imw 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. data/.gitignore +4 -1
  2. data/Rakefile +10 -0
  3. data/TODO +18 -0
  4. data/VERSION +1 -1
  5. data/bin/imw +1 -1
  6. data/etc/imwrc.rb +0 -50
  7. data/examples/dataset.rb +12 -0
  8. data/lib/imw/boot.rb +55 -9
  9. data/lib/imw/dataset/paths.rb +15 -24
  10. data/lib/imw/dataset/workflow.rb +131 -72
  11. data/lib/imw/dataset.rb +94 -186
  12. data/lib/imw/parsers/html_parser.rb +1 -1
  13. data/lib/imw/parsers.rb +1 -1
  14. data/lib/imw/repository.rb +3 -27
  15. data/lib/imw/resource.rb +190 -0
  16. data/lib/imw/resources/archive.rb +97 -0
  17. data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
  18. data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
  19. data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
  20. data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
  21. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
  22. data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
  23. data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
  24. data/lib/imw/resources/archives_and_compressed.rb +32 -0
  25. data/lib/imw/resources/compressed_file.rb +89 -0
  26. data/lib/imw/resources/compressible.rb +77 -0
  27. data/lib/imw/resources/formats/delimited.rb +92 -0
  28. data/lib/imw/resources/formats/excel.rb +125 -0
  29. data/lib/imw/resources/formats/json.rb +53 -0
  30. data/lib/imw/resources/formats/sgml.rb +72 -0
  31. data/lib/imw/resources/formats/yaml.rb +53 -0
  32. data/lib/imw/resources/formats.rb +32 -0
  33. data/lib/imw/resources/local.rb +198 -0
  34. data/lib/imw/resources/remote.rb +110 -0
  35. data/lib/imw/resources/schemes/hdfs.rb +242 -0
  36. data/lib/imw/resources/schemes/http.rb +161 -0
  37. data/lib/imw/resources/schemes/s3.rb +137 -0
  38. data/lib/imw/resources/schemes.rb +19 -0
  39. data/lib/imw/resources.rb +118 -0
  40. data/lib/imw/runner.rb +5 -4
  41. data/lib/imw/transforms/archiver.rb +215 -0
  42. data/lib/imw/transforms/transferer.rb +103 -0
  43. data/lib/imw/transforms.rb +8 -0
  44. data/lib/imw/utils/error.rb +26 -30
  45. data/lib/imw/utils/extensions/array.rb +5 -15
  46. data/lib/imw/utils/extensions/hash.rb +6 -16
  47. data/lib/imw/utils/extensions/hpricot.rb +0 -14
  48. data/lib/imw/utils/extensions/string.rb +5 -15
  49. data/lib/imw/utils/extensions/symbol.rb +0 -13
  50. data/lib/imw/utils/extensions.rb +65 -0
  51. data/lib/imw/utils/log.rb +14 -13
  52. data/lib/imw/utils/misc.rb +0 -6
  53. data/lib/imw/utils/paths.rb +101 -42
  54. data/lib/imw/utils/version.rb +8 -9
  55. data/lib/imw/utils.rb +2 -18
  56. data/lib/imw.rb +92 -17
  57. data/spec/data/sample.csv +1 -1
  58. data/spec/data/sample.json +1 -0
  59. data/spec/data/sample.tsv +1 -1
  60. data/spec/data/sample.txt +1 -1
  61. data/spec/data/sample.xml +1 -1
  62. data/spec/data/sample.yaml +1 -1
  63. data/spec/imw/dataset/paths_spec.rb +32 -0
  64. data/spec/imw/dataset/workflow_spec.rb +41 -0
  65. data/spec/imw/resource_spec.rb +79 -0
  66. data/spec/imw/resources/archive_spec.rb +69 -0
  67. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
  68. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
  69. data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
  70. data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
  71. data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
  72. data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
  73. data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
  74. data/spec/imw/resources/compressed_file_spec.rb +48 -0
  75. data/spec/imw/resources/compressible_spec.rb +36 -0
  76. data/spec/imw/resources/formats/delimited_spec.rb +33 -0
  77. data/spec/imw/resources/formats/json_spec.rb +32 -0
  78. data/spec/imw/resources/formats/sgml_spec.rb +24 -0
  79. data/spec/imw/resources/formats/yaml_spec.rb +41 -0
  80. data/spec/imw/resources/local_spec.rb +98 -0
  81. data/spec/imw/resources/remote_spec.rb +35 -0
  82. data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
  83. data/spec/imw/resources/schemes/http_spec.rb +19 -0
  84. data/spec/imw/resources/schemes/s3_spec.rb +19 -0
  85. data/spec/imw/transforms/archiver_spec.rb +120 -0
  86. data/spec/imw/transforms/transferer_spec.rb +113 -0
  87. data/spec/imw/utils/paths_spec.rb +5 -33
  88. data/spec/imw/utils/shared_paths_spec.rb +29 -0
  89. data/spec/spec_helper.rb +5 -5
  90. data/spec/support/paths_matcher.rb +67 -0
  91. data/spec/support/random.rb +39 -36
  92. metadata +88 -75
  93. data/lib/imw/dataset/task.rb +0 -41
  94. data/lib/imw/files/archive.rb +0 -113
  95. data/lib/imw/files/basicfile.rb +0 -122
  96. data/lib/imw/files/binary.rb +0 -28
  97. data/lib/imw/files/compressed_file.rb +0 -93
  98. data/lib/imw/files/compressed_files_and_archives.rb +0 -334
  99. data/lib/imw/files/compressible.rb +0 -103
  100. data/lib/imw/files/csv.rb +0 -113
  101. data/lib/imw/files/directory.rb +0 -62
  102. data/lib/imw/files/excel.rb +0 -84
  103. data/lib/imw/files/json.rb +0 -41
  104. data/lib/imw/files/sgml.rb +0 -46
  105. data/lib/imw/files/text.rb +0 -68
  106. data/lib/imw/files/yaml.rb +0 -46
  107. data/lib/imw/files.rb +0 -125
  108. data/lib/imw/packagers/archiver.rb +0 -126
  109. data/lib/imw/packagers/s3_mover.rb +0 -36
  110. data/lib/imw/packagers.rb +0 -8
  111. data/lib/imw/utils/components.rb +0 -61
  112. data/lib/imw/utils/config.rb +0 -46
  113. data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
  114. data/lib/imw/utils/extensions/core.rb +0 -27
  115. data/lib/imw/utils/extensions/dir.rb +0 -24
  116. data/lib/imw/utils/extensions/file_core.rb +0 -64
  117. data/lib/imw/utils/extensions/typed_struct.rb +0 -22
  118. data/lib/imw/utils/extensions/uri.rb +0 -59
  119. data/lib/imw/utils/view/dump_csv.rb +0 -112
  120. data/lib/imw/utils/view/dump_csv_older.rb +0 -117
  121. data/lib/imw/utils/view.rb +0 -113
  122. data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
  123. data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
  124. data/spec/imw/files/archive_spec.rb +0 -118
  125. data/spec/imw/files/basicfile_spec.rb +0 -121
  126. data/spec/imw/files/bz2_spec.rb +0 -32
  127. data/spec/imw/files/compressed_file_spec.rb +0 -96
  128. data/spec/imw/files/compressible_spec.rb +0 -100
  129. data/spec/imw/files/file_spec.rb +0 -144
  130. data/spec/imw/files/gz_spec.rb +0 -32
  131. data/spec/imw/files/rar_spec.rb +0 -33
  132. data/spec/imw/files/tar_spec.rb +0 -31
  133. data/spec/imw/files/text_spec.rb +0 -23
  134. data/spec/imw/files/zip_spec.rb +0 -31
  135. data/spec/imw/files_spec.rb +0 -38
  136. data/spec/imw/packagers/archiver_spec.rb +0 -125
  137. data/spec/imw/packagers/s3_mover_spec.rb +0 -7
  138. data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
  139. data/spec/imw/utils/extensions/find_spec.rb +0 -113
  140. data/spec/imw/workflow/rip/local_spec.rb +0 -89
  141. data/spec/imw/workflow/rip_spec.rb +0 -27
  142. data/spec/support/archive_contents_matcher.rb +0 -94
  143. data/spec/support/directory_contents_matcher.rb +0 -61
@@ -1,334 +0,0 @@
1
- module IMW
2
- module Files
3
-
4
- # A class to wrap a +tar+ archive.
5
- #
6
- # Creation, appending, listing, and extraction flags are stored in
7
- # <tt>IMW::Files::Tar::DEFAULT_FLAGS</tt> and all are passed to
8
- # the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
9
- class Tar
10
-
11
- include IMW::Files::BasicFile
12
- include IMW::Files::Archive
13
- include IMW::Files::Compressible
14
-
15
- # The default flags used creating, appending to, listing, and
16
- # extracting a tar archive.
17
- DEFAULT_FLAGS = {
18
- :create => "-cf",
19
- :append => "-rf",
20
- :list => "-tf",
21
- :extract => "-xf",
22
- :program => :tar
23
- }
24
-
25
- def initialize uri, *args
26
- self.uri= uri
27
- @archive = {
28
- :program => DEFAULT_FLAGS[:program],
29
- :create_flags => DEFAULT_FLAGS[:create],
30
- :append_flags => DEFAULT_FLAGS[:append],
31
- :list_flags => DEFAULT_FLAGS[:list],
32
- :extract_flags => DEFAULT_FLAGS[:extract]
33
- }
34
- end
35
- end # Tar
36
-
37
- # A class to wrap a <tt>tar.gz</tt> archive.
38
- #
39
- # Creation, appending, listing, and extraction flags are stored in
40
- # <tt>IMW::Files::Targz::DEFAULT_FLAGS</tt> and all are passed to
41
- # the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
42
- class Targz
43
-
44
- include IMW::Files::BasicFile
45
- include IMW::Files::Archive
46
- include IMW::Files::CompressedFile
47
-
48
- # The default flags used creating, appending to, listing, and
49
- # extracting a <tt>tar.gz</tt> archive.
50
- DEFAULT_FLAGS = {
51
- :decompression_program => :gzip,
52
- :decompression_flags => '-fd',
53
- :archive_program => :tar,
54
- :archive_list_flags => "-tf",
55
- :archive_extract_flags => "-xzf"
56
- }
57
-
58
- def initialize uri, *args
59
- self.uri= uri
60
- @compression = {
61
- :program => DEFAULT_FLAGS[:decompression_program],
62
- :decompression_flags => DEFAULT_FLAGS[:decompression_flags]
63
- }
64
- @archive = {
65
- :program => DEFAULT_FLAGS[:archive_program],
66
- :list_flags => DEFAULT_FLAGS[:archive_list_flags],
67
- :extract_flags => DEFAULT_FLAGS[:archive_extract_flags]
68
- }
69
- end
70
-
71
- # Returns the path of the file after decompression.
72
- def decompressed_path
73
- if /\.tar\.gz$/.match @path then
74
- @path.gsub /\.tar\.gz$/, ".tar"
75
- elsif /\.tgz$/.match @path then
76
- @path.gsub /\.tgz$/, ".tar"
77
- end
78
- end
79
-
80
- def self.extname path
81
- if /\.tar\.gz$/.match path then
82
- ".tar.gz"
83
- elsif /\.tgz$/.match path then
84
- ".tgz"
85
- end
86
- end
87
-
88
- end # Targz
89
-
90
- # A class to wrap a <tt>tar.bz2</tt> archive.
91
- #
92
- # Creation, appending, listing, and extraction flags are stored in
93
- # <tt>IMW::Files::Tarbz2::DEFAULT_FLAGS</tt> and all are passed to
94
- # the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
95
- class Tarbz2
96
-
97
- include IMW::Files::BasicFile
98
- include IMW::Files::Archive
99
- include IMW::Files::CompressedFile
100
-
101
- # The default flags used creating, appending to, listing, and
102
- # extracting a <tt>tar.bz2</tt> archive.
103
- DEFAULT_FLAGS = {
104
- :decompression_program => :bzip2,
105
- :decompression_flags => '-fd',
106
- :archive_program => :tar,
107
- :archive_create_flags => '-cf',
108
- :archive_list_flags => "-tf",
109
- :archive_extract_flags => "-xjf"
110
- }
111
-
112
- def self.extname path
113
- if /\.tar\.bz2$/.match path then
114
- ".tar.bz2"
115
- elsif /\.tbz2$/.match path then
116
- ".tbz2"
117
- end
118
- end
119
-
120
- def initialize uri, *args
121
- self.uri= uri
122
- @compression = {
123
- :program => DEFAULT_FLAGS[:decompression_program],
124
- :decompression_flags => DEFAULT_FLAGS[:decompression]
125
- }
126
- @archive = {
127
- :program => DEFAULT_FLAGS[:archive_program],
128
- :list_flags => DEFAULT_FLAGS[:archive_list_flags],
129
- :extract_flags => DEFAULT_FLAGS[:archive_extract_flags],
130
- :create_flags => DEFAULT_FLAGS[:archive_create_flags]
131
- }
132
- end
133
-
134
- # Returns the path of the file after decompression.
135
- def decompressed_path
136
- if /\.tar\.bz2$/.match @path then
137
- @path.gsub /\.tar\.bz2$/, ".tar"
138
- elsif /\.tbz2$/.match @path then
139
- @path.gsub /\.tbz2$/, ".tar"
140
- end
141
- end
142
-
143
- # Overrides default behvaior of IMW::Files::Archive#create to
144
- # compress files after creating them.
145
- def create paths, opts={}
146
- opts = opts.reverse_merge({:force => false})
147
- raise IMW::Error.new("An archive already exists at #{@path}.") if exist? and not opts[:force]
148
- paths = [paths] if paths.class == String
149
- IMW.system IMW::EXTERNAL_PROGRAMS[@archive[:program]], @archive[:create_flags], path_between_archive_and_compression, *paths
150
- IMW.open(path_between_archive_and_compression).compress!(:bzip2)
151
- end
152
-
153
- protected
154
- def path_between_archive_and_compression
155
- File.join(dirname,name + '.tar')
156
- end
157
-
158
- end # Tarbz2
159
-
160
- # A class to wrap a +rar+ archive.
161
- #
162
- # Creation, appending, listing, and extraction flags are stored in
163
- # <tt>IMW::Files::Rar::DEFAULT_FLAGS</tt> and all are passed to
164
- # the <tt>:rar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
165
- class Rar
166
-
167
- include IMW::Files::BasicFile
168
- include IMW::Files::Archive
169
-
170
- # The default flags used creating, appending to, listing, and
171
- # extracting a rar archive.
172
- DEFAULT_FLAGS = {
173
- :create => "a -r -o+ -inul",
174
- :append => "a -r -o+ -inul",
175
- :list => "vb",
176
- :extract => "x -o+ -inul"
177
- }
178
-
179
- def initialize uri, *args
180
- self.uri= uri
181
- @archive = {
182
- :program => :rar,
183
- :create_flags => DEFAULT_FLAGS[:create],
184
- :append_flags => DEFAULT_FLAGS[:append],
185
- :list_flags => DEFAULT_FLAGS[:list],
186
- :extract_flags => DEFAULT_FLAGS[:extract]
187
- }
188
- end
189
- end # Rar
190
-
191
- # A class to wrap a +zip+ archive.
192
- #
193
- # Creation, appending, listing, and extraction flags are stored in
194
- # <tt>IMW::Files::Zip::DEFAULT_FLAGS</tt> and all are passed to
195
- # the <tt>:zip</tt> and <tt>:unzip</tt> programs in
196
- # <tt>IMW::EXTERAL_PROGRAMS</tt>.
197
- class Zip
198
-
199
- include IMW::Files::BasicFile
200
- include IMW::Files::Archive
201
-
202
- # The default flags used creating, appending to, listing, and
203
- # extracting a zip archive.
204
- DEFAULT_FLAGS = {
205
- :create => "-q -r",
206
- :append => "-q -g",
207
- :list => "-l",
208
- :extract => "-q -o",
209
- :unarchiving_program => :unzip
210
- }
211
-
212
- def initialize uri, *args
213
- self.uri= uri
214
- @archive = {
215
- :program => :zip,
216
- :create_flags => DEFAULT_FLAGS[:create],
217
- :append_flags => DEFAULT_FLAGS[:append],
218
- :list_flags => DEFAULT_FLAGS[:list],
219
- :extract_flags => DEFAULT_FLAGS[:extract],
220
- :unarchiving_program => DEFAULT_FLAGS[:unarchiving_program]
221
- }
222
- end
223
-
224
- # The `unzip' program outputs data in a very annoying format:
225
- #
226
- # Archive: data.zip
227
- # Length Date Time Name
228
- # -------- ---- ---- ----
229
- # 18510 07-28-08 15:58 data/4d7Qrgz7.csv
230
- # 3418 07-28-08 15:41 data/7S.csv
231
- # 23353 07-28-08 15:41 data/g.csv
232
- # 711 07-28-08 15:58 data/g.xml
233
- # 1095 07-28-08 15:41 data/L.xml
234
- # 2399 07-28-08 15:58 data/mTAu9H3.xml
235
- # 152 07-28-08 15:58 data/vaHBS2t5R.dat
236
- # -------- -------
237
- # 49638 7 files
238
- #
239
- # which is parsed by this method.
240
- def archive_contents_string_to_array string
241
- rows = string.split("\n")
242
- # ignore the first 3 lines of the output and also discared the
243
- # last 2 (5 = 2 + 3)
244
- file_rows = rows[3,(rows.length - 5)]
245
- file_rows.map! do |row|
246
- # discard extra whitespace before after main text
247
- row.lstrip!.rstrip!
248
- # split the remaining text at spaces...columns beyond the
249
- # third are part of the filename and should be joined with a
250
- # space again in case of a filename with a space
251
- row.split(' ')[3,row.size].join(' ')
252
- end
253
- file_rows
254
- end
255
- end # Zip
256
-
257
- # A class to wrap a <tt>gz</tt> compressed file.
258
- #
259
- # The decompressing flags are stored in
260
- # <tt>IMW::Files::Gz::DEFAULT_FLAGS</tt> and all are passed to the
261
- # <tt>:gzip</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
262
- class Gz
263
-
264
- include IMW::Files::BasicFile
265
- include IMW::Files::CompressedFile
266
-
267
- # The default flags used in extracting a <tt>gz</tt> file.
268
- DEFAULT_FLAGS = {
269
- :program => :gzip,
270
- :decompression => '-fd'
271
- }
272
-
273
- def initialize uri, *args
274
- self.uri= uri
275
- @compression = {
276
- :program => DEFAULT_FLAGS[:program],
277
- :decompression_flags => DEFAULT_FLAGS[:decompression]
278
- }
279
- end
280
-
281
- def decompressed_path
282
- @path.gsub /\.gz$/, ""
283
- end
284
- end # Gz
285
-
286
- # A class to wrap a <tt>bz2</tt> compressed file.
287
- #
288
- # The decompressing flags are stored in
289
- # <tt>IMW::Files::Bz2::DEFAULT_FLAGS</tt> and all are passed to
290
- # the <tt>:bzip2</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
291
- class Bz2
292
-
293
- include IMW::Files::BasicFile
294
- include IMW::Files::CompressedFile
295
-
296
- # The default flags used in extracting a <tt>bz2</tt> file.
297
- DEFAULT_FLAGS = {
298
- :program => :bzip2,
299
- :decompression => '-fd'
300
- }
301
-
302
- def initialize uri, *args
303
- self.uri= uri
304
- raise IMW::Error.new("#{@extname} is not a valid extension for a bzip2 compressed file.") unless @extname == '.bz2'
305
- @compression = {
306
- :program => DEFAULT_FLAGS[:program],
307
- :decompression_flags => DEFAULT_FLAGS[:decompression]
308
- }
309
- end
310
-
311
- # Returns the path of the file after decompression.
312
- def decompressed_path
313
- @path.gsub /\.bz2$/, ""
314
- end
315
- end # Bz2
316
-
317
-
318
- # make sure that tar.bz2 precedes bz2 and so on...
319
- FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::Tarbz2]
320
- FILE_REGEXPS << [/\.tbz2$/, IMW::Files::Tarbz2]
321
-
322
- FILE_REGEXPS << [/\.tar\.gz$/, IMW::Files::Targz]
323
- FILE_REGEXPS << [/\.tgz$/, IMW::Files::Targz]
324
-
325
- FILE_REGEXPS << [/\.tar$/, IMW::Files::Tar]
326
- FILE_REGEXPS << [/\.bz2$/, IMW::Files::Bz2]
327
- FILE_REGEXPS << [/\.gz$/, IMW::Files::Gz]
328
- FILE_REGEXPS << [/\.rar$/, IMW::Files::Rar]
329
- FILE_REGEXPS << [/\.zip$/, IMW::Files::Zip]
330
-
331
- end # Files
332
- end # IMW
333
-
334
-
@@ -1,103 +0,0 @@
1
- #
2
- # h2. lib/imw/files//compressible.rb -- compression module
3
- #
4
- # == About
5
- #
6
- # Module used for compression of files. An including
7
- # <tt>IMW::Files::BasicFile</tt> object gains +compress+ and
8
- # <tt>compress!</tt> methods.
9
- #
10
- # By default, bzip2 is used for compression though gzip can also be
11
- # specified (the full list of known compression programs is in
12
- # <tt>IMW::Files::Compressible::COMPRESSION_PROGS</tt>). Zip and Rar
13
- # compression are handled by the <tt>IMW::Files::Archive</tt> module.
14
- #
15
- # Decompression should be handled via the
16
- # <tt>IMW::Files::CompressedFile</tt> class.
17
- #
18
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
19
- # Copyright:: Copyright (c) 2008 infochimps.org
20
- # License:: GPL 3.0
21
- # Website:: http://infinitemonkeywrench.org/
22
- #
23
- # puts "#{File.basename(__FILE__)}: Why is it that when you squeeze a lemon you get lemonade but when you squeeze a banana you just get a mess?" # at bottom
24
- module IMW
25
- module Files
26
- module Compressible
27
-
28
- # Known compression programs.
29
- COMPRESSION_PROGS = [:bzip2, :gzip]
30
-
31
- # Extensions that are appended by each compression program.
32
- COMPRESSION_EXTS = {
33
- :bzip2 => '.bz2',
34
- :gzip => '.gz'
35
- }
36
-
37
- # Compression flags for each program
38
- COMPRESSION_FLAGS = {
39
- :bzip2 => "-f",
40
- :gzip => "-f"
41
- }
42
-
43
- protected
44
- # Check that +program+ is a valid compression program.
45
- def ensure_valid_compression_program program
46
- raise IMW::Error.new("#{program} is not a valid compression program (#{COMPRESSION_PROGS.join(' ,')}).") unless COMPRESSION_PROGS.include? program
47
- end
48
-
49
- # Construct the command passed to the shell to compress this
50
- # file using the given +program+.
51
- def compression_command program
52
- ensure_valid_compression_program program
53
- [IMW::EXTERNAL_PROGRAMS[program],COMPRESSION_FLAGS[program],self.path].join ' '
54
- end
55
-
56
- # Return the object representing this file compressed with
57
- # +program+.
58
- def compressed_file_path program
59
- ensure_valid_compression_program program
60
- path = File.join(self.dirname,self.basename + COMPRESSION_EXTS[program])
61
- end
62
-
63
- public
64
- # Compress this file in its present directory using +program+,
65
- # overwriting any existing compressed files and without saving
66
- # the original file. Returns an
67
- # <tt>IMW::Files::CompressedFile</tt> object corresponding to
68
- # the compressed file.
69
- #
70
- # Options:
71
- #
72
- # <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
73
- # program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
74
- def compress! program = :bzip2
75
- raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
76
- FileUtils.cd(@dirname) { IMW.system(self.compression_command(program)) }
77
- IMW.open(self.compressed_file_path(program))
78
- end
79
-
80
- # Compress this file in its present directory, overwriting any
81
- # existing compressed files while keeping the original file.
82
- # Returns an <tt>IMW::Files::CompressedFile</tt> object
83
- # corresponding to the compressed file.
84
- #
85
- # Options:
86
- #
87
- # <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
88
- # program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
89
- def compress program = :bzip2
90
- raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
91
- begin
92
- FileUtils.cp(self.path,self.path + 'copy')
93
- compress! program
94
- ensure
95
- FileUtils.mv(self.path + 'copy',self.path)
96
- end
97
- IMW.open(self.compressed_file_path(program))
98
- end
99
-
100
- end
101
- end
102
- end
103
-
data/lib/imw/files/csv.rb DELETED
@@ -1,113 +0,0 @@
1
- #
2
- # h2. lib/imw/files/csv.rb -- CSV, TSV files
3
- #
4
- # == About
5
- #
6
- # For "comma-separated value" (CSV) and "tab-separated value" (TSV)
7
- # files.
8
- #
9
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
- # Copyright:: Copyright (c) 2008 infochimps.org
11
- # License:: GPL 3.0
12
- # Website:: http://infinitemonkeywrench.org/
13
- #
14
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
-
16
- require 'fastercsv'
17
- module IMW
18
- module Files
19
-
20
- # A base class from which to subclass various types of tabular
21
- # data files (CSV, TSV, &c.)
22
- class TabularDataFile < FasterCSV
23
-
24
- include IMW::Files::BasicFile
25
- include IMW::Files::Compressible
26
-
27
- # Default options to be passed to
28
- # FasterCSV[http://fastercsv.rubyforge.org/]; see its
29
- # documentation for more information.
30
- DEFAULT_OPTIONS = {
31
- :col_sep => ',',
32
- :headers => false,
33
- :return_headers => false,
34
- :write_headers => true,
35
- :skip_blanks => false,
36
- :force_quotes => false
37
- }
38
-
39
- def initialize uri, mode='r', options = {}
40
- options.reverse_merge!(self.class::DEFAULT_OPTIONS)
41
- self.uri= uri
42
- options.delete(:write) # FasterCSV complains about unkown options
43
- super open(uri,mode), options
44
- end
45
-
46
- # Return the contents of this CSV file as an array of arrays.
47
- def load
48
- entries
49
- end
50
-
51
- # Dump +data+ to this file.
52
- #
53
- # Options include:
54
- # <tt>:flush</tt> (true):: flush the file buffer, writing it to disk
55
- # <tt>:close</tt> (true):: close the file after writing +data+
56
- def dump data, options = {}
57
- options = options.reverse_merge :close => true, :flush => true
58
- data.each {|row| self << row}
59
- self.flush if options[:flush]
60
- self.close if options[:close]
61
- self
62
- end
63
-
64
- # Return a random sample of rows.
65
- def sample length=10
66
- rows, indices = [], Set.new
67
- begin
68
- each_with_index do |row, index|
69
- break if rows.size == length
70
- next if index != 0 && rand < 0.75 # skip 3/4 of rows after the 1st
71
- rows << row
72
- indices << index
73
- end
74
- # now fill up to length if not there already
75
- while rows.length < length
76
- each_with_index do |row, index|
77
- break if rows.size == length
78
- next if index indices.include?(index)
79
- rows << row
80
- end
81
- end
82
- rows
83
- rescue FasterCSV::MalformedCSVError
84
- rows
85
- end
86
- end
87
- end
88
-
89
-
90
- # Represents a file of comma-separated values (CSV). This class
91
- # is a subclass of <tt>FasterCSV</tt> so the methods of that
92
- # library are available for use.
93
- #
94
- # See <tt>IMW::Files::TabularDataFile</tt> for more complete
95
- # documentation.
96
- class Csv < TabularDataFile
97
- end
98
-
99
- # Represents a file of tab-separated values (TSV). This class
100
- # is a subclass of <tt>FasterCSV</tt> so the methods of that
101
- # library are available for use.
102
- #
103
- # See <tt>IMW::Files::TabularDataFile</tt> for more complete
104
- # documentation.
105
- class Tsv < TabularDataFile
106
- DEFAULT_OPTIONS = {:col_sep => "\t"}.reverse_merge DEFAULT_OPTIONS
107
- end
108
-
109
- FILE_REGEXPS << [/\.csv$/, IMW::Files::Csv]
110
- FILE_REGEXPS << [/\.tsv$/, IMW::Files::Tsv]
111
-
112
- end
113
- end
@@ -1,62 +0,0 @@
1
- require 'imw/files/basicfile'
2
- module IMW
3
- module Files
4
- class Directory
5
-
6
- include IMW::Files::BasicFile
7
-
8
- # FIXME these should be defined by BasicFile and then removed here but I don't see how...
9
- # [:executable?, :executable_real?, :pipe?, :socket?, :rm, :rm!, :extname, :extname=, :name, :name=].each do |method|
10
- # instance_eval do
11
- # remove_method method
12
- # end
13
- # end
14
-
15
- def uri= uri
16
- @uri = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
17
- @host = self.uri.host
18
- @path = local? ? ::File.expand_path(self.uri.path) : self.uri.path
19
- @dirname = ::File.dirname path
20
- @basename = ::File.basename path
21
- end
22
-
23
- def initialize uri
24
- self.uri = uri
25
- end
26
-
27
- def [] selector='*'
28
- Dir[File.join(path, selector)] if local?
29
- end
30
-
31
- # Copy the contents of this directory to +new_dir+.
32
- def cp new_dir
33
- raise IMW::PathError.new("cannot copy from #{path}, doesn't exist!") unless exist?
34
- if local?
35
- FileUtils.cp_r path, new_dir
36
- else
37
- raise IMW::PathError.new("cannot recursively copy remote directories (yet!)")
38
- end
39
- self.class.new(new_dir)
40
- end
41
-
42
- # Move this directory to +new_dir+.
43
- def mv new_dir
44
- raise IMW::PathError.new("cannot move from #{path}, doesn't exist!") unless exist?
45
- if local?
46
- FileUtils.mv path, new_dir
47
- else
48
- raise IMW::PathError.new("cannot move remote directories (yet!)")
49
- end
50
- self.class.new(new_dir)
51
- end
52
- alias_method :mv!, :mv
53
-
54
- # Move this directory so it sits beneath +dir+.
55
- def mv_to_dir dir
56
- mv File.join(File.expand_path(dir),basename)
57
- end
58
- alias_method :mv_to_dir!, :mv_to_dir
59
-
60
- end
61
- end
62
- end
@@ -1,84 +0,0 @@
1
- require 'spreadsheet'
2
-
3
- # FIXME Main issue with this:
4
- # You can make a new excel book and dump data to it no problem.
5
- # However, something that doesn't seem to work is dumping to a file, opening,
6
- # and dumping to it again. At the moment this is probably not a big deal.
7
-
8
-
9
- module IMW
10
- module Files
11
- class Excel
12
- include IMW::Files::BasicFile
13
- include IMW::Files::Compressible
14
-
15
- #need to initialize, load, and dump
16
- attr_accessor :book,:idx, :max_lines, :sht_idx, :sht_row, :book_idx
17
- def initialize uri, mode, options={}
18
- self.uri = uri
19
- @max_lines = options[:max_lines] || 65000
20
- @idx = 0
21
- @book_idx = 0
22
- @sht_idx = 0
23
- unless self.exist?
24
- make_new_book
25
- make_new_sheet
26
- else
27
- get_existing_book
28
- end
29
- end
30
-
31
- def load
32
- @sheet.map{|row| row.to_a}
33
- end
34
-
35
- def dump data
36
- data.each do |line|
37
- raise "too many lines" if too_many?
38
- self << line
39
- end
40
- save unless no_data?
41
- end
42
-
43
- def << line
44
- @sheet.row(@sht_row).concat( line )
45
- @sht_row += 1
46
- @idx += 1
47
- end
48
-
49
- def make_new_book
50
- @book = Spreadsheet::Workbook.new
51
- @book_idx += 1
52
- end
53
-
54
- def make_new_sheet
55
- @sheet = @book.create_worksheet
56
- @sht_idx += 1
57
- @sht_row = 0 #always start at row 0 in a new sheet
58
- end
59
-
60
- def get_existing_book
61
- @book = Spreadsheet.open path
62
- @sheet = book.worksheet 0
63
- @sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
64
- @sht_idx += 1
65
- end
66
-
67
- def incr_sheet
68
- @sheet = book.worksheet @sht_idx
69
- end
70
-
71
- def too_many?
72
- @sht_row >= @max_lines
73
- end
74
-
75
- def no_data?
76
- @sht_row == 0
77
- end
78
-
79
- def save
80
- @book.write path
81
- end
82
- end
83
- end
84
- end