imw 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. data/.gitignore +4 -1
  2. data/Rakefile +10 -0
  3. data/TODO +18 -0
  4. data/VERSION +1 -1
  5. data/bin/imw +1 -1
  6. data/etc/imwrc.rb +0 -50
  7. data/examples/dataset.rb +12 -0
  8. data/lib/imw/boot.rb +55 -9
  9. data/lib/imw/dataset/paths.rb +15 -24
  10. data/lib/imw/dataset/workflow.rb +131 -72
  11. data/lib/imw/dataset.rb +94 -186
  12. data/lib/imw/parsers/html_parser.rb +1 -1
  13. data/lib/imw/parsers.rb +1 -1
  14. data/lib/imw/repository.rb +3 -27
  15. data/lib/imw/resource.rb +190 -0
  16. data/lib/imw/resources/archive.rb +97 -0
  17. data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
  18. data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
  19. data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
  20. data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
  21. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
  22. data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
  23. data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
  24. data/lib/imw/resources/archives_and_compressed.rb +32 -0
  25. data/lib/imw/resources/compressed_file.rb +89 -0
  26. data/lib/imw/resources/compressible.rb +77 -0
  27. data/lib/imw/resources/formats/delimited.rb +92 -0
  28. data/lib/imw/resources/formats/excel.rb +125 -0
  29. data/lib/imw/resources/formats/json.rb +53 -0
  30. data/lib/imw/resources/formats/sgml.rb +72 -0
  31. data/lib/imw/resources/formats/yaml.rb +53 -0
  32. data/lib/imw/resources/formats.rb +32 -0
  33. data/lib/imw/resources/local.rb +198 -0
  34. data/lib/imw/resources/remote.rb +110 -0
  35. data/lib/imw/resources/schemes/hdfs.rb +242 -0
  36. data/lib/imw/resources/schemes/http.rb +161 -0
  37. data/lib/imw/resources/schemes/s3.rb +137 -0
  38. data/lib/imw/resources/schemes.rb +19 -0
  39. data/lib/imw/resources.rb +118 -0
  40. data/lib/imw/runner.rb +5 -4
  41. data/lib/imw/transforms/archiver.rb +215 -0
  42. data/lib/imw/transforms/transferer.rb +103 -0
  43. data/lib/imw/transforms.rb +8 -0
  44. data/lib/imw/utils/error.rb +26 -30
  45. data/lib/imw/utils/extensions/array.rb +5 -15
  46. data/lib/imw/utils/extensions/hash.rb +6 -16
  47. data/lib/imw/utils/extensions/hpricot.rb +0 -14
  48. data/lib/imw/utils/extensions/string.rb +5 -15
  49. data/lib/imw/utils/extensions/symbol.rb +0 -13
  50. data/lib/imw/utils/extensions.rb +65 -0
  51. data/lib/imw/utils/log.rb +14 -13
  52. data/lib/imw/utils/misc.rb +0 -6
  53. data/lib/imw/utils/paths.rb +101 -42
  54. data/lib/imw/utils/version.rb +8 -9
  55. data/lib/imw/utils.rb +2 -18
  56. data/lib/imw.rb +92 -17
  57. data/spec/data/sample.csv +1 -1
  58. data/spec/data/sample.json +1 -0
  59. data/spec/data/sample.tsv +1 -1
  60. data/spec/data/sample.txt +1 -1
  61. data/spec/data/sample.xml +1 -1
  62. data/spec/data/sample.yaml +1 -1
  63. data/spec/imw/dataset/paths_spec.rb +32 -0
  64. data/spec/imw/dataset/workflow_spec.rb +41 -0
  65. data/spec/imw/resource_spec.rb +79 -0
  66. data/spec/imw/resources/archive_spec.rb +69 -0
  67. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
  68. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
  69. data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
  70. data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
  71. data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
  72. data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
  73. data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
  74. data/spec/imw/resources/compressed_file_spec.rb +48 -0
  75. data/spec/imw/resources/compressible_spec.rb +36 -0
  76. data/spec/imw/resources/formats/delimited_spec.rb +33 -0
  77. data/spec/imw/resources/formats/json_spec.rb +32 -0
  78. data/spec/imw/resources/formats/sgml_spec.rb +24 -0
  79. data/spec/imw/resources/formats/yaml_spec.rb +41 -0
  80. data/spec/imw/resources/local_spec.rb +98 -0
  81. data/spec/imw/resources/remote_spec.rb +35 -0
  82. data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
  83. data/spec/imw/resources/schemes/http_spec.rb +19 -0
  84. data/spec/imw/resources/schemes/s3_spec.rb +19 -0
  85. data/spec/imw/transforms/archiver_spec.rb +120 -0
  86. data/spec/imw/transforms/transferer_spec.rb +113 -0
  87. data/spec/imw/utils/paths_spec.rb +5 -33
  88. data/spec/imw/utils/shared_paths_spec.rb +29 -0
  89. data/spec/spec_helper.rb +5 -5
  90. data/spec/support/paths_matcher.rb +67 -0
  91. data/spec/support/random.rb +39 -36
  92. metadata +88 -75
  93. data/lib/imw/dataset/task.rb +0 -41
  94. data/lib/imw/files/archive.rb +0 -113
  95. data/lib/imw/files/basicfile.rb +0 -122
  96. data/lib/imw/files/binary.rb +0 -28
  97. data/lib/imw/files/compressed_file.rb +0 -93
  98. data/lib/imw/files/compressed_files_and_archives.rb +0 -334
  99. data/lib/imw/files/compressible.rb +0 -103
  100. data/lib/imw/files/csv.rb +0 -113
  101. data/lib/imw/files/directory.rb +0 -62
  102. data/lib/imw/files/excel.rb +0 -84
  103. data/lib/imw/files/json.rb +0 -41
  104. data/lib/imw/files/sgml.rb +0 -46
  105. data/lib/imw/files/text.rb +0 -68
  106. data/lib/imw/files/yaml.rb +0 -46
  107. data/lib/imw/files.rb +0 -125
  108. data/lib/imw/packagers/archiver.rb +0 -126
  109. data/lib/imw/packagers/s3_mover.rb +0 -36
  110. data/lib/imw/packagers.rb +0 -8
  111. data/lib/imw/utils/components.rb +0 -61
  112. data/lib/imw/utils/config.rb +0 -46
  113. data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
  114. data/lib/imw/utils/extensions/core.rb +0 -27
  115. data/lib/imw/utils/extensions/dir.rb +0 -24
  116. data/lib/imw/utils/extensions/file_core.rb +0 -64
  117. data/lib/imw/utils/extensions/typed_struct.rb +0 -22
  118. data/lib/imw/utils/extensions/uri.rb +0 -59
  119. data/lib/imw/utils/view/dump_csv.rb +0 -112
  120. data/lib/imw/utils/view/dump_csv_older.rb +0 -117
  121. data/lib/imw/utils/view.rb +0 -113
  122. data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
  123. data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
  124. data/spec/imw/files/archive_spec.rb +0 -118
  125. data/spec/imw/files/basicfile_spec.rb +0 -121
  126. data/spec/imw/files/bz2_spec.rb +0 -32
  127. data/spec/imw/files/compressed_file_spec.rb +0 -96
  128. data/spec/imw/files/compressible_spec.rb +0 -100
  129. data/spec/imw/files/file_spec.rb +0 -144
  130. data/spec/imw/files/gz_spec.rb +0 -32
  131. data/spec/imw/files/rar_spec.rb +0 -33
  132. data/spec/imw/files/tar_spec.rb +0 -31
  133. data/spec/imw/files/text_spec.rb +0 -23
  134. data/spec/imw/files/zip_spec.rb +0 -31
  135. data/spec/imw/files_spec.rb +0 -38
  136. data/spec/imw/packagers/archiver_spec.rb +0 -125
  137. data/spec/imw/packagers/s3_mover_spec.rb +0 -7
  138. data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
  139. data/spec/imw/utils/extensions/find_spec.rb +0 -113
  140. data/spec/imw/workflow/rip/local_spec.rb +0 -89
  141. data/spec/imw/workflow/rip_spec.rb +0 -27
  142. data/spec/support/archive_contents_matcher.rb +0 -94
  143. data/spec/support/directory_contents_matcher.rb +0 -61
@@ -1,334 +0,0 @@
1
- module IMW
2
- module Files
3
-
4
- # A class to wrap a +tar+ archive.
5
- #
6
- # Creation, appending, listing, and extraction flags are stored in
7
- # <tt>IMW::Files::Tar::DEFAULT_FLAGS</tt> and all are passed to
8
- # the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
9
- class Tar
10
-
11
- include IMW::Files::BasicFile
12
- include IMW::Files::Archive
13
- include IMW::Files::Compressible
14
-
15
- # The default flags used creating, appending to, listing, and
16
- # extracting a tar archive.
17
- DEFAULT_FLAGS = {
18
- :create => "-cf",
19
- :append => "-rf",
20
- :list => "-tf",
21
- :extract => "-xf",
22
- :program => :tar
23
- }
24
-
25
- def initialize uri, *args
26
- self.uri= uri
27
- @archive = {
28
- :program => DEFAULT_FLAGS[:program],
29
- :create_flags => DEFAULT_FLAGS[:create],
30
- :append_flags => DEFAULT_FLAGS[:append],
31
- :list_flags => DEFAULT_FLAGS[:list],
32
- :extract_flags => DEFAULT_FLAGS[:extract]
33
- }
34
- end
35
- end # Tar
36
-
37
- # A class to wrap a <tt>tar.gz</tt> archive.
38
- #
39
- # Creation, appending, listing, and extraction flags are stored in
40
- # <tt>IMW::Files::Targz::DEFAULT_FLAGS</tt> and all are passed to
41
- # the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
42
- class Targz
43
-
44
- include IMW::Files::BasicFile
45
- include IMW::Files::Archive
46
- include IMW::Files::CompressedFile
47
-
48
- # The default flags used creating, appending to, listing, and
49
- # extracting a <tt>tar.gz</tt> archive.
50
- DEFAULT_FLAGS = {
51
- :decompression_program => :gzip,
52
- :decompression_flags => '-fd',
53
- :archive_program => :tar,
54
- :archive_list_flags => "-tf",
55
- :archive_extract_flags => "-xzf"
56
- }
57
-
58
- def initialize uri, *args
59
- self.uri= uri
60
- @compression = {
61
- :program => DEFAULT_FLAGS[:decompression_program],
62
- :decompression_flags => DEFAULT_FLAGS[:decompression_flags]
63
- }
64
- @archive = {
65
- :program => DEFAULT_FLAGS[:archive_program],
66
- :list_flags => DEFAULT_FLAGS[:archive_list_flags],
67
- :extract_flags => DEFAULT_FLAGS[:archive_extract_flags]
68
- }
69
- end
70
-
71
- # Returns the path of the file after decompression.
72
- def decompressed_path
73
- if /\.tar\.gz$/.match @path then
74
- @path.gsub /\.tar\.gz$/, ".tar"
75
- elsif /\.tgz$/.match @path then
76
- @path.gsub /\.tgz$/, ".tar"
77
- end
78
- end
79
-
80
- def self.extname path
81
- if /\.tar\.gz$/.match path then
82
- ".tar.gz"
83
- elsif /\.tgz$/.match path then
84
- ".tgz"
85
- end
86
- end
87
-
88
- end # Targz
89
-
90
- # A class to wrap a <tt>tar.bz2</tt> archive.
91
- #
92
- # Creation, appending, listing, and extraction flags are stored in
93
- # <tt>IMW::Files::Tarbz2::DEFAULT_FLAGS</tt> and all are passed to
94
- # the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
95
- class Tarbz2
96
-
97
- include IMW::Files::BasicFile
98
- include IMW::Files::Archive
99
- include IMW::Files::CompressedFile
100
-
101
- # The default flags used creating, appending to, listing, and
102
- # extracting a <tt>tar.bz2</tt> archive.
103
- DEFAULT_FLAGS = {
104
- :decompression_program => :bzip2,
105
- :decompression_flags => '-fd',
106
- :archive_program => :tar,
107
- :archive_create_flags => '-cf',
108
- :archive_list_flags => "-tf",
109
- :archive_extract_flags => "-xjf"
110
- }
111
-
112
- def self.extname path
113
- if /\.tar\.bz2$/.match path then
114
- ".tar.bz2"
115
- elsif /\.tbz2$/.match path then
116
- ".tbz2"
117
- end
118
- end
119
-
120
- def initialize uri, *args
121
- self.uri= uri
122
- @compression = {
123
- :program => DEFAULT_FLAGS[:decompression_program],
124
- :decompression_flags => DEFAULT_FLAGS[:decompression]
125
- }
126
- @archive = {
127
- :program => DEFAULT_FLAGS[:archive_program],
128
- :list_flags => DEFAULT_FLAGS[:archive_list_flags],
129
- :extract_flags => DEFAULT_FLAGS[:archive_extract_flags],
130
- :create_flags => DEFAULT_FLAGS[:archive_create_flags]
131
- }
132
- end
133
-
134
- # Returns the path of the file after decompression.
135
- def decompressed_path
136
- if /\.tar\.bz2$/.match @path then
137
- @path.gsub /\.tar\.bz2$/, ".tar"
138
- elsif /\.tbz2$/.match @path then
139
- @path.gsub /\.tbz2$/, ".tar"
140
- end
141
- end
142
-
143
- # Overrides default behvaior of IMW::Files::Archive#create to
144
- # compress files after creating them.
145
- def create paths, opts={}
146
- opts = opts.reverse_merge({:force => false})
147
- raise IMW::Error.new("An archive already exists at #{@path}.") if exist? and not opts[:force]
148
- paths = [paths] if paths.class == String
149
- IMW.system IMW::EXTERNAL_PROGRAMS[@archive[:program]], @archive[:create_flags], path_between_archive_and_compression, *paths
150
- IMW.open(path_between_archive_and_compression).compress!(:bzip2)
151
- end
152
-
153
- protected
154
- def path_between_archive_and_compression
155
- File.join(dirname,name + '.tar')
156
- end
157
-
158
- end # Tarbz2
159
-
160
- # A class to wrap a +rar+ archive.
161
- #
162
- # Creation, appending, listing, and extraction flags are stored in
163
- # <tt>IMW::Files::Rar::DEFAULT_FLAGS</tt> and all are passed to
164
- # the <tt>:rar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
165
- class Rar
166
-
167
- include IMW::Files::BasicFile
168
- include IMW::Files::Archive
169
-
170
- # The default flags used creating, appending to, listing, and
171
- # extracting a rar archive.
172
- DEFAULT_FLAGS = {
173
- :create => "a -r -o+ -inul",
174
- :append => "a -r -o+ -inul",
175
- :list => "vb",
176
- :extract => "x -o+ -inul"
177
- }
178
-
179
- def initialize uri, *args
180
- self.uri= uri
181
- @archive = {
182
- :program => :rar,
183
- :create_flags => DEFAULT_FLAGS[:create],
184
- :append_flags => DEFAULT_FLAGS[:append],
185
- :list_flags => DEFAULT_FLAGS[:list],
186
- :extract_flags => DEFAULT_FLAGS[:extract]
187
- }
188
- end
189
- end # Rar
190
-
191
- # A class to wrap a +zip+ archive.
192
- #
193
- # Creation, appending, listing, and extraction flags are stored in
194
- # <tt>IMW::Files::Zip::DEFAULT_FLAGS</tt> and all are passed to
195
- # the <tt>:zip</tt> and <tt>:unzip</tt> programs in
196
- # <tt>IMW::EXTERAL_PROGRAMS</tt>.
197
- class Zip
198
-
199
- include IMW::Files::BasicFile
200
- include IMW::Files::Archive
201
-
202
- # The default flags used creating, appending to, listing, and
203
- # extracting a zip archive.
204
- DEFAULT_FLAGS = {
205
- :create => "-q -r",
206
- :append => "-q -g",
207
- :list => "-l",
208
- :extract => "-q -o",
209
- :unarchiving_program => :unzip
210
- }
211
-
212
- def initialize uri, *args
213
- self.uri= uri
214
- @archive = {
215
- :program => :zip,
216
- :create_flags => DEFAULT_FLAGS[:create],
217
- :append_flags => DEFAULT_FLAGS[:append],
218
- :list_flags => DEFAULT_FLAGS[:list],
219
- :extract_flags => DEFAULT_FLAGS[:extract],
220
- :unarchiving_program => DEFAULT_FLAGS[:unarchiving_program]
221
- }
222
- end
223
-
224
- # The `unzip' program outputs data in a very annoying format:
225
- #
226
- # Archive: data.zip
227
- # Length Date Time Name
228
- # -------- ---- ---- ----
229
- # 18510 07-28-08 15:58 data/4d7Qrgz7.csv
230
- # 3418 07-28-08 15:41 data/7S.csv
231
- # 23353 07-28-08 15:41 data/g.csv
232
- # 711 07-28-08 15:58 data/g.xml
233
- # 1095 07-28-08 15:41 data/L.xml
234
- # 2399 07-28-08 15:58 data/mTAu9H3.xml
235
- # 152 07-28-08 15:58 data/vaHBS2t5R.dat
236
- # -------- -------
237
- # 49638 7 files
238
- #
239
- # which is parsed by this method.
240
- def archive_contents_string_to_array string
241
- rows = string.split("\n")
242
- # ignore the first 3 lines of the output and also discared the
243
- # last 2 (5 = 2 + 3)
244
- file_rows = rows[3,(rows.length - 5)]
245
- file_rows.map! do |row|
246
- # discard extra whitespace before after main text
247
- row.lstrip!.rstrip!
248
- # split the remaining text at spaces...columns beyond the
249
- # third are part of the filename and should be joined with a
250
- # space again in case of a filename with a space
251
- row.split(' ')[3,row.size].join(' ')
252
- end
253
- file_rows
254
- end
255
- end # Zip
256
-
257
- # A class to wrap a <tt>gz</tt> compressed file.
258
- #
259
- # The decompressing flags are stored in
260
- # <tt>IMW::Files::Gz::DEFAULT_FLAGS</tt> and all are passed to the
261
- # <tt>:gzip</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
262
- class Gz
263
-
264
- include IMW::Files::BasicFile
265
- include IMW::Files::CompressedFile
266
-
267
- # The default flags used in extracting a <tt>gz</tt> file.
268
- DEFAULT_FLAGS = {
269
- :program => :gzip,
270
- :decompression => '-fd'
271
- }
272
-
273
- def initialize uri, *args
274
- self.uri= uri
275
- @compression = {
276
- :program => DEFAULT_FLAGS[:program],
277
- :decompression_flags => DEFAULT_FLAGS[:decompression]
278
- }
279
- end
280
-
281
- def decompressed_path
282
- @path.gsub /\.gz$/, ""
283
- end
284
- end # Gz
285
-
286
- # A class to wrap a <tt>bz2</tt> compressed file.
287
- #
288
- # The decompressing flags are stored in
289
- # <tt>IMW::Files::Bz2::DEFAULT_FLAGS</tt> and all are passed to
290
- # the <tt>:bzip2</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
291
- class Bz2
292
-
293
- include IMW::Files::BasicFile
294
- include IMW::Files::CompressedFile
295
-
296
- # The default flags used in extracting a <tt>bz2</tt> file.
297
- DEFAULT_FLAGS = {
298
- :program => :bzip2,
299
- :decompression => '-fd'
300
- }
301
-
302
- def initialize uri, *args
303
- self.uri= uri
304
- raise IMW::Error.new("#{@extname} is not a valid extension for a bzip2 compressed file.") unless @extname == '.bz2'
305
- @compression = {
306
- :program => DEFAULT_FLAGS[:program],
307
- :decompression_flags => DEFAULT_FLAGS[:decompression]
308
- }
309
- end
310
-
311
- # Returns the path of the file after decompression.
312
- def decompressed_path
313
- @path.gsub /\.bz2$/, ""
314
- end
315
- end # Bz2
316
-
317
-
318
- # make sure that tar.bz2 precedes bz2 and so on...
319
- FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::Tarbz2]
320
- FILE_REGEXPS << [/\.tbz2$/, IMW::Files::Tarbz2]
321
-
322
- FILE_REGEXPS << [/\.tar\.gz$/, IMW::Files::Targz]
323
- FILE_REGEXPS << [/\.tgz$/, IMW::Files::Targz]
324
-
325
- FILE_REGEXPS << [/\.tar$/, IMW::Files::Tar]
326
- FILE_REGEXPS << [/\.bz2$/, IMW::Files::Bz2]
327
- FILE_REGEXPS << [/\.gz$/, IMW::Files::Gz]
328
- FILE_REGEXPS << [/\.rar$/, IMW::Files::Rar]
329
- FILE_REGEXPS << [/\.zip$/, IMW::Files::Zip]
330
-
331
- end # Files
332
- end # IMW
333
-
334
-
@@ -1,103 +0,0 @@
1
- #
2
- # h2. lib/imw/files//compressible.rb -- compression module
3
- #
4
- # == About
5
- #
6
- # Module used for compression of files. An including
7
- # <tt>IMW::Files::BasicFile</tt> object gains +compress+ and
8
- # <tt>compress!</tt> methods.
9
- #
10
- # By default, bzip2 is used for compression though gzip can also be
11
- # specified (the full list of known compression programs is in
12
- # <tt>IMW::Files::Compressible::COMPRESSION_PROGS</tt>). Zip and Rar
13
- # compression are handled by the <tt>IMW::Files::Archive</tt> module.
14
- #
15
- # Decompression should be handled via the
16
- # <tt>IMW::Files::CompressedFile</tt> class.
17
- #
18
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
19
- # Copyright:: Copyright (c) 2008 infochimps.org
20
- # License:: GPL 3.0
21
- # Website:: http://infinitemonkeywrench.org/
22
- #
23
- # puts "#{File.basename(__FILE__)}: Why is it that when you squeeze a lemon you get lemonade but when you squeeze a banana you just get a mess?" # at bottom
24
- module IMW
25
- module Files
26
- module Compressible
27
-
28
- # Known compression programs.
29
- COMPRESSION_PROGS = [:bzip2, :gzip]
30
-
31
- # Extensions that are appended by each compression program.
32
- COMPRESSION_EXTS = {
33
- :bzip2 => '.bz2',
34
- :gzip => '.gz'
35
- }
36
-
37
- # Compression flags for each program
38
- COMPRESSION_FLAGS = {
39
- :bzip2 => "-f",
40
- :gzip => "-f"
41
- }
42
-
43
- protected
44
- # Check that +program+ is a valid compression program.
45
- def ensure_valid_compression_program program
46
- raise IMW::Error.new("#{program} is not a valid compression program (#{COMPRESSION_PROGS.join(' ,')}).") unless COMPRESSION_PROGS.include? program
47
- end
48
-
49
- # Construct the command passed to the shell to compress this
50
- # file using the given +program+.
51
- def compression_command program
52
- ensure_valid_compression_program program
53
- [IMW::EXTERNAL_PROGRAMS[program],COMPRESSION_FLAGS[program],self.path].join ' '
54
- end
55
-
56
- # Return the object representing this file compressed with
57
- # +program+.
58
- def compressed_file_path program
59
- ensure_valid_compression_program program
60
- path = File.join(self.dirname,self.basename + COMPRESSION_EXTS[program])
61
- end
62
-
63
- public
64
- # Compress this file in its present directory using +program+,
65
- # overwriting any existing compressed files and without saving
66
- # the original file. Returns an
67
- # <tt>IMW::Files::CompressedFile</tt> object corresponding to
68
- # the compressed file.
69
- #
70
- # Options:
71
- #
72
- # <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
73
- # program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
74
- def compress! program = :bzip2
75
- raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
76
- FileUtils.cd(@dirname) { IMW.system(self.compression_command(program)) }
77
- IMW.open(self.compressed_file_path(program))
78
- end
79
-
80
- # Compress this file in its present directory, overwriting any
81
- # existing compressed files while keeping the original file.
82
- # Returns an <tt>IMW::Files::CompressedFile</tt> object
83
- # corresponding to the compressed file.
84
- #
85
- # Options:
86
- #
87
- # <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
88
- # program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
89
- def compress program = :bzip2
90
- raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
91
- begin
92
- FileUtils.cp(self.path,self.path + 'copy')
93
- compress! program
94
- ensure
95
- FileUtils.mv(self.path + 'copy',self.path)
96
- end
97
- IMW.open(self.compressed_file_path(program))
98
- end
99
-
100
- end
101
- end
102
- end
103
-
data/lib/imw/files/csv.rb DELETED
@@ -1,113 +0,0 @@
1
- #
2
- # h2. lib/imw/files/csv.rb -- CSV, TSV files
3
- #
4
- # == About
5
- #
6
- # For "comma-separated value" (CSV) and "tab-separated value" (TSV)
7
- # files.
8
- #
9
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
- # Copyright:: Copyright (c) 2008 infochimps.org
11
- # License:: GPL 3.0
12
- # Website:: http://infinitemonkeywrench.org/
13
- #
14
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
-
16
- require 'fastercsv'
17
- module IMW
18
- module Files
19
-
20
- # A base class from which to subclass various types of tabular
21
- # data files (CSV, TSV, &c.)
22
- class TabularDataFile < FasterCSV
23
-
24
- include IMW::Files::BasicFile
25
- include IMW::Files::Compressible
26
-
27
- # Default options to be passed to
28
- # FasterCSV[http://fastercsv.rubyforge.org/]; see its
29
- # documentation for more information.
30
- DEFAULT_OPTIONS = {
31
- :col_sep => ',',
32
- :headers => false,
33
- :return_headers => false,
34
- :write_headers => true,
35
- :skip_blanks => false,
36
- :force_quotes => false
37
- }
38
-
39
- def initialize uri, mode='r', options = {}
40
- options.reverse_merge!(self.class::DEFAULT_OPTIONS)
41
- self.uri= uri
42
- options.delete(:write) # FasterCSV complains about unkown options
43
- super open(uri,mode), options
44
- end
45
-
46
- # Return the contents of this CSV file as an array of arrays.
47
- def load
48
- entries
49
- end
50
-
51
- # Dump +data+ to this file.
52
- #
53
- # Options include:
54
- # <tt>:flush</tt> (true):: flush the file buffer, writing it to disk
55
- # <tt>:close</tt> (true):: close the file after writing +data+
56
- def dump data, options = {}
57
- options = options.reverse_merge :close => true, :flush => true
58
- data.each {|row| self << row}
59
- self.flush if options[:flush]
60
- self.close if options[:close]
61
- self
62
- end
63
-
64
- # Return a random sample of rows.
65
- def sample length=10
66
- rows, indices = [], Set.new
67
- begin
68
- each_with_index do |row, index|
69
- break if rows.size == length
70
- next if index != 0 && rand < 0.75 # skip 3/4 of rows after the 1st
71
- rows << row
72
- indices << index
73
- end
74
- # now fill up to length if not there already
75
- while rows.length < length
76
- each_with_index do |row, index|
77
- break if rows.size == length
78
- next if index indices.include?(index)
79
- rows << row
80
- end
81
- end
82
- rows
83
- rescue FasterCSV::MalformedCSVError
84
- rows
85
- end
86
- end
87
- end
88
-
89
-
90
- # Represents a file of comma-separated values (CSV). This class
91
- # is a subclass of <tt>FasterCSV</tt> so the methods of that
92
- # library are available for use.
93
- #
94
- # See <tt>IMW::Files::TabularDataFile</tt> for more complete
95
- # documentation.
96
- class Csv < TabularDataFile
97
- end
98
-
99
- # Represents a file of tab-separated values (TSV). This class
100
- # is a subclass of <tt>FasterCSV</tt> so the methods of that
101
- # library are available for use.
102
- #
103
- # See <tt>IMW::Files::TabularDataFile</tt> for more complete
104
- # documentation.
105
- class Tsv < TabularDataFile
106
- DEFAULT_OPTIONS = {:col_sep => "\t"}.reverse_merge DEFAULT_OPTIONS
107
- end
108
-
109
- FILE_REGEXPS << [/\.csv$/, IMW::Files::Csv]
110
- FILE_REGEXPS << [/\.tsv$/, IMW::Files::Tsv]
111
-
112
- end
113
- end
@@ -1,62 +0,0 @@
1
- require 'imw/files/basicfile'
2
- module IMW
3
- module Files
4
- class Directory
5
-
6
- include IMW::Files::BasicFile
7
-
8
- # FIXME these should be defined by BasicFile and then removed here but I don't see how...
9
- # [:executable?, :executable_real?, :pipe?, :socket?, :rm, :rm!, :extname, :extname=, :name, :name=].each do |method|
10
- # instance_eval do
11
- # remove_method method
12
- # end
13
- # end
14
-
15
- def uri= uri
16
- @uri = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
17
- @host = self.uri.host
18
- @path = local? ? ::File.expand_path(self.uri.path) : self.uri.path
19
- @dirname = ::File.dirname path
20
- @basename = ::File.basename path
21
- end
22
-
23
- def initialize uri
24
- self.uri = uri
25
- end
26
-
27
- def [] selector='*'
28
- Dir[File.join(path, selector)] if local?
29
- end
30
-
31
- # Copy the contents of this directory to +new_dir+.
32
- def cp new_dir
33
- raise IMW::PathError.new("cannot copy from #{path}, doesn't exist!") unless exist?
34
- if local?
35
- FileUtils.cp_r path, new_dir
36
- else
37
- raise IMW::PathError.new("cannot recursively copy remote directories (yet!)")
38
- end
39
- self.class.new(new_dir)
40
- end
41
-
42
- # Move this directory to +new_dir+.
43
- def mv new_dir
44
- raise IMW::PathError.new("cannot move from #{path}, doesn't exist!") unless exist?
45
- if local?
46
- FileUtils.mv path, new_dir
47
- else
48
- raise IMW::PathError.new("cannot move remote directories (yet!)")
49
- end
50
- self.class.new(new_dir)
51
- end
52
- alias_method :mv!, :mv
53
-
54
- # Move this directory so it sits beneath +dir+.
55
- def mv_to_dir dir
56
- mv File.join(File.expand_path(dir),basename)
57
- end
58
- alias_method :mv_to_dir!, :mv_to_dir
59
-
60
- end
61
- end
62
- end
@@ -1,84 +0,0 @@
1
- require 'spreadsheet'
2
-
3
- # FIXME Main issue with this:
4
- # You can make a new excel book and dump data to it no problem.
5
- # However, something that doesn't seem to work is dumping to a file, opening,
6
- # and dumping to it again. At the moment this is probably not a big deal.
7
-
8
-
9
- module IMW
10
- module Files
11
- class Excel
12
- include IMW::Files::BasicFile
13
- include IMW::Files::Compressible
14
-
15
- #need to initialize, load, and dump
16
- attr_accessor :book,:idx, :max_lines, :sht_idx, :sht_row, :book_idx
17
- def initialize uri, mode, options={}
18
- self.uri = uri
19
- @max_lines = options[:max_lines] || 65000
20
- @idx = 0
21
- @book_idx = 0
22
- @sht_idx = 0
23
- unless self.exist?
24
- make_new_book
25
- make_new_sheet
26
- else
27
- get_existing_book
28
- end
29
- end
30
-
31
- def load
32
- @sheet.map{|row| row.to_a}
33
- end
34
-
35
- def dump data
36
- data.each do |line|
37
- raise "too many lines" if too_many?
38
- self << line
39
- end
40
- save unless no_data?
41
- end
42
-
43
- def << line
44
- @sheet.row(@sht_row).concat( line )
45
- @sht_row += 1
46
- @idx += 1
47
- end
48
-
49
- def make_new_book
50
- @book = Spreadsheet::Workbook.new
51
- @book_idx += 1
52
- end
53
-
54
- def make_new_sheet
55
- @sheet = @book.create_worksheet
56
- @sht_idx += 1
57
- @sht_row = 0 #always start at row 0 in a new sheet
58
- end
59
-
60
- def get_existing_book
61
- @book = Spreadsheet.open path
62
- @sheet = book.worksheet 0
63
- @sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
64
- @sht_idx += 1
65
- end
66
-
67
- def incr_sheet
68
- @sheet = book.worksheet @sht_idx
69
- end
70
-
71
- def too_many?
72
- @sht_row >= @max_lines
73
- end
74
-
75
- def no_data?
76
- @sht_row == 0
77
- end
78
-
79
- def save
80
- @book.write path
81
- end
82
- end
83
- end
84
- end