imw 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -1
- data/Rakefile +10 -0
- data/TODO +18 -0
- data/VERSION +1 -1
- data/bin/imw +1 -1
- data/etc/imwrc.rb +0 -50
- data/examples/dataset.rb +12 -0
- data/lib/imw/boot.rb +55 -9
- data/lib/imw/dataset/paths.rb +15 -24
- data/lib/imw/dataset/workflow.rb +131 -72
- data/lib/imw/dataset.rb +94 -186
- data/lib/imw/parsers/html_parser.rb +1 -1
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +3 -27
- data/lib/imw/resource.rb +190 -0
- data/lib/imw/resources/archive.rb +97 -0
- data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
- data/lib/imw/resources/archives_and_compressed.rb +32 -0
- data/lib/imw/resources/compressed_file.rb +89 -0
- data/lib/imw/resources/compressible.rb +77 -0
- data/lib/imw/resources/formats/delimited.rb +92 -0
- data/lib/imw/resources/formats/excel.rb +125 -0
- data/lib/imw/resources/formats/json.rb +53 -0
- data/lib/imw/resources/formats/sgml.rb +72 -0
- data/lib/imw/resources/formats/yaml.rb +53 -0
- data/lib/imw/resources/formats.rb +32 -0
- data/lib/imw/resources/local.rb +198 -0
- data/lib/imw/resources/remote.rb +110 -0
- data/lib/imw/resources/schemes/hdfs.rb +242 -0
- data/lib/imw/resources/schemes/http.rb +161 -0
- data/lib/imw/resources/schemes/s3.rb +137 -0
- data/lib/imw/resources/schemes.rb +19 -0
- data/lib/imw/resources.rb +118 -0
- data/lib/imw/runner.rb +5 -4
- data/lib/imw/transforms/archiver.rb +215 -0
- data/lib/imw/transforms/transferer.rb +103 -0
- data/lib/imw/transforms.rb +8 -0
- data/lib/imw/utils/error.rb +26 -30
- data/lib/imw/utils/extensions/array.rb +5 -15
- data/lib/imw/utils/extensions/hash.rb +6 -16
- data/lib/imw/utils/extensions/hpricot.rb +0 -14
- data/lib/imw/utils/extensions/string.rb +5 -15
- data/lib/imw/utils/extensions/symbol.rb +0 -13
- data/lib/imw/utils/extensions.rb +65 -0
- data/lib/imw/utils/log.rb +14 -13
- data/lib/imw/utils/misc.rb +0 -6
- data/lib/imw/utils/paths.rb +101 -42
- data/lib/imw/utils/version.rb +8 -9
- data/lib/imw/utils.rb +2 -18
- data/lib/imw.rb +92 -17
- data/spec/data/sample.csv +1 -1
- data/spec/data/sample.json +1 -0
- data/spec/data/sample.tsv +1 -1
- data/spec/data/sample.txt +1 -1
- data/spec/data/sample.xml +1 -1
- data/spec/data/sample.yaml +1 -1
- data/spec/imw/dataset/paths_spec.rb +32 -0
- data/spec/imw/dataset/workflow_spec.rb +41 -0
- data/spec/imw/resource_spec.rb +79 -0
- data/spec/imw/resources/archive_spec.rb +69 -0
- data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
- data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
- data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
- data/spec/imw/resources/compressed_file_spec.rb +48 -0
- data/spec/imw/resources/compressible_spec.rb +36 -0
- data/spec/imw/resources/formats/delimited_spec.rb +33 -0
- data/spec/imw/resources/formats/json_spec.rb +32 -0
- data/spec/imw/resources/formats/sgml_spec.rb +24 -0
- data/spec/imw/resources/formats/yaml_spec.rb +41 -0
- data/spec/imw/resources/local_spec.rb +98 -0
- data/spec/imw/resources/remote_spec.rb +35 -0
- data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
- data/spec/imw/resources/schemes/http_spec.rb +19 -0
- data/spec/imw/resources/schemes/s3_spec.rb +19 -0
- data/spec/imw/transforms/archiver_spec.rb +120 -0
- data/spec/imw/transforms/transferer_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +5 -33
- data/spec/imw/utils/shared_paths_spec.rb +29 -0
- data/spec/spec_helper.rb +5 -5
- data/spec/support/paths_matcher.rb +67 -0
- data/spec/support/random.rb +39 -36
- metadata +88 -75
- data/lib/imw/dataset/task.rb +0 -41
- data/lib/imw/files/archive.rb +0 -113
- data/lib/imw/files/basicfile.rb +0 -122
- data/lib/imw/files/binary.rb +0 -28
- data/lib/imw/files/compressed_file.rb +0 -93
- data/lib/imw/files/compressed_files_and_archives.rb +0 -334
- data/lib/imw/files/compressible.rb +0 -103
- data/lib/imw/files/csv.rb +0 -113
- data/lib/imw/files/directory.rb +0 -62
- data/lib/imw/files/excel.rb +0 -84
- data/lib/imw/files/json.rb +0 -41
- data/lib/imw/files/sgml.rb +0 -46
- data/lib/imw/files/text.rb +0 -68
- data/lib/imw/files/yaml.rb +0 -46
- data/lib/imw/files.rb +0 -125
- data/lib/imw/packagers/archiver.rb +0 -126
- data/lib/imw/packagers/s3_mover.rb +0 -36
- data/lib/imw/packagers.rb +0 -8
- data/lib/imw/utils/components.rb +0 -61
- data/lib/imw/utils/config.rb +0 -46
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
- data/lib/imw/utils/extensions/core.rb +0 -27
- data/lib/imw/utils/extensions/dir.rb +0 -24
- data/lib/imw/utils/extensions/file_core.rb +0 -64
- data/lib/imw/utils/extensions/typed_struct.rb +0 -22
- data/lib/imw/utils/extensions/uri.rb +0 -59
- data/lib/imw/utils/view/dump_csv.rb +0 -112
- data/lib/imw/utils/view/dump_csv_older.rb +0 -117
- data/lib/imw/utils/view.rb +0 -113
- data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
- data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
- data/spec/imw/files/archive_spec.rb +0 -118
- data/spec/imw/files/basicfile_spec.rb +0 -121
- data/spec/imw/files/bz2_spec.rb +0 -32
- data/spec/imw/files/compressed_file_spec.rb +0 -96
- data/spec/imw/files/compressible_spec.rb +0 -100
- data/spec/imw/files/file_spec.rb +0 -144
- data/spec/imw/files/gz_spec.rb +0 -32
- data/spec/imw/files/rar_spec.rb +0 -33
- data/spec/imw/files/tar_spec.rb +0 -31
- data/spec/imw/files/text_spec.rb +0 -23
- data/spec/imw/files/zip_spec.rb +0 -31
- data/spec/imw/files_spec.rb +0 -38
- data/spec/imw/packagers/archiver_spec.rb +0 -125
- data/spec/imw/packagers/s3_mover_spec.rb +0 -7
- data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
- data/spec/imw/utils/extensions/find_spec.rb +0 -113
- data/spec/imw/workflow/rip/local_spec.rb +0 -89
- data/spec/imw/workflow/rip_spec.rb +0 -27
- data/spec/support/archive_contents_matcher.rb +0 -94
- data/spec/support/directory_contents_matcher.rb +0 -61
@@ -1,334 +0,0 @@
|
|
1
|
-
module IMW
|
2
|
-
module Files
|
3
|
-
|
4
|
-
# A class to wrap a +tar+ archive.
|
5
|
-
#
|
6
|
-
# Creation, appending, listing, and extraction flags are stored in
|
7
|
-
# <tt>IMW::Files::Tar::DEFAULT_FLAGS</tt> and all are passed to
|
8
|
-
# the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
9
|
-
class Tar
|
10
|
-
|
11
|
-
include IMW::Files::BasicFile
|
12
|
-
include IMW::Files::Archive
|
13
|
-
include IMW::Files::Compressible
|
14
|
-
|
15
|
-
# The default flags used creating, appending to, listing, and
|
16
|
-
# extracting a tar archive.
|
17
|
-
DEFAULT_FLAGS = {
|
18
|
-
:create => "-cf",
|
19
|
-
:append => "-rf",
|
20
|
-
:list => "-tf",
|
21
|
-
:extract => "-xf",
|
22
|
-
:program => :tar
|
23
|
-
}
|
24
|
-
|
25
|
-
def initialize uri, *args
|
26
|
-
self.uri= uri
|
27
|
-
@archive = {
|
28
|
-
:program => DEFAULT_FLAGS[:program],
|
29
|
-
:create_flags => DEFAULT_FLAGS[:create],
|
30
|
-
:append_flags => DEFAULT_FLAGS[:append],
|
31
|
-
:list_flags => DEFAULT_FLAGS[:list],
|
32
|
-
:extract_flags => DEFAULT_FLAGS[:extract]
|
33
|
-
}
|
34
|
-
end
|
35
|
-
end # Tar
|
36
|
-
|
37
|
-
# A class to wrap a <tt>tar.gz</tt> archive.
|
38
|
-
#
|
39
|
-
# Creation, appending, listing, and extraction flags are stored in
|
40
|
-
# <tt>IMW::Files::Targz::DEFAULT_FLAGS</tt> and all are passed to
|
41
|
-
# the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
42
|
-
class Targz
|
43
|
-
|
44
|
-
include IMW::Files::BasicFile
|
45
|
-
include IMW::Files::Archive
|
46
|
-
include IMW::Files::CompressedFile
|
47
|
-
|
48
|
-
# The default flags used creating, appending to, listing, and
|
49
|
-
# extracting a <tt>tar.gz</tt> archive.
|
50
|
-
DEFAULT_FLAGS = {
|
51
|
-
:decompression_program => :gzip,
|
52
|
-
:decompression_flags => '-fd',
|
53
|
-
:archive_program => :tar,
|
54
|
-
:archive_list_flags => "-tf",
|
55
|
-
:archive_extract_flags => "-xzf"
|
56
|
-
}
|
57
|
-
|
58
|
-
def initialize uri, *args
|
59
|
-
self.uri= uri
|
60
|
-
@compression = {
|
61
|
-
:program => DEFAULT_FLAGS[:decompression_program],
|
62
|
-
:decompression_flags => DEFAULT_FLAGS[:decompression_flags]
|
63
|
-
}
|
64
|
-
@archive = {
|
65
|
-
:program => DEFAULT_FLAGS[:archive_program],
|
66
|
-
:list_flags => DEFAULT_FLAGS[:archive_list_flags],
|
67
|
-
:extract_flags => DEFAULT_FLAGS[:archive_extract_flags]
|
68
|
-
}
|
69
|
-
end
|
70
|
-
|
71
|
-
# Returns the path of the file after decompression.
|
72
|
-
def decompressed_path
|
73
|
-
if /\.tar\.gz$/.match @path then
|
74
|
-
@path.gsub /\.tar\.gz$/, ".tar"
|
75
|
-
elsif /\.tgz$/.match @path then
|
76
|
-
@path.gsub /\.tgz$/, ".tar"
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
def self.extname path
|
81
|
-
if /\.tar\.gz$/.match path then
|
82
|
-
".tar.gz"
|
83
|
-
elsif /\.tgz$/.match path then
|
84
|
-
".tgz"
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
end # Targz
|
89
|
-
|
90
|
-
# A class to wrap a <tt>tar.bz2</tt> archive.
|
91
|
-
#
|
92
|
-
# Creation, appending, listing, and extraction flags are stored in
|
93
|
-
# <tt>IMW::Files::Tarbz2::DEFAULT_FLAGS</tt> and all are passed to
|
94
|
-
# the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
95
|
-
class Tarbz2
|
96
|
-
|
97
|
-
include IMW::Files::BasicFile
|
98
|
-
include IMW::Files::Archive
|
99
|
-
include IMW::Files::CompressedFile
|
100
|
-
|
101
|
-
# The default flags used creating, appending to, listing, and
|
102
|
-
# extracting a <tt>tar.bz2</tt> archive.
|
103
|
-
DEFAULT_FLAGS = {
|
104
|
-
:decompression_program => :bzip2,
|
105
|
-
:decompression_flags => '-fd',
|
106
|
-
:archive_program => :tar,
|
107
|
-
:archive_create_flags => '-cf',
|
108
|
-
:archive_list_flags => "-tf",
|
109
|
-
:archive_extract_flags => "-xjf"
|
110
|
-
}
|
111
|
-
|
112
|
-
def self.extname path
|
113
|
-
if /\.tar\.bz2$/.match path then
|
114
|
-
".tar.bz2"
|
115
|
-
elsif /\.tbz2$/.match path then
|
116
|
-
".tbz2"
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
def initialize uri, *args
|
121
|
-
self.uri= uri
|
122
|
-
@compression = {
|
123
|
-
:program => DEFAULT_FLAGS[:decompression_program],
|
124
|
-
:decompression_flags => DEFAULT_FLAGS[:decompression]
|
125
|
-
}
|
126
|
-
@archive = {
|
127
|
-
:program => DEFAULT_FLAGS[:archive_program],
|
128
|
-
:list_flags => DEFAULT_FLAGS[:archive_list_flags],
|
129
|
-
:extract_flags => DEFAULT_FLAGS[:archive_extract_flags],
|
130
|
-
:create_flags => DEFAULT_FLAGS[:archive_create_flags]
|
131
|
-
}
|
132
|
-
end
|
133
|
-
|
134
|
-
# Returns the path of the file after decompression.
|
135
|
-
def decompressed_path
|
136
|
-
if /\.tar\.bz2$/.match @path then
|
137
|
-
@path.gsub /\.tar\.bz2$/, ".tar"
|
138
|
-
elsif /\.tbz2$/.match @path then
|
139
|
-
@path.gsub /\.tbz2$/, ".tar"
|
140
|
-
end
|
141
|
-
end
|
142
|
-
|
143
|
-
# Overrides default behvaior of IMW::Files::Archive#create to
|
144
|
-
# compress files after creating them.
|
145
|
-
def create paths, opts={}
|
146
|
-
opts = opts.reverse_merge({:force => false})
|
147
|
-
raise IMW::Error.new("An archive already exists at #{@path}.") if exist? and not opts[:force]
|
148
|
-
paths = [paths] if paths.class == String
|
149
|
-
IMW.system IMW::EXTERNAL_PROGRAMS[@archive[:program]], @archive[:create_flags], path_between_archive_and_compression, *paths
|
150
|
-
IMW.open(path_between_archive_and_compression).compress!(:bzip2)
|
151
|
-
end
|
152
|
-
|
153
|
-
protected
|
154
|
-
def path_between_archive_and_compression
|
155
|
-
File.join(dirname,name + '.tar')
|
156
|
-
end
|
157
|
-
|
158
|
-
end # Tarbz2
|
159
|
-
|
160
|
-
# A class to wrap a +rar+ archive.
|
161
|
-
#
|
162
|
-
# Creation, appending, listing, and extraction flags are stored in
|
163
|
-
# <tt>IMW::Files::Rar::DEFAULT_FLAGS</tt> and all are passed to
|
164
|
-
# the <tt>:rar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
165
|
-
class Rar
|
166
|
-
|
167
|
-
include IMW::Files::BasicFile
|
168
|
-
include IMW::Files::Archive
|
169
|
-
|
170
|
-
# The default flags used creating, appending to, listing, and
|
171
|
-
# extracting a rar archive.
|
172
|
-
DEFAULT_FLAGS = {
|
173
|
-
:create => "a -r -o+ -inul",
|
174
|
-
:append => "a -r -o+ -inul",
|
175
|
-
:list => "vb",
|
176
|
-
:extract => "x -o+ -inul"
|
177
|
-
}
|
178
|
-
|
179
|
-
def initialize uri, *args
|
180
|
-
self.uri= uri
|
181
|
-
@archive = {
|
182
|
-
:program => :rar,
|
183
|
-
:create_flags => DEFAULT_FLAGS[:create],
|
184
|
-
:append_flags => DEFAULT_FLAGS[:append],
|
185
|
-
:list_flags => DEFAULT_FLAGS[:list],
|
186
|
-
:extract_flags => DEFAULT_FLAGS[:extract]
|
187
|
-
}
|
188
|
-
end
|
189
|
-
end # Rar
|
190
|
-
|
191
|
-
# A class to wrap a +zip+ archive.
|
192
|
-
#
|
193
|
-
# Creation, appending, listing, and extraction flags are stored in
|
194
|
-
# <tt>IMW::Files::Zip::DEFAULT_FLAGS</tt> and all are passed to
|
195
|
-
# the <tt>:zip</tt> and <tt>:unzip</tt> programs in
|
196
|
-
# <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
197
|
-
class Zip
|
198
|
-
|
199
|
-
include IMW::Files::BasicFile
|
200
|
-
include IMW::Files::Archive
|
201
|
-
|
202
|
-
# The default flags used creating, appending to, listing, and
|
203
|
-
# extracting a zip archive.
|
204
|
-
DEFAULT_FLAGS = {
|
205
|
-
:create => "-q -r",
|
206
|
-
:append => "-q -g",
|
207
|
-
:list => "-l",
|
208
|
-
:extract => "-q -o",
|
209
|
-
:unarchiving_program => :unzip
|
210
|
-
}
|
211
|
-
|
212
|
-
def initialize uri, *args
|
213
|
-
self.uri= uri
|
214
|
-
@archive = {
|
215
|
-
:program => :zip,
|
216
|
-
:create_flags => DEFAULT_FLAGS[:create],
|
217
|
-
:append_flags => DEFAULT_FLAGS[:append],
|
218
|
-
:list_flags => DEFAULT_FLAGS[:list],
|
219
|
-
:extract_flags => DEFAULT_FLAGS[:extract],
|
220
|
-
:unarchiving_program => DEFAULT_FLAGS[:unarchiving_program]
|
221
|
-
}
|
222
|
-
end
|
223
|
-
|
224
|
-
# The `unzip' program outputs data in a very annoying format:
|
225
|
-
#
|
226
|
-
# Archive: data.zip
|
227
|
-
# Length Date Time Name
|
228
|
-
# -------- ---- ---- ----
|
229
|
-
# 18510 07-28-08 15:58 data/4d7Qrgz7.csv
|
230
|
-
# 3418 07-28-08 15:41 data/7S.csv
|
231
|
-
# 23353 07-28-08 15:41 data/g.csv
|
232
|
-
# 711 07-28-08 15:58 data/g.xml
|
233
|
-
# 1095 07-28-08 15:41 data/L.xml
|
234
|
-
# 2399 07-28-08 15:58 data/mTAu9H3.xml
|
235
|
-
# 152 07-28-08 15:58 data/vaHBS2t5R.dat
|
236
|
-
# -------- -------
|
237
|
-
# 49638 7 files
|
238
|
-
#
|
239
|
-
# which is parsed by this method.
|
240
|
-
def archive_contents_string_to_array string
|
241
|
-
rows = string.split("\n")
|
242
|
-
# ignore the first 3 lines of the output and also discared the
|
243
|
-
# last 2 (5 = 2 + 3)
|
244
|
-
file_rows = rows[3,(rows.length - 5)]
|
245
|
-
file_rows.map! do |row|
|
246
|
-
# discard extra whitespace before after main text
|
247
|
-
row.lstrip!.rstrip!
|
248
|
-
# split the remaining text at spaces...columns beyond the
|
249
|
-
# third are part of the filename and should be joined with a
|
250
|
-
# space again in case of a filename with a space
|
251
|
-
row.split(' ')[3,row.size].join(' ')
|
252
|
-
end
|
253
|
-
file_rows
|
254
|
-
end
|
255
|
-
end # Zip
|
256
|
-
|
257
|
-
# A class to wrap a <tt>gz</tt> compressed file.
|
258
|
-
#
|
259
|
-
# The decompressing flags are stored in
|
260
|
-
# <tt>IMW::Files::Gz::DEFAULT_FLAGS</tt> and all are passed to the
|
261
|
-
# <tt>:gzip</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
262
|
-
class Gz
|
263
|
-
|
264
|
-
include IMW::Files::BasicFile
|
265
|
-
include IMW::Files::CompressedFile
|
266
|
-
|
267
|
-
# The default flags used in extracting a <tt>gz</tt> file.
|
268
|
-
DEFAULT_FLAGS = {
|
269
|
-
:program => :gzip,
|
270
|
-
:decompression => '-fd'
|
271
|
-
}
|
272
|
-
|
273
|
-
def initialize uri, *args
|
274
|
-
self.uri= uri
|
275
|
-
@compression = {
|
276
|
-
:program => DEFAULT_FLAGS[:program],
|
277
|
-
:decompression_flags => DEFAULT_FLAGS[:decompression]
|
278
|
-
}
|
279
|
-
end
|
280
|
-
|
281
|
-
def decompressed_path
|
282
|
-
@path.gsub /\.gz$/, ""
|
283
|
-
end
|
284
|
-
end # Gz
|
285
|
-
|
286
|
-
# A class to wrap a <tt>bz2</tt> compressed file.
|
287
|
-
#
|
288
|
-
# The decompressing flags are stored in
|
289
|
-
# <tt>IMW::Files::Bz2::DEFAULT_FLAGS</tt> and all are passed to
|
290
|
-
# the <tt>:bzip2</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
291
|
-
class Bz2
|
292
|
-
|
293
|
-
include IMW::Files::BasicFile
|
294
|
-
include IMW::Files::CompressedFile
|
295
|
-
|
296
|
-
# The default flags used in extracting a <tt>bz2</tt> file.
|
297
|
-
DEFAULT_FLAGS = {
|
298
|
-
:program => :bzip2,
|
299
|
-
:decompression => '-fd'
|
300
|
-
}
|
301
|
-
|
302
|
-
def initialize uri, *args
|
303
|
-
self.uri= uri
|
304
|
-
raise IMW::Error.new("#{@extname} is not a valid extension for a bzip2 compressed file.") unless @extname == '.bz2'
|
305
|
-
@compression = {
|
306
|
-
:program => DEFAULT_FLAGS[:program],
|
307
|
-
:decompression_flags => DEFAULT_FLAGS[:decompression]
|
308
|
-
}
|
309
|
-
end
|
310
|
-
|
311
|
-
# Returns the path of the file after decompression.
|
312
|
-
def decompressed_path
|
313
|
-
@path.gsub /\.bz2$/, ""
|
314
|
-
end
|
315
|
-
end # Bz2
|
316
|
-
|
317
|
-
|
318
|
-
# make sure that tar.bz2 precedes bz2 and so on...
|
319
|
-
FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::Tarbz2]
|
320
|
-
FILE_REGEXPS << [/\.tbz2$/, IMW::Files::Tarbz2]
|
321
|
-
|
322
|
-
FILE_REGEXPS << [/\.tar\.gz$/, IMW::Files::Targz]
|
323
|
-
FILE_REGEXPS << [/\.tgz$/, IMW::Files::Targz]
|
324
|
-
|
325
|
-
FILE_REGEXPS << [/\.tar$/, IMW::Files::Tar]
|
326
|
-
FILE_REGEXPS << [/\.bz2$/, IMW::Files::Bz2]
|
327
|
-
FILE_REGEXPS << [/\.gz$/, IMW::Files::Gz]
|
328
|
-
FILE_REGEXPS << [/\.rar$/, IMW::Files::Rar]
|
329
|
-
FILE_REGEXPS << [/\.zip$/, IMW::Files::Zip]
|
330
|
-
|
331
|
-
end # Files
|
332
|
-
end # IMW
|
333
|
-
|
334
|
-
|
@@ -1,103 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/files//compressible.rb -- compression module
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# Module used for compression of files. An including
|
7
|
-
# <tt>IMW::Files::BasicFile</tt> object gains +compress+ and
|
8
|
-
# <tt>compress!</tt> methods.
|
9
|
-
#
|
10
|
-
# By default, bzip2 is used for compression though gzip can also be
|
11
|
-
# specified (the full list of known compression programs is in
|
12
|
-
# <tt>IMW::Files::Compressible::COMPRESSION_PROGS</tt>). Zip and Rar
|
13
|
-
# compression are handled by the <tt>IMW::Files::Archive</tt> module.
|
14
|
-
#
|
15
|
-
# Decompression should be handled via the
|
16
|
-
# <tt>IMW::Files::CompressedFile</tt> class.
|
17
|
-
#
|
18
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
19
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
20
|
-
# License:: GPL 3.0
|
21
|
-
# Website:: http://infinitemonkeywrench.org/
|
22
|
-
#
|
23
|
-
# puts "#{File.basename(__FILE__)}: Why is it that when you squeeze a lemon you get lemonade but when you squeeze a banana you just get a mess?" # at bottom
|
24
|
-
module IMW
|
25
|
-
module Files
|
26
|
-
module Compressible
|
27
|
-
|
28
|
-
# Known compression programs.
|
29
|
-
COMPRESSION_PROGS = [:bzip2, :gzip]
|
30
|
-
|
31
|
-
# Extensions that are appended by each compression program.
|
32
|
-
COMPRESSION_EXTS = {
|
33
|
-
:bzip2 => '.bz2',
|
34
|
-
:gzip => '.gz'
|
35
|
-
}
|
36
|
-
|
37
|
-
# Compression flags for each program
|
38
|
-
COMPRESSION_FLAGS = {
|
39
|
-
:bzip2 => "-f",
|
40
|
-
:gzip => "-f"
|
41
|
-
}
|
42
|
-
|
43
|
-
protected
|
44
|
-
# Check that +program+ is a valid compression program.
|
45
|
-
def ensure_valid_compression_program program
|
46
|
-
raise IMW::Error.new("#{program} is not a valid compression program (#{COMPRESSION_PROGS.join(' ,')}).") unless COMPRESSION_PROGS.include? program
|
47
|
-
end
|
48
|
-
|
49
|
-
# Construct the command passed to the shell to compress this
|
50
|
-
# file using the given +program+.
|
51
|
-
def compression_command program
|
52
|
-
ensure_valid_compression_program program
|
53
|
-
[IMW::EXTERNAL_PROGRAMS[program],COMPRESSION_FLAGS[program],self.path].join ' '
|
54
|
-
end
|
55
|
-
|
56
|
-
# Return the object representing this file compressed with
|
57
|
-
# +program+.
|
58
|
-
def compressed_file_path program
|
59
|
-
ensure_valid_compression_program program
|
60
|
-
path = File.join(self.dirname,self.basename + COMPRESSION_EXTS[program])
|
61
|
-
end
|
62
|
-
|
63
|
-
public
|
64
|
-
# Compress this file in its present directory using +program+,
|
65
|
-
# overwriting any existing compressed files and without saving
|
66
|
-
# the original file. Returns an
|
67
|
-
# <tt>IMW::Files::CompressedFile</tt> object corresponding to
|
68
|
-
# the compressed file.
|
69
|
-
#
|
70
|
-
# Options:
|
71
|
-
#
|
72
|
-
# <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
|
73
|
-
# program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
|
74
|
-
def compress! program = :bzip2
|
75
|
-
raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
|
76
|
-
FileUtils.cd(@dirname) { IMW.system(self.compression_command(program)) }
|
77
|
-
IMW.open(self.compressed_file_path(program))
|
78
|
-
end
|
79
|
-
|
80
|
-
# Compress this file in its present directory, overwriting any
|
81
|
-
# existing compressed files while keeping the original file.
|
82
|
-
# Returns an <tt>IMW::Files::CompressedFile</tt> object
|
83
|
-
# corresponding to the compressed file.
|
84
|
-
#
|
85
|
-
# Options:
|
86
|
-
#
|
87
|
-
# <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
|
88
|
-
# program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
|
89
|
-
def compress program = :bzip2
|
90
|
-
raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
|
91
|
-
begin
|
92
|
-
FileUtils.cp(self.path,self.path + 'copy')
|
93
|
-
compress! program
|
94
|
-
ensure
|
95
|
-
FileUtils.mv(self.path + 'copy',self.path)
|
96
|
-
end
|
97
|
-
IMW.open(self.compressed_file_path(program))
|
98
|
-
end
|
99
|
-
|
100
|
-
end
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
data/lib/imw/files/csv.rb
DELETED
@@ -1,113 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/files/csv.rb -- CSV, TSV files
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# For "comma-separated value" (CSV) and "tab-separated value" (TSV)
|
7
|
-
# files.
|
8
|
-
#
|
9
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
10
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
-
# License:: GPL 3.0
|
12
|
-
# Website:: http://infinitemonkeywrench.org/
|
13
|
-
#
|
14
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
15
|
-
|
16
|
-
require 'fastercsv'
|
17
|
-
module IMW
|
18
|
-
module Files
|
19
|
-
|
20
|
-
# A base class from which to subclass various types of tabular
|
21
|
-
# data files (CSV, TSV, &c.)
|
22
|
-
class TabularDataFile < FasterCSV
|
23
|
-
|
24
|
-
include IMW::Files::BasicFile
|
25
|
-
include IMW::Files::Compressible
|
26
|
-
|
27
|
-
# Default options to be passed to
|
28
|
-
# FasterCSV[http://fastercsv.rubyforge.org/]; see its
|
29
|
-
# documentation for more information.
|
30
|
-
DEFAULT_OPTIONS = {
|
31
|
-
:col_sep => ',',
|
32
|
-
:headers => false,
|
33
|
-
:return_headers => false,
|
34
|
-
:write_headers => true,
|
35
|
-
:skip_blanks => false,
|
36
|
-
:force_quotes => false
|
37
|
-
}
|
38
|
-
|
39
|
-
def initialize uri, mode='r', options = {}
|
40
|
-
options.reverse_merge!(self.class::DEFAULT_OPTIONS)
|
41
|
-
self.uri= uri
|
42
|
-
options.delete(:write) # FasterCSV complains about unkown options
|
43
|
-
super open(uri,mode), options
|
44
|
-
end
|
45
|
-
|
46
|
-
# Return the contents of this CSV file as an array of arrays.
|
47
|
-
def load
|
48
|
-
entries
|
49
|
-
end
|
50
|
-
|
51
|
-
# Dump +data+ to this file.
|
52
|
-
#
|
53
|
-
# Options include:
|
54
|
-
# <tt>:flush</tt> (true):: flush the file buffer, writing it to disk
|
55
|
-
# <tt>:close</tt> (true):: close the file after writing +data+
|
56
|
-
def dump data, options = {}
|
57
|
-
options = options.reverse_merge :close => true, :flush => true
|
58
|
-
data.each {|row| self << row}
|
59
|
-
self.flush if options[:flush]
|
60
|
-
self.close if options[:close]
|
61
|
-
self
|
62
|
-
end
|
63
|
-
|
64
|
-
# Return a random sample of rows.
|
65
|
-
def sample length=10
|
66
|
-
rows, indices = [], Set.new
|
67
|
-
begin
|
68
|
-
each_with_index do |row, index|
|
69
|
-
break if rows.size == length
|
70
|
-
next if index != 0 && rand < 0.75 # skip 3/4 of rows after the 1st
|
71
|
-
rows << row
|
72
|
-
indices << index
|
73
|
-
end
|
74
|
-
# now fill up to length if not there already
|
75
|
-
while rows.length < length
|
76
|
-
each_with_index do |row, index|
|
77
|
-
break if rows.size == length
|
78
|
-
next if index indices.include?(index)
|
79
|
-
rows << row
|
80
|
-
end
|
81
|
-
end
|
82
|
-
rows
|
83
|
-
rescue FasterCSV::MalformedCSVError
|
84
|
-
rows
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
|
90
|
-
# Represents a file of comma-separated values (CSV). This class
|
91
|
-
# is a subclass of <tt>FasterCSV</tt> so the methods of that
|
92
|
-
# library are available for use.
|
93
|
-
#
|
94
|
-
# See <tt>IMW::Files::TabularDataFile</tt> for more complete
|
95
|
-
# documentation.
|
96
|
-
class Csv < TabularDataFile
|
97
|
-
end
|
98
|
-
|
99
|
-
# Represents a file of tab-separated values (TSV). This class
|
100
|
-
# is a subclass of <tt>FasterCSV</tt> so the methods of that
|
101
|
-
# library are available for use.
|
102
|
-
#
|
103
|
-
# See <tt>IMW::Files::TabularDataFile</tt> for more complete
|
104
|
-
# documentation.
|
105
|
-
class Tsv < TabularDataFile
|
106
|
-
DEFAULT_OPTIONS = {:col_sep => "\t"}.reverse_merge DEFAULT_OPTIONS
|
107
|
-
end
|
108
|
-
|
109
|
-
FILE_REGEXPS << [/\.csv$/, IMW::Files::Csv]
|
110
|
-
FILE_REGEXPS << [/\.tsv$/, IMW::Files::Tsv]
|
111
|
-
|
112
|
-
end
|
113
|
-
end
|
data/lib/imw/files/directory.rb
DELETED
@@ -1,62 +0,0 @@
|
|
1
|
-
require 'imw/files/basicfile'
|
2
|
-
module IMW
|
3
|
-
module Files
|
4
|
-
class Directory
|
5
|
-
|
6
|
-
include IMW::Files::BasicFile
|
7
|
-
|
8
|
-
# FIXME these should be defined by BasicFile and then removed here but I don't see how...
|
9
|
-
# [:executable?, :executable_real?, :pipe?, :socket?, :rm, :rm!, :extname, :extname=, :name, :name=].each do |method|
|
10
|
-
# instance_eval do
|
11
|
-
# remove_method method
|
12
|
-
# end
|
13
|
-
# end
|
14
|
-
|
15
|
-
def uri= uri
|
16
|
-
@uri = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
|
17
|
-
@host = self.uri.host
|
18
|
-
@path = local? ? ::File.expand_path(self.uri.path) : self.uri.path
|
19
|
-
@dirname = ::File.dirname path
|
20
|
-
@basename = ::File.basename path
|
21
|
-
end
|
22
|
-
|
23
|
-
def initialize uri
|
24
|
-
self.uri = uri
|
25
|
-
end
|
26
|
-
|
27
|
-
def [] selector='*'
|
28
|
-
Dir[File.join(path, selector)] if local?
|
29
|
-
end
|
30
|
-
|
31
|
-
# Copy the contents of this directory to +new_dir+.
|
32
|
-
def cp new_dir
|
33
|
-
raise IMW::PathError.new("cannot copy from #{path}, doesn't exist!") unless exist?
|
34
|
-
if local?
|
35
|
-
FileUtils.cp_r path, new_dir
|
36
|
-
else
|
37
|
-
raise IMW::PathError.new("cannot recursively copy remote directories (yet!)")
|
38
|
-
end
|
39
|
-
self.class.new(new_dir)
|
40
|
-
end
|
41
|
-
|
42
|
-
# Move this directory to +new_dir+.
|
43
|
-
def mv new_dir
|
44
|
-
raise IMW::PathError.new("cannot move from #{path}, doesn't exist!") unless exist?
|
45
|
-
if local?
|
46
|
-
FileUtils.mv path, new_dir
|
47
|
-
else
|
48
|
-
raise IMW::PathError.new("cannot move remote directories (yet!)")
|
49
|
-
end
|
50
|
-
self.class.new(new_dir)
|
51
|
-
end
|
52
|
-
alias_method :mv!, :mv
|
53
|
-
|
54
|
-
# Move this directory so it sits beneath +dir+.
|
55
|
-
def mv_to_dir dir
|
56
|
-
mv File.join(File.expand_path(dir),basename)
|
57
|
-
end
|
58
|
-
alias_method :mv_to_dir!, :mv_to_dir
|
59
|
-
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
data/lib/imw/files/excel.rb
DELETED
@@ -1,84 +0,0 @@
|
|
1
|
-
require 'spreadsheet'
|
2
|
-
|
3
|
-
# FIXME Main issue with this:
|
4
|
-
# You can make a new excel book and dump data to it no problem.
|
5
|
-
# However, something that doesn't seem to work is dumping to a file, opening,
|
6
|
-
# and dumping to it again. At the moment this is probably not a big deal.
|
7
|
-
|
8
|
-
|
9
|
-
module IMW
|
10
|
-
module Files
|
11
|
-
class Excel
|
12
|
-
include IMW::Files::BasicFile
|
13
|
-
include IMW::Files::Compressible
|
14
|
-
|
15
|
-
#need to initialize, load, and dump
|
16
|
-
attr_accessor :book,:idx, :max_lines, :sht_idx, :sht_row, :book_idx
|
17
|
-
def initialize uri, mode, options={}
|
18
|
-
self.uri = uri
|
19
|
-
@max_lines = options[:max_lines] || 65000
|
20
|
-
@idx = 0
|
21
|
-
@book_idx = 0
|
22
|
-
@sht_idx = 0
|
23
|
-
unless self.exist?
|
24
|
-
make_new_book
|
25
|
-
make_new_sheet
|
26
|
-
else
|
27
|
-
get_existing_book
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
def load
|
32
|
-
@sheet.map{|row| row.to_a}
|
33
|
-
end
|
34
|
-
|
35
|
-
def dump data
|
36
|
-
data.each do |line|
|
37
|
-
raise "too many lines" if too_many?
|
38
|
-
self << line
|
39
|
-
end
|
40
|
-
save unless no_data?
|
41
|
-
end
|
42
|
-
|
43
|
-
def << line
|
44
|
-
@sheet.row(@sht_row).concat( line )
|
45
|
-
@sht_row += 1
|
46
|
-
@idx += 1
|
47
|
-
end
|
48
|
-
|
49
|
-
def make_new_book
|
50
|
-
@book = Spreadsheet::Workbook.new
|
51
|
-
@book_idx += 1
|
52
|
-
end
|
53
|
-
|
54
|
-
def make_new_sheet
|
55
|
-
@sheet = @book.create_worksheet
|
56
|
-
@sht_idx += 1
|
57
|
-
@sht_row = 0 #always start at row 0 in a new sheet
|
58
|
-
end
|
59
|
-
|
60
|
-
def get_existing_book
|
61
|
-
@book = Spreadsheet.open path
|
62
|
-
@sheet = book.worksheet 0
|
63
|
-
@sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
|
64
|
-
@sht_idx += 1
|
65
|
-
end
|
66
|
-
|
67
|
-
def incr_sheet
|
68
|
-
@sheet = book.worksheet @sht_idx
|
69
|
-
end
|
70
|
-
|
71
|
-
def too_many?
|
72
|
-
@sht_row >= @max_lines
|
73
|
-
end
|
74
|
-
|
75
|
-
def no_data?
|
76
|
-
@sht_row == 0
|
77
|
-
end
|
78
|
-
|
79
|
-
def save
|
80
|
-
@book.write path
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|