imw 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -1
- data/Rakefile +10 -0
- data/TODO +18 -0
- data/VERSION +1 -1
- data/bin/imw +1 -1
- data/etc/imwrc.rb +0 -50
- data/examples/dataset.rb +12 -0
- data/lib/imw/boot.rb +55 -9
- data/lib/imw/dataset/paths.rb +15 -24
- data/lib/imw/dataset/workflow.rb +131 -72
- data/lib/imw/dataset.rb +94 -186
- data/lib/imw/parsers/html_parser.rb +1 -1
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +3 -27
- data/lib/imw/resource.rb +190 -0
- data/lib/imw/resources/archive.rb +97 -0
- data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
- data/lib/imw/resources/archives_and_compressed.rb +32 -0
- data/lib/imw/resources/compressed_file.rb +89 -0
- data/lib/imw/resources/compressible.rb +77 -0
- data/lib/imw/resources/formats/delimited.rb +92 -0
- data/lib/imw/resources/formats/excel.rb +125 -0
- data/lib/imw/resources/formats/json.rb +53 -0
- data/lib/imw/resources/formats/sgml.rb +72 -0
- data/lib/imw/resources/formats/yaml.rb +53 -0
- data/lib/imw/resources/formats.rb +32 -0
- data/lib/imw/resources/local.rb +198 -0
- data/lib/imw/resources/remote.rb +110 -0
- data/lib/imw/resources/schemes/hdfs.rb +242 -0
- data/lib/imw/resources/schemes/http.rb +161 -0
- data/lib/imw/resources/schemes/s3.rb +137 -0
- data/lib/imw/resources/schemes.rb +19 -0
- data/lib/imw/resources.rb +118 -0
- data/lib/imw/runner.rb +5 -4
- data/lib/imw/transforms/archiver.rb +215 -0
- data/lib/imw/transforms/transferer.rb +103 -0
- data/lib/imw/transforms.rb +8 -0
- data/lib/imw/utils/error.rb +26 -30
- data/lib/imw/utils/extensions/array.rb +5 -15
- data/lib/imw/utils/extensions/hash.rb +6 -16
- data/lib/imw/utils/extensions/hpricot.rb +0 -14
- data/lib/imw/utils/extensions/string.rb +5 -15
- data/lib/imw/utils/extensions/symbol.rb +0 -13
- data/lib/imw/utils/extensions.rb +65 -0
- data/lib/imw/utils/log.rb +14 -13
- data/lib/imw/utils/misc.rb +0 -6
- data/lib/imw/utils/paths.rb +101 -42
- data/lib/imw/utils/version.rb +8 -9
- data/lib/imw/utils.rb +2 -18
- data/lib/imw.rb +92 -17
- data/spec/data/sample.csv +1 -1
- data/spec/data/sample.json +1 -0
- data/spec/data/sample.tsv +1 -1
- data/spec/data/sample.txt +1 -1
- data/spec/data/sample.xml +1 -1
- data/spec/data/sample.yaml +1 -1
- data/spec/imw/dataset/paths_spec.rb +32 -0
- data/spec/imw/dataset/workflow_spec.rb +41 -0
- data/spec/imw/resource_spec.rb +79 -0
- data/spec/imw/resources/archive_spec.rb +69 -0
- data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
- data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
- data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
- data/spec/imw/resources/compressed_file_spec.rb +48 -0
- data/spec/imw/resources/compressible_spec.rb +36 -0
- data/spec/imw/resources/formats/delimited_spec.rb +33 -0
- data/spec/imw/resources/formats/json_spec.rb +32 -0
- data/spec/imw/resources/formats/sgml_spec.rb +24 -0
- data/spec/imw/resources/formats/yaml_spec.rb +41 -0
- data/spec/imw/resources/local_spec.rb +98 -0
- data/spec/imw/resources/remote_spec.rb +35 -0
- data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
- data/spec/imw/resources/schemes/http_spec.rb +19 -0
- data/spec/imw/resources/schemes/s3_spec.rb +19 -0
- data/spec/imw/transforms/archiver_spec.rb +120 -0
- data/spec/imw/transforms/transferer_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +5 -33
- data/spec/imw/utils/shared_paths_spec.rb +29 -0
- data/spec/spec_helper.rb +5 -5
- data/spec/support/paths_matcher.rb +67 -0
- data/spec/support/random.rb +39 -36
- metadata +88 -75
- data/lib/imw/dataset/task.rb +0 -41
- data/lib/imw/files/archive.rb +0 -113
- data/lib/imw/files/basicfile.rb +0 -122
- data/lib/imw/files/binary.rb +0 -28
- data/lib/imw/files/compressed_file.rb +0 -93
- data/lib/imw/files/compressed_files_and_archives.rb +0 -334
- data/lib/imw/files/compressible.rb +0 -103
- data/lib/imw/files/csv.rb +0 -113
- data/lib/imw/files/directory.rb +0 -62
- data/lib/imw/files/excel.rb +0 -84
- data/lib/imw/files/json.rb +0 -41
- data/lib/imw/files/sgml.rb +0 -46
- data/lib/imw/files/text.rb +0 -68
- data/lib/imw/files/yaml.rb +0 -46
- data/lib/imw/files.rb +0 -125
- data/lib/imw/packagers/archiver.rb +0 -126
- data/lib/imw/packagers/s3_mover.rb +0 -36
- data/lib/imw/packagers.rb +0 -8
- data/lib/imw/utils/components.rb +0 -61
- data/lib/imw/utils/config.rb +0 -46
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
- data/lib/imw/utils/extensions/core.rb +0 -27
- data/lib/imw/utils/extensions/dir.rb +0 -24
- data/lib/imw/utils/extensions/file_core.rb +0 -64
- data/lib/imw/utils/extensions/typed_struct.rb +0 -22
- data/lib/imw/utils/extensions/uri.rb +0 -59
- data/lib/imw/utils/view/dump_csv.rb +0 -112
- data/lib/imw/utils/view/dump_csv_older.rb +0 -117
- data/lib/imw/utils/view.rb +0 -113
- data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
- data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
- data/spec/imw/files/archive_spec.rb +0 -118
- data/spec/imw/files/basicfile_spec.rb +0 -121
- data/spec/imw/files/bz2_spec.rb +0 -32
- data/spec/imw/files/compressed_file_spec.rb +0 -96
- data/spec/imw/files/compressible_spec.rb +0 -100
- data/spec/imw/files/file_spec.rb +0 -144
- data/spec/imw/files/gz_spec.rb +0 -32
- data/spec/imw/files/rar_spec.rb +0 -33
- data/spec/imw/files/tar_spec.rb +0 -31
- data/spec/imw/files/text_spec.rb +0 -23
- data/spec/imw/files/zip_spec.rb +0 -31
- data/spec/imw/files_spec.rb +0 -38
- data/spec/imw/packagers/archiver_spec.rb +0 -125
- data/spec/imw/packagers/s3_mover_spec.rb +0 -7
- data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
- data/spec/imw/utils/extensions/find_spec.rb +0 -113
- data/spec/imw/workflow/rip/local_spec.rb +0 -89
- data/spec/imw/workflow/rip_spec.rb +0 -27
- data/spec/support/archive_contents_matcher.rb +0 -94
- data/spec/support/directory_contents_matcher.rb +0 -61
|
@@ -1,334 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Files
|
|
3
|
-
|
|
4
|
-
# A class to wrap a +tar+ archive.
|
|
5
|
-
#
|
|
6
|
-
# Creation, appending, listing, and extraction flags are stored in
|
|
7
|
-
# <tt>IMW::Files::Tar::DEFAULT_FLAGS</tt> and all are passed to
|
|
8
|
-
# the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
9
|
-
class Tar
|
|
10
|
-
|
|
11
|
-
include IMW::Files::BasicFile
|
|
12
|
-
include IMW::Files::Archive
|
|
13
|
-
include IMW::Files::Compressible
|
|
14
|
-
|
|
15
|
-
# The default flags used creating, appending to, listing, and
|
|
16
|
-
# extracting a tar archive.
|
|
17
|
-
DEFAULT_FLAGS = {
|
|
18
|
-
:create => "-cf",
|
|
19
|
-
:append => "-rf",
|
|
20
|
-
:list => "-tf",
|
|
21
|
-
:extract => "-xf",
|
|
22
|
-
:program => :tar
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
def initialize uri, *args
|
|
26
|
-
self.uri= uri
|
|
27
|
-
@archive = {
|
|
28
|
-
:program => DEFAULT_FLAGS[:program],
|
|
29
|
-
:create_flags => DEFAULT_FLAGS[:create],
|
|
30
|
-
:append_flags => DEFAULT_FLAGS[:append],
|
|
31
|
-
:list_flags => DEFAULT_FLAGS[:list],
|
|
32
|
-
:extract_flags => DEFAULT_FLAGS[:extract]
|
|
33
|
-
}
|
|
34
|
-
end
|
|
35
|
-
end # Tar
|
|
36
|
-
|
|
37
|
-
# A class to wrap a <tt>tar.gz</tt> archive.
|
|
38
|
-
#
|
|
39
|
-
# Creation, appending, listing, and extraction flags are stored in
|
|
40
|
-
# <tt>IMW::Files::Targz::DEFAULT_FLAGS</tt> and all are passed to
|
|
41
|
-
# the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
42
|
-
class Targz
|
|
43
|
-
|
|
44
|
-
include IMW::Files::BasicFile
|
|
45
|
-
include IMW::Files::Archive
|
|
46
|
-
include IMW::Files::CompressedFile
|
|
47
|
-
|
|
48
|
-
# The default flags used creating, appending to, listing, and
|
|
49
|
-
# extracting a <tt>tar.gz</tt> archive.
|
|
50
|
-
DEFAULT_FLAGS = {
|
|
51
|
-
:decompression_program => :gzip,
|
|
52
|
-
:decompression_flags => '-fd',
|
|
53
|
-
:archive_program => :tar,
|
|
54
|
-
:archive_list_flags => "-tf",
|
|
55
|
-
:archive_extract_flags => "-xzf"
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
def initialize uri, *args
|
|
59
|
-
self.uri= uri
|
|
60
|
-
@compression = {
|
|
61
|
-
:program => DEFAULT_FLAGS[:decompression_program],
|
|
62
|
-
:decompression_flags => DEFAULT_FLAGS[:decompression_flags]
|
|
63
|
-
}
|
|
64
|
-
@archive = {
|
|
65
|
-
:program => DEFAULT_FLAGS[:archive_program],
|
|
66
|
-
:list_flags => DEFAULT_FLAGS[:archive_list_flags],
|
|
67
|
-
:extract_flags => DEFAULT_FLAGS[:archive_extract_flags]
|
|
68
|
-
}
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
# Returns the path of the file after decompression.
|
|
72
|
-
def decompressed_path
|
|
73
|
-
if /\.tar\.gz$/.match @path then
|
|
74
|
-
@path.gsub /\.tar\.gz$/, ".tar"
|
|
75
|
-
elsif /\.tgz$/.match @path then
|
|
76
|
-
@path.gsub /\.tgz$/, ".tar"
|
|
77
|
-
end
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
def self.extname path
|
|
81
|
-
if /\.tar\.gz$/.match path then
|
|
82
|
-
".tar.gz"
|
|
83
|
-
elsif /\.tgz$/.match path then
|
|
84
|
-
".tgz"
|
|
85
|
-
end
|
|
86
|
-
end
|
|
87
|
-
|
|
88
|
-
end # Targz
|
|
89
|
-
|
|
90
|
-
# A class to wrap a <tt>tar.bz2</tt> archive.
|
|
91
|
-
#
|
|
92
|
-
# Creation, appending, listing, and extraction flags are stored in
|
|
93
|
-
# <tt>IMW::Files::Tarbz2::DEFAULT_FLAGS</tt> and all are passed to
|
|
94
|
-
# the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
95
|
-
class Tarbz2
|
|
96
|
-
|
|
97
|
-
include IMW::Files::BasicFile
|
|
98
|
-
include IMW::Files::Archive
|
|
99
|
-
include IMW::Files::CompressedFile
|
|
100
|
-
|
|
101
|
-
# The default flags used creating, appending to, listing, and
|
|
102
|
-
# extracting a <tt>tar.bz2</tt> archive.
|
|
103
|
-
DEFAULT_FLAGS = {
|
|
104
|
-
:decompression_program => :bzip2,
|
|
105
|
-
:decompression_flags => '-fd',
|
|
106
|
-
:archive_program => :tar,
|
|
107
|
-
:archive_create_flags => '-cf',
|
|
108
|
-
:archive_list_flags => "-tf",
|
|
109
|
-
:archive_extract_flags => "-xjf"
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
def self.extname path
|
|
113
|
-
if /\.tar\.bz2$/.match path then
|
|
114
|
-
".tar.bz2"
|
|
115
|
-
elsif /\.tbz2$/.match path then
|
|
116
|
-
".tbz2"
|
|
117
|
-
end
|
|
118
|
-
end
|
|
119
|
-
|
|
120
|
-
def initialize uri, *args
|
|
121
|
-
self.uri= uri
|
|
122
|
-
@compression = {
|
|
123
|
-
:program => DEFAULT_FLAGS[:decompression_program],
|
|
124
|
-
:decompression_flags => DEFAULT_FLAGS[:decompression]
|
|
125
|
-
}
|
|
126
|
-
@archive = {
|
|
127
|
-
:program => DEFAULT_FLAGS[:archive_program],
|
|
128
|
-
:list_flags => DEFAULT_FLAGS[:archive_list_flags],
|
|
129
|
-
:extract_flags => DEFAULT_FLAGS[:archive_extract_flags],
|
|
130
|
-
:create_flags => DEFAULT_FLAGS[:archive_create_flags]
|
|
131
|
-
}
|
|
132
|
-
end
|
|
133
|
-
|
|
134
|
-
# Returns the path of the file after decompression.
|
|
135
|
-
def decompressed_path
|
|
136
|
-
if /\.tar\.bz2$/.match @path then
|
|
137
|
-
@path.gsub /\.tar\.bz2$/, ".tar"
|
|
138
|
-
elsif /\.tbz2$/.match @path then
|
|
139
|
-
@path.gsub /\.tbz2$/, ".tar"
|
|
140
|
-
end
|
|
141
|
-
end
|
|
142
|
-
|
|
143
|
-
# Overrides default behvaior of IMW::Files::Archive#create to
|
|
144
|
-
# compress files after creating them.
|
|
145
|
-
def create paths, opts={}
|
|
146
|
-
opts = opts.reverse_merge({:force => false})
|
|
147
|
-
raise IMW::Error.new("An archive already exists at #{@path}.") if exist? and not opts[:force]
|
|
148
|
-
paths = [paths] if paths.class == String
|
|
149
|
-
IMW.system IMW::EXTERNAL_PROGRAMS[@archive[:program]], @archive[:create_flags], path_between_archive_and_compression, *paths
|
|
150
|
-
IMW.open(path_between_archive_and_compression).compress!(:bzip2)
|
|
151
|
-
end
|
|
152
|
-
|
|
153
|
-
protected
|
|
154
|
-
def path_between_archive_and_compression
|
|
155
|
-
File.join(dirname,name + '.tar')
|
|
156
|
-
end
|
|
157
|
-
|
|
158
|
-
end # Tarbz2
|
|
159
|
-
|
|
160
|
-
# A class to wrap a +rar+ archive.
|
|
161
|
-
#
|
|
162
|
-
# Creation, appending, listing, and extraction flags are stored in
|
|
163
|
-
# <tt>IMW::Files::Rar::DEFAULT_FLAGS</tt> and all are passed to
|
|
164
|
-
# the <tt>:rar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
165
|
-
class Rar
|
|
166
|
-
|
|
167
|
-
include IMW::Files::BasicFile
|
|
168
|
-
include IMW::Files::Archive
|
|
169
|
-
|
|
170
|
-
# The default flags used creating, appending to, listing, and
|
|
171
|
-
# extracting a rar archive.
|
|
172
|
-
DEFAULT_FLAGS = {
|
|
173
|
-
:create => "a -r -o+ -inul",
|
|
174
|
-
:append => "a -r -o+ -inul",
|
|
175
|
-
:list => "vb",
|
|
176
|
-
:extract => "x -o+ -inul"
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
def initialize uri, *args
|
|
180
|
-
self.uri= uri
|
|
181
|
-
@archive = {
|
|
182
|
-
:program => :rar,
|
|
183
|
-
:create_flags => DEFAULT_FLAGS[:create],
|
|
184
|
-
:append_flags => DEFAULT_FLAGS[:append],
|
|
185
|
-
:list_flags => DEFAULT_FLAGS[:list],
|
|
186
|
-
:extract_flags => DEFAULT_FLAGS[:extract]
|
|
187
|
-
}
|
|
188
|
-
end
|
|
189
|
-
end # Rar
|
|
190
|
-
|
|
191
|
-
# A class to wrap a +zip+ archive.
|
|
192
|
-
#
|
|
193
|
-
# Creation, appending, listing, and extraction flags are stored in
|
|
194
|
-
# <tt>IMW::Files::Zip::DEFAULT_FLAGS</tt> and all are passed to
|
|
195
|
-
# the <tt>:zip</tt> and <tt>:unzip</tt> programs in
|
|
196
|
-
# <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
197
|
-
class Zip
|
|
198
|
-
|
|
199
|
-
include IMW::Files::BasicFile
|
|
200
|
-
include IMW::Files::Archive
|
|
201
|
-
|
|
202
|
-
# The default flags used creating, appending to, listing, and
|
|
203
|
-
# extracting a zip archive.
|
|
204
|
-
DEFAULT_FLAGS = {
|
|
205
|
-
:create => "-q -r",
|
|
206
|
-
:append => "-q -g",
|
|
207
|
-
:list => "-l",
|
|
208
|
-
:extract => "-q -o",
|
|
209
|
-
:unarchiving_program => :unzip
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
def initialize uri, *args
|
|
213
|
-
self.uri= uri
|
|
214
|
-
@archive = {
|
|
215
|
-
:program => :zip,
|
|
216
|
-
:create_flags => DEFAULT_FLAGS[:create],
|
|
217
|
-
:append_flags => DEFAULT_FLAGS[:append],
|
|
218
|
-
:list_flags => DEFAULT_FLAGS[:list],
|
|
219
|
-
:extract_flags => DEFAULT_FLAGS[:extract],
|
|
220
|
-
:unarchiving_program => DEFAULT_FLAGS[:unarchiving_program]
|
|
221
|
-
}
|
|
222
|
-
end
|
|
223
|
-
|
|
224
|
-
# The `unzip' program outputs data in a very annoying format:
|
|
225
|
-
#
|
|
226
|
-
# Archive: data.zip
|
|
227
|
-
# Length Date Time Name
|
|
228
|
-
# -------- ---- ---- ----
|
|
229
|
-
# 18510 07-28-08 15:58 data/4d7Qrgz7.csv
|
|
230
|
-
# 3418 07-28-08 15:41 data/7S.csv
|
|
231
|
-
# 23353 07-28-08 15:41 data/g.csv
|
|
232
|
-
# 711 07-28-08 15:58 data/g.xml
|
|
233
|
-
# 1095 07-28-08 15:41 data/L.xml
|
|
234
|
-
# 2399 07-28-08 15:58 data/mTAu9H3.xml
|
|
235
|
-
# 152 07-28-08 15:58 data/vaHBS2t5R.dat
|
|
236
|
-
# -------- -------
|
|
237
|
-
# 49638 7 files
|
|
238
|
-
#
|
|
239
|
-
# which is parsed by this method.
|
|
240
|
-
def archive_contents_string_to_array string
|
|
241
|
-
rows = string.split("\n")
|
|
242
|
-
# ignore the first 3 lines of the output and also discared the
|
|
243
|
-
# last 2 (5 = 2 + 3)
|
|
244
|
-
file_rows = rows[3,(rows.length - 5)]
|
|
245
|
-
file_rows.map! do |row|
|
|
246
|
-
# discard extra whitespace before after main text
|
|
247
|
-
row.lstrip!.rstrip!
|
|
248
|
-
# split the remaining text at spaces...columns beyond the
|
|
249
|
-
# third are part of the filename and should be joined with a
|
|
250
|
-
# space again in case of a filename with a space
|
|
251
|
-
row.split(' ')[3,row.size].join(' ')
|
|
252
|
-
end
|
|
253
|
-
file_rows
|
|
254
|
-
end
|
|
255
|
-
end # Zip
|
|
256
|
-
|
|
257
|
-
# A class to wrap a <tt>gz</tt> compressed file.
|
|
258
|
-
#
|
|
259
|
-
# The decompressing flags are stored in
|
|
260
|
-
# <tt>IMW::Files::Gz::DEFAULT_FLAGS</tt> and all are passed to the
|
|
261
|
-
# <tt>:gzip</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
262
|
-
class Gz
|
|
263
|
-
|
|
264
|
-
include IMW::Files::BasicFile
|
|
265
|
-
include IMW::Files::CompressedFile
|
|
266
|
-
|
|
267
|
-
# The default flags used in extracting a <tt>gz</tt> file.
|
|
268
|
-
DEFAULT_FLAGS = {
|
|
269
|
-
:program => :gzip,
|
|
270
|
-
:decompression => '-fd'
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
def initialize uri, *args
|
|
274
|
-
self.uri= uri
|
|
275
|
-
@compression = {
|
|
276
|
-
:program => DEFAULT_FLAGS[:program],
|
|
277
|
-
:decompression_flags => DEFAULT_FLAGS[:decompression]
|
|
278
|
-
}
|
|
279
|
-
end
|
|
280
|
-
|
|
281
|
-
def decompressed_path
|
|
282
|
-
@path.gsub /\.gz$/, ""
|
|
283
|
-
end
|
|
284
|
-
end # Gz
|
|
285
|
-
|
|
286
|
-
# A class to wrap a <tt>bz2</tt> compressed file.
|
|
287
|
-
#
|
|
288
|
-
# The decompressing flags are stored in
|
|
289
|
-
# <tt>IMW::Files::Bz2::DEFAULT_FLAGS</tt> and all are passed to
|
|
290
|
-
# the <tt>:bzip2</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
291
|
-
class Bz2
|
|
292
|
-
|
|
293
|
-
include IMW::Files::BasicFile
|
|
294
|
-
include IMW::Files::CompressedFile
|
|
295
|
-
|
|
296
|
-
# The default flags used in extracting a <tt>bz2</tt> file.
|
|
297
|
-
DEFAULT_FLAGS = {
|
|
298
|
-
:program => :bzip2,
|
|
299
|
-
:decompression => '-fd'
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
def initialize uri, *args
|
|
303
|
-
self.uri= uri
|
|
304
|
-
raise IMW::Error.new("#{@extname} is not a valid extension for a bzip2 compressed file.") unless @extname == '.bz2'
|
|
305
|
-
@compression = {
|
|
306
|
-
:program => DEFAULT_FLAGS[:program],
|
|
307
|
-
:decompression_flags => DEFAULT_FLAGS[:decompression]
|
|
308
|
-
}
|
|
309
|
-
end
|
|
310
|
-
|
|
311
|
-
# Returns the path of the file after decompression.
|
|
312
|
-
def decompressed_path
|
|
313
|
-
@path.gsub /\.bz2$/, ""
|
|
314
|
-
end
|
|
315
|
-
end # Bz2
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
# make sure that tar.bz2 precedes bz2 and so on...
|
|
319
|
-
FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::Tarbz2]
|
|
320
|
-
FILE_REGEXPS << [/\.tbz2$/, IMW::Files::Tarbz2]
|
|
321
|
-
|
|
322
|
-
FILE_REGEXPS << [/\.tar\.gz$/, IMW::Files::Targz]
|
|
323
|
-
FILE_REGEXPS << [/\.tgz$/, IMW::Files::Targz]
|
|
324
|
-
|
|
325
|
-
FILE_REGEXPS << [/\.tar$/, IMW::Files::Tar]
|
|
326
|
-
FILE_REGEXPS << [/\.bz2$/, IMW::Files::Bz2]
|
|
327
|
-
FILE_REGEXPS << [/\.gz$/, IMW::Files::Gz]
|
|
328
|
-
FILE_REGEXPS << [/\.rar$/, IMW::Files::Rar]
|
|
329
|
-
FILE_REGEXPS << [/\.zip$/, IMW::Files::Zip]
|
|
330
|
-
|
|
331
|
-
end # Files
|
|
332
|
-
end # IMW
|
|
333
|
-
|
|
334
|
-
|
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# h2. lib/imw/files//compressible.rb -- compression module
|
|
3
|
-
#
|
|
4
|
-
# == About
|
|
5
|
-
#
|
|
6
|
-
# Module used for compression of files. An including
|
|
7
|
-
# <tt>IMW::Files::BasicFile</tt> object gains +compress+ and
|
|
8
|
-
# <tt>compress!</tt> methods.
|
|
9
|
-
#
|
|
10
|
-
# By default, bzip2 is used for compression though gzip can also be
|
|
11
|
-
# specified (the full list of known compression programs is in
|
|
12
|
-
# <tt>IMW::Files::Compressible::COMPRESSION_PROGS</tt>). Zip and Rar
|
|
13
|
-
# compression are handled by the <tt>IMW::Files::Archive</tt> module.
|
|
14
|
-
#
|
|
15
|
-
# Decompression should be handled via the
|
|
16
|
-
# <tt>IMW::Files::CompressedFile</tt> class.
|
|
17
|
-
#
|
|
18
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
19
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
20
|
-
# License:: GPL 3.0
|
|
21
|
-
# Website:: http://infinitemonkeywrench.org/
|
|
22
|
-
#
|
|
23
|
-
# puts "#{File.basename(__FILE__)}: Why is it that when you squeeze a lemon you get lemonade but when you squeeze a banana you just get a mess?" # at bottom
|
|
24
|
-
module IMW
|
|
25
|
-
module Files
|
|
26
|
-
module Compressible
|
|
27
|
-
|
|
28
|
-
# Known compression programs.
|
|
29
|
-
COMPRESSION_PROGS = [:bzip2, :gzip]
|
|
30
|
-
|
|
31
|
-
# Extensions that are appended by each compression program.
|
|
32
|
-
COMPRESSION_EXTS = {
|
|
33
|
-
:bzip2 => '.bz2',
|
|
34
|
-
:gzip => '.gz'
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
# Compression flags for each program
|
|
38
|
-
COMPRESSION_FLAGS = {
|
|
39
|
-
:bzip2 => "-f",
|
|
40
|
-
:gzip => "-f"
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
protected
|
|
44
|
-
# Check that +program+ is a valid compression program.
|
|
45
|
-
def ensure_valid_compression_program program
|
|
46
|
-
raise IMW::Error.new("#{program} is not a valid compression program (#{COMPRESSION_PROGS.join(' ,')}).") unless COMPRESSION_PROGS.include? program
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
# Construct the command passed to the shell to compress this
|
|
50
|
-
# file using the given +program+.
|
|
51
|
-
def compression_command program
|
|
52
|
-
ensure_valid_compression_program program
|
|
53
|
-
[IMW::EXTERNAL_PROGRAMS[program],COMPRESSION_FLAGS[program],self.path].join ' '
|
|
54
|
-
end
|
|
55
|
-
|
|
56
|
-
# Return the object representing this file compressed with
|
|
57
|
-
# +program+.
|
|
58
|
-
def compressed_file_path program
|
|
59
|
-
ensure_valid_compression_program program
|
|
60
|
-
path = File.join(self.dirname,self.basename + COMPRESSION_EXTS[program])
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
public
|
|
64
|
-
# Compress this file in its present directory using +program+,
|
|
65
|
-
# overwriting any existing compressed files and without saving
|
|
66
|
-
# the original file. Returns an
|
|
67
|
-
# <tt>IMW::Files::CompressedFile</tt> object corresponding to
|
|
68
|
-
# the compressed file.
|
|
69
|
-
#
|
|
70
|
-
# Options:
|
|
71
|
-
#
|
|
72
|
-
# <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
|
|
73
|
-
# program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
|
|
74
|
-
def compress! program = :bzip2
|
|
75
|
-
raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
|
|
76
|
-
FileUtils.cd(@dirname) { IMW.system(self.compression_command(program)) }
|
|
77
|
-
IMW.open(self.compressed_file_path(program))
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
# Compress this file in its present directory, overwriting any
|
|
81
|
-
# existing compressed files while keeping the original file.
|
|
82
|
-
# Returns an <tt>IMW::Files::CompressedFile</tt> object
|
|
83
|
-
# corresponding to the compressed file.
|
|
84
|
-
#
|
|
85
|
-
# Options:
|
|
86
|
-
#
|
|
87
|
-
# <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
|
|
88
|
-
# program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
|
|
89
|
-
def compress program = :bzip2
|
|
90
|
-
raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
|
|
91
|
-
begin
|
|
92
|
-
FileUtils.cp(self.path,self.path + 'copy')
|
|
93
|
-
compress! program
|
|
94
|
-
ensure
|
|
95
|
-
FileUtils.mv(self.path + 'copy',self.path)
|
|
96
|
-
end
|
|
97
|
-
IMW.open(self.compressed_file_path(program))
|
|
98
|
-
end
|
|
99
|
-
|
|
100
|
-
end
|
|
101
|
-
end
|
|
102
|
-
end
|
|
103
|
-
|
data/lib/imw/files/csv.rb
DELETED
|
@@ -1,113 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# h2. lib/imw/files/csv.rb -- CSV, TSV files
|
|
3
|
-
#
|
|
4
|
-
# == About
|
|
5
|
-
#
|
|
6
|
-
# For "comma-separated value" (CSV) and "tab-separated value" (TSV)
|
|
7
|
-
# files.
|
|
8
|
-
#
|
|
9
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
10
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
11
|
-
# License:: GPL 3.0
|
|
12
|
-
# Website:: http://infinitemonkeywrench.org/
|
|
13
|
-
#
|
|
14
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
|
15
|
-
|
|
16
|
-
require 'fastercsv'
|
|
17
|
-
module IMW
|
|
18
|
-
module Files
|
|
19
|
-
|
|
20
|
-
# A base class from which to subclass various types of tabular
|
|
21
|
-
# data files (CSV, TSV, &c.)
|
|
22
|
-
class TabularDataFile < FasterCSV
|
|
23
|
-
|
|
24
|
-
include IMW::Files::BasicFile
|
|
25
|
-
include IMW::Files::Compressible
|
|
26
|
-
|
|
27
|
-
# Default options to be passed to
|
|
28
|
-
# FasterCSV[http://fastercsv.rubyforge.org/]; see its
|
|
29
|
-
# documentation for more information.
|
|
30
|
-
DEFAULT_OPTIONS = {
|
|
31
|
-
:col_sep => ',',
|
|
32
|
-
:headers => false,
|
|
33
|
-
:return_headers => false,
|
|
34
|
-
:write_headers => true,
|
|
35
|
-
:skip_blanks => false,
|
|
36
|
-
:force_quotes => false
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
def initialize uri, mode='r', options = {}
|
|
40
|
-
options.reverse_merge!(self.class::DEFAULT_OPTIONS)
|
|
41
|
-
self.uri= uri
|
|
42
|
-
options.delete(:write) # FasterCSV complains about unkown options
|
|
43
|
-
super open(uri,mode), options
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
# Return the contents of this CSV file as an array of arrays.
|
|
47
|
-
def load
|
|
48
|
-
entries
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
# Dump +data+ to this file.
|
|
52
|
-
#
|
|
53
|
-
# Options include:
|
|
54
|
-
# <tt>:flush</tt> (true):: flush the file buffer, writing it to disk
|
|
55
|
-
# <tt>:close</tt> (true):: close the file after writing +data+
|
|
56
|
-
def dump data, options = {}
|
|
57
|
-
options = options.reverse_merge :close => true, :flush => true
|
|
58
|
-
data.each {|row| self << row}
|
|
59
|
-
self.flush if options[:flush]
|
|
60
|
-
self.close if options[:close]
|
|
61
|
-
self
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
# Return a random sample of rows.
|
|
65
|
-
def sample length=10
|
|
66
|
-
rows, indices = [], Set.new
|
|
67
|
-
begin
|
|
68
|
-
each_with_index do |row, index|
|
|
69
|
-
break if rows.size == length
|
|
70
|
-
next if index != 0 && rand < 0.75 # skip 3/4 of rows after the 1st
|
|
71
|
-
rows << row
|
|
72
|
-
indices << index
|
|
73
|
-
end
|
|
74
|
-
# now fill up to length if not there already
|
|
75
|
-
while rows.length < length
|
|
76
|
-
each_with_index do |row, index|
|
|
77
|
-
break if rows.size == length
|
|
78
|
-
next if index indices.include?(index)
|
|
79
|
-
rows << row
|
|
80
|
-
end
|
|
81
|
-
end
|
|
82
|
-
rows
|
|
83
|
-
rescue FasterCSV::MalformedCSVError
|
|
84
|
-
rows
|
|
85
|
-
end
|
|
86
|
-
end
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
# Represents a file of comma-separated values (CSV). This class
|
|
91
|
-
# is a subclass of <tt>FasterCSV</tt> so the methods of that
|
|
92
|
-
# library are available for use.
|
|
93
|
-
#
|
|
94
|
-
# See <tt>IMW::Files::TabularDataFile</tt> for more complete
|
|
95
|
-
# documentation.
|
|
96
|
-
class Csv < TabularDataFile
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
# Represents a file of tab-separated values (TSV). This class
|
|
100
|
-
# is a subclass of <tt>FasterCSV</tt> so the methods of that
|
|
101
|
-
# library are available for use.
|
|
102
|
-
#
|
|
103
|
-
# See <tt>IMW::Files::TabularDataFile</tt> for more complete
|
|
104
|
-
# documentation.
|
|
105
|
-
class Tsv < TabularDataFile
|
|
106
|
-
DEFAULT_OPTIONS = {:col_sep => "\t"}.reverse_merge DEFAULT_OPTIONS
|
|
107
|
-
end
|
|
108
|
-
|
|
109
|
-
FILE_REGEXPS << [/\.csv$/, IMW::Files::Csv]
|
|
110
|
-
FILE_REGEXPS << [/\.tsv$/, IMW::Files::Tsv]
|
|
111
|
-
|
|
112
|
-
end
|
|
113
|
-
end
|
data/lib/imw/files/directory.rb
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
require 'imw/files/basicfile'
|
|
2
|
-
module IMW
|
|
3
|
-
module Files
|
|
4
|
-
class Directory
|
|
5
|
-
|
|
6
|
-
include IMW::Files::BasicFile
|
|
7
|
-
|
|
8
|
-
# FIXME these should be defined by BasicFile and then removed here but I don't see how...
|
|
9
|
-
# [:executable?, :executable_real?, :pipe?, :socket?, :rm, :rm!, :extname, :extname=, :name, :name=].each do |method|
|
|
10
|
-
# instance_eval do
|
|
11
|
-
# remove_method method
|
|
12
|
-
# end
|
|
13
|
-
# end
|
|
14
|
-
|
|
15
|
-
def uri= uri
|
|
16
|
-
@uri = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
|
|
17
|
-
@host = self.uri.host
|
|
18
|
-
@path = local? ? ::File.expand_path(self.uri.path) : self.uri.path
|
|
19
|
-
@dirname = ::File.dirname path
|
|
20
|
-
@basename = ::File.basename path
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
def initialize uri
|
|
24
|
-
self.uri = uri
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
def [] selector='*'
|
|
28
|
-
Dir[File.join(path, selector)] if local?
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
# Copy the contents of this directory to +new_dir+.
|
|
32
|
-
def cp new_dir
|
|
33
|
-
raise IMW::PathError.new("cannot copy from #{path}, doesn't exist!") unless exist?
|
|
34
|
-
if local?
|
|
35
|
-
FileUtils.cp_r path, new_dir
|
|
36
|
-
else
|
|
37
|
-
raise IMW::PathError.new("cannot recursively copy remote directories (yet!)")
|
|
38
|
-
end
|
|
39
|
-
self.class.new(new_dir)
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
# Move this directory to +new_dir+.
|
|
43
|
-
def mv new_dir
|
|
44
|
-
raise IMW::PathError.new("cannot move from #{path}, doesn't exist!") unless exist?
|
|
45
|
-
if local?
|
|
46
|
-
FileUtils.mv path, new_dir
|
|
47
|
-
else
|
|
48
|
-
raise IMW::PathError.new("cannot move remote directories (yet!)")
|
|
49
|
-
end
|
|
50
|
-
self.class.new(new_dir)
|
|
51
|
-
end
|
|
52
|
-
alias_method :mv!, :mv
|
|
53
|
-
|
|
54
|
-
# Move this directory so it sits beneath +dir+.
|
|
55
|
-
def mv_to_dir dir
|
|
56
|
-
mv File.join(File.expand_path(dir),basename)
|
|
57
|
-
end
|
|
58
|
-
alias_method :mv_to_dir!, :mv_to_dir
|
|
59
|
-
|
|
60
|
-
end
|
|
61
|
-
end
|
|
62
|
-
end
|
data/lib/imw/files/excel.rb
DELETED
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
require 'spreadsheet'
|
|
2
|
-
|
|
3
|
-
# FIXME Main issue with this:
|
|
4
|
-
# You can make a new excel book and dump data to it no problem.
|
|
5
|
-
# However, something that doesn't seem to work is dumping to a file, opening,
|
|
6
|
-
# and dumping to it again. At the moment this is probably not a big deal.
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
module IMW
|
|
10
|
-
module Files
|
|
11
|
-
class Excel
|
|
12
|
-
include IMW::Files::BasicFile
|
|
13
|
-
include IMW::Files::Compressible
|
|
14
|
-
|
|
15
|
-
#need to initialize, load, and dump
|
|
16
|
-
attr_accessor :book,:idx, :max_lines, :sht_idx, :sht_row, :book_idx
|
|
17
|
-
def initialize uri, mode, options={}
|
|
18
|
-
self.uri = uri
|
|
19
|
-
@max_lines = options[:max_lines] || 65000
|
|
20
|
-
@idx = 0
|
|
21
|
-
@book_idx = 0
|
|
22
|
-
@sht_idx = 0
|
|
23
|
-
unless self.exist?
|
|
24
|
-
make_new_book
|
|
25
|
-
make_new_sheet
|
|
26
|
-
else
|
|
27
|
-
get_existing_book
|
|
28
|
-
end
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
def load
|
|
32
|
-
@sheet.map{|row| row.to_a}
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
def dump data
|
|
36
|
-
data.each do |line|
|
|
37
|
-
raise "too many lines" if too_many?
|
|
38
|
-
self << line
|
|
39
|
-
end
|
|
40
|
-
save unless no_data?
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
def << line
|
|
44
|
-
@sheet.row(@sht_row).concat( line )
|
|
45
|
-
@sht_row += 1
|
|
46
|
-
@idx += 1
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
def make_new_book
|
|
50
|
-
@book = Spreadsheet::Workbook.new
|
|
51
|
-
@book_idx += 1
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
def make_new_sheet
|
|
55
|
-
@sheet = @book.create_worksheet
|
|
56
|
-
@sht_idx += 1
|
|
57
|
-
@sht_row = 0 #always start at row 0 in a new sheet
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
def get_existing_book
|
|
61
|
-
@book = Spreadsheet.open path
|
|
62
|
-
@sheet = book.worksheet 0
|
|
63
|
-
@sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
|
|
64
|
-
@sht_idx += 1
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
def incr_sheet
|
|
68
|
-
@sheet = book.worksheet @sht_idx
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
def too_many?
|
|
72
|
-
@sht_row >= @max_lines
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
def no_data?
|
|
76
|
-
@sht_row == 0
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
def save
|
|
80
|
-
@book.write path
|
|
81
|
-
end
|
|
82
|
-
end
|
|
83
|
-
end
|
|
84
|
-
end
|