imw 0.2.18 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +7 -26
- data/Gemfile.lock +13 -38
- data/{LICENSE → LICENSE.txt} +1 -1
- data/README.textile +35 -0
- data/Rakefile +45 -22
- data/VERSION +1 -1
- data/examples/foo.rb +19 -0
- data/examples/html_selector.rb +22 -0
- data/examples/nes_game_list.csv +625 -0
- data/examples/nes_gamespot.csv +1371 -0
- data/examples/nes_nintendo.csv +624 -0
- data/examples/nes_unlicensed.csv +89 -0
- data/examples/nes_wikipedia.csv +710 -0
- data/examples/nibbler_test.rb +24 -0
- data/examples/script.rb +19 -0
- data/lib/imw.rb +28 -140
- data/lib/imw/error.rb +9 -0
- data/lib/imw/recordizer.rb +8 -0
- data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
- data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
- data/lib/imw/resource.rb +3 -119
- data/lib/imw/serializer.rb +7 -0
- data/lib/imw/serializer/json_serializer.rb +17 -0
- data/lib/imw/uri.rb +41 -0
- data/spec/resource_spec.rb +78 -0
- data/spec/uri_spec.rb +55 -0
- metadata +81 -232
- data/README.rdoc +0 -371
- data/bin/imw +0 -5
- data/bin/tsv_to_json.rb +0 -29
- data/etc/imwrc.rb +0 -26
- data/examples/dataset.rb +0 -12
- data/examples/metadata.yml +0 -10
- data/lib/imw/archives.rb +0 -120
- data/lib/imw/archives/rar.rb +0 -19
- data/lib/imw/archives/tar.rb +0 -19
- data/lib/imw/archives/tarbz2.rb +0 -73
- data/lib/imw/archives/targz.rb +0 -73
- data/lib/imw/archives/zip.rb +0 -51
- data/lib/imw/boot.rb +0 -87
- data/lib/imw/compressed_files.rb +0 -94
- data/lib/imw/compressed_files/bz2.rb +0 -16
- data/lib/imw/compressed_files/compressible.rb +0 -75
- data/lib/imw/compressed_files/gz.rb +0 -16
- data/lib/imw/dataset.rb +0 -125
- data/lib/imw/dataset/paths.rb +0 -29
- data/lib/imw/dataset/workflow.rb +0 -195
- data/lib/imw/formats.rb +0 -33
- data/lib/imw/formats/delimited.rb +0 -170
- data/lib/imw/formats/excel.rb +0 -100
- data/lib/imw/formats/json.rb +0 -41
- data/lib/imw/formats/pdf.rb +0 -71
- data/lib/imw/formats/sgml.rb +0 -69
- data/lib/imw/formats/yaml.rb +0 -41
- data/lib/imw/metadata.rb +0 -83
- data/lib/imw/metadata/contains_metadata.rb +0 -54
- data/lib/imw/metadata/dsl.rb +0 -111
- data/lib/imw/metadata/field.rb +0 -37
- data/lib/imw/metadata/has_metadata.rb +0 -98
- data/lib/imw/metadata/has_summary.rb +0 -57
- data/lib/imw/metadata/schema.rb +0 -17
- data/lib/imw/parsers.rb +0 -8
- data/lib/imw/parsers/flat.rb +0 -44
- data/lib/imw/parsers/html_parser.rb +0 -387
- data/lib/imw/parsers/html_parser/matchers.rb +0 -289
- data/lib/imw/parsers/line_parser.rb +0 -87
- data/lib/imw/parsers/regexp_parser.rb +0 -72
- data/lib/imw/repository.rb +0 -12
- data/lib/imw/runner.rb +0 -118
- data/lib/imw/schemes.rb +0 -23
- data/lib/imw/schemes/ftp.rb +0 -142
- data/lib/imw/schemes/hdfs.rb +0 -251
- data/lib/imw/schemes/http.rb +0 -165
- data/lib/imw/schemes/local.rb +0 -409
- data/lib/imw/schemes/remote.rb +0 -119
- data/lib/imw/schemes/s3.rb +0 -143
- data/lib/imw/schemes/sql.rb +0 -129
- data/lib/imw/tools.rb +0 -12
- data/lib/imw/tools/aggregator.rb +0 -148
- data/lib/imw/tools/archiver.rb +0 -220
- data/lib/imw/tools/downloader.rb +0 -63
- data/lib/imw/tools/extension_analyzer.rb +0 -114
- data/lib/imw/tools/summarizer.rb +0 -83
- data/lib/imw/tools/transferer.rb +0 -167
- data/lib/imw/utils.rb +0 -74
- data/lib/imw/utils/dynamically_extendable.rb +0 -137
- data/lib/imw/utils/error.rb +0 -59
- data/lib/imw/utils/extensions/hpricot.rb +0 -34
- data/lib/imw/utils/has_uri.rb +0 -131
- data/lib/imw/utils/log.rb +0 -92
- data/lib/imw/utils/misc.rb +0 -57
- data/lib/imw/utils/paths.rb +0 -146
- data/lib/imw/utils/uri.rb +0 -59
- data/lib/imw/utils/uuid.rb +0 -33
- data/lib/imw/utils/validate.rb +0 -38
- data/lib/imw/utils/version.rb +0 -11
- data/spec/data/formats/delimited/sample.csv +0 -131
- data/spec/data/formats/delimited/sample.tsv +0 -131
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +0 -1
- data/spec/data/formats/none/sample +0 -650
- data/spec/data/formats/sgml/sample.xml +0 -617
- data/spec/data/formats/text/sample.txt +0 -650
- data/spec/data/formats/yaml/sample.yaml +0 -410
- data/spec/data/schema-tabular.yaml +0 -11
- data/spec/imw/archives/rar_spec.rb +0 -16
- data/spec/imw/archives/tar_spec.rb +0 -16
- data/spec/imw/archives/tarbz2_spec.rb +0 -24
- data/spec/imw/archives/targz_spec.rb +0 -21
- data/spec/imw/archives/zip_spec.rb +0 -16
- data/spec/imw/archives_spec.rb +0 -77
- data/spec/imw/compressed_files/bz2_spec.rb +0 -15
- data/spec/imw/compressed_files/compressible_spec.rb +0 -36
- data/spec/imw/compressed_files/gz_spec.rb +0 -15
- data/spec/imw/compressed_files_spec.rb +0 -47
- data/spec/imw/dataset/paths_spec.rb +0 -32
- data/spec/imw/dataset/workflow_spec.rb +0 -41
- data/spec/imw/formats/delimited_spec.rb +0 -44
- data/spec/imw/formats/excel_spec.rb +0 -55
- data/spec/imw/formats/json_spec.rb +0 -18
- data/spec/imw/formats/sgml_spec.rb +0 -24
- data/spec/imw/formats/yaml_spec.rb +0 -19
- data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
- data/spec/imw/metadata/field_spec.rb +0 -25
- data/spec/imw/metadata/has_metadata_spec.rb +0 -58
- data/spec/imw/metadata/has_summary_spec.rb +0 -32
- data/spec/imw/metadata/schema_spec.rb +0 -24
- data/spec/imw/metadata_spec.rb +0 -86
- data/spec/imw/parsers/line_parser_spec.rb +0 -96
- data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
- data/spec/imw/resource_spec.rb +0 -32
- data/spec/imw/schemes/hdfs_spec.rb +0 -67
- data/spec/imw/schemes/http_spec.rb +0 -19
- data/spec/imw/schemes/local_spec.rb +0 -165
- data/spec/imw/schemes/remote_spec.rb +0 -38
- data/spec/imw/schemes/s3_spec.rb +0 -31
- data/spec/imw/schemes/sql_spec.rb +0 -3
- data/spec/imw/tools/aggregator_spec.rb +0 -71
- data/spec/imw/tools/archiver_spec.rb +0 -120
- data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
- data/spec/imw/tools/summarizer_spec.rb +0 -8
- data/spec/imw/tools/transferer_spec.rb +0 -195
- data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
- data/spec/imw/utils/has_uri_spec.rb +0 -61
- data/spec/imw/utils/paths_spec.rb +0 -10
- data/spec/imw/utils/shared_paths_spec.rb +0 -29
- data/spec/imw_spec.rb +0 -14
- data/spec/rcov.opts +0 -1
- data/spec/spec_helper.rb +0 -31
- data/spec/support/custom_matchers.rb +0 -28
- data/spec/support/file_contents_matcher.rb +0 -30
- data/spec/support/paths_matcher.rb +0 -66
- data/spec/support/random.rb +0 -213
- data/spec/support/without_regard_to_order_matcher.rb +0 -41
data/lib/imw/tools/archiver.rb
DELETED
|
@@ -1,220 +0,0 @@
|
|
|
1
|
-
require 'imw/resource'
|
|
2
|
-
|
|
3
|
-
module IMW
|
|
4
|
-
module Tools
|
|
5
|
-
|
|
6
|
-
# Packages an Array of input files into a single output archive.
|
|
7
|
-
# When the archive is extracted, all the input files given will be
|
|
8
|
-
# in a single directory with a chosen name. The path to the output
|
|
9
|
-
# archive determines both the name of the archive and its type (tar,
|
|
10
|
-
# tar.bz2, zip, &c.).
|
|
11
|
-
#
|
|
12
|
-
# If any of the input files are themselves archives, they will first
|
|
13
|
-
# be extracted, with only their contents winding up in the final
|
|
14
|
-
# directory (the file hierarchy of the archive will be preserved).
|
|
15
|
-
# If any of the input files are compressed, they will first be
|
|
16
|
-
# uncompressed before being added to the directory.
|
|
17
|
-
#
|
|
18
|
-
# Both local and remote files can be archived. An exmaple:
|
|
19
|
-
#
|
|
20
|
-
# archiver = IMW::Transforms::Archiver.new 'my_archive', '/path/to/my/regular_file.tsv', '/path/to/an/archive.tar.bz2', '/path/to/my_compressed_file.gz', 'http://mywebsite.com/index.html'
|
|
21
|
-
# archiver.package! '/path/to/my_archive.zip'
|
|
22
|
-
#
|
|
23
|
-
# This will create a ZIP archive at
|
|
24
|
-
# <tt>/path/to/my_archive.zip</tt>. When the ZIP archive is
|
|
25
|
-
# extracted its contents will look like
|
|
26
|
-
#
|
|
27
|
-
# my_archive
|
|
28
|
-
# |-- regular_file.tsv
|
|
29
|
-
# |-- archive_file1
|
|
30
|
-
# |-- archive_dir
|
|
31
|
-
# | |-- archive_file2
|
|
32
|
-
# | `-- archive_file3
|
|
33
|
-
# |-- archive_file3
|
|
34
|
-
# |-- my_compressed_file
|
|
35
|
-
# `-- index.html
|
|
36
|
-
#
|
|
37
|
-
# Notice that
|
|
38
|
-
#
|
|
39
|
-
# - the name of the extracted directory is given by the first
|
|
40
|
-
# argument to the Archiver when it was instantiated.
|
|
41
|
-
#
|
|
42
|
-
# - all files wind up in the top-level of this extracted directory
|
|
43
|
-
# when possible (<tt>regular_file.tsv</tt>, <tt>index.html</tt>)
|
|
44
|
-
#
|
|
45
|
-
# - /path/to/archive.tar.bz2 was not directly included, but its
|
|
46
|
-
# contents (<tt>archive_file1</tt>,
|
|
47
|
-
# <tt>archive_dir/archive_file2</tt>,
|
|
48
|
-
# <tt>archive_dir/archive_file3</tt>) were included instead.
|
|
49
|
-
#
|
|
50
|
-
# - /path/to/my_compressed_file.gz was first uncompressed before
|
|
51
|
-
# being added to the archive.
|
|
52
|
-
#
|
|
53
|
-
# - the remote file <tt>http://mywebsite.com/index.html</tt> was
|
|
54
|
-
# downloaded and included
|
|
55
|
-
#
|
|
56
|
-
# This process can take a while when the constituent files are
|
|
57
|
-
# large because there is quite a lot of preparation done to the
|
|
58
|
-
# files to make this nice output structure in the final archive.
|
|
59
|
-
# Further calls to <tt>package!</tt> on the same instance of
|
|
60
|
-
# Archiver will skip the preparation step (the intermediate
|
|
61
|
-
# results of which are sitting in IMW's temporary directory) and
|
|
62
|
-
# directly create the package, saving time when attempting to
|
|
63
|
-
# create multiple package formats from the same input data.
|
|
64
|
-
class Archiver
|
|
65
|
-
|
|
66
|
-
attr_accessor :name, :local_inputs, :remote_inputs
|
|
67
|
-
|
|
68
|
-
def initialize name, *raw_inputs
|
|
69
|
-
@name = name
|
|
70
|
-
self.inputs = raw_inputs
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
# Set the inputs for this archiver.
|
|
74
|
-
#
|
|
75
|
-
# @param [String, IMW::Resource] new_inputs the inputs to archive, local or remote
|
|
76
|
-
def inputs= raw_inputs
|
|
77
|
-
@local_inputs, @remote_inputs = [], []
|
|
78
|
-
raw_inputs.flatten.each do |raw_input|
|
|
79
|
-
input = IMW.open(raw_input)
|
|
80
|
-
if input.is_local?
|
|
81
|
-
@local_inputs << input
|
|
82
|
-
else
|
|
83
|
-
@remote_inputs << input
|
|
84
|
-
end
|
|
85
|
-
end
|
|
86
|
-
@local_inputs.flatten!
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
# Return a list of error messages for this archiver.
|
|
90
|
-
#
|
|
91
|
-
# @return [Array] the error messages
|
|
92
|
-
def errors
|
|
93
|
-
@errors ||= []
|
|
94
|
-
end
|
|
95
|
-
|
|
96
|
-
# Was this archiver successful (did it not have any errors)?
|
|
97
|
-
#
|
|
98
|
-
# @return [true, false]
|
|
99
|
-
def success?
|
|
100
|
-
errors.empty?
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
# A temporary directory to work in. Its contents will
|
|
104
|
-
# ultimately consist of a directory named for the package
|
|
105
|
-
# containing all the input files.
|
|
106
|
-
#
|
|
107
|
-
# @return [String]
|
|
108
|
-
def tmp_dir
|
|
109
|
-
@tmp_dir ||= File.join(IMW.path_to(:tmp_root, 'packager'), (Time.now.to_i.to_s + "-" + $$.to_s)) # guaranteed unique on a node
|
|
110
|
-
end
|
|
111
|
-
|
|
112
|
-
# A directory which will contain all the content being packaged,
|
|
113
|
-
# including the contents of any archives that were included in
|
|
114
|
-
# the list of files to process.
|
|
115
|
-
#
|
|
116
|
-
# @return [String]
|
|
117
|
-
def dir
|
|
118
|
-
@dir ||= File.join(tmp_dir, name.to_s)
|
|
119
|
-
end
|
|
120
|
-
|
|
121
|
-
# Remove the +tmp_dir+ entirely, getting rid of all temporary
|
|
122
|
-
# files.
|
|
123
|
-
def clean!
|
|
124
|
-
IMW.announce_if_verbose("Cleaning temporary directory #{tmp_dir}...")
|
|
125
|
-
FileUtils.rm_rf(tmp_dir)
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
# Copy, decompress, or extract the input paths to the temporary
|
|
129
|
-
# directory, readying them for packaging.
|
|
130
|
-
def prepare!
|
|
131
|
-
FileUtils.mkdir_p dir unless File.exist?(dir)
|
|
132
|
-
|
|
133
|
-
local_inputs.each do |existing_file|
|
|
134
|
-
new_path = File.join(dir, existing_file.basename)
|
|
135
|
-
case
|
|
136
|
-
when existing_file.is_archive?
|
|
137
|
-
IMW.announce_if_verbose("Extracting #{existing_file}...")
|
|
138
|
-
FileUtils.cd(dir) do
|
|
139
|
-
existing_file.extract
|
|
140
|
-
end
|
|
141
|
-
when existing_file.is_compressed?
|
|
142
|
-
IMW.announce_if_verbose("Decompressing #{existing_file}...")
|
|
143
|
-
existing_file.cp(new_path).decompress!
|
|
144
|
-
else
|
|
145
|
-
IMW.announce_if_verbose("Copying #{existing_file}...")
|
|
146
|
-
existing_file.cp(new_path)
|
|
147
|
-
end
|
|
148
|
-
end
|
|
149
|
-
|
|
150
|
-
remote_inputs.each do |remote_input|
|
|
151
|
-
IMW.announce_if_verbose("Downloading #{remote_input}...")
|
|
152
|
-
remote_input.cp(File.join(dir, remote_input.effective_basename))
|
|
153
|
-
end
|
|
154
|
-
end
|
|
155
|
-
|
|
156
|
-
# Checks to see if all expected files exist in the temporary
|
|
157
|
-
# directory for this packager.
|
|
158
|
-
#
|
|
159
|
-
# @return [true, false]
|
|
160
|
-
def prepared?
|
|
161
|
-
local_inputs.each do |existing_file|
|
|
162
|
-
case
|
|
163
|
-
when existing_file.is_archive?
|
|
164
|
-
existing_file.contents.each do |archived_file_path|
|
|
165
|
-
return false unless File.exist?(File.join(dir, archived_file_path))
|
|
166
|
-
end
|
|
167
|
-
when existing_file.is_compressed?
|
|
168
|
-
return false unless File.exist?(File.join(dir, existing_file.decompressed_basename))
|
|
169
|
-
else
|
|
170
|
-
return false unless File.exist?(File.join(dir, existing_file.basename))
|
|
171
|
-
end
|
|
172
|
-
end
|
|
173
|
-
|
|
174
|
-
remote_inputs.each do |remote_input|
|
|
175
|
-
return false unless File.exist?(File.join(dir, remote_input.effective_basename))
|
|
176
|
-
end
|
|
177
|
-
|
|
178
|
-
true
|
|
179
|
-
end
|
|
180
|
-
|
|
181
|
-
# Package the contents of the temporary directory to an archive
|
|
182
|
-
# at +output+ but return exceptions instead of raising them.
|
|
183
|
-
#
|
|
184
|
-
# @param [String, IMW::Resource] output the path to the output package
|
|
185
|
-
# @param [Hash] options
|
|
186
|
-
# @return [StandardError, IMW::Resource] either the completed package or the error which was raised
|
|
187
|
-
def package output, options={}
|
|
188
|
-
begin
|
|
189
|
-
package! output, options={}
|
|
190
|
-
rescue StandardError => e
|
|
191
|
-
return e
|
|
192
|
-
end
|
|
193
|
-
end
|
|
194
|
-
|
|
195
|
-
# Package the contents of the temporary directory to an archive
|
|
196
|
-
# at +output+. The extension of +output+ determines the kind of
|
|
197
|
-
# archive.
|
|
198
|
-
#
|
|
199
|
-
# @param [String, IMW::Resource] output the path to the output package
|
|
200
|
-
# @param [Hash] options
|
|
201
|
-
# @return [IMW::Resource] the completed package
|
|
202
|
-
def package! output, options={}
|
|
203
|
-
prepare! unless prepared?
|
|
204
|
-
output = IMW.open(output)
|
|
205
|
-
FileUtils.mkdir_p(output.dirname) unless File.exist?(output.dirname)
|
|
206
|
-
output.rm! if output.exist?
|
|
207
|
-
FileUtils.cd(tmp_dir) { IMW.open(output.basename).create(name).mv(output.path) }
|
|
208
|
-
add_processing_error "Archiver: couldn't create archive #{output.path}" unless output.exists?
|
|
209
|
-
output
|
|
210
|
-
end
|
|
211
|
-
|
|
212
|
-
protected
|
|
213
|
-
def add_processing_error error # :nodoc:
|
|
214
|
-
IMW.logger.warn error
|
|
215
|
-
errors << error
|
|
216
|
-
end
|
|
217
|
-
|
|
218
|
-
end
|
|
219
|
-
end
|
|
220
|
-
end
|
data/lib/imw/tools/downloader.rb
DELETED
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Tools
|
|
3
|
-
|
|
4
|
-
# A class to download a collection of resources to a shared
|
|
5
|
-
# directory.
|
|
6
|
-
class Downloader
|
|
7
|
-
|
|
8
|
-
def initialize dir, *inputs
|
|
9
|
-
self.dir = dir
|
|
10
|
-
self.inputs = inputs unless inputs.blank?
|
|
11
|
-
end
|
|
12
|
-
|
|
13
|
-
def self.dir= new_dir
|
|
14
|
-
@dir = IMW.open(new_dir)
|
|
15
|
-
raise IMW::PathError.new("#{@dir} must be a local directory") unless @dir.is_local? && @dir.is_directory?
|
|
16
|
-
@dir
|
|
17
|
-
end
|
|
18
|
-
attr_reader :dir
|
|
19
|
-
|
|
20
|
-
def inputs= new_inputs
|
|
21
|
-
@inputs = new_inputs.flatten.compact.map { |raw_input| IMW.open(raw_input) }
|
|
22
|
-
end
|
|
23
|
-
attr_reader :inputs
|
|
24
|
-
|
|
25
|
-
def downloaded_path_for input
|
|
26
|
-
dir.join(input.respond_to?(:effective_basename) ? input.effective_basename : input.basename)
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
def download!
|
|
30
|
-
before_download
|
|
31
|
-
inputs.each do |input|
|
|
32
|
-
downloaded_path = downloaded_path_for(input)
|
|
33
|
-
IMW.log_if_verbose "Downloading #{input} to #{downloaded_path}"
|
|
34
|
-
input.cp(downloaded_path)
|
|
35
|
-
end
|
|
36
|
-
after_download
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
def downloaded?
|
|
40
|
-
downloaded_resources.all? { |resource| resource.exist? }
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
def downloaded_resources
|
|
44
|
-
inputs.map do |input|
|
|
45
|
-
IMW.open(downloaded_path_for(input))
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
def clean!
|
|
50
|
-
IMW.log_if_verbose("Deleting downloader directory #{dir}")
|
|
51
|
-
dir.rm_rf!
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
def before_download
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
def after_download
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
end
|
|
61
|
-
end
|
|
62
|
-
end
|
|
63
|
-
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Tools
|
|
3
|
-
|
|
4
|
-
# Mixin with some heuristic methods for identifying common
|
|
5
|
-
# extensions and likely data formats for a collection of files.
|
|
6
|
-
#
|
|
7
|
-
# Requires the including class to define a method +resources+
|
|
8
|
-
# which returns an array of IMW::Resource objects as well as a
|
|
9
|
-
# method +total_size+ which gives the total size of the resources
|
|
10
|
-
# (for weighting extensions by size).
|
|
11
|
-
module ExtensionAnalyzer
|
|
12
|
-
|
|
13
|
-
# Return the file counts of each extension.
|
|
14
|
-
#
|
|
15
|
-
# @return [Hash]
|
|
16
|
-
def extension_counts
|
|
17
|
-
@extension_counts ||= {}.tap do |counts|
|
|
18
|
-
resources.each do |resource|
|
|
19
|
-
next if resource.is_directory?
|
|
20
|
-
counts[resource.extension] = 0 unless counts.has_key?(resource.extension)
|
|
21
|
-
counts[resource.extension] += 1
|
|
22
|
-
end
|
|
23
|
-
end
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
# Return the most common extension by count of files.
|
|
27
|
-
def most_common_extension_by_count
|
|
28
|
-
return @most_common_extension_by_count if @most_common_extension_by_count
|
|
29
|
-
current_count, current_extension = 0, ''
|
|
30
|
-
extension_counts.each_pair do |extension, count|
|
|
31
|
-
current_extension = extension if count > current_count
|
|
32
|
-
end
|
|
33
|
-
if current_extension.strip.blank? then current_extension = 'flat' end
|
|
34
|
-
@most_common_extension_by_count = current_extension
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
# Return the file counts of each extension, normalized by the
|
|
38
|
-
# total number of files.
|
|
39
|
-
#
|
|
40
|
-
# @return [Hash]
|
|
41
|
-
def normalized_extension_counts
|
|
42
|
-
@normalized_extension_counts ||= {}.tap do |weighted|
|
|
43
|
-
num_files = resources.reject(&:is_directory?).length.to_f
|
|
44
|
-
extension_counts.each_pair do |extension, count|
|
|
45
|
-
weighted[extension] = count.to_f / num_files
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
# Return the amount of data corresponding to each extension.
|
|
51
|
-
#
|
|
52
|
-
# @return [Hash]
|
|
53
|
-
def extension_sizes
|
|
54
|
-
@extension_sizes ||= {}.tap do |sizes|
|
|
55
|
-
resources.each do |resource|
|
|
56
|
-
next if resource.is_directory?
|
|
57
|
-
sizes[resource.extension] = 0 unless sizes.has_key?(resource.extension)
|
|
58
|
-
sizes[resource.extension] += resource.size
|
|
59
|
-
end
|
|
60
|
-
end
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
# Return the most common extension by amount of data.
|
|
64
|
-
#
|
|
65
|
-
# @return [String]
|
|
66
|
-
def most_common_extension_by_size
|
|
67
|
-
return @most_common_extension_by_size if @most_common_extension_by_size
|
|
68
|
-
current_size, current_extension = 0, ''
|
|
69
|
-
extension_sizes.each_pair do |extension, size|
|
|
70
|
-
if size > current_size
|
|
71
|
-
current_extension = extension
|
|
72
|
-
current_size = size
|
|
73
|
-
end
|
|
74
|
-
end
|
|
75
|
-
current_extension = 'flat' if current_extension.strip.blank?
|
|
76
|
-
@most_common_extension_by_size = current_extension
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
# Return the fractional share of each extension by file size.
|
|
80
|
-
#
|
|
81
|
-
# @return [Hash]
|
|
82
|
-
def normalized_extension_sizes
|
|
83
|
-
@normalized_extension_sizes ||= {}.tap do |weighted|
|
|
84
|
-
extension_sizes.each_pair do |extension, size|
|
|
85
|
-
weighted[extension] = size.to_f / total_size.to_f
|
|
86
|
-
end
|
|
87
|
-
end
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
# Return a guess as to the most common extension format for this
|
|
91
|
-
# Summarizer's resources.
|
|
92
|
-
#
|
|
93
|
-
# @return [String]
|
|
94
|
-
def most_common_extension
|
|
95
|
-
return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
|
|
96
|
-
count_fraction = (normalized_extension_counts[most_common_extension_by_count] or 0.0)
|
|
97
|
-
size_fraction = (normalized_extension_sizes[most_common_extension_by_size] or 0.0)
|
|
98
|
-
return most_common_extension_by_count if count_fraction >= 0.5 and size_fraction < 0.5 # FIXME arbitrary
|
|
99
|
-
return most_common_extension_by_size if count_fraction < 0.5 and size_fraction >= 0.5
|
|
100
|
-
most_common_extension_by_size # default to size
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
# Returns a guess as to the most common data format for this
|
|
104
|
-
# Summarizer's resources.
|
|
105
|
-
#
|
|
106
|
-
# @return [String]
|
|
107
|
-
def most_common_data_format
|
|
108
|
-
extension = most_common_extension
|
|
109
|
-
['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
|
|
110
|
-
end
|
|
111
|
-
end
|
|
112
|
-
end
|
|
113
|
-
end
|
|
114
|
-
|
data/lib/imw/tools/summarizer.rb
DELETED
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
require 'imw/tools/extension_analyzer'
|
|
2
|
-
|
|
3
|
-
module IMW
|
|
4
|
-
module Tools
|
|
5
|
-
|
|
6
|
-
# A class for producing summary data about a collection of
|
|
7
|
-
# resources.
|
|
8
|
-
#
|
|
9
|
-
# The Summarizer needs recursively IMW.open all files and
|
|
10
|
-
# directories given so will be very cumbersome if given many
|
|
11
|
-
# files. Few large files will not cause a problem.
|
|
12
|
-
class Summarizer
|
|
13
|
-
|
|
14
|
-
# Options for this Summarizer.
|
|
15
|
-
attr_accessor :options
|
|
16
|
-
|
|
17
|
-
# The inputs given to this Summarizer.
|
|
18
|
-
attr_reader :inputs
|
|
19
|
-
|
|
20
|
-
# The resources analyzed, calculated recursively from the
|
|
21
|
-
# +inputs+.
|
|
22
|
-
attr_reader :resources
|
|
23
|
-
|
|
24
|
-
include IMW::Tools::ExtensionAnalyzer
|
|
25
|
-
|
|
26
|
-
# Initialize a new Summarizer with the given +inputs+.
|
|
27
|
-
#
|
|
28
|
-
# A Hash of options can be given as the last parameter.
|
|
29
|
-
#
|
|
30
|
-
# @param [Array<String, IMW::Resource>] inputs
|
|
31
|
-
# @return [IMW::Tools::Summarizer]
|
|
32
|
-
def initialize *inputs
|
|
33
|
-
self.options = (inputs.last.is_a?(Hash) && inputs.pop) || {}
|
|
34
|
-
self.inputs = inputs.flatten
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
# Return the total size of all resources.
|
|
38
|
-
#
|
|
39
|
-
# @return [Integer]
|
|
40
|
-
def total_size
|
|
41
|
-
@total_size ||= resources.map(&:size).inject(0) { |e, sum| sum += e }
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
# Return a summary of the +inputs+ to this Summarizer.
|
|
45
|
-
#
|
|
46
|
-
# Will swallow errors.
|
|
47
|
-
#
|
|
48
|
-
# @return [Array<Hash>]
|
|
49
|
-
def summary
|
|
50
|
-
@summary ||= summary! rescue []
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
# Return a summary of the +inputs+ to this summarizer.
|
|
54
|
-
#
|
|
55
|
-
# Delegates to the +summary+ method of each constituent
|
|
56
|
-
# IMW::Resource in +inputs+.
|
|
57
|
-
#
|
|
58
|
-
# @return [Array]
|
|
59
|
-
def summary!
|
|
60
|
-
inputs.map do |input|
|
|
61
|
-
(input.respond_to?(:summary) ? input.summary : {})
|
|
62
|
-
end
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
protected
|
|
66
|
-
# Set new inputs for this summarizer.
|
|
67
|
-
#
|
|
68
|
-
# Summarizer statistics are cached as instance variables so be
|
|
69
|
-
# careful about changing inputs and then using old statistics...
|
|
70
|
-
#
|
|
71
|
-
# @param [Array<String, IMW::Resource>] new_inputs
|
|
72
|
-
def inputs= new_inputs
|
|
73
|
-
@inputs = new_inputs.map do |path_or_resource|
|
|
74
|
-
input = IMW.open(path_or_resource)
|
|
75
|
-
end
|
|
76
|
-
@resources = inputs.map do |input|
|
|
77
|
-
input.is_local? && input.is_directory? ? input.all_resources : input
|
|
78
|
-
end.compact.flatten
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
end
|
|
82
|
-
end
|
|
83
|
-
end
|