imw 0.2.18 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,220 +0,0 @@
1
- require 'imw/resource'
2
-
3
- module IMW
4
- module Tools
5
-
6
- # Packages an Array of input files into a single output archive.
7
- # When the archive is extracted, all the input files given will be
8
- # in a single directory with a chosen name. The path to the output
9
- # archive determines both the name of the archive and its type (tar,
10
- # tar.bz2, zip, &c.).
11
- #
12
- # If any of the input files are themselves archives, they will first
13
- # be extracted, with only their contents winding up in the final
14
- # directory (the file hierarchy of the archive will be preserved).
15
- # If any of the input files are compressed, they will first be
16
- # uncompressed before being added to the directory.
17
- #
18
- # Both local and remote files can be archived. An exmaple:
19
- #
20
- # archiver = IMW::Transforms::Archiver.new 'my_archive', '/path/to/my/regular_file.tsv', '/path/to/an/archive.tar.bz2', '/path/to/my_compressed_file.gz', 'http://mywebsite.com/index.html'
21
- # archiver.package! '/path/to/my_archive.zip'
22
- #
23
- # This will create a ZIP archive at
24
- # <tt>/path/to/my_archive.zip</tt>. When the ZIP archive is
25
- # extracted its contents will look like
26
- #
27
- # my_archive
28
- # |-- regular_file.tsv
29
- # |-- archive_file1
30
- # |-- archive_dir
31
- # | |-- archive_file2
32
- # | `-- archive_file3
33
- # |-- archive_file3
34
- # |-- my_compressed_file
35
- # `-- index.html
36
- #
37
- # Notice that
38
- #
39
- # - the name of the extracted directory is given by the first
40
- # argument to the Archiver when it was instantiated.
41
- #
42
- # - all files wind up in the top-level of this extracted directory
43
- # when possible (<tt>regular_file.tsv</tt>, <tt>index.html</tt>)
44
- #
45
- # - /path/to/archive.tar.bz2 was not directly included, but its
46
- # contents (<tt>archive_file1</tt>,
47
- # <tt>archive_dir/archive_file2</tt>,
48
- # <tt>archive_dir/archive_file3</tt>) were included instead.
49
- #
50
- # - /path/to/my_compressed_file.gz was first uncompressed before
51
- # being added to the archive.
52
- #
53
- # - the remote file <tt>http://mywebsite.com/index.html</tt> was
54
- # downloaded and included
55
- #
56
- # This process can take a while when the constituent files are
57
- # large because there is quite a lot of preparation done to the
58
- # files to make this nice output structure in the final archive.
59
- # Further calls to <tt>package!</tt> on the same instance of
60
- # Archiver will skip the preparation step (the intermediate
61
- # results of which are sitting in IMW's temporary directory) and
62
- # directly create the package, saving time when attempting to
63
- # create multiple package formats from the same input data.
64
- class Archiver
65
-
66
- attr_accessor :name, :local_inputs, :remote_inputs
67
-
68
- def initialize name, *raw_inputs
69
- @name = name
70
- self.inputs = raw_inputs
71
- end
72
-
73
- # Set the inputs for this archiver.
74
- #
75
- # @param [String, IMW::Resource] new_inputs the inputs to archive, local or remote
76
- def inputs= raw_inputs
77
- @local_inputs, @remote_inputs = [], []
78
- raw_inputs.flatten.each do |raw_input|
79
- input = IMW.open(raw_input)
80
- if input.is_local?
81
- @local_inputs << input
82
- else
83
- @remote_inputs << input
84
- end
85
- end
86
- @local_inputs.flatten!
87
- end
88
-
89
- # Return a list of error messages for this archiver.
90
- #
91
- # @return [Array] the error messages
92
- def errors
93
- @errors ||= []
94
- end
95
-
96
- # Was this archiver successful (did it not have any errors)?
97
- #
98
- # @return [true, false]
99
- def success?
100
- errors.empty?
101
- end
102
-
103
- # A temporary directory to work in. Its contents will
104
- # ultimately consist of a directory named for the package
105
- # containing all the input files.
106
- #
107
- # @return [String]
108
- def tmp_dir
109
- @tmp_dir ||= File.join(IMW.path_to(:tmp_root, 'packager'), (Time.now.to_i.to_s + "-" + $$.to_s)) # guaranteed unique on a node
110
- end
111
-
112
- # A directory which will contain all the content being packaged,
113
- # including the contents of any archives that were included in
114
- # the list of files to process.
115
- #
116
- # @return [String]
117
- def dir
118
- @dir ||= File.join(tmp_dir, name.to_s)
119
- end
120
-
121
- # Remove the +tmp_dir+ entirely, getting rid of all temporary
122
- # files.
123
- def clean!
124
- IMW.announce_if_verbose("Cleaning temporary directory #{tmp_dir}...")
125
- FileUtils.rm_rf(tmp_dir)
126
- end
127
-
128
- # Copy, decompress, or extract the input paths to the temporary
129
- # directory, readying them for packaging.
130
- def prepare!
131
- FileUtils.mkdir_p dir unless File.exist?(dir)
132
-
133
- local_inputs.each do |existing_file|
134
- new_path = File.join(dir, existing_file.basename)
135
- case
136
- when existing_file.is_archive?
137
- IMW.announce_if_verbose("Extracting #{existing_file}...")
138
- FileUtils.cd(dir) do
139
- existing_file.extract
140
- end
141
- when existing_file.is_compressed?
142
- IMW.announce_if_verbose("Decompressing #{existing_file}...")
143
- existing_file.cp(new_path).decompress!
144
- else
145
- IMW.announce_if_verbose("Copying #{existing_file}...")
146
- existing_file.cp(new_path)
147
- end
148
- end
149
-
150
- remote_inputs.each do |remote_input|
151
- IMW.announce_if_verbose("Downloading #{remote_input}...")
152
- remote_input.cp(File.join(dir, remote_input.effective_basename))
153
- end
154
- end
155
-
156
- # Checks to see if all expected files exist in the temporary
157
- # directory for this packager.
158
- #
159
- # @return [true, false]
160
- def prepared?
161
- local_inputs.each do |existing_file|
162
- case
163
- when existing_file.is_archive?
164
- existing_file.contents.each do |archived_file_path|
165
- return false unless File.exist?(File.join(dir, archived_file_path))
166
- end
167
- when existing_file.is_compressed?
168
- return false unless File.exist?(File.join(dir, existing_file.decompressed_basename))
169
- else
170
- return false unless File.exist?(File.join(dir, existing_file.basename))
171
- end
172
- end
173
-
174
- remote_inputs.each do |remote_input|
175
- return false unless File.exist?(File.join(dir, remote_input.effective_basename))
176
- end
177
-
178
- true
179
- end
180
-
181
- # Package the contents of the temporary directory to an archive
182
- # at +output+ but return exceptions instead of raising them.
183
- #
184
- # @param [String, IMW::Resource] output the path to the output package
185
- # @param [Hash] options
186
- # @return [StandardError, IMW::Resource] either the completed package or the error which was raised
187
- def package output, options={}
188
- begin
189
- package! output, options={}
190
- rescue StandardError => e
191
- return e
192
- end
193
- end
194
-
195
- # Package the contents of the temporary directory to an archive
196
- # at +output+. The extension of +output+ determines the kind of
197
- # archive.
198
- #
199
- # @param [String, IMW::Resource] output the path to the output package
200
- # @param [Hash] options
201
- # @return [IMW::Resource] the completed package
202
- def package! output, options={}
203
- prepare! unless prepared?
204
- output = IMW.open(output)
205
- FileUtils.mkdir_p(output.dirname) unless File.exist?(output.dirname)
206
- output.rm! if output.exist?
207
- FileUtils.cd(tmp_dir) { IMW.open(output.basename).create(name).mv(output.path) }
208
- add_processing_error "Archiver: couldn't create archive #{output.path}" unless output.exists?
209
- output
210
- end
211
-
212
- protected
213
- def add_processing_error error # :nodoc:
214
- IMW.logger.warn error
215
- errors << error
216
- end
217
-
218
- end
219
- end
220
- end
@@ -1,63 +0,0 @@
1
- module IMW
2
- module Tools
3
-
4
- # A class to download a collection of resources to a shared
5
- # directory.
6
- class Downloader
7
-
8
- def initialize dir, *inputs
9
- self.dir = dir
10
- self.inputs = inputs unless inputs.blank?
11
- end
12
-
13
- def self.dir= new_dir
14
- @dir = IMW.open(new_dir)
15
- raise IMW::PathError.new("#{@dir} must be a local directory") unless @dir.is_local? && @dir.is_directory?
16
- @dir
17
- end
18
- attr_reader :dir
19
-
20
- def inputs= new_inputs
21
- @inputs = new_inputs.flatten.compact.map { |raw_input| IMW.open(raw_input) }
22
- end
23
- attr_reader :inputs
24
-
25
- def downloaded_path_for input
26
- dir.join(input.respond_to?(:effective_basename) ? input.effective_basename : input.basename)
27
- end
28
-
29
- def download!
30
- before_download
31
- inputs.each do |input|
32
- downloaded_path = downloaded_path_for(input)
33
- IMW.log_if_verbose "Downloading #{input} to #{downloaded_path}"
34
- input.cp(downloaded_path)
35
- end
36
- after_download
37
- end
38
-
39
- def downloaded?
40
- downloaded_resources.all? { |resource| resource.exist? }
41
- end
42
-
43
- def downloaded_resources
44
- inputs.map do |input|
45
- IMW.open(downloaded_path_for(input))
46
- end
47
- end
48
-
49
- def clean!
50
- IMW.log_if_verbose("Deleting downloader directory #{dir}")
51
- dir.rm_rf!
52
- end
53
-
54
- def before_download
55
- end
56
-
57
- def after_download
58
- end
59
-
60
- end
61
- end
62
- end
63
-
@@ -1,114 +0,0 @@
1
- module IMW
2
- module Tools
3
-
4
- # Mixin with some heuristic methods for identifying common
5
- # extensions and likely data formats for a collection of files.
6
- #
7
- # Requires the including class to define a method +resources+
8
- # which returns an array of IMW::Resource objects as well as a
9
- # method +total_size+ which gives the total size of the resources
10
- # (for weighting extensions by size).
11
- module ExtensionAnalyzer
12
-
13
- # Return the file counts of each extension.
14
- #
15
- # @return [Hash]
16
- def extension_counts
17
- @extension_counts ||= {}.tap do |counts|
18
- resources.each do |resource|
19
- next if resource.is_directory?
20
- counts[resource.extension] = 0 unless counts.has_key?(resource.extension)
21
- counts[resource.extension] += 1
22
- end
23
- end
24
- end
25
-
26
- # Return the most common extension by count of files.
27
- def most_common_extension_by_count
28
- return @most_common_extension_by_count if @most_common_extension_by_count
29
- current_count, current_extension = 0, ''
30
- extension_counts.each_pair do |extension, count|
31
- current_extension = extension if count > current_count
32
- end
33
- if current_extension.strip.blank? then current_extension = 'flat' end
34
- @most_common_extension_by_count = current_extension
35
- end
36
-
37
- # Return the file counts of each extension, normalized by the
38
- # total number of files.
39
- #
40
- # @return [Hash]
41
- def normalized_extension_counts
42
- @normalized_extension_counts ||= {}.tap do |weighted|
43
- num_files = resources.reject(&:is_directory?).length.to_f
44
- extension_counts.each_pair do |extension, count|
45
- weighted[extension] = count.to_f / num_files
46
- end
47
- end
48
- end
49
-
50
- # Return the amount of data corresponding to each extension.
51
- #
52
- # @return [Hash]
53
- def extension_sizes
54
- @extension_sizes ||= {}.tap do |sizes|
55
- resources.each do |resource|
56
- next if resource.is_directory?
57
- sizes[resource.extension] = 0 unless sizes.has_key?(resource.extension)
58
- sizes[resource.extension] += resource.size
59
- end
60
- end
61
- end
62
-
63
- # Return the most common extension by amount of data.
64
- #
65
- # @return [String]
66
- def most_common_extension_by_size
67
- return @most_common_extension_by_size if @most_common_extension_by_size
68
- current_size, current_extension = 0, ''
69
- extension_sizes.each_pair do |extension, size|
70
- if size > current_size
71
- current_extension = extension
72
- current_size = size
73
- end
74
- end
75
- current_extension = 'flat' if current_extension.strip.blank?
76
- @most_common_extension_by_size = current_extension
77
- end
78
-
79
- # Return the fractional share of each extension by file size.
80
- #
81
- # @return [Hash]
82
- def normalized_extension_sizes
83
- @normalized_extension_sizes ||= {}.tap do |weighted|
84
- extension_sizes.each_pair do |extension, size|
85
- weighted[extension] = size.to_f / total_size.to_f
86
- end
87
- end
88
- end
89
-
90
- # Return a guess as to the most common extension format for this
91
- # Summarizer's resources.
92
- #
93
- # @return [String]
94
- def most_common_extension
95
- return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
96
- count_fraction = (normalized_extension_counts[most_common_extension_by_count] or 0.0)
97
- size_fraction = (normalized_extension_sizes[most_common_extension_by_size] or 0.0)
98
- return most_common_extension_by_count if count_fraction >= 0.5 and size_fraction < 0.5 # FIXME arbitrary
99
- return most_common_extension_by_size if count_fraction < 0.5 and size_fraction >= 0.5
100
- most_common_extension_by_size # default to size
101
- end
102
-
103
- # Returns a guess as to the most common data format for this
104
- # Summarizer's resources.
105
- #
106
- # @return [String]
107
- def most_common_data_format
108
- extension = most_common_extension
109
- ['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
110
- end
111
- end
112
- end
113
- end
114
-
@@ -1,83 +0,0 @@
1
- require 'imw/tools/extension_analyzer'
2
-
3
- module IMW
4
- module Tools
5
-
6
- # A class for producing summary data about a collection of
7
- # resources.
8
- #
9
- # The Summarizer needs recursively IMW.open all files and
10
- # directories given so will be very cumbersome if given many
11
- # files. Few large files will not cause a problem.
12
- class Summarizer
13
-
14
- # Options for this Summarizer.
15
- attr_accessor :options
16
-
17
- # The inputs given to this Summarizer.
18
- attr_reader :inputs
19
-
20
- # The resources analyzed, calculated recursively from the
21
- # +inputs+.
22
- attr_reader :resources
23
-
24
- include IMW::Tools::ExtensionAnalyzer
25
-
26
- # Initialize a new Summarizer with the given +inputs+.
27
- #
28
- # A Hash of options can be given as the last parameter.
29
- #
30
- # @param [Array<String, IMW::Resource>] inputs
31
- # @return [IMW::Tools::Summarizer]
32
- def initialize *inputs
33
- self.options = (inputs.last.is_a?(Hash) && inputs.pop) || {}
34
- self.inputs = inputs.flatten
35
- end
36
-
37
- # Return the total size of all resources.
38
- #
39
- # @return [Integer]
40
- def total_size
41
- @total_size ||= resources.map(&:size).inject(0) { |e, sum| sum += e }
42
- end
43
-
44
- # Return a summary of the +inputs+ to this Summarizer.
45
- #
46
- # Will swallow errors.
47
- #
48
- # @return [Array<Hash>]
49
- def summary
50
- @summary ||= summary! rescue []
51
- end
52
-
53
- # Return a summary of the +inputs+ to this summarizer.
54
- #
55
- # Delegates to the +summary+ method of each constituent
56
- # IMW::Resource in +inputs+.
57
- #
58
- # @return [Array]
59
- def summary!
60
- inputs.map do |input|
61
- (input.respond_to?(:summary) ? input.summary : {})
62
- end
63
- end
64
-
65
- protected
66
- # Set new inputs for this summarizer.
67
- #
68
- # Summarizer statistics are cached as instance variables so be
69
- # careful about changing inputs and then using old statistics...
70
- #
71
- # @param [Array<String, IMW::Resource>] new_inputs
72
- def inputs= new_inputs
73
- @inputs = new_inputs.map do |path_or_resource|
74
- input = IMW.open(path_or_resource)
75
- end
76
- @resources = inputs.map do |input|
77
- input.is_local? && input.is_directory? ? input.all_resources : input
78
- end.compact.flatten
79
- end
80
-
81
- end
82
- end
83
- end