imw 0.2.18 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,220 +0,0 @@
1
- require 'imw/resource'
2
-
3
- module IMW
4
- module Tools
5
-
6
- # Packages an Array of input files into a single output archive.
7
- # When the archive is extracted, all the input files given will be
8
- # in a single directory with a chosen name. The path to the output
9
- # archive determines both the name of the archive and its type (tar,
10
- # tar.bz2, zip, &c.).
11
- #
12
- # If any of the input files are themselves archives, they will first
13
- # be extracted, with only their contents winding up in the final
14
- # directory (the file hierarchy of the archive will be preserved).
15
- # If any of the input files are compressed, they will first be
16
- # uncompressed before being added to the directory.
17
- #
18
- # Both local and remote files can be archived. An exmaple:
19
- #
20
- # archiver = IMW::Transforms::Archiver.new 'my_archive', '/path/to/my/regular_file.tsv', '/path/to/an/archive.tar.bz2', '/path/to/my_compressed_file.gz', 'http://mywebsite.com/index.html'
21
- # archiver.package! '/path/to/my_archive.zip'
22
- #
23
- # This will create a ZIP archive at
24
- # <tt>/path/to/my_archive.zip</tt>. When the ZIP archive is
25
- # extracted its contents will look like
26
- #
27
- # my_archive
28
- # |-- regular_file.tsv
29
- # |-- archive_file1
30
- # |-- archive_dir
31
- # | |-- archive_file2
32
- # | `-- archive_file3
33
- # |-- archive_file3
34
- # |-- my_compressed_file
35
- # `-- index.html
36
- #
37
- # Notice that
38
- #
39
- # - the name of the extracted directory is given by the first
40
- # argument to the Archiver when it was instantiated.
41
- #
42
- # - all files wind up in the top-level of this extracted directory
43
- # when possible (<tt>regular_file.tsv</tt>, <tt>index.html</tt>)
44
- #
45
- # - /path/to/archive.tar.bz2 was not directly included, but its
46
- # contents (<tt>archive_file1</tt>,
47
- # <tt>archive_dir/archive_file2</tt>,
48
- # <tt>archive_dir/archive_file3</tt>) were included instead.
49
- #
50
- # - /path/to/my_compressed_file.gz was first uncompressed before
51
- # being added to the archive.
52
- #
53
- # - the remote file <tt>http://mywebsite.com/index.html</tt> was
54
- # downloaded and included
55
- #
56
- # This process can take a while when the constituent files are
57
- # large because there is quite a lot of preparation done to the
58
- # files to make this nice output structure in the final archive.
59
- # Further calls to <tt>package!</tt> on the same instance of
60
- # Archiver will skip the preparation step (the intermediate
61
- # results of which are sitting in IMW's temporary directory) and
62
- # directly create the package, saving time when attempting to
63
- # create multiple package formats from the same input data.
64
- class Archiver
65
-
66
- attr_accessor :name, :local_inputs, :remote_inputs
67
-
68
- def initialize name, *raw_inputs
69
- @name = name
70
- self.inputs = raw_inputs
71
- end
72
-
73
- # Set the inputs for this archiver.
74
- #
75
- # @param [String, IMW::Resource] new_inputs the inputs to archive, local or remote
76
- def inputs= raw_inputs
77
- @local_inputs, @remote_inputs = [], []
78
- raw_inputs.flatten.each do |raw_input|
79
- input = IMW.open(raw_input)
80
- if input.is_local?
81
- @local_inputs << input
82
- else
83
- @remote_inputs << input
84
- end
85
- end
86
- @local_inputs.flatten!
87
- end
88
-
89
- # Return a list of error messages for this archiver.
90
- #
91
- # @return [Array] the error messages
92
- def errors
93
- @errors ||= []
94
- end
95
-
96
- # Was this archiver successful (did it not have any errors)?
97
- #
98
- # @return [true, false]
99
- def success?
100
- errors.empty?
101
- end
102
-
103
- # A temporary directory to work in. Its contents will
104
- # ultimately consist of a directory named for the package
105
- # containing all the input files.
106
- #
107
- # @return [String]
108
- def tmp_dir
109
- @tmp_dir ||= File.join(IMW.path_to(:tmp_root, 'packager'), (Time.now.to_i.to_s + "-" + $$.to_s)) # guaranteed unique on a node
110
- end
111
-
112
- # A directory which will contain all the content being packaged,
113
- # including the contents of any archives that were included in
114
- # the list of files to process.
115
- #
116
- # @return [String]
117
- def dir
118
- @dir ||= File.join(tmp_dir, name.to_s)
119
- end
120
-
121
- # Remove the +tmp_dir+ entirely, getting rid of all temporary
122
- # files.
123
- def clean!
124
- IMW.announce_if_verbose("Cleaning temporary directory #{tmp_dir}...")
125
- FileUtils.rm_rf(tmp_dir)
126
- end
127
-
128
- # Copy, decompress, or extract the input paths to the temporary
129
- # directory, readying them for packaging.
130
- def prepare!
131
- FileUtils.mkdir_p dir unless File.exist?(dir)
132
-
133
- local_inputs.each do |existing_file|
134
- new_path = File.join(dir, existing_file.basename)
135
- case
136
- when existing_file.is_archive?
137
- IMW.announce_if_verbose("Extracting #{existing_file}...")
138
- FileUtils.cd(dir) do
139
- existing_file.extract
140
- end
141
- when existing_file.is_compressed?
142
- IMW.announce_if_verbose("Decompressing #{existing_file}...")
143
- existing_file.cp(new_path).decompress!
144
- else
145
- IMW.announce_if_verbose("Copying #{existing_file}...")
146
- existing_file.cp(new_path)
147
- end
148
- end
149
-
150
- remote_inputs.each do |remote_input|
151
- IMW.announce_if_verbose("Downloading #{remote_input}...")
152
- remote_input.cp(File.join(dir, remote_input.effective_basename))
153
- end
154
- end
155
-
156
- # Checks to see if all expected files exist in the temporary
157
- # directory for this packager.
158
- #
159
- # @return [true, false]
160
- def prepared?
161
- local_inputs.each do |existing_file|
162
- case
163
- when existing_file.is_archive?
164
- existing_file.contents.each do |archived_file_path|
165
- return false unless File.exist?(File.join(dir, archived_file_path))
166
- end
167
- when existing_file.is_compressed?
168
- return false unless File.exist?(File.join(dir, existing_file.decompressed_basename))
169
- else
170
- return false unless File.exist?(File.join(dir, existing_file.basename))
171
- end
172
- end
173
-
174
- remote_inputs.each do |remote_input|
175
- return false unless File.exist?(File.join(dir, remote_input.effective_basename))
176
- end
177
-
178
- true
179
- end
180
-
181
- # Package the contents of the temporary directory to an archive
182
- # at +output+ but return exceptions instead of raising them.
183
- #
184
- # @param [String, IMW::Resource] output the path to the output package
185
- # @param [Hash] options
186
- # @return [StandardError, IMW::Resource] either the completed package or the error which was raised
187
- def package output, options={}
188
- begin
189
- package! output, options={}
190
- rescue StandardError => e
191
- return e
192
- end
193
- end
194
-
195
- # Package the contents of the temporary directory to an archive
196
- # at +output+. The extension of +output+ determines the kind of
197
- # archive.
198
- #
199
- # @param [String, IMW::Resource] output the path to the output package
200
- # @param [Hash] options
201
- # @return [IMW::Resource] the completed package
202
- def package! output, options={}
203
- prepare! unless prepared?
204
- output = IMW.open(output)
205
- FileUtils.mkdir_p(output.dirname) unless File.exist?(output.dirname)
206
- output.rm! if output.exist?
207
- FileUtils.cd(tmp_dir) { IMW.open(output.basename).create(name).mv(output.path) }
208
- add_processing_error "Archiver: couldn't create archive #{output.path}" unless output.exists?
209
- output
210
- end
211
-
212
- protected
213
- def add_processing_error error # :nodoc:
214
- IMW.logger.warn error
215
- errors << error
216
- end
217
-
218
- end
219
- end
220
- end
@@ -1,63 +0,0 @@
1
- module IMW
2
- module Tools
3
-
4
- # A class to download a collection of resources to a shared
5
- # directory.
6
- class Downloader
7
-
8
- def initialize dir, *inputs
9
- self.dir = dir
10
- self.inputs = inputs unless inputs.blank?
11
- end
12
-
13
- def self.dir= new_dir
14
- @dir = IMW.open(new_dir)
15
- raise IMW::PathError.new("#{@dir} must be a local directory") unless @dir.is_local? && @dir.is_directory?
16
- @dir
17
- end
18
- attr_reader :dir
19
-
20
- def inputs= new_inputs
21
- @inputs = new_inputs.flatten.compact.map { |raw_input| IMW.open(raw_input) }
22
- end
23
- attr_reader :inputs
24
-
25
- def downloaded_path_for input
26
- dir.join(input.respond_to?(:effective_basename) ? input.effective_basename : input.basename)
27
- end
28
-
29
- def download!
30
- before_download
31
- inputs.each do |input|
32
- downloaded_path = downloaded_path_for(input)
33
- IMW.log_if_verbose "Downloading #{input} to #{downloaded_path}"
34
- input.cp(downloaded_path)
35
- end
36
- after_download
37
- end
38
-
39
- def downloaded?
40
- downloaded_resources.all? { |resource| resource.exist? }
41
- end
42
-
43
- def downloaded_resources
44
- inputs.map do |input|
45
- IMW.open(downloaded_path_for(input))
46
- end
47
- end
48
-
49
- def clean!
50
- IMW.log_if_verbose("Deleting downloader directory #{dir}")
51
- dir.rm_rf!
52
- end
53
-
54
- def before_download
55
- end
56
-
57
- def after_download
58
- end
59
-
60
- end
61
- end
62
- end
63
-
@@ -1,114 +0,0 @@
1
- module IMW
2
- module Tools
3
-
4
- # Mixin with some heuristic methods for identifying common
5
- # extensions and likely data formats for a collection of files.
6
- #
7
- # Requires the including class to define a method +resources+
8
- # which returns an array of IMW::Resource objects as well as a
9
- # method +total_size+ which gives the total size of the resources
10
- # (for weighting extensions by size).
11
- module ExtensionAnalyzer
12
-
13
- # Return the file counts of each extension.
14
- #
15
- # @return [Hash]
16
- def extension_counts
17
- @extension_counts ||= {}.tap do |counts|
18
- resources.each do |resource|
19
- next if resource.is_directory?
20
- counts[resource.extension] = 0 unless counts.has_key?(resource.extension)
21
- counts[resource.extension] += 1
22
- end
23
- end
24
- end
25
-
26
- # Return the most common extension by count of files.
27
- def most_common_extension_by_count
28
- return @most_common_extension_by_count if @most_common_extension_by_count
29
- current_count, current_extension = 0, ''
30
- extension_counts.each_pair do |extension, count|
31
- current_extension = extension if count > current_count
32
- end
33
- if current_extension.strip.blank? then current_extension = 'flat' end
34
- @most_common_extension_by_count = current_extension
35
- end
36
-
37
- # Return the file counts of each extension, normalized by the
38
- # total number of files.
39
- #
40
- # @return [Hash]
41
- def normalized_extension_counts
42
- @normalized_extension_counts ||= {}.tap do |weighted|
43
- num_files = resources.reject(&:is_directory?).length.to_f
44
- extension_counts.each_pair do |extension, count|
45
- weighted[extension] = count.to_f / num_files
46
- end
47
- end
48
- end
49
-
50
- # Return the amount of data corresponding to each extension.
51
- #
52
- # @return [Hash]
53
- def extension_sizes
54
- @extension_sizes ||= {}.tap do |sizes|
55
- resources.each do |resource|
56
- next if resource.is_directory?
57
- sizes[resource.extension] = 0 unless sizes.has_key?(resource.extension)
58
- sizes[resource.extension] += resource.size
59
- end
60
- end
61
- end
62
-
63
- # Return the most common extension by amount of data.
64
- #
65
- # @return [String]
66
- def most_common_extension_by_size
67
- return @most_common_extension_by_size if @most_common_extension_by_size
68
- current_size, current_extension = 0, ''
69
- extension_sizes.each_pair do |extension, size|
70
- if size > current_size
71
- current_extension = extension
72
- current_size = size
73
- end
74
- end
75
- current_extension = 'flat' if current_extension.strip.blank?
76
- @most_common_extension_by_size = current_extension
77
- end
78
-
79
- # Return the fractional share of each extension by file size.
80
- #
81
- # @return [Hash]
82
- def normalized_extension_sizes
83
- @normalized_extension_sizes ||= {}.tap do |weighted|
84
- extension_sizes.each_pair do |extension, size|
85
- weighted[extension] = size.to_f / total_size.to_f
86
- end
87
- end
88
- end
89
-
90
- # Return a guess as to the most common extension format for this
91
- # Summarizer's resources.
92
- #
93
- # @return [String]
94
- def most_common_extension
95
- return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
96
- count_fraction = (normalized_extension_counts[most_common_extension_by_count] or 0.0)
97
- size_fraction = (normalized_extension_sizes[most_common_extension_by_size] or 0.0)
98
- return most_common_extension_by_count if count_fraction >= 0.5 and size_fraction < 0.5 # FIXME arbitrary
99
- return most_common_extension_by_size if count_fraction < 0.5 and size_fraction >= 0.5
100
- most_common_extension_by_size # default to size
101
- end
102
-
103
- # Returns a guess as to the most common data format for this
104
- # Summarizer's resources.
105
- #
106
- # @return [String]
107
- def most_common_data_format
108
- extension = most_common_extension
109
- ['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
110
- end
111
- end
112
- end
113
- end
114
-
@@ -1,83 +0,0 @@
1
- require 'imw/tools/extension_analyzer'
2
-
3
- module IMW
4
- module Tools
5
-
6
- # A class for producing summary data about a collection of
7
- # resources.
8
- #
9
- # The Summarizer needs recursively IMW.open all files and
10
- # directories given so will be very cumbersome if given many
11
- # files. Few large files will not cause a problem.
12
- class Summarizer
13
-
14
- # Options for this Summarizer.
15
- attr_accessor :options
16
-
17
- # The inputs given to this Summarizer.
18
- attr_reader :inputs
19
-
20
- # The resources analyzed, calculated recursively from the
21
- # +inputs+.
22
- attr_reader :resources
23
-
24
- include IMW::Tools::ExtensionAnalyzer
25
-
26
- # Initialize a new Summarizer with the given +inputs+.
27
- #
28
- # A Hash of options can be given as the last parameter.
29
- #
30
- # @param [Array<String, IMW::Resource>] inputs
31
- # @return [IMW::Tools::Summarizer]
32
- def initialize *inputs
33
- self.options = (inputs.last.is_a?(Hash) && inputs.pop) || {}
34
- self.inputs = inputs.flatten
35
- end
36
-
37
- # Return the total size of all resources.
38
- #
39
- # @return [Integer]
40
- def total_size
41
- @total_size ||= resources.map(&:size).inject(0) { |e, sum| sum += e }
42
- end
43
-
44
- # Return a summary of the +inputs+ to this Summarizer.
45
- #
46
- # Will swallow errors.
47
- #
48
- # @return [Array<Hash>]
49
- def summary
50
- @summary ||= summary! rescue []
51
- end
52
-
53
- # Return a summary of the +inputs+ to this summarizer.
54
- #
55
- # Delegates to the +summary+ method of each constituent
56
- # IMW::Resource in +inputs+.
57
- #
58
- # @return [Array]
59
- def summary!
60
- inputs.map do |input|
61
- (input.respond_to?(:summary) ? input.summary : {})
62
- end
63
- end
64
-
65
- protected
66
- # Set new inputs for this summarizer.
67
- #
68
- # Summarizer statistics are cached as instance variables so be
69
- # careful about changing inputs and then using old statistics...
70
- #
71
- # @param [Array<String, IMW::Resource>] new_inputs
72
- def inputs= new_inputs
73
- @inputs = new_inputs.map do |path_or_resource|
74
- input = IMW.open(path_or_resource)
75
- end
76
- @resources = inputs.map do |input|
77
- input.is_local? && input.is_directory? ? input.all_resources : input
78
- end.compact.flatten
79
- end
80
-
81
- end
82
- end
83
- end