imw 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ module IMW
2
+ module Tools
3
+
4
+ # Mixin with some heuristic methods for identifying common
5
+ # extensions and likely data formats for a collection of files.
6
+ #
7
+ # Requires the including class to define a method +resources+ which
8
+ # returns an array of IMW::Resource objects.
9
+ module ExtensionAnalyzer
10
+
11
+ # Return the file counts of each extension.
12
+ #
13
+ # @return [Hash]
14
+ def extension_counts
15
+ @extension_counts ||= returning({}) do |counts|
16
+ resources.each do |resource|
17
+ next if resource.is_directory?
18
+ counts[resource.extension] = 0 unless counts.has_key?(resource.extension)
19
+ counts[resource.extension] += 1
20
+ end
21
+ end
22
+ end
23
+
24
+ # Return the most common extension by count of files.
25
+ def most_common_extension_by_count
26
+ return @most_common_extension_by_count if @most_common_extension_by_count
27
+ current_count, current_extension = 0, nil
28
+ extension_counts.each_pair do |extension, count|
29
+ current_extension = extension if count > current_count
30
+ end
31
+ if current_extension.strip.blank? then current_extension = 'flat' end
32
+ @most_common_extension_by_count = current_extension
33
+ end
34
+
35
+ # Return the file counts of each extension, normalized by the
36
+ # total number of files.
37
+ #
38
+ # @return [Hash]
39
+ def normalized_extension_counts
40
+ @normalized_extension_counts ||= returning({}) do |weighted|
41
+ extension_counts.each_pair do |extension, count|
42
+ weighted[extension] = count.to_f / num_files.to_f
43
+ end
44
+ end
45
+ end
46
+
47
+ # Return the amount of data corresponding to each extension.
48
+ #
49
+ # @return [Hash]
50
+ def extension_sizes
51
+ @extension_sizes ||= returning({}) do |sizes|
52
+ resources.each do |resource|
53
+ next if resource.is_directory?
54
+ sizes[resource.extension] = 0 unless sizes.has_key?(resource.extension)
55
+ sizes[resource.extension] += resource.size
56
+ end
57
+ end
58
+ end
59
+
60
+ # Return the most common extension by amount of data.
61
+ #
62
+ # @return [String]
63
+ def most_common_extension_by_size
64
+ return @most_common_extension_by_size if @most_common_extension_by_size
65
+ current_size, current_extension = 0, nil
66
+ extension_sizes.each_pair do |extension, size|
67
+ current_extension = extension if size > current_size
68
+ end
69
+ if current_extension.strip.blank? then current_extension = 'flat' end
70
+ @most_common_extension_by_size = current_extension
71
+ end
72
+
73
+ # Return the fractional share of each extension by file size.
74
+ #
75
+ # @return [Hash]
76
+ def normalized_extension_sizes
77
+ @normalized_extension_sizes ||= returning({}) do |weighted|
78
+ extension_sizes.each_pair do |extension, size|
79
+ weighted[extension] = size.to_f / total_size.to_f
80
+ end
81
+ end
82
+ end
83
+
84
+ # Return a guess as to the most common extension format for this
85
+ # Summarizer's resources.
86
+ #
87
+ # @return [String]
88
+ def most_common_extension
89
+ return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
90
+ count_fraction = normalized_extension_counts[most_common_extension_by_count]
91
+ size_fraction = normalized_extension_sizes[most_common_extension_by_size]
92
+ return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # FIXME arbitrary
93
+ return most_common_extension_by_size if count_fraction < 0.5 and size_fraction > 0.5
94
+ most_common_extension_by_size # default to size
95
+ end
96
+
97
+ # Returns a guess as to the most common data format for this
98
+ # Summarizer's resources.
99
+ #
100
+ # @return [String]
101
+ def most_common_data_format
102
+ extension = most_common_extension
103
+ ['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
104
+ end
105
+ end
106
+ end
107
+ end
108
+
@@ -1,16 +1,25 @@
1
+ require 'imw/tools/extension_analyzer'
2
+
1
3
  module IMW
2
4
  module Tools
3
5
 
4
6
  # A class for producing summary data about a collection of
5
7
  # resources.
6
8
  #
7
- # This summary data includes the directory tree, file sizes, file
8
- # formats, record counts, &c.
9
+ # The Summarizer needs recursively IMW.open all files and
10
+ # directories given so will be very cumbersome if given many
11
+ # files. Few large files will not cause a problem.
9
12
  class Summarizer
10
13
 
11
- # The inputs to this Summarizer.
14
+ # The inputs given to this Summarizer.
12
15
  attr_reader :inputs
13
16
 
17
+ # The resources to this Summarizer, calculated recursively from
18
+ # its +inputs+.
19
+ attr_reader :resources
20
+
21
+ include IMW::Tools::ExtensionAnalyzer
22
+
14
23
  # Initialize a new Summarizer with the given +inputs+.
15
24
  #
16
25
  # @param [Array<String, IMW::Resource>] inputs
@@ -19,151 +28,40 @@ module IMW
19
28
  self.inputs = inputs.flatten
20
29
  end
21
30
 
22
- # Set new inputs for this summarizer.
23
- #
24
- # Clears any cached summary statistics
25
- #
26
- # @param [Array<String, IMW::Resource>] new_inputs
27
- def inputs= new_inputs
28
- @inputs = new_inputs.map do |input|
29
- i = IMW.open(input)
30
- raise PathError.new("Invalid input, #{i.path}") if i.is_local? && !i.exist? # don't check for remote files
31
- i.is_directory? ? i.resources : i
32
- end.compact.flatten
33
- clear_cached_statistics!
34
- end
35
-
36
- # Reset all the cached statistics of this summarizer to +nil+.
37
- def clear_cached_statistics!
38
- [:num_files,
39
- :num_direcories,
40
- :total_size,
41
- :extension_counts,
42
- :most_common_extension_by_count,
43
- :normalized_extension_counts,
44
- :extension_sizes,
45
- :most_common_extension_by_size,
46
- :normalized_extension_sizes].each do |instance_variable|
47
- self.instance_variable_set("@#{instance_variable}", nil)
48
- end
49
- end
50
-
51
- # Return the number of files.
52
- #
53
- # @return [Integer]
54
- def num_files
55
- @num_files ||= inputs.size
56
- end
57
-
58
- # Return the number of directories.
59
- #
60
- # @return [Integer]
61
- def num_directories
62
- @num_directories ||= inputs.collect { |input| input.is_directory? }
63
- end
64
-
65
31
  # Return the total size.
66
32
  #
67
33
  # @return [Integer]
68
34
  def total_size
69
- @total_size ||= inputs.map(&:size).inject(0) { |e, sum| sum += e }
35
+ @total_size ||= resources.map(&:size).inject(0) { |e, sum| sum += e }
70
36
  end
71
37
 
72
- # Return the file counts of each extension.
38
+ # Return a summary of the +inputs+ to this Summarizer.
73
39
  #
74
- # @return [Hash]
75
- def extension_counts
76
- @extension_counts ||= returning({}) do |counts|
77
- inputs.each do |input|
78
- next if input.is_directory?
79
- counts[input.extension] = 0 unless counts.has_key?(input.extension)
80
- counts[input.extension] += 1
81
- end
82
- end
83
- end
84
-
85
- # Return the most common extension by count of files.
86
- def most_common_extension_by_count
87
- return @most_common_extension_by_count if @most_common_extension_by_count
88
- current_count, current_extension = 0, nil
89
- extension_counts.each_pair do |extension, count|
90
- current_extension = extension if count > current_count
91
- end
92
- if current_extension.strip.blank? then current_extension = 'flat' end
93
- @most_common_extension_by_count = current_extension
94
- end
95
-
96
- # Return the file counts of each extension, normalized by the
97
- # total number of files.
40
+ # Delegates to the +summary+ method of each constituent
41
+ # IMW::Resource in +inputs+.
98
42
  #
99
- # @return [Hash]
100
- def normalized_extension_counts
101
- @normalized_extension_counts ||= returning({}) do |weighted|
102
- extension_counts.each_pair do |extension, count|
103
- weighted[extension] = count.to_f / num_files.to_f
104
- end
105
- end
43
+ # @return [Array<Hash>]
44
+ def summary
45
+ @summary ||= inputs.map(&:summary)
106
46
  end
107
47
 
108
- # Return the amount of data corresponding to each extension.
109
- #
110
- # @return [Hash]
111
- def extension_sizes
112
- @extension_sizes ||= returning({}) do |sizes|
113
- inputs.each do |input|
114
- next if input.is_directory?
115
- sizes[input.extension] = 0 unless sizes.has_key?(input.extension)
116
- sizes[input.extension] += input.size
117
- end
118
- end
119
- end
120
-
121
- # Return the most common extension by amount of data.
48
+ protected
49
+ # Set new inputs for this summarizer.
122
50
  #
123
- # @return [String]
124
- def most_common_extension_by_size
125
- return @most_common_extension_by_size if @most_common_extension_by_size
126
- current_size, current_extension = 0, nil
127
- extension_sizes.each_pair do |extension, size|
128
- current_extension = extension if size > current_size
129
- end
130
- if current_extension.strip.blank? then current_extension = 'flat' end
131
- @most_common_extension_by_size = current_extension
132
- end
133
-
134
- # Return the fractional share of each extension by file size.
51
+ # Summarizer statistics are cached as instance variables so be
52
+ # careful about changing inputs and then using old statistics...
135
53
  #
136
- # @return [Hash]
137
- def normalized_extension_sizes
138
- @normalized_extension_sizes ||= returning({}) do |weighted|
139
- extension_sizes.each_pair do |extension, size|
140
- weighted[extension] = size.to_f / total_size.to_f
141
- end
54
+ # @param [Array<String, IMW::Resource>] new_inputs
55
+ def inputs= new_inputs
56
+ @inputs = new_inputs.map do |path_or_resource|
57
+ input = IMW.open(path_or_resource)
58
+ input.should_exist!("Cannot summarize.")
142
59
  end
60
+ @resources = inputs.map do |input|
61
+ input.is_directory? ? input.all_resources : input
62
+ end.compact.flatten
143
63
  end
144
64
 
145
- # Return a guess as to the most common extension format for this
146
- # Summarizer's inputs.
147
- #
148
- # @return [String]
149
- def most_common_extension
150
- return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
151
- count_fraction = normalized_extension_counts[most_common_extension_by_count]
152
- size_fraction = normalized_extension_sizes[most_common_extension_by_size]
153
- return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # choose the winner based on differential
154
- return most_common_extension_by_size if count_fraction < 0.5 and size_fraction > 0.5
155
- most_common_extension_by_size # default to size
156
- end
157
-
158
- # Returns a guess as to the most common data format for this
159
- # Summarizer's inputs.
160
- #
161
- # @return [String]
162
- def most_common_data_format
163
- extension = most_common_extension
164
- ['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
165
- end
166
-
167
65
  end
168
66
  end
169
67
  end
data/lib/imw/utils/log.rb CHANGED
@@ -9,7 +9,7 @@ module IMW
9
9
  LOG_TIMEFORMAT = "%Y-%m-%d %H:%M:%S " unless defined?(LOG_TIMEFORMAT)
10
10
 
11
11
  # Default verbosity
12
- VERBOSE = false
12
+ VERBOSE = false unless defined?(VERBOSE)
13
13
 
14
14
  class << self; attr_accessor :log, :verbose end
15
15
 
@@ -36,7 +36,7 @@ module IMW
36
36
  def self.announce *events
37
37
  options = events.flatten.extract_options!
38
38
  options.reverse_merge! :level => Logger::INFO
39
- IMW.log.add options[:level], events.join("\n")
39
+ IMW.log.add options[:level], "IMW: " + events.join("\n")
40
40
  end
41
41
  def self.announce_if_verbose *events
42
42
  announce(*events) if IMW.verbose?