imw 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,108 @@
1
+ module IMW
2
+ module Tools
3
+
4
+ # Mixin with some heuristic methods for identifying common
5
+ # extensions and likely data formats for a collection of files.
6
+ #
7
+ # Requires the including class to define a method +resources+ which
8
+ # returns an array of IMW::Resource objects.
9
+ module ExtensionAnalyzer
10
+
11
+ # Return the file counts of each extension.
12
+ #
13
+ # @return [Hash]
14
+ def extension_counts
15
+ @extension_counts ||= returning({}) do |counts|
16
+ resources.each do |resource|
17
+ next if resource.is_directory?
18
+ counts[resource.extension] = 0 unless counts.has_key?(resource.extension)
19
+ counts[resource.extension] += 1
20
+ end
21
+ end
22
+ end
23
+
24
+ # Return the most common extension by count of files.
25
+ def most_common_extension_by_count
26
+ return @most_common_extension_by_count if @most_common_extension_by_count
27
+ current_count, current_extension = 0, nil
28
+ extension_counts.each_pair do |extension, count|
29
+ current_extension = extension if count > current_count
30
+ end
31
+ if current_extension.strip.blank? then current_extension = 'flat' end
32
+ @most_common_extension_by_count = current_extension
33
+ end
34
+
35
+ # Return the file counts of each extension, normalized by the
36
+ # total number of files.
37
+ #
38
+ # @return [Hash]
39
+ def normalized_extension_counts
40
+ @normalized_extension_counts ||= returning({}) do |weighted|
41
+ extension_counts.each_pair do |extension, count|
42
+ weighted[extension] = count.to_f / num_files.to_f
43
+ end
44
+ end
45
+ end
46
+
47
+ # Return the amount of data corresponding to each extension.
48
+ #
49
+ # @return [Hash]
50
+ def extension_sizes
51
+ @extension_sizes ||= returning({}) do |sizes|
52
+ resources.each do |resource|
53
+ next if resource.is_directory?
54
+ sizes[resource.extension] = 0 unless sizes.has_key?(resource.extension)
55
+ sizes[resource.extension] += resource.size
56
+ end
57
+ end
58
+ end
59
+
60
+ # Return the most common extension by amount of data.
61
+ #
62
+ # @return [String]
63
+ def most_common_extension_by_size
64
+ return @most_common_extension_by_size if @most_common_extension_by_size
65
+ current_size, current_extension = 0, nil
66
+ extension_sizes.each_pair do |extension, size|
67
+ current_extension = extension if size > current_size
68
+ end
69
+ if current_extension.strip.blank? then current_extension = 'flat' end
70
+ @most_common_extension_by_size = current_extension
71
+ end
72
+
73
+ # Return the fractional share of each extension by file size.
74
+ #
75
+ # @return [Hash]
76
+ def normalized_extension_sizes
77
+ @normalized_extension_sizes ||= returning({}) do |weighted|
78
+ extension_sizes.each_pair do |extension, size|
79
+ weighted[extension] = size.to_f / total_size.to_f
80
+ end
81
+ end
82
+ end
83
+
84
+ # Return a guess as to the most common extension format for this
85
+ # Summarizer's resources.
86
+ #
87
+ # @return [String]
88
+ def most_common_extension
89
+ return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
90
+ count_fraction = normalized_extension_counts[most_common_extension_by_count]
91
+ size_fraction = normalized_extension_sizes[most_common_extension_by_size]
92
+ return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # FIXME arbitrary
93
+ return most_common_extension_by_size if count_fraction < 0.5 and size_fraction > 0.5
94
+ most_common_extension_by_size # default to size
95
+ end
96
+
97
+ # Returns a guess as to the most common data format for this
98
+ # Summarizer's resources.
99
+ #
100
+ # @return [String]
101
+ def most_common_data_format
102
+ extension = most_common_extension
103
+ ['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
104
+ end
105
+ end
106
+ end
107
+ end
108
+
@@ -1,16 +1,25 @@
1
+ require 'imw/tools/extension_analyzer'
2
+
1
3
  module IMW
2
4
  module Tools
3
5
 
4
6
  # A class for producing summary data about a collection of
5
7
  # resources.
6
8
  #
7
- # This summary data includes the directory tree, file sizes, file
8
- # formats, record counts, &c.
9
+ # The Summarizer needs recursively IMW.open all files and
10
+ # directories given so will be very cumbersome if given many
11
+ # files. Few large files will not cause a problem.
9
12
  class Summarizer
10
13
 
11
- # The inputs to this Summarizer.
14
+ # The inputs given to this Summarizer.
12
15
  attr_reader :inputs
13
16
 
17
+ # The resources to this Summarizer, calculated recursively from
18
+ # its +inputs+.
19
+ attr_reader :resources
20
+
21
+ include IMW::Tools::ExtensionAnalyzer
22
+
14
23
  # Initialize a new Summarizer with the given +inputs+.
15
24
  #
16
25
  # @param [Array<String, IMW::Resource>] inputs
@@ -19,151 +28,40 @@ module IMW
19
28
  self.inputs = inputs.flatten
20
29
  end
21
30
 
22
- # Set new inputs for this summarizer.
23
- #
24
- # Clears any cached summary statistics
25
- #
26
- # @param [Array<String, IMW::Resource>] new_inputs
27
- def inputs= new_inputs
28
- @inputs = new_inputs.map do |input|
29
- i = IMW.open(input)
30
- raise PathError.new("Invalid input, #{i.path}") if i.is_local? && !i.exist? # don't check for remote files
31
- i.is_directory? ? i.resources : i
32
- end.compact.flatten
33
- clear_cached_statistics!
34
- end
35
-
36
- # Reset all the cached statistics of this summarizer to +nil+.
37
- def clear_cached_statistics!
38
- [:num_files,
39
- :num_direcories,
40
- :total_size,
41
- :extension_counts,
42
- :most_common_extension_by_count,
43
- :normalized_extension_counts,
44
- :extension_sizes,
45
- :most_common_extension_by_size,
46
- :normalized_extension_sizes].each do |instance_variable|
47
- self.instance_variable_set("@#{instance_variable}", nil)
48
- end
49
- end
50
-
51
- # Return the number of files.
52
- #
53
- # @return [Integer]
54
- def num_files
55
- @num_files ||= inputs.size
56
- end
57
-
58
- # Return the number of directories.
59
- #
60
- # @return [Integer]
61
- def num_directories
62
- @num_directories ||= inputs.collect { |input| input.is_directory? }
63
- end
64
-
65
31
  # Return the total size.
66
32
  #
67
33
  # @return [Integer]
68
34
  def total_size
69
- @total_size ||= inputs.map(&:size).inject(0) { |e, sum| sum += e }
35
+ @total_size ||= resources.map(&:size).inject(0) { |e, sum| sum += e }
70
36
  end
71
37
 
72
- # Return the file counts of each extension.
38
+ # Return a summary of the +inputs+ to this Summarizer.
73
39
  #
74
- # @return [Hash]
75
- def extension_counts
76
- @extension_counts ||= returning({}) do |counts|
77
- inputs.each do |input|
78
- next if input.is_directory?
79
- counts[input.extension] = 0 unless counts.has_key?(input.extension)
80
- counts[input.extension] += 1
81
- end
82
- end
83
- end
84
-
85
- # Return the most common extension by count of files.
86
- def most_common_extension_by_count
87
- return @most_common_extension_by_count if @most_common_extension_by_count
88
- current_count, current_extension = 0, nil
89
- extension_counts.each_pair do |extension, count|
90
- current_extension = extension if count > current_count
91
- end
92
- if current_extension.strip.blank? then current_extension = 'flat' end
93
- @most_common_extension_by_count = current_extension
94
- end
95
-
96
- # Return the file counts of each extension, normalized by the
97
- # total number of files.
40
+ # Delegates to the +summary+ method of each constituent
41
+ # IMW::Resource in +inputs+.
98
42
  #
99
- # @return [Hash]
100
- def normalized_extension_counts
101
- @normalized_extension_counts ||= returning({}) do |weighted|
102
- extension_counts.each_pair do |extension, count|
103
- weighted[extension] = count.to_f / num_files.to_f
104
- end
105
- end
43
+ # @return [Array<Hash>]
44
+ def summary
45
+ @summary ||= inputs.map(&:summary)
106
46
  end
107
47
 
108
- # Return the amount of data corresponding to each extension.
109
- #
110
- # @return [Hash]
111
- def extension_sizes
112
- @extension_sizes ||= returning({}) do |sizes|
113
- inputs.each do |input|
114
- next if input.is_directory?
115
- sizes[input.extension] = 0 unless sizes.has_key?(input.extension)
116
- sizes[input.extension] += input.size
117
- end
118
- end
119
- end
120
-
121
- # Return the most common extension by amount of data.
48
+ protected
49
+ # Set new inputs for this summarizer.
122
50
  #
123
- # @return [String]
124
- def most_common_extension_by_size
125
- return @most_common_extension_by_size if @most_common_extension_by_size
126
- current_size, current_extension = 0, nil
127
- extension_sizes.each_pair do |extension, size|
128
- current_extension = extension if size > current_size
129
- end
130
- if current_extension.strip.blank? then current_extension = 'flat' end
131
- @most_common_extension_by_size = current_extension
132
- end
133
-
134
- # Return the fractional share of each extension by file size.
51
+ # Summarizer statistics are cached as instance variables so be
52
+ # careful about changing inputs and then using old statistics...
135
53
  #
136
- # @return [Hash]
137
- def normalized_extension_sizes
138
- @normalized_extension_sizes ||= returning({}) do |weighted|
139
- extension_sizes.each_pair do |extension, size|
140
- weighted[extension] = size.to_f / total_size.to_f
141
- end
54
+ # @param [Array<String, IMW::Resource>] new_inputs
55
+ def inputs= new_inputs
56
+ @inputs = new_inputs.map do |path_or_resource|
57
+ input = IMW.open(path_or_resource)
58
+ input.should_exist!("Cannot summarize.")
142
59
  end
60
+ @resources = inputs.map do |input|
61
+ input.is_directory? ? input.all_resources : input
62
+ end.compact.flatten
143
63
  end
144
64
 
145
- # Return a guess as to the most common extension format for this
146
- # Summarizer's inputs.
147
- #
148
- # @return [String]
149
- def most_common_extension
150
- return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
151
- count_fraction = normalized_extension_counts[most_common_extension_by_count]
152
- size_fraction = normalized_extension_sizes[most_common_extension_by_size]
153
- return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # choose the winner based on differential
154
- return most_common_extension_by_size if count_fraction < 0.5 and size_fraction > 0.5
155
- most_common_extension_by_size # default to size
156
- end
157
-
158
- # Returns a guess as to the most common data format for this
159
- # Summarizer's inputs.
160
- #
161
- # @return [String]
162
- def most_common_data_format
163
- extension = most_common_extension
164
- ['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
165
- end
166
-
167
65
  end
168
66
  end
169
67
  end
data/lib/imw/utils/log.rb CHANGED
@@ -9,7 +9,7 @@ module IMW
9
9
  LOG_TIMEFORMAT = "%Y-%m-%d %H:%M:%S " unless defined?(LOG_TIMEFORMAT)
10
10
 
11
11
  # Default verbosity
12
- VERBOSE = false
12
+ VERBOSE = false unless defined?(VERBOSE)
13
13
 
14
14
  class << self; attr_accessor :log, :verbose end
15
15
 
@@ -36,7 +36,7 @@ module IMW
36
36
  def self.announce *events
37
37
  options = events.flatten.extract_options!
38
38
  options.reverse_merge! :level => Logger::INFO
39
- IMW.log.add options[:level], events.join("\n")
39
+ IMW.log.add options[:level], "IMW: " + events.join("\n")
40
40
  end
41
41
  def self.announce_if_verbose *events
42
42
  announce(*events) if IMW.verbose?