imw 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +174 -86
- data/VERSION +1 -1
- data/lib/imw/formats/delimited.rb +5 -5
- data/lib/imw/formats/json.rb +10 -18
- data/lib/imw/formats/yaml.rb +11 -19
- data/lib/imw/resource.rb +26 -0
- data/lib/imw/schemes/local.rb +59 -10
- data/lib/imw/tools/extension_analyzer.rb +108 -0
- data/lib/imw/tools/summarizer.rb +31 -133
- data/lib/imw/utils/log.rb +2 -2
- data/spec/data/sample.json +782 -1
- data/spec/data/sample.yaml +650 -651
- data/spec/imw/formats/delimited_spec.rb +0 -12
- data/spec/imw/formats/json_spec.rb +1 -15
- data/spec/imw/formats/yaml_spec.rb +1 -23
- data/spec/imw/resource_spec.rb +26 -0
- data/spec/imw/schemes/local_spec.rb +1 -1
- metadata +3 -2
@@ -0,0 +1,108 @@
|
|
1
|
+
module IMW
|
2
|
+
module Tools
|
3
|
+
|
4
|
+
# Mixin with some heuristic methods for identifying common
|
5
|
+
# extensions and likely data formats for a collection of files.
|
6
|
+
#
|
7
|
+
# Requires the including class to define a method +resources+ which
|
8
|
+
# returns an array of IMW::Resource objects.
|
9
|
+
module ExtensionAnalyzer
|
10
|
+
|
11
|
+
# Return the file counts of each extension.
|
12
|
+
#
|
13
|
+
# @return [Hash]
|
14
|
+
def extension_counts
|
15
|
+
@extension_counts ||= returning({}) do |counts|
|
16
|
+
resources.each do |resource|
|
17
|
+
next if resource.is_directory?
|
18
|
+
counts[resource.extension] = 0 unless counts.has_key?(resource.extension)
|
19
|
+
counts[resource.extension] += 1
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Return the most common extension by count of files.
|
25
|
+
def most_common_extension_by_count
|
26
|
+
return @most_common_extension_by_count if @most_common_extension_by_count
|
27
|
+
current_count, current_extension = 0, nil
|
28
|
+
extension_counts.each_pair do |extension, count|
|
29
|
+
current_extension = extension if count > current_count
|
30
|
+
end
|
31
|
+
if current_extension.strip.blank? then current_extension = 'flat' end
|
32
|
+
@most_common_extension_by_count = current_extension
|
33
|
+
end
|
34
|
+
|
35
|
+
# Return the file counts of each extension, normalized by the
|
36
|
+
# total number of files.
|
37
|
+
#
|
38
|
+
# @return [Hash]
|
39
|
+
def normalized_extension_counts
|
40
|
+
@normalized_extension_counts ||= returning({}) do |weighted|
|
41
|
+
extension_counts.each_pair do |extension, count|
|
42
|
+
weighted[extension] = count.to_f / num_files.to_f
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Return the amount of data corresponding to each extension.
|
48
|
+
#
|
49
|
+
# @return [Hash]
|
50
|
+
def extension_sizes
|
51
|
+
@extension_sizes ||= returning({}) do |sizes|
|
52
|
+
resources.each do |resource|
|
53
|
+
next if resource.is_directory?
|
54
|
+
sizes[resource.extension] = 0 unless sizes.has_key?(resource.extension)
|
55
|
+
sizes[resource.extension] += resource.size
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Return the most common extension by amount of data.
|
61
|
+
#
|
62
|
+
# @return [String]
|
63
|
+
def most_common_extension_by_size
|
64
|
+
return @most_common_extension_by_size if @most_common_extension_by_size
|
65
|
+
current_size, current_extension = 0, nil
|
66
|
+
extension_sizes.each_pair do |extension, size|
|
67
|
+
current_extension = extension if size > current_size
|
68
|
+
end
|
69
|
+
if current_extension.strip.blank? then current_extension = 'flat' end
|
70
|
+
@most_common_extension_by_size = current_extension
|
71
|
+
end
|
72
|
+
|
73
|
+
# Return the fractional share of each extension by file size.
|
74
|
+
#
|
75
|
+
# @return [Hash]
|
76
|
+
def normalized_extension_sizes
|
77
|
+
@normalized_extension_sizes ||= returning({}) do |weighted|
|
78
|
+
extension_sizes.each_pair do |extension, size|
|
79
|
+
weighted[extension] = size.to_f / total_size.to_f
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# Return a guess as to the most common extension format for this
|
85
|
+
# Summarizer's resources.
|
86
|
+
#
|
87
|
+
# @return [String]
|
88
|
+
def most_common_extension
|
89
|
+
return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
|
90
|
+
count_fraction = normalized_extension_counts[most_common_extension_by_count]
|
91
|
+
size_fraction = normalized_extension_sizes[most_common_extension_by_size]
|
92
|
+
return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # FIXME arbitrary
|
93
|
+
return most_common_extension_by_size if count_fraction < 0.5 and size_fraction > 0.5
|
94
|
+
most_common_extension_by_size # default to size
|
95
|
+
end
|
96
|
+
|
97
|
+
# Returns a guess as to the most common data format for this
|
98
|
+
# Summarizer's resources.
|
99
|
+
#
|
100
|
+
# @return [String]
|
101
|
+
def most_common_data_format
|
102
|
+
extension = most_common_extension
|
103
|
+
['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
data/lib/imw/tools/summarizer.rb
CHANGED
@@ -1,16 +1,25 @@
|
|
1
|
+
require 'imw/tools/extension_analyzer'
|
2
|
+
|
1
3
|
module IMW
|
2
4
|
module Tools
|
3
5
|
|
4
6
|
# A class for producing summary data about a collection of
|
5
7
|
# resources.
|
6
8
|
#
|
7
|
-
#
|
8
|
-
#
|
9
|
+
# The Summarizer needs recursively IMW.open all files and
|
10
|
+
# directories given so will be very cumbersome if given many
|
11
|
+
# files. Few large files will not cause a problem.
|
9
12
|
class Summarizer
|
10
13
|
|
11
|
-
# The inputs to this Summarizer.
|
14
|
+
# The inputs given to this Summarizer.
|
12
15
|
attr_reader :inputs
|
13
16
|
|
17
|
+
# The resources to this Summarizer, calculated recursively from
|
18
|
+
# its +inputs+.
|
19
|
+
attr_reader :resources
|
20
|
+
|
21
|
+
include IMW::Tools::ExtensionAnalyzer
|
22
|
+
|
14
23
|
# Initialize a new Summarizer with the given +inputs+.
|
15
24
|
#
|
16
25
|
# @param [Array<String, IMW::Resource>] inputs
|
@@ -19,151 +28,40 @@ module IMW
|
|
19
28
|
self.inputs = inputs.flatten
|
20
29
|
end
|
21
30
|
|
22
|
-
# Set new inputs for this summarizer.
|
23
|
-
#
|
24
|
-
# Clears any cached summary statistics
|
25
|
-
#
|
26
|
-
# @param [Array<String, IMW::Resource>] new_inputs
|
27
|
-
def inputs= new_inputs
|
28
|
-
@inputs = new_inputs.map do |input|
|
29
|
-
i = IMW.open(input)
|
30
|
-
raise PathError.new("Invalid input, #{i.path}") if i.is_local? && !i.exist? # don't check for remote files
|
31
|
-
i.is_directory? ? i.resources : i
|
32
|
-
end.compact.flatten
|
33
|
-
clear_cached_statistics!
|
34
|
-
end
|
35
|
-
|
36
|
-
# Reset all the cached statistics of this summarizer to +nil+.
|
37
|
-
def clear_cached_statistics!
|
38
|
-
[:num_files,
|
39
|
-
:num_direcories,
|
40
|
-
:total_size,
|
41
|
-
:extension_counts,
|
42
|
-
:most_common_extension_by_count,
|
43
|
-
:normalized_extension_counts,
|
44
|
-
:extension_sizes,
|
45
|
-
:most_common_extension_by_size,
|
46
|
-
:normalized_extension_sizes].each do |instance_variable|
|
47
|
-
self.instance_variable_set("@#{instance_variable}", nil)
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
# Return the number of files.
|
52
|
-
#
|
53
|
-
# @return [Integer]
|
54
|
-
def num_files
|
55
|
-
@num_files ||= inputs.size
|
56
|
-
end
|
57
|
-
|
58
|
-
# Return the number of directories.
|
59
|
-
#
|
60
|
-
# @return [Integer]
|
61
|
-
def num_directories
|
62
|
-
@num_directories ||= inputs.collect { |input| input.is_directory? }
|
63
|
-
end
|
64
|
-
|
65
31
|
# Return the total size.
|
66
32
|
#
|
67
33
|
# @return [Integer]
|
68
34
|
def total_size
|
69
|
-
@total_size ||=
|
35
|
+
@total_size ||= resources.map(&:size).inject(0) { |e, sum| sum += e }
|
70
36
|
end
|
71
37
|
|
72
|
-
# Return the
|
38
|
+
# Return a summary of the +inputs+ to this Summarizer.
|
73
39
|
#
|
74
|
-
#
|
75
|
-
|
76
|
-
@extension_counts ||= returning({}) do |counts|
|
77
|
-
inputs.each do |input|
|
78
|
-
next if input.is_directory?
|
79
|
-
counts[input.extension] = 0 unless counts.has_key?(input.extension)
|
80
|
-
counts[input.extension] += 1
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
# Return the most common extension by count of files.
|
86
|
-
def most_common_extension_by_count
|
87
|
-
return @most_common_extension_by_count if @most_common_extension_by_count
|
88
|
-
current_count, current_extension = 0, nil
|
89
|
-
extension_counts.each_pair do |extension, count|
|
90
|
-
current_extension = extension if count > current_count
|
91
|
-
end
|
92
|
-
if current_extension.strip.blank? then current_extension = 'flat' end
|
93
|
-
@most_common_extension_by_count = current_extension
|
94
|
-
end
|
95
|
-
|
96
|
-
# Return the file counts of each extension, normalized by the
|
97
|
-
# total number of files.
|
40
|
+
# Delegates to the +summary+ method of each constituent
|
41
|
+
# IMW::Resource in +inputs+.
|
98
42
|
#
|
99
|
-
# @return [Hash]
|
100
|
-
def
|
101
|
-
@
|
102
|
-
extension_counts.each_pair do |extension, count|
|
103
|
-
weighted[extension] = count.to_f / num_files.to_f
|
104
|
-
end
|
105
|
-
end
|
43
|
+
# @return [Array<Hash>]
|
44
|
+
def summary
|
45
|
+
@summary ||= inputs.map(&:summary)
|
106
46
|
end
|
107
47
|
|
108
|
-
|
109
|
-
#
|
110
|
-
# @return [Hash]
|
111
|
-
def extension_sizes
|
112
|
-
@extension_sizes ||= returning({}) do |sizes|
|
113
|
-
inputs.each do |input|
|
114
|
-
next if input.is_directory?
|
115
|
-
sizes[input.extension] = 0 unless sizes.has_key?(input.extension)
|
116
|
-
sizes[input.extension] += input.size
|
117
|
-
end
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
# Return the most common extension by amount of data.
|
48
|
+
protected
|
49
|
+
# Set new inputs for this summarizer.
|
122
50
|
#
|
123
|
-
#
|
124
|
-
|
125
|
-
return @most_common_extension_by_size if @most_common_extension_by_size
|
126
|
-
current_size, current_extension = 0, nil
|
127
|
-
extension_sizes.each_pair do |extension, size|
|
128
|
-
current_extension = extension if size > current_size
|
129
|
-
end
|
130
|
-
if current_extension.strip.blank? then current_extension = 'flat' end
|
131
|
-
@most_common_extension_by_size = current_extension
|
132
|
-
end
|
133
|
-
|
134
|
-
# Return the fractional share of each extension by file size.
|
51
|
+
# Summarizer statistics are cached as instance variables so be
|
52
|
+
# careful about changing inputs and then using old statistics...
|
135
53
|
#
|
136
|
-
# @
|
137
|
-
def
|
138
|
-
@
|
139
|
-
|
140
|
-
|
141
|
-
end
|
54
|
+
# @param [Array<String, IMW::Resource>] new_inputs
|
55
|
+
def inputs= new_inputs
|
56
|
+
@inputs = new_inputs.map do |path_or_resource|
|
57
|
+
input = IMW.open(path_or_resource)
|
58
|
+
input.should_exist!("Cannot summarize.")
|
142
59
|
end
|
60
|
+
@resources = inputs.map do |input|
|
61
|
+
input.is_directory? ? input.all_resources : input
|
62
|
+
end.compact.flatten
|
143
63
|
end
|
144
64
|
|
145
|
-
# Return a guess as to the most common extension format for this
|
146
|
-
# Summarizer's inputs.
|
147
|
-
#
|
148
|
-
# @return [String]
|
149
|
-
def most_common_extension
|
150
|
-
return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
|
151
|
-
count_fraction = normalized_extension_counts[most_common_extension_by_count]
|
152
|
-
size_fraction = normalized_extension_sizes[most_common_extension_by_size]
|
153
|
-
return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # choose the winner based on differential
|
154
|
-
return most_common_extension_by_size if count_fraction < 0.5 and size_fraction > 0.5
|
155
|
-
most_common_extension_by_size # default to size
|
156
|
-
end
|
157
|
-
|
158
|
-
# Returns a guess as to the most common data format for this
|
159
|
-
# Summarizer's inputs.
|
160
|
-
#
|
161
|
-
# @return [String]
|
162
|
-
def most_common_data_format
|
163
|
-
extension = most_common_extension
|
164
|
-
['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
|
165
|
-
end
|
166
|
-
|
167
65
|
end
|
168
66
|
end
|
169
67
|
end
|
data/lib/imw/utils/log.rb
CHANGED
@@ -9,7 +9,7 @@ module IMW
|
|
9
9
|
LOG_TIMEFORMAT = "%Y-%m-%d %H:%M:%S " unless defined?(LOG_TIMEFORMAT)
|
10
10
|
|
11
11
|
# Default verbosity
|
12
|
-
VERBOSE = false
|
12
|
+
VERBOSE = false unless defined?(VERBOSE)
|
13
13
|
|
14
14
|
class << self; attr_accessor :log, :verbose end
|
15
15
|
|
@@ -36,7 +36,7 @@ module IMW
|
|
36
36
|
def self.announce *events
|
37
37
|
options = events.flatten.extract_options!
|
38
38
|
options.reverse_merge! :level => Logger::INFO
|
39
|
-
IMW.log.add options[:level], events.join("\n")
|
39
|
+
IMW.log.add options[:level], "IMW: " + events.join("\n")
|
40
40
|
end
|
41
41
|
def self.announce_if_verbose *events
|
42
42
|
announce(*events) if IMW.verbose?
|