imw 0.2.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +174 -86
- data/VERSION +1 -1
- data/lib/imw/formats/delimited.rb +5 -5
- data/lib/imw/formats/json.rb +10 -18
- data/lib/imw/formats/yaml.rb +11 -19
- data/lib/imw/resource.rb +26 -0
- data/lib/imw/schemes/local.rb +59 -10
- data/lib/imw/tools/extension_analyzer.rb +108 -0
- data/lib/imw/tools/summarizer.rb +31 -133
- data/lib/imw/utils/log.rb +2 -2
- data/spec/data/sample.json +782 -1
- data/spec/data/sample.yaml +650 -651
- data/spec/imw/formats/delimited_spec.rb +0 -12
- data/spec/imw/formats/json_spec.rb +1 -15
- data/spec/imw/formats/yaml_spec.rb +1 -23
- data/spec/imw/resource_spec.rb +26 -0
- data/spec/imw/schemes/local_spec.rb +1 -1
- metadata +3 -2
@@ -0,0 +1,108 @@
|
|
1
|
+
module IMW
|
2
|
+
module Tools
|
3
|
+
|
4
|
+
# Mixin with some heuristic methods for identifying common
|
5
|
+
# extensions and likely data formats for a collection of files.
|
6
|
+
#
|
7
|
+
# Requires the including class to define a method +resources+ which
|
8
|
+
# returns an array of IMW::Resource objects.
|
9
|
+
module ExtensionAnalyzer
|
10
|
+
|
11
|
+
# Return the file counts of each extension.
|
12
|
+
#
|
13
|
+
# @return [Hash]
|
14
|
+
def extension_counts
|
15
|
+
@extension_counts ||= returning({}) do |counts|
|
16
|
+
resources.each do |resource|
|
17
|
+
next if resource.is_directory?
|
18
|
+
counts[resource.extension] = 0 unless counts.has_key?(resource.extension)
|
19
|
+
counts[resource.extension] += 1
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Return the most common extension by count of files.
|
25
|
+
def most_common_extension_by_count
|
26
|
+
return @most_common_extension_by_count if @most_common_extension_by_count
|
27
|
+
current_count, current_extension = 0, nil
|
28
|
+
extension_counts.each_pair do |extension, count|
|
29
|
+
current_extension = extension if count > current_count
|
30
|
+
end
|
31
|
+
if current_extension.strip.blank? then current_extension = 'flat' end
|
32
|
+
@most_common_extension_by_count = current_extension
|
33
|
+
end
|
34
|
+
|
35
|
+
# Return the file counts of each extension, normalized by the
|
36
|
+
# total number of files.
|
37
|
+
#
|
38
|
+
# @return [Hash]
|
39
|
+
def normalized_extension_counts
|
40
|
+
@normalized_extension_counts ||= returning({}) do |weighted|
|
41
|
+
extension_counts.each_pair do |extension, count|
|
42
|
+
weighted[extension] = count.to_f / num_files.to_f
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Return the amount of data corresponding to each extension.
|
48
|
+
#
|
49
|
+
# @return [Hash]
|
50
|
+
def extension_sizes
|
51
|
+
@extension_sizes ||= returning({}) do |sizes|
|
52
|
+
resources.each do |resource|
|
53
|
+
next if resource.is_directory?
|
54
|
+
sizes[resource.extension] = 0 unless sizes.has_key?(resource.extension)
|
55
|
+
sizes[resource.extension] += resource.size
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Return the most common extension by amount of data.
|
61
|
+
#
|
62
|
+
# @return [String]
|
63
|
+
def most_common_extension_by_size
|
64
|
+
return @most_common_extension_by_size if @most_common_extension_by_size
|
65
|
+
current_size, current_extension = 0, nil
|
66
|
+
extension_sizes.each_pair do |extension, size|
|
67
|
+
current_extension = extension if size > current_size
|
68
|
+
end
|
69
|
+
if current_extension.strip.blank? then current_extension = 'flat' end
|
70
|
+
@most_common_extension_by_size = current_extension
|
71
|
+
end
|
72
|
+
|
73
|
+
# Return the fractional share of each extension by file size.
|
74
|
+
#
|
75
|
+
# @return [Hash]
|
76
|
+
def normalized_extension_sizes
|
77
|
+
@normalized_extension_sizes ||= returning({}) do |weighted|
|
78
|
+
extension_sizes.each_pair do |extension, size|
|
79
|
+
weighted[extension] = size.to_f / total_size.to_f
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# Return a guess as to the most common extension format for this
|
85
|
+
# Summarizer's resources.
|
86
|
+
#
|
87
|
+
# @return [String]
|
88
|
+
def most_common_extension
|
89
|
+
return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
|
90
|
+
count_fraction = normalized_extension_counts[most_common_extension_by_count]
|
91
|
+
size_fraction = normalized_extension_sizes[most_common_extension_by_size]
|
92
|
+
return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # FIXME arbitrary
|
93
|
+
return most_common_extension_by_size if count_fraction < 0.5 and size_fraction > 0.5
|
94
|
+
most_common_extension_by_size # default to size
|
95
|
+
end
|
96
|
+
|
97
|
+
# Returns a guess as to the most common data format for this
|
98
|
+
# Summarizer's resources.
|
99
|
+
#
|
100
|
+
# @return [String]
|
101
|
+
def most_common_data_format
|
102
|
+
extension = most_common_extension
|
103
|
+
['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
data/lib/imw/tools/summarizer.rb
CHANGED
@@ -1,16 +1,25 @@
|
|
1
|
+
require 'imw/tools/extension_analyzer'
|
2
|
+
|
1
3
|
module IMW
|
2
4
|
module Tools
|
3
5
|
|
4
6
|
# A class for producing summary data about a collection of
|
5
7
|
# resources.
|
6
8
|
#
|
7
|
-
#
|
8
|
-
#
|
9
|
+
# The Summarizer needs recursively IMW.open all files and
|
10
|
+
# directories given so will be very cumbersome if given many
|
11
|
+
# files. Few large files will not cause a problem.
|
9
12
|
class Summarizer
|
10
13
|
|
11
|
-
# The inputs to this Summarizer.
|
14
|
+
# The inputs given to this Summarizer.
|
12
15
|
attr_reader :inputs
|
13
16
|
|
17
|
+
# The resources to this Summarizer, calculated recursively from
|
18
|
+
# its +inputs+.
|
19
|
+
attr_reader :resources
|
20
|
+
|
21
|
+
include IMW::Tools::ExtensionAnalyzer
|
22
|
+
|
14
23
|
# Initialize a new Summarizer with the given +inputs+.
|
15
24
|
#
|
16
25
|
# @param [Array<String, IMW::Resource>] inputs
|
@@ -19,151 +28,40 @@ module IMW
|
|
19
28
|
self.inputs = inputs.flatten
|
20
29
|
end
|
21
30
|
|
22
|
-
# Set new inputs for this summarizer.
|
23
|
-
#
|
24
|
-
# Clears any cached summary statistics
|
25
|
-
#
|
26
|
-
# @param [Array<String, IMW::Resource>] new_inputs
|
27
|
-
def inputs= new_inputs
|
28
|
-
@inputs = new_inputs.map do |input|
|
29
|
-
i = IMW.open(input)
|
30
|
-
raise PathError.new("Invalid input, #{i.path}") if i.is_local? && !i.exist? # don't check for remote files
|
31
|
-
i.is_directory? ? i.resources : i
|
32
|
-
end.compact.flatten
|
33
|
-
clear_cached_statistics!
|
34
|
-
end
|
35
|
-
|
36
|
-
# Reset all the cached statistics of this summarizer to +nil+.
|
37
|
-
def clear_cached_statistics!
|
38
|
-
[:num_files,
|
39
|
-
:num_direcories,
|
40
|
-
:total_size,
|
41
|
-
:extension_counts,
|
42
|
-
:most_common_extension_by_count,
|
43
|
-
:normalized_extension_counts,
|
44
|
-
:extension_sizes,
|
45
|
-
:most_common_extension_by_size,
|
46
|
-
:normalized_extension_sizes].each do |instance_variable|
|
47
|
-
self.instance_variable_set("@#{instance_variable}", nil)
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
# Return the number of files.
|
52
|
-
#
|
53
|
-
# @return [Integer]
|
54
|
-
def num_files
|
55
|
-
@num_files ||= inputs.size
|
56
|
-
end
|
57
|
-
|
58
|
-
# Return the number of directories.
|
59
|
-
#
|
60
|
-
# @return [Integer]
|
61
|
-
def num_directories
|
62
|
-
@num_directories ||= inputs.collect { |input| input.is_directory? }
|
63
|
-
end
|
64
|
-
|
65
31
|
# Return the total size.
|
66
32
|
#
|
67
33
|
# @return [Integer]
|
68
34
|
def total_size
|
69
|
-
@total_size ||=
|
35
|
+
@total_size ||= resources.map(&:size).inject(0) { |e, sum| sum += e }
|
70
36
|
end
|
71
37
|
|
72
|
-
# Return the
|
38
|
+
# Return a summary of the +inputs+ to this Summarizer.
|
73
39
|
#
|
74
|
-
#
|
75
|
-
|
76
|
-
@extension_counts ||= returning({}) do |counts|
|
77
|
-
inputs.each do |input|
|
78
|
-
next if input.is_directory?
|
79
|
-
counts[input.extension] = 0 unless counts.has_key?(input.extension)
|
80
|
-
counts[input.extension] += 1
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
# Return the most common extension by count of files.
|
86
|
-
def most_common_extension_by_count
|
87
|
-
return @most_common_extension_by_count if @most_common_extension_by_count
|
88
|
-
current_count, current_extension = 0, nil
|
89
|
-
extension_counts.each_pair do |extension, count|
|
90
|
-
current_extension = extension if count > current_count
|
91
|
-
end
|
92
|
-
if current_extension.strip.blank? then current_extension = 'flat' end
|
93
|
-
@most_common_extension_by_count = current_extension
|
94
|
-
end
|
95
|
-
|
96
|
-
# Return the file counts of each extension, normalized by the
|
97
|
-
# total number of files.
|
40
|
+
# Delegates to the +summary+ method of each constituent
|
41
|
+
# IMW::Resource in +inputs+.
|
98
42
|
#
|
99
|
-
# @return [Hash]
|
100
|
-
def
|
101
|
-
@
|
102
|
-
extension_counts.each_pair do |extension, count|
|
103
|
-
weighted[extension] = count.to_f / num_files.to_f
|
104
|
-
end
|
105
|
-
end
|
43
|
+
# @return [Array<Hash>]
|
44
|
+
def summary
|
45
|
+
@summary ||= inputs.map(&:summary)
|
106
46
|
end
|
107
47
|
|
108
|
-
|
109
|
-
#
|
110
|
-
# @return [Hash]
|
111
|
-
def extension_sizes
|
112
|
-
@extension_sizes ||= returning({}) do |sizes|
|
113
|
-
inputs.each do |input|
|
114
|
-
next if input.is_directory?
|
115
|
-
sizes[input.extension] = 0 unless sizes.has_key?(input.extension)
|
116
|
-
sizes[input.extension] += input.size
|
117
|
-
end
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
# Return the most common extension by amount of data.
|
48
|
+
protected
|
49
|
+
# Set new inputs for this summarizer.
|
122
50
|
#
|
123
|
-
#
|
124
|
-
|
125
|
-
return @most_common_extension_by_size if @most_common_extension_by_size
|
126
|
-
current_size, current_extension = 0, nil
|
127
|
-
extension_sizes.each_pair do |extension, size|
|
128
|
-
current_extension = extension if size > current_size
|
129
|
-
end
|
130
|
-
if current_extension.strip.blank? then current_extension = 'flat' end
|
131
|
-
@most_common_extension_by_size = current_extension
|
132
|
-
end
|
133
|
-
|
134
|
-
# Return the fractional share of each extension by file size.
|
51
|
+
# Summarizer statistics are cached as instance variables so be
|
52
|
+
# careful about changing inputs and then using old statistics...
|
135
53
|
#
|
136
|
-
# @
|
137
|
-
def
|
138
|
-
@
|
139
|
-
|
140
|
-
|
141
|
-
end
|
54
|
+
# @param [Array<String, IMW::Resource>] new_inputs
|
55
|
+
def inputs= new_inputs
|
56
|
+
@inputs = new_inputs.map do |path_or_resource|
|
57
|
+
input = IMW.open(path_or_resource)
|
58
|
+
input.should_exist!("Cannot summarize.")
|
142
59
|
end
|
60
|
+
@resources = inputs.map do |input|
|
61
|
+
input.is_directory? ? input.all_resources : input
|
62
|
+
end.compact.flatten
|
143
63
|
end
|
144
64
|
|
145
|
-
# Return a guess as to the most common extension format for this
|
146
|
-
# Summarizer's inputs.
|
147
|
-
#
|
148
|
-
# @return [String]
|
149
|
-
def most_common_extension
|
150
|
-
return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
|
151
|
-
count_fraction = normalized_extension_counts[most_common_extension_by_count]
|
152
|
-
size_fraction = normalized_extension_sizes[most_common_extension_by_size]
|
153
|
-
return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # choose the winner based on differential
|
154
|
-
return most_common_extension_by_size if count_fraction < 0.5 and size_fraction > 0.5
|
155
|
-
most_common_extension_by_size # default to size
|
156
|
-
end
|
157
|
-
|
158
|
-
# Returns a guess as to the most common data format for this
|
159
|
-
# Summarizer's inputs.
|
160
|
-
#
|
161
|
-
# @return [String]
|
162
|
-
def most_common_data_format
|
163
|
-
extension = most_common_extension
|
164
|
-
['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
|
165
|
-
end
|
166
|
-
|
167
65
|
end
|
168
66
|
end
|
169
67
|
end
|
data/lib/imw/utils/log.rb
CHANGED
@@ -9,7 +9,7 @@ module IMW
|
|
9
9
|
LOG_TIMEFORMAT = "%Y-%m-%d %H:%M:%S " unless defined?(LOG_TIMEFORMAT)
|
10
10
|
|
11
11
|
# Default verbosity
|
12
|
-
VERBOSE = false
|
12
|
+
VERBOSE = false unless defined?(VERBOSE)
|
13
13
|
|
14
14
|
class << self; attr_accessor :log, :verbose end
|
15
15
|
|
@@ -36,7 +36,7 @@ module IMW
|
|
36
36
|
def self.announce *events
|
37
37
|
options = events.flatten.extract_options!
|
38
38
|
options.reverse_merge! :level => Logger::INFO
|
39
|
-
IMW.log.add options[:level], events.join("\n")
|
39
|
+
IMW.log.add options[:level], "IMW: " + events.join("\n")
|
40
40
|
end
|
41
41
|
def self.announce_if_verbose *events
|
42
42
|
announce(*events) if IMW.verbose?
|