imw 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/imw/tools/archiver.rb +1 -0
- data/lib/imw/tools/summarizer.rb +169 -0
- data/lib/imw/tools.rb +1 -0
- data/lib/imw/utils/extensions.rb +2 -1
- data/spec/imw/tools/summarizer_spec.rb +6 -0
- metadata +5 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.3
|
data/lib/imw/tools/archiver.rb
CHANGED
@@ -0,0 +1,169 @@
|
|
1
|
+
module IMW
|
2
|
+
module Tools
|
3
|
+
|
4
|
+
# A class for producing summary data about a collection of
|
5
|
+
# resources.
|
6
|
+
#
|
7
|
+
# This summary data includes the directory tree, file sizes, file
|
8
|
+
# formats, record counts, &c.
|
9
|
+
class Summarizer
|
10
|
+
|
11
|
+
# The inputs to this Summarizer.
|
12
|
+
attr_reader :inputs
|
13
|
+
|
14
|
+
# Initialize a new Summarizer with the given +inputs+.
|
15
|
+
#
|
16
|
+
# @param [Array<String, IMW::Resource>] inputs
|
17
|
+
# @return [IMW::Tools::Summarizer]
|
18
|
+
def initialize *inputs
|
19
|
+
self.inputs = inputs.flatten
|
20
|
+
end
|
21
|
+
|
22
|
+
# Set new inputs for this summarizer.
|
23
|
+
#
|
24
|
+
# Clears any cached summary statistics
|
25
|
+
#
|
26
|
+
# @param [Array<String, IMW::Resource>] new_inputs
|
27
|
+
def inputs= new_inputs
|
28
|
+
@inputs = new_inputs.map do |input|
|
29
|
+
i = IMW.open(input)
|
30
|
+
raise PathError.new("Invalid input, #{i.path}") if i.is_local? && !i.exist? # don't check for remote files
|
31
|
+
i.is_directory? ? i.resources : i
|
32
|
+
end.compact.flatten
|
33
|
+
clear_cached_statistics!
|
34
|
+
end
|
35
|
+
|
36
|
+
# Reset all the cached statistics of this summarizer to +nil+.
|
37
|
+
def clear_cached_statistics!
|
38
|
+
[:num_files,
|
39
|
+
:num_direcories,
|
40
|
+
:total_size,
|
41
|
+
:extension_counts,
|
42
|
+
:most_common_extension_by_count,
|
43
|
+
:normalized_extension_counts,
|
44
|
+
:extension_sizes,
|
45
|
+
:most_common_extension_by_size,
|
46
|
+
:normalized_extension_sizes].each do |instance_variable|
|
47
|
+
self.instance_variable_set("@#{instance_variable}", nil)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Return the number of files.
|
52
|
+
#
|
53
|
+
# @return [Integer]
|
54
|
+
def num_files
|
55
|
+
@num_files ||= inputs.size
|
56
|
+
end
|
57
|
+
|
58
|
+
# Return the number of directories.
|
59
|
+
#
|
60
|
+
# @return [Integer]
|
61
|
+
def num_directories
|
62
|
+
@num_directories ||= inputs.collect { |input| input.is_directory? }
|
63
|
+
end
|
64
|
+
|
65
|
+
# Return the total size.
|
66
|
+
#
|
67
|
+
# @return [Integer]
|
68
|
+
def total_size
|
69
|
+
@total_size ||= inputs.map(&:size).inject(0) { |e, sum| sum += e }
|
70
|
+
end
|
71
|
+
|
72
|
+
# Return the file counts of each extension.
|
73
|
+
#
|
74
|
+
# @return [Hash]
|
75
|
+
def extension_counts
|
76
|
+
@extension_counts ||= returning({}) do |counts|
|
77
|
+
inputs.each do |input|
|
78
|
+
next if input.is_directory?
|
79
|
+
counts[input.extension] = 0 unless counts.has_key?(input.extension)
|
80
|
+
counts[input.extension] += 1
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Return the most common extension by count of files.
|
86
|
+
def most_common_extension_by_count
|
87
|
+
return @most_common_extension_by_count if @most_common_extension_by_count
|
88
|
+
current_count, current_extension = 0, nil
|
89
|
+
extension_counts.each_pair do |extension, count|
|
90
|
+
current_extension = extension if count > current_count
|
91
|
+
end
|
92
|
+
if current_extension.strip.blank? then current_extension = 'flat' end
|
93
|
+
@most_common_extension_by_count = current_extension
|
94
|
+
end
|
95
|
+
|
96
|
+
# Return the file counts of each extension, normalized by the
|
97
|
+
# total number of files.
|
98
|
+
#
|
99
|
+
# @return [Hash]
|
100
|
+
def normalized_extension_counts
|
101
|
+
@normalized_extension_counts ||= returning({}) do |weighted|
|
102
|
+
extension_counts.each_pair do |extension, count|
|
103
|
+
weighted[extension] = count.to_f / num_files.to_f
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Return the amount of data corresponding to each extension.
|
109
|
+
#
|
110
|
+
# @return [Hash]
|
111
|
+
def extension_sizes
|
112
|
+
@extension_sizes ||= returning({}) do |sizes|
|
113
|
+
inputs.each do |input|
|
114
|
+
next if input.is_directory?
|
115
|
+
sizes[input.extension] = 0 unless sizes.has_key?(input.extension)
|
116
|
+
sizes[input.extension] += input.size
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# Return the most common extension by amount of data.
|
122
|
+
#
|
123
|
+
# @return [String]
|
124
|
+
def most_common_extension_by_size
|
125
|
+
return @most_common_extension_by_size if @most_common_extension_by_size
|
126
|
+
current_size, current_extension = 0, nil
|
127
|
+
extension_sizes.each_pair do |extension, size|
|
128
|
+
current_extension = extension if size > current_size
|
129
|
+
end
|
130
|
+
if current_extension.strip.blank? then current_extension = 'flat' end
|
131
|
+
@most_common_extension_by_size = current_extension
|
132
|
+
end
|
133
|
+
|
134
|
+
# Return the fractional share of each extension by file size.
|
135
|
+
#
|
136
|
+
# @return [Hash]
|
137
|
+
def normalized_extension_sizes
|
138
|
+
@normalized_extension_sizes ||= returning({}) do |weighted|
|
139
|
+
extension_sizes.each_pair do |extension, size|
|
140
|
+
weighted[extension] = size.to_f / total_size.to_f
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# Return a guess as to the most common extension format for this
|
146
|
+
# Summarizer's inputs.
|
147
|
+
#
|
148
|
+
# @return [String]
|
149
|
+
def most_common_extension
|
150
|
+
return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
|
151
|
+
count_fraction = normalized_extension_counts[most_common_extension_by_count]
|
152
|
+
size_fraction = normalized_extension_sizes[most_common_extension_by_size]
|
153
|
+
return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # choose the winner based on differential
|
154
|
+
return most_common_extension_by_size if count_fraction < 0.5 and size_fraction > 0.5
|
155
|
+
most_common_extension_by_size # default to size
|
156
|
+
end
|
157
|
+
|
158
|
+
# Returns a guess as to the most common data format for this
|
159
|
+
# Summarizer's inputs.
|
160
|
+
#
|
161
|
+
# @return [String]
|
162
|
+
def most_common_data_format
|
163
|
+
extension = most_common_extension
|
164
|
+
['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
data/lib/imw/tools.rb
CHANGED
data/lib/imw/utils/extensions.rb
CHANGED
@@ -58,8 +58,9 @@ module IMW
|
|
58
58
|
def self.system *commands
|
59
59
|
stripped_commands = commands.flatten.map { |command| command.to_s unless command.blank? }.compact
|
60
60
|
IMW.announce_if_verbose(stripped_commands.join(" "))
|
61
|
-
Kernel.system(*stripped_commands)
|
61
|
+
exit_code = Kernel.system(*stripped_commands)
|
62
62
|
raise IMW::SystemCallError.new($?.dup, commands.join(' ')) unless $?.success?
|
63
|
+
exit_code
|
63
64
|
end
|
64
65
|
end
|
65
66
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: imw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dhruv Bansal
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2010-05-
|
13
|
+
date: 2010-05-20 00:00:00 -05:00
|
14
14
|
default_executable: imw
|
15
15
|
dependencies: []
|
16
16
|
|
@@ -72,6 +72,7 @@ files:
|
|
72
72
|
- lib/imw/schemes/s3.rb
|
73
73
|
- lib/imw/tools.rb
|
74
74
|
- lib/imw/tools/archiver.rb
|
75
|
+
- lib/imw/tools/summarizer.rb
|
75
76
|
- lib/imw/tools/transferer.rb
|
76
77
|
- lib/imw/utils.rb
|
77
78
|
- lib/imw/utils/error.rb
|
@@ -120,6 +121,7 @@ files:
|
|
120
121
|
- spec/imw/schemes/remote_spec.rb
|
121
122
|
- spec/imw/schemes/s3_spec.rb
|
122
123
|
- spec/imw/tools/archiver_spec.rb
|
124
|
+
- spec/imw/tools/summarizer_spec.rb
|
123
125
|
- spec/imw/tools/transferer_spec.rb
|
124
126
|
- spec/imw/utils/paths_spec.rb
|
125
127
|
- spec/imw/utils/shared_paths_spec.rb
|
@@ -167,6 +169,7 @@ test_files:
|
|
167
169
|
- spec/imw/archives/tarbz2_spec.rb
|
168
170
|
- spec/imw/archives/rar_spec.rb
|
169
171
|
- spec/imw/tools/archiver_spec.rb
|
172
|
+
- spec/imw/tools/summarizer_spec.rb
|
170
173
|
- spec/imw/tools/transferer_spec.rb
|
171
174
|
- spec/imw/compressed_files/compressible_spec.rb
|
172
175
|
- spec/imw/compressed_files/bz2_spec.rb
|