imw 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/imw/tools/archiver.rb +1 -0
- data/lib/imw/tools/summarizer.rb +169 -0
- data/lib/imw/tools.rb +1 -0
- data/lib/imw/utils/extensions.rb +2 -1
- data/spec/imw/tools/summarizer_spec.rb +6 -0
- metadata +5 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.3
|
data/lib/imw/tools/archiver.rb
CHANGED
@@ -0,0 +1,169 @@
|
|
1
|
+
module IMW
|
2
|
+
module Tools
|
3
|
+
|
4
|
+
# A class for producing summary data about a collection of
|
5
|
+
# resources.
|
6
|
+
#
|
7
|
+
# This summary data includes the directory tree, file sizes, file
|
8
|
+
# formats, record counts, &c.
|
9
|
+
class Summarizer
|
10
|
+
|
11
|
+
# The inputs to this Summarizer.
|
12
|
+
attr_reader :inputs
|
13
|
+
|
14
|
+
# Initialize a new Summarizer with the given +inputs+.
|
15
|
+
#
|
16
|
+
# @param [Array<String, IMW::Resource>] inputs
|
17
|
+
# @return [IMW::Tools::Summarizer]
|
18
|
+
def initialize *inputs
|
19
|
+
self.inputs = inputs.flatten
|
20
|
+
end
|
21
|
+
|
22
|
+
# Set new inputs for this summarizer.
|
23
|
+
#
|
24
|
+
# Clears any cached summary statistics
|
25
|
+
#
|
26
|
+
# @param [Array<String, IMW::Resource>] new_inputs
|
27
|
+
def inputs= new_inputs
|
28
|
+
@inputs = new_inputs.map do |input|
|
29
|
+
i = IMW.open(input)
|
30
|
+
raise PathError.new("Invalid input, #{i.path}") if i.is_local? && !i.exist? # don't check for remote files
|
31
|
+
i.is_directory? ? i.resources : i
|
32
|
+
end.compact.flatten
|
33
|
+
clear_cached_statistics!
|
34
|
+
end
|
35
|
+
|
36
|
+
# Reset all the cached statistics of this summarizer to +nil+.
|
37
|
+
def clear_cached_statistics!
|
38
|
+
[:num_files,
|
39
|
+
:num_direcories,
|
40
|
+
:total_size,
|
41
|
+
:extension_counts,
|
42
|
+
:most_common_extension_by_count,
|
43
|
+
:normalized_extension_counts,
|
44
|
+
:extension_sizes,
|
45
|
+
:most_common_extension_by_size,
|
46
|
+
:normalized_extension_sizes].each do |instance_variable|
|
47
|
+
self.instance_variable_set("@#{instance_variable}", nil)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Return the number of files.
|
52
|
+
#
|
53
|
+
# @return [Integer]
|
54
|
+
def num_files
|
55
|
+
@num_files ||= inputs.size
|
56
|
+
end
|
57
|
+
|
58
|
+
# Return the number of directories.
|
59
|
+
#
|
60
|
+
# @return [Integer]
|
61
|
+
def num_directories
|
62
|
+
@num_directories ||= inputs.collect { |input| input.is_directory? }
|
63
|
+
end
|
64
|
+
|
65
|
+
# Return the total size.
|
66
|
+
#
|
67
|
+
# @return [Integer]
|
68
|
+
def total_size
|
69
|
+
@total_size ||= inputs.map(&:size).inject(0) { |e, sum| sum += e }
|
70
|
+
end
|
71
|
+
|
72
|
+
# Return the file counts of each extension.
|
73
|
+
#
|
74
|
+
# @return [Hash]
|
75
|
+
def extension_counts
|
76
|
+
@extension_counts ||= returning({}) do |counts|
|
77
|
+
inputs.each do |input|
|
78
|
+
next if input.is_directory?
|
79
|
+
counts[input.extension] = 0 unless counts.has_key?(input.extension)
|
80
|
+
counts[input.extension] += 1
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Return the most common extension by count of files.
|
86
|
+
def most_common_extension_by_count
|
87
|
+
return @most_common_extension_by_count if @most_common_extension_by_count
|
88
|
+
current_count, current_extension = 0, nil
|
89
|
+
extension_counts.each_pair do |extension, count|
|
90
|
+
current_extension = extension if count > current_count
|
91
|
+
end
|
92
|
+
if current_extension.strip.blank? then current_extension = 'flat' end
|
93
|
+
@most_common_extension_by_count = current_extension
|
94
|
+
end
|
95
|
+
|
96
|
+
# Return the file counts of each extension, normalized by the
|
97
|
+
# total number of files.
|
98
|
+
#
|
99
|
+
# @return [Hash]
|
100
|
+
def normalized_extension_counts
|
101
|
+
@normalized_extension_counts ||= returning({}) do |weighted|
|
102
|
+
extension_counts.each_pair do |extension, count|
|
103
|
+
weighted[extension] = count.to_f / num_files.to_f
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Return the amount of data corresponding to each extension.
|
109
|
+
#
|
110
|
+
# @return [Hash]
|
111
|
+
def extension_sizes
|
112
|
+
@extension_sizes ||= returning({}) do |sizes|
|
113
|
+
inputs.each do |input|
|
114
|
+
next if input.is_directory?
|
115
|
+
sizes[input.extension] = 0 unless sizes.has_key?(input.extension)
|
116
|
+
sizes[input.extension] += input.size
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# Return the most common extension by amount of data.
|
122
|
+
#
|
123
|
+
# @return [String]
|
124
|
+
def most_common_extension_by_size
|
125
|
+
return @most_common_extension_by_size if @most_common_extension_by_size
|
126
|
+
current_size, current_extension = 0, nil
|
127
|
+
extension_sizes.each_pair do |extension, size|
|
128
|
+
current_extension = extension if size > current_size
|
129
|
+
end
|
130
|
+
if current_extension.strip.blank? then current_extension = 'flat' end
|
131
|
+
@most_common_extension_by_size = current_extension
|
132
|
+
end
|
133
|
+
|
134
|
+
# Return the fractional share of each extension by file size.
|
135
|
+
#
|
136
|
+
# @return [Hash]
|
137
|
+
def normalized_extension_sizes
|
138
|
+
@normalized_extension_sizes ||= returning({}) do |weighted|
|
139
|
+
extension_sizes.each_pair do |extension, size|
|
140
|
+
weighted[extension] = size.to_f / total_size.to_f
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# Return a guess as to the most common extension format for this
|
146
|
+
# Summarizer's inputs.
|
147
|
+
#
|
148
|
+
# @return [String]
|
149
|
+
def most_common_extension
|
150
|
+
return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
|
151
|
+
count_fraction = normalized_extension_counts[most_common_extension_by_count]
|
152
|
+
size_fraction = normalized_extension_sizes[most_common_extension_by_size]
|
153
|
+
return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # choose the winner based on differential
|
154
|
+
return most_common_extension_by_size if count_fraction < 0.5 and size_fraction > 0.5
|
155
|
+
most_common_extension_by_size # default to size
|
156
|
+
end
|
157
|
+
|
158
|
+
# Returns a guess as to the most common data format for this
|
159
|
+
# Summarizer's inputs.
|
160
|
+
#
|
161
|
+
# @return [String]
|
162
|
+
def most_common_data_format
|
163
|
+
extension = most_common_extension
|
164
|
+
['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
data/lib/imw/tools.rb
CHANGED
data/lib/imw/utils/extensions.rb
CHANGED
@@ -58,8 +58,9 @@ module IMW
|
|
58
58
|
def self.system *commands
|
59
59
|
stripped_commands = commands.flatten.map { |command| command.to_s unless command.blank? }.compact
|
60
60
|
IMW.announce_if_verbose(stripped_commands.join(" "))
|
61
|
-
Kernel.system(*stripped_commands)
|
61
|
+
exit_code = Kernel.system(*stripped_commands)
|
62
62
|
raise IMW::SystemCallError.new($?.dup, commands.join(' ')) unless $?.success?
|
63
|
+
exit_code
|
63
64
|
end
|
64
65
|
end
|
65
66
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: imw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dhruv Bansal
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2010-05-
|
13
|
+
date: 2010-05-20 00:00:00 -05:00
|
14
14
|
default_executable: imw
|
15
15
|
dependencies: []
|
16
16
|
|
@@ -72,6 +72,7 @@ files:
|
|
72
72
|
- lib/imw/schemes/s3.rb
|
73
73
|
- lib/imw/tools.rb
|
74
74
|
- lib/imw/tools/archiver.rb
|
75
|
+
- lib/imw/tools/summarizer.rb
|
75
76
|
- lib/imw/tools/transferer.rb
|
76
77
|
- lib/imw/utils.rb
|
77
78
|
- lib/imw/utils/error.rb
|
@@ -120,6 +121,7 @@ files:
|
|
120
121
|
- spec/imw/schemes/remote_spec.rb
|
121
122
|
- spec/imw/schemes/s3_spec.rb
|
122
123
|
- spec/imw/tools/archiver_spec.rb
|
124
|
+
- spec/imw/tools/summarizer_spec.rb
|
123
125
|
- spec/imw/tools/transferer_spec.rb
|
124
126
|
- spec/imw/utils/paths_spec.rb
|
125
127
|
- spec/imw/utils/shared_paths_spec.rb
|
@@ -167,6 +169,7 @@ test_files:
|
|
167
169
|
- spec/imw/archives/tarbz2_spec.rb
|
168
170
|
- spec/imw/archives/rar_spec.rb
|
169
171
|
- spec/imw/tools/archiver_spec.rb
|
172
|
+
- spec/imw/tools/summarizer_spec.rb
|
170
173
|
- spec/imw/tools/transferer_spec.rb
|
171
174
|
- spec/imw/compressed_files/compressible_spec.rb
|
172
175
|
- spec/imw/compressed_files/bz2_spec.rb
|