compare_compressors 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +132 -0
- data/bin/compare_compressors +6 -0
- data/lib/compare_compressors.rb +40 -0
- data/lib/compare_compressors/command_line_interface.rb +223 -0
- data/lib/compare_compressors/comparer.rb +70 -0
- data/lib/compare_compressors/compressor.rb +150 -0
- data/lib/compare_compressors/compressors/brotli_compressor.rb +43 -0
- data/lib/compare_compressors/compressors/bzip2_compressor.rb +37 -0
- data/lib/compare_compressors/compressors/gzip_compressor.rb +34 -0
- data/lib/compare_compressors/compressors/seven_zip_compressor.rb +43 -0
- data/lib/compare_compressors/compressors/xz_compressor.rb +37 -0
- data/lib/compare_compressors/compressors/zstd_compressor.rb +37 -0
- data/lib/compare_compressors/cost_model.rb +55 -0
- data/lib/compare_compressors/costed_group_result.rb +87 -0
- data/lib/compare_compressors/group_result.rb +62 -0
- data/lib/compare_compressors/plotter.rb +164 -0
- data/lib/compare_compressors/plotters/cost_plotter.rb +90 -0
- data/lib/compare_compressors/plotters/raw_plotter.rb +61 -0
- data/lib/compare_compressors/plotters/size_plotter.rb +76 -0
- data/lib/compare_compressors/result.rb +81 -0
- data/lib/compare_compressors/version.rb +8 -0
- data/test/compare_compressors/compare_compressors_test.rb +271 -0
- metadata +101 -0
@@ -0,0 +1,81 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'benchmark'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'tmpdir'
|
6
|
+
|
7
|
+
module CompareCompressors
|
8
|
+
#
|
9
|
+
# A single compressor-level result.
|
10
|
+
#
|
11
|
+
Result = Struct.new(
|
12
|
+
:target,
|
13
|
+
:compressor_name,
|
14
|
+
:compressor_level,
|
15
|
+
:compression_elapsed_time,
|
16
|
+
:compression_cpu_time,
|
17
|
+
:compression_max_rss,
|
18
|
+
:size,
|
19
|
+
:decompression_elapsed_time,
|
20
|
+
:decompression_cpu_time,
|
21
|
+
:decompression_max_rss
|
22
|
+
) do
|
23
|
+
def group_key
|
24
|
+
[compressor_name, compressor_level]
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# @return [Integer] in bytes; cached
|
29
|
+
#
|
30
|
+
def uncompressed_size
|
31
|
+
@uncompressed_size ||= File.stat(target).size
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# @return [Float] positive; should be finite; larger is better
|
36
|
+
#
|
37
|
+
def compression_ratio
|
38
|
+
uncompressed_size / size.to_f
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# @return [Integer] should be positive; larger is better
|
43
|
+
#
|
44
|
+
def compression_delta
|
45
|
+
uncompressed_size - size
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.read_csv(io)
|
49
|
+
results = []
|
50
|
+
CSV(io, headers: true) do |csv|
|
51
|
+
csv.each do |row|
|
52
|
+
results << Result.from_row(row)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
results
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.from_row(row)
|
59
|
+
Result.new(
|
60
|
+
row['target'],
|
61
|
+
row['compressor_name'],
|
62
|
+
row['compressor_level'].to_i,
|
63
|
+
row['compression_elapsed_time'].to_f,
|
64
|
+
row['compression_cpu_time'].to_f,
|
65
|
+
row['compression_max_rss'].to_i,
|
66
|
+
row['size'].to_i,
|
67
|
+
row['decompression_elapsed_time'].to_f,
|
68
|
+
row['decompression_cpu_time'].to_f,
|
69
|
+
row['decompression_max_rss'].to_i
|
70
|
+
)
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.mean(results, attribute)
|
74
|
+
results.map(&attribute).inject(&:+) / results.size.to_f
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.geomean(results, attribute)
|
78
|
+
results.map(&attribute).inject(&:*)**(1 / results.size.to_f)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,271 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'minitest/autorun'
|
4
|
+
require 'tmpdir'
|
5
|
+
|
6
|
+
require 'compare_compressors'
|
7
|
+
|
8
|
+
class TestCompareCompressors < MiniTest::Test
|
9
|
+
include CompareCompressors
|
10
|
+
|
11
|
+
def test_compressors
|
12
|
+
COMPRESSORS.each do |compressor|
|
13
|
+
check_compressor(compressor)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_compressor_versions
|
18
|
+
COMPRESSORS.each do |compressor|
|
19
|
+
if compressor.name == 'brotli'
|
20
|
+
assert_nil compressor.version
|
21
|
+
else
|
22
|
+
assert_match(/[0-9]/, compressor.version)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_grouper_groups_over_targets
|
28
|
+
with_fixed_test_targets(2, 10_000) do |targets|
|
29
|
+
results = make_test_results(targets)
|
30
|
+
group_results = GroupResult.group(results, scale: 10_000)
|
31
|
+
assert_equal 2, group_results.size
|
32
|
+
assert_equal 'fooz', group_results[0].compressor_name
|
33
|
+
assert_equal 1, group_results[0].compressor_level
|
34
|
+
assert_in_delta \
|
35
|
+
10_000 * (1.1 + 1.3) / 2 / 3600,
|
36
|
+
group_results[0].mean_compression_elapsed_hours
|
37
|
+
assert_in_delta \
|
38
|
+
10_000 * (10.1 + 10.3) / 2 / 3600,
|
39
|
+
group_results[0].mean_compression_cpu_hours
|
40
|
+
assert_equal 1002, group_results[0].max_compression_max_rss
|
41
|
+
assert_in_delta \
|
42
|
+
10_000 * (5000 + 4000) / 2.0 / 1024**3,
|
43
|
+
group_results[0].mean_compressed_gibytes
|
44
|
+
assert_in_delta \
|
45
|
+
10_000 * (5000 + 6000) / 2.0 / 1024**3,
|
46
|
+
group_results[0].mean_compression_delta_gibytes
|
47
|
+
assert_in_delta \
|
48
|
+
Math.sqrt((10_000.0 / 5000) * (10_000.0 / 4000)),
|
49
|
+
group_results[0].geomean_compression_ratio
|
50
|
+
assert_in_delta \
|
51
|
+
10_000 * (3.1 + 3.3) / 2 / 3600,
|
52
|
+
group_results[0].mean_decompression_elapsed_hours
|
53
|
+
assert_in_delta \
|
54
|
+
10_000 * (2.1 + 2.3) / 2 / 3600,
|
55
|
+
group_results[0].mean_decompression_cpu_hours
|
56
|
+
assert_equal 2002, group_results[0].max_decompression_max_rss
|
57
|
+
|
58
|
+
cpu_time_cost_model = CostModel.new(
|
59
|
+
gibyte_cost: 0.023,
|
60
|
+
compression_hour_cost: 0.05,
|
61
|
+
decompression_hour_cost: 0.10,
|
62
|
+
use_cpu_time: true
|
63
|
+
)
|
64
|
+
costed_group_results =
|
65
|
+
CostedGroupResult.from_group_results(cpu_time_cost_model, group_results)
|
66
|
+
|
67
|
+
assert_equal 2, costed_group_results.size
|
68
|
+
assert_equal 'fooz', costed_group_results[0].compressor_name
|
69
|
+
assert_equal 1, costed_group_results[0].compressor_level
|
70
|
+
assert_in_delta \
|
71
|
+
0.023 * group_results[0].mean_compressed_gibytes,
|
72
|
+
costed_group_results[0].gibyte_cost
|
73
|
+
assert_in_delta \
|
74
|
+
0.05 * group_results[0].mean_compression_cpu_hours,
|
75
|
+
costed_group_results[0].compression_hour_cost
|
76
|
+
assert_in_delta \
|
77
|
+
0.10 * group_results[0].mean_decompression_cpu_hours,
|
78
|
+
costed_group_results[0].decompression_hour_cost
|
79
|
+
assert_in_delta \
|
80
|
+
costed_group_results[0].total_cost,
|
81
|
+
costed_group_results[0].hour_cost +
|
82
|
+
costed_group_results[0].gibyte_cost
|
83
|
+
|
84
|
+
elapsed_time_cost_model = CostModel.new(
|
85
|
+
gibyte_cost: 0.023,
|
86
|
+
compression_hour_cost: 0.05,
|
87
|
+
decompression_hour_cost: 0.10,
|
88
|
+
use_cpu_time: false
|
89
|
+
)
|
90
|
+
costed_group_results = CostedGroupResult.from_group_results(
|
91
|
+
elapsed_time_cost_model, group_results
|
92
|
+
)
|
93
|
+
assert_in_delta \
|
94
|
+
0.05 * group_results[0].mean_compression_elapsed_hours,
|
95
|
+
costed_group_results[0].compression_hour_cost
|
96
|
+
assert_in_delta \
|
97
|
+
0.10 * group_results[0].mean_decompression_elapsed_hours,
|
98
|
+
costed_group_results[0].decompression_hour_cost
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def write_random_test_data(pathname, num_reps)
|
103
|
+
File.open(pathname, 'w') do |f|
|
104
|
+
num_reps.times do
|
105
|
+
string = 'a' * rand(100) + 'b' * rand(100) + 'c' * rand(100)
|
106
|
+
f.puts(string.split('').shuffle.join(''))
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def with_random_test_targets(num_targets, num_reps)
|
112
|
+
srand 42
|
113
|
+
Dir.mktmpdir do |tmp|
|
114
|
+
targets = Array.new(num_targets) do |i|
|
115
|
+
pathname = File.join(tmp, "test_#{i}")
|
116
|
+
write_random_test_data pathname, num_reps
|
117
|
+
pathname
|
118
|
+
end
|
119
|
+
yield targets
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def with_fixed_test_targets(num_targets, target_size)
|
124
|
+
Dir.mktmpdir do |tmp|
|
125
|
+
targets = Array.new(num_targets) do |i|
|
126
|
+
pathname = File.join(tmp, "test_#{i}")
|
127
|
+
File.open(pathname, 'w') { |f| f.puts 'a' * target_size }
|
128
|
+
pathname
|
129
|
+
end
|
130
|
+
yield targets
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Test results for grouping.
|
135
|
+
def make_test_results(targets)
|
136
|
+
[
|
137
|
+
Result.new(
|
138
|
+
targets[0], 'fooz', 1, 1.1, 10.1, 1000, 5000, 3.1, 2.1, 2000
|
139
|
+
),
|
140
|
+
Result.new(
|
141
|
+
targets[0], 'fooz', 2, 2.2, 20.2, 1001, 2500, 6.2, 4.2, 2001
|
142
|
+
),
|
143
|
+
Result.new(
|
144
|
+
targets[1], 'fooz', 1, 1.3, 10.3, 1002, 4000, 3.3, 2.3, 2002
|
145
|
+
),
|
146
|
+
Result.new(
|
147
|
+
targets[1], 'fooz', 2, 2.4, 20.4, 1003, 2000, 6.4, 4.4, 2003
|
148
|
+
)
|
149
|
+
]
|
150
|
+
end
|
151
|
+
|
152
|
+
# An integration test for any compressor.
|
153
|
+
def check_compressor(compressor)
|
154
|
+
num_levels = compressor.levels.size
|
155
|
+
|
156
|
+
with_random_test_targets(3, 100) do |targets|
|
157
|
+
csv_string_io = StringIO.new
|
158
|
+
CSV(csv_string_io) do |csv|
|
159
|
+
Comparer.new.run(csv, [compressor], targets)
|
160
|
+
end
|
161
|
+
|
162
|
+
csv_string_io.rewind
|
163
|
+
results = Result.read_csv(csv_string_io)
|
164
|
+
targets.each do |target|
|
165
|
+
target_results = results.select { |r| r.target == target }
|
166
|
+
assert_equal num_levels, target_results.size
|
167
|
+
assert_equal \
|
168
|
+
target_results.map(&:compressor_level).min,
|
169
|
+
target_results.first.compressor_level
|
170
|
+
refute target_results.first.compression_cpu_time.negative?
|
171
|
+
assert target_results.first.size.positive?
|
172
|
+
refute target_results.first.decompression_cpu_time.negative?
|
173
|
+
end
|
174
|
+
|
175
|
+
# Average out the targets.
|
176
|
+
group_results = GroupResult.group(results, scale: 10)
|
177
|
+
assert_equal num_levels, group_results.size
|
178
|
+
|
179
|
+
# Run the 2D plotter.
|
180
|
+
size_plotter = SizePlotter.new(
|
181
|
+
terminal: Plotter::DEFAULT_TERMINAL,
|
182
|
+
output: Plotter::DEFAULT_OUTPUT,
|
183
|
+
logscale_size: true,
|
184
|
+
autoscale_fix: true,
|
185
|
+
show_labels: true,
|
186
|
+
lmargin: 5,
|
187
|
+
title: 'Test Plot',
|
188
|
+
use_cpu_time: true
|
189
|
+
)
|
190
|
+
io = StringIO.new
|
191
|
+
size_plotter.plot(group_results, pareto_only: false, io: io)
|
192
|
+
script = io.string
|
193
|
+
assert_match(/set terminal png/, script)
|
194
|
+
assert_match(/#{compressor.name} << EOD/, script)
|
195
|
+
|
196
|
+
# There's not much we can reliably say about the pareto results, because
|
197
|
+
# they depend on time. We can make sure it runs, however.
|
198
|
+
io = StringIO.new
|
199
|
+
size_plotter.plot(group_results, pareto_only: true, io: io)
|
200
|
+
assert_match(/#{compressor.name} << EOD/, io.string)
|
201
|
+
assert size_plotter.group_results.size.positive?
|
202
|
+
|
203
|
+
# Run the 3D plotter.
|
204
|
+
raw_plotter = RawPlotter.new(
|
205
|
+
terminal: Plotter::DEFAULT_TERMINAL,
|
206
|
+
output: Plotter::DEFAULT_OUTPUT,
|
207
|
+
logscale_size: true,
|
208
|
+
autoscale_fix: true,
|
209
|
+
show_labels: true,
|
210
|
+
lmargin: 5,
|
211
|
+
title: 'Test Plot',
|
212
|
+
use_cpu_time: false
|
213
|
+
)
|
214
|
+
io = StringIO.new
|
215
|
+
raw_plotter.plot(group_results, pareto_only: false, io: io)
|
216
|
+
script = io.string
|
217
|
+
assert_match(/set terminal png/, script)
|
218
|
+
assert_match(/#{compressor.name} << EOD/, script)
|
219
|
+
|
220
|
+
# Again, just make sure the Pareto only plot runs.
|
221
|
+
io = StringIO.new
|
222
|
+
raw_plotter.plot(group_results, pareto_only: true, io: io)
|
223
|
+
assert_match(/#{compressor.name} << EOD/, io.string)
|
224
|
+
assert raw_plotter.group_results.size.positive?
|
225
|
+
|
226
|
+
# Make a cost plot.
|
227
|
+
cost_model = CostModel.new(
|
228
|
+
gibyte_cost: 0.023,
|
229
|
+
compression_hour_cost: 0.05,
|
230
|
+
decompression_hour_cost: 0.10,
|
231
|
+
use_cpu_time: false
|
232
|
+
)
|
233
|
+
|
234
|
+
costed_group_results =
|
235
|
+
CostedGroupResult.from_group_results(cost_model, group_results)
|
236
|
+
|
237
|
+
# Summarise the results. Again there's not much we can reliably test here.
|
238
|
+
summary_results = cost_model.summarize(costed_group_results)
|
239
|
+
assert_equal 5, summary_results.size
|
240
|
+
|
241
|
+
# Run the cost plotter.
|
242
|
+
cost_plotter = CostPlotter.new(
|
243
|
+
cost_model,
|
244
|
+
terminal: Plotter::DEFAULT_TERMINAL,
|
245
|
+
output: Plotter::DEFAULT_OUTPUT,
|
246
|
+
logscale_size: true,
|
247
|
+
autoscale_fix: true,
|
248
|
+
show_cost_contours: true,
|
249
|
+
show_labels: true,
|
250
|
+
lmargin: 5,
|
251
|
+
title: 'Test Plot',
|
252
|
+
use_cpu_time: false
|
253
|
+
)
|
254
|
+
|
255
|
+
io = StringIO.new
|
256
|
+
cost_plotter.plot(costed_group_results, pareto_only: false, io: io)
|
257
|
+
script = io.string
|
258
|
+
assert_match(/set terminal png/, script)
|
259
|
+
assert_match(/#{compressor.name} << EOD/, script)
|
260
|
+
assert_match(/set lmargin 5/, script)
|
261
|
+
assert_match(/set logscale y/, script)
|
262
|
+
assert_match(/set autoscale fix/, script)
|
263
|
+
|
264
|
+
# Again, just make sure the Pareto only plot runs.
|
265
|
+
io = StringIO.new
|
266
|
+
cost_plotter.plot(costed_group_results, pareto_only: true, io: io)
|
267
|
+
assert_match(/#{compressor.name} << EOD/, io.string)
|
268
|
+
assert cost_plotter.group_results.size.positive?
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: compare_compressors
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- John Lees-Miller
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-05-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: thor
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.19.4
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.19.4
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: gemma
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 5.0.0
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 5.0.0
|
41
|
+
description: "\n Compare compression algorithms (gzip, bzip2, xz, etc.) for a sample
|
42
|
+
of\n documents.\n "
|
43
|
+
email:
|
44
|
+
- jdleesmiller@gmail.com
|
45
|
+
executables:
|
46
|
+
- compare_compressors
|
47
|
+
extensions: []
|
48
|
+
extra_rdoc_files:
|
49
|
+
- README.md
|
50
|
+
files:
|
51
|
+
- README.md
|
52
|
+
- bin/compare_compressors
|
53
|
+
- lib/compare_compressors.rb
|
54
|
+
- lib/compare_compressors/command_line_interface.rb
|
55
|
+
- lib/compare_compressors/comparer.rb
|
56
|
+
- lib/compare_compressors/compressor.rb
|
57
|
+
- lib/compare_compressors/compressors/brotli_compressor.rb
|
58
|
+
- lib/compare_compressors/compressors/bzip2_compressor.rb
|
59
|
+
- lib/compare_compressors/compressors/gzip_compressor.rb
|
60
|
+
- lib/compare_compressors/compressors/seven_zip_compressor.rb
|
61
|
+
- lib/compare_compressors/compressors/xz_compressor.rb
|
62
|
+
- lib/compare_compressors/compressors/zstd_compressor.rb
|
63
|
+
- lib/compare_compressors/cost_model.rb
|
64
|
+
- lib/compare_compressors/costed_group_result.rb
|
65
|
+
- lib/compare_compressors/group_result.rb
|
66
|
+
- lib/compare_compressors/plotter.rb
|
67
|
+
- lib/compare_compressors/plotters/cost_plotter.rb
|
68
|
+
- lib/compare_compressors/plotters/raw_plotter.rb
|
69
|
+
- lib/compare_compressors/plotters/size_plotter.rb
|
70
|
+
- lib/compare_compressors/result.rb
|
71
|
+
- lib/compare_compressors/version.rb
|
72
|
+
- test/compare_compressors/compare_compressors_test.rb
|
73
|
+
homepage: https://github.com/jdleesmiller/compare_compressors
|
74
|
+
licenses: []
|
75
|
+
metadata: {}
|
76
|
+
post_install_message:
|
77
|
+
rdoc_options:
|
78
|
+
- "--main"
|
79
|
+
- README.md
|
80
|
+
- "--title"
|
81
|
+
- compare_compressors-0.0.1 Documentation
|
82
|
+
require_paths:
|
83
|
+
- lib
|
84
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
requirements: []
|
95
|
+
rubyforge_project:
|
96
|
+
rubygems_version: 2.6.8
|
97
|
+
signing_key:
|
98
|
+
specification_version: 4
|
99
|
+
summary: Compare compression algorithms (gzip, bzip2, xz, etc.) for a sample of documents.
|
100
|
+
test_files:
|
101
|
+
- test/compare_compressors/compare_compressors_test.rb
|