compare_compressors 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +132 -0
- data/bin/compare_compressors +6 -0
- data/lib/compare_compressors.rb +40 -0
- data/lib/compare_compressors/command_line_interface.rb +223 -0
- data/lib/compare_compressors/comparer.rb +70 -0
- data/lib/compare_compressors/compressor.rb +150 -0
- data/lib/compare_compressors/compressors/brotli_compressor.rb +43 -0
- data/lib/compare_compressors/compressors/bzip2_compressor.rb +37 -0
- data/lib/compare_compressors/compressors/gzip_compressor.rb +34 -0
- data/lib/compare_compressors/compressors/seven_zip_compressor.rb +43 -0
- data/lib/compare_compressors/compressors/xz_compressor.rb +37 -0
- data/lib/compare_compressors/compressors/zstd_compressor.rb +37 -0
- data/lib/compare_compressors/cost_model.rb +55 -0
- data/lib/compare_compressors/costed_group_result.rb +87 -0
- data/lib/compare_compressors/group_result.rb +62 -0
- data/lib/compare_compressors/plotter.rb +164 -0
- data/lib/compare_compressors/plotters/cost_plotter.rb +90 -0
- data/lib/compare_compressors/plotters/raw_plotter.rb +61 -0
- data/lib/compare_compressors/plotters/size_plotter.rb +76 -0
- data/lib/compare_compressors/result.rb +81 -0
- data/lib/compare_compressors/version.rb +8 -0
- data/test/compare_compressors/compare_compressors_test.rb +271 -0
- metadata +101 -0
@@ -0,0 +1,81 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'benchmark'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'tmpdir'
|
6
|
+
|
7
|
+
module CompareCompressors
|
8
|
+
#
|
9
|
+
# A single compressor-level result.
|
10
|
+
#
|
11
|
+
Result = Struct.new(
|
12
|
+
:target,
|
13
|
+
:compressor_name,
|
14
|
+
:compressor_level,
|
15
|
+
:compression_elapsed_time,
|
16
|
+
:compression_cpu_time,
|
17
|
+
:compression_max_rss,
|
18
|
+
:size,
|
19
|
+
:decompression_elapsed_time,
|
20
|
+
:decompression_cpu_time,
|
21
|
+
:decompression_max_rss
|
22
|
+
) do
|
23
|
+
def group_key
|
24
|
+
[compressor_name, compressor_level]
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# @return [Integer] in bytes; cached
|
29
|
+
#
|
30
|
+
def uncompressed_size
|
31
|
+
@uncompressed_size ||= File.stat(target).size
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# @return [Float] positive; should be finite; larger is better
|
36
|
+
#
|
37
|
+
def compression_ratio
|
38
|
+
uncompressed_size / size.to_f
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# @return [Integer] should be positive; larger is better
|
43
|
+
#
|
44
|
+
def compression_delta
|
45
|
+
uncompressed_size - size
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.read_csv(io)
|
49
|
+
results = []
|
50
|
+
CSV(io, headers: true) do |csv|
|
51
|
+
csv.each do |row|
|
52
|
+
results << Result.from_row(row)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
results
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.from_row(row)
|
59
|
+
Result.new(
|
60
|
+
row['target'],
|
61
|
+
row['compressor_name'],
|
62
|
+
row['compressor_level'].to_i,
|
63
|
+
row['compression_elapsed_time'].to_f,
|
64
|
+
row['compression_cpu_time'].to_f,
|
65
|
+
row['compression_max_rss'].to_i,
|
66
|
+
row['size'].to_i,
|
67
|
+
row['decompression_elapsed_time'].to_f,
|
68
|
+
row['decompression_cpu_time'].to_f,
|
69
|
+
row['decompression_max_rss'].to_i
|
70
|
+
)
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.mean(results, attribute)
|
74
|
+
results.map(&attribute).inject(&:+) / results.size.to_f
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.geomean(results, attribute)
|
78
|
+
results.map(&attribute).inject(&:*)**(1 / results.size.to_f)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,271 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'minitest/autorun'
|
4
|
+
require 'tmpdir'
|
5
|
+
|
6
|
+
require 'compare_compressors'
|
7
|
+
|
8
|
+
class TestCompareCompressors < MiniTest::Test
|
9
|
+
include CompareCompressors
|
10
|
+
|
11
|
+
def test_compressors
|
12
|
+
COMPRESSORS.each do |compressor|
|
13
|
+
check_compressor(compressor)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_compressor_versions
|
18
|
+
COMPRESSORS.each do |compressor|
|
19
|
+
if compressor.name == 'brotli'
|
20
|
+
assert_nil compressor.version
|
21
|
+
else
|
22
|
+
assert_match(/[0-9]/, compressor.version)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_grouper_groups_over_targets
|
28
|
+
with_fixed_test_targets(2, 10_000) do |targets|
|
29
|
+
results = make_test_results(targets)
|
30
|
+
group_results = GroupResult.group(results, scale: 10_000)
|
31
|
+
assert_equal 2, group_results.size
|
32
|
+
assert_equal 'fooz', group_results[0].compressor_name
|
33
|
+
assert_equal 1, group_results[0].compressor_level
|
34
|
+
assert_in_delta \
|
35
|
+
10_000 * (1.1 + 1.3) / 2 / 3600,
|
36
|
+
group_results[0].mean_compression_elapsed_hours
|
37
|
+
assert_in_delta \
|
38
|
+
10_000 * (10.1 + 10.3) / 2 / 3600,
|
39
|
+
group_results[0].mean_compression_cpu_hours
|
40
|
+
assert_equal 1002, group_results[0].max_compression_max_rss
|
41
|
+
assert_in_delta \
|
42
|
+
10_000 * (5000 + 4000) / 2.0 / 1024**3,
|
43
|
+
group_results[0].mean_compressed_gibytes
|
44
|
+
assert_in_delta \
|
45
|
+
10_000 * (5000 + 6000) / 2.0 / 1024**3,
|
46
|
+
group_results[0].mean_compression_delta_gibytes
|
47
|
+
assert_in_delta \
|
48
|
+
Math.sqrt((10_000.0 / 5000) * (10_000.0 / 4000)),
|
49
|
+
group_results[0].geomean_compression_ratio
|
50
|
+
assert_in_delta \
|
51
|
+
10_000 * (3.1 + 3.3) / 2 / 3600,
|
52
|
+
group_results[0].mean_decompression_elapsed_hours
|
53
|
+
assert_in_delta \
|
54
|
+
10_000 * (2.1 + 2.3) / 2 / 3600,
|
55
|
+
group_results[0].mean_decompression_cpu_hours
|
56
|
+
assert_equal 2002, group_results[0].max_decompression_max_rss
|
57
|
+
|
58
|
+
cpu_time_cost_model = CostModel.new(
|
59
|
+
gibyte_cost: 0.023,
|
60
|
+
compression_hour_cost: 0.05,
|
61
|
+
decompression_hour_cost: 0.10,
|
62
|
+
use_cpu_time: true
|
63
|
+
)
|
64
|
+
costed_group_results =
|
65
|
+
CostedGroupResult.from_group_results(cpu_time_cost_model, group_results)
|
66
|
+
|
67
|
+
assert_equal 2, costed_group_results.size
|
68
|
+
assert_equal 'fooz', costed_group_results[0].compressor_name
|
69
|
+
assert_equal 1, costed_group_results[0].compressor_level
|
70
|
+
assert_in_delta \
|
71
|
+
0.023 * group_results[0].mean_compressed_gibytes,
|
72
|
+
costed_group_results[0].gibyte_cost
|
73
|
+
assert_in_delta \
|
74
|
+
0.05 * group_results[0].mean_compression_cpu_hours,
|
75
|
+
costed_group_results[0].compression_hour_cost
|
76
|
+
assert_in_delta \
|
77
|
+
0.10 * group_results[0].mean_decompression_cpu_hours,
|
78
|
+
costed_group_results[0].decompression_hour_cost
|
79
|
+
assert_in_delta \
|
80
|
+
costed_group_results[0].total_cost,
|
81
|
+
costed_group_results[0].hour_cost +
|
82
|
+
costed_group_results[0].gibyte_cost
|
83
|
+
|
84
|
+
elapsed_time_cost_model = CostModel.new(
|
85
|
+
gibyte_cost: 0.023,
|
86
|
+
compression_hour_cost: 0.05,
|
87
|
+
decompression_hour_cost: 0.10,
|
88
|
+
use_cpu_time: false
|
89
|
+
)
|
90
|
+
costed_group_results = CostedGroupResult.from_group_results(
|
91
|
+
elapsed_time_cost_model, group_results
|
92
|
+
)
|
93
|
+
assert_in_delta \
|
94
|
+
0.05 * group_results[0].mean_compression_elapsed_hours,
|
95
|
+
costed_group_results[0].compression_hour_cost
|
96
|
+
assert_in_delta \
|
97
|
+
0.10 * group_results[0].mean_decompression_elapsed_hours,
|
98
|
+
costed_group_results[0].decompression_hour_cost
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def write_random_test_data(pathname, num_reps)
|
103
|
+
File.open(pathname, 'w') do |f|
|
104
|
+
num_reps.times do
|
105
|
+
string = 'a' * rand(100) + 'b' * rand(100) + 'c' * rand(100)
|
106
|
+
f.puts(string.split('').shuffle.join(''))
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def with_random_test_targets(num_targets, num_reps)
|
112
|
+
srand 42
|
113
|
+
Dir.mktmpdir do |tmp|
|
114
|
+
targets = Array.new(num_targets) do |i|
|
115
|
+
pathname = File.join(tmp, "test_#{i}")
|
116
|
+
write_random_test_data pathname, num_reps
|
117
|
+
pathname
|
118
|
+
end
|
119
|
+
yield targets
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def with_fixed_test_targets(num_targets, target_size)
|
124
|
+
Dir.mktmpdir do |tmp|
|
125
|
+
targets = Array.new(num_targets) do |i|
|
126
|
+
pathname = File.join(tmp, "test_#{i}")
|
127
|
+
File.open(pathname, 'w') { |f| f.puts 'a' * target_size }
|
128
|
+
pathname
|
129
|
+
end
|
130
|
+
yield targets
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Test results for grouping.
|
135
|
+
def make_test_results(targets)
|
136
|
+
[
|
137
|
+
Result.new(
|
138
|
+
targets[0], 'fooz', 1, 1.1, 10.1, 1000, 5000, 3.1, 2.1, 2000
|
139
|
+
),
|
140
|
+
Result.new(
|
141
|
+
targets[0], 'fooz', 2, 2.2, 20.2, 1001, 2500, 6.2, 4.2, 2001
|
142
|
+
),
|
143
|
+
Result.new(
|
144
|
+
targets[1], 'fooz', 1, 1.3, 10.3, 1002, 4000, 3.3, 2.3, 2002
|
145
|
+
),
|
146
|
+
Result.new(
|
147
|
+
targets[1], 'fooz', 2, 2.4, 20.4, 1003, 2000, 6.4, 4.4, 2003
|
148
|
+
)
|
149
|
+
]
|
150
|
+
end
|
151
|
+
|
152
|
+
# An integration test for any compressor.
|
153
|
+
def check_compressor(compressor)
|
154
|
+
num_levels = compressor.levels.size
|
155
|
+
|
156
|
+
with_random_test_targets(3, 100) do |targets|
|
157
|
+
csv_string_io = StringIO.new
|
158
|
+
CSV(csv_string_io) do |csv|
|
159
|
+
Comparer.new.run(csv, [compressor], targets)
|
160
|
+
end
|
161
|
+
|
162
|
+
csv_string_io.rewind
|
163
|
+
results = Result.read_csv(csv_string_io)
|
164
|
+
targets.each do |target|
|
165
|
+
target_results = results.select { |r| r.target == target }
|
166
|
+
assert_equal num_levels, target_results.size
|
167
|
+
assert_equal \
|
168
|
+
target_results.map(&:compressor_level).min,
|
169
|
+
target_results.first.compressor_level
|
170
|
+
refute target_results.first.compression_cpu_time.negative?
|
171
|
+
assert target_results.first.size.positive?
|
172
|
+
refute target_results.first.decompression_cpu_time.negative?
|
173
|
+
end
|
174
|
+
|
175
|
+
# Average out the targets.
|
176
|
+
group_results = GroupResult.group(results, scale: 10)
|
177
|
+
assert_equal num_levels, group_results.size
|
178
|
+
|
179
|
+
# Run the 2D plotter.
|
180
|
+
size_plotter = SizePlotter.new(
|
181
|
+
terminal: Plotter::DEFAULT_TERMINAL,
|
182
|
+
output: Plotter::DEFAULT_OUTPUT,
|
183
|
+
logscale_size: true,
|
184
|
+
autoscale_fix: true,
|
185
|
+
show_labels: true,
|
186
|
+
lmargin: 5,
|
187
|
+
title: 'Test Plot',
|
188
|
+
use_cpu_time: true
|
189
|
+
)
|
190
|
+
io = StringIO.new
|
191
|
+
size_plotter.plot(group_results, pareto_only: false, io: io)
|
192
|
+
script = io.string
|
193
|
+
assert_match(/set terminal png/, script)
|
194
|
+
assert_match(/#{compressor.name} << EOD/, script)
|
195
|
+
|
196
|
+
# There's not much we can reliably say about the pareto results, because
|
197
|
+
# they depend on time. We can make sure it runs, however.
|
198
|
+
io = StringIO.new
|
199
|
+
size_plotter.plot(group_results, pareto_only: true, io: io)
|
200
|
+
assert_match(/#{compressor.name} << EOD/, io.string)
|
201
|
+
assert size_plotter.group_results.size.positive?
|
202
|
+
|
203
|
+
# Run the 3D plotter.
|
204
|
+
raw_plotter = RawPlotter.new(
|
205
|
+
terminal: Plotter::DEFAULT_TERMINAL,
|
206
|
+
output: Plotter::DEFAULT_OUTPUT,
|
207
|
+
logscale_size: true,
|
208
|
+
autoscale_fix: true,
|
209
|
+
show_labels: true,
|
210
|
+
lmargin: 5,
|
211
|
+
title: 'Test Plot',
|
212
|
+
use_cpu_time: false
|
213
|
+
)
|
214
|
+
io = StringIO.new
|
215
|
+
raw_plotter.plot(group_results, pareto_only: false, io: io)
|
216
|
+
script = io.string
|
217
|
+
assert_match(/set terminal png/, script)
|
218
|
+
assert_match(/#{compressor.name} << EOD/, script)
|
219
|
+
|
220
|
+
# Again, just make sure the Pareto only plot runs.
|
221
|
+
io = StringIO.new
|
222
|
+
raw_plotter.plot(group_results, pareto_only: true, io: io)
|
223
|
+
assert_match(/#{compressor.name} << EOD/, io.string)
|
224
|
+
assert raw_plotter.group_results.size.positive?
|
225
|
+
|
226
|
+
# Make a cost plot.
|
227
|
+
cost_model = CostModel.new(
|
228
|
+
gibyte_cost: 0.023,
|
229
|
+
compression_hour_cost: 0.05,
|
230
|
+
decompression_hour_cost: 0.10,
|
231
|
+
use_cpu_time: false
|
232
|
+
)
|
233
|
+
|
234
|
+
costed_group_results =
|
235
|
+
CostedGroupResult.from_group_results(cost_model, group_results)
|
236
|
+
|
237
|
+
# Summarise the results. Again there's not much we can reliably test here.
|
238
|
+
summary_results = cost_model.summarize(costed_group_results)
|
239
|
+
assert_equal 5, summary_results.size
|
240
|
+
|
241
|
+
# Run the cost plotter.
|
242
|
+
cost_plotter = CostPlotter.new(
|
243
|
+
cost_model,
|
244
|
+
terminal: Plotter::DEFAULT_TERMINAL,
|
245
|
+
output: Plotter::DEFAULT_OUTPUT,
|
246
|
+
logscale_size: true,
|
247
|
+
autoscale_fix: true,
|
248
|
+
show_cost_contours: true,
|
249
|
+
show_labels: true,
|
250
|
+
lmargin: 5,
|
251
|
+
title: 'Test Plot',
|
252
|
+
use_cpu_time: false
|
253
|
+
)
|
254
|
+
|
255
|
+
io = StringIO.new
|
256
|
+
cost_plotter.plot(costed_group_results, pareto_only: false, io: io)
|
257
|
+
script = io.string
|
258
|
+
assert_match(/set terminal png/, script)
|
259
|
+
assert_match(/#{compressor.name} << EOD/, script)
|
260
|
+
assert_match(/set lmargin 5/, script)
|
261
|
+
assert_match(/set logscale y/, script)
|
262
|
+
assert_match(/set autoscale fix/, script)
|
263
|
+
|
264
|
+
# Again, just make sure the Pareto only plot runs.
|
265
|
+
io = StringIO.new
|
266
|
+
cost_plotter.plot(costed_group_results, pareto_only: true, io: io)
|
267
|
+
assert_match(/#{compressor.name} << EOD/, io.string)
|
268
|
+
assert cost_plotter.group_results.size.positive?
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: compare_compressors
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- John Lees-Miller
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-05-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: thor
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.19.4
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.19.4
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: gemma
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 5.0.0
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 5.0.0
|
41
|
+
description: "\n Compare compression algorithms (gzip, bzip2, xz, etc.) for a sample
|
42
|
+
of\n documents.\n "
|
43
|
+
email:
|
44
|
+
- jdleesmiller@gmail.com
|
45
|
+
executables:
|
46
|
+
- compare_compressors
|
47
|
+
extensions: []
|
48
|
+
extra_rdoc_files:
|
49
|
+
- README.md
|
50
|
+
files:
|
51
|
+
- README.md
|
52
|
+
- bin/compare_compressors
|
53
|
+
- lib/compare_compressors.rb
|
54
|
+
- lib/compare_compressors/command_line_interface.rb
|
55
|
+
- lib/compare_compressors/comparer.rb
|
56
|
+
- lib/compare_compressors/compressor.rb
|
57
|
+
- lib/compare_compressors/compressors/brotli_compressor.rb
|
58
|
+
- lib/compare_compressors/compressors/bzip2_compressor.rb
|
59
|
+
- lib/compare_compressors/compressors/gzip_compressor.rb
|
60
|
+
- lib/compare_compressors/compressors/seven_zip_compressor.rb
|
61
|
+
- lib/compare_compressors/compressors/xz_compressor.rb
|
62
|
+
- lib/compare_compressors/compressors/zstd_compressor.rb
|
63
|
+
- lib/compare_compressors/cost_model.rb
|
64
|
+
- lib/compare_compressors/costed_group_result.rb
|
65
|
+
- lib/compare_compressors/group_result.rb
|
66
|
+
- lib/compare_compressors/plotter.rb
|
67
|
+
- lib/compare_compressors/plotters/cost_plotter.rb
|
68
|
+
- lib/compare_compressors/plotters/raw_plotter.rb
|
69
|
+
- lib/compare_compressors/plotters/size_plotter.rb
|
70
|
+
- lib/compare_compressors/result.rb
|
71
|
+
- lib/compare_compressors/version.rb
|
72
|
+
- test/compare_compressors/compare_compressors_test.rb
|
73
|
+
homepage: https://github.com/jdleesmiller/compare_compressors
|
74
|
+
licenses: []
|
75
|
+
metadata: {}
|
76
|
+
post_install_message:
|
77
|
+
rdoc_options:
|
78
|
+
- "--main"
|
79
|
+
- README.md
|
80
|
+
- "--title"
|
81
|
+
- compare_compressors-0.0.1 Documentation
|
82
|
+
require_paths:
|
83
|
+
- lib
|
84
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
requirements: []
|
95
|
+
rubyforge_project:
|
96
|
+
rubygems_version: 2.6.8
|
97
|
+
signing_key:
|
98
|
+
specification_version: 4
|
99
|
+
summary: Compare compression algorithms (gzip, bzip2, xz, etc.) for a sample of documents.
|
100
|
+
test_files:
|
101
|
+
- test/compare_compressors/compare_compressors_test.rb
|