compare_compressors 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'benchmark'
4
+ require 'fileutils'
5
+ require 'tmpdir'
6
+
7
+ module CompareCompressors
8
+ #
9
+ # A single compressor-level result.
10
+ #
11
+ Result = Struct.new(
12
+ :target,
13
+ :compressor_name,
14
+ :compressor_level,
15
+ :compression_elapsed_time,
16
+ :compression_cpu_time,
17
+ :compression_max_rss,
18
+ :size,
19
+ :decompression_elapsed_time,
20
+ :decompression_cpu_time,
21
+ :decompression_max_rss
22
+ ) do
23
+ def group_key
24
+ [compressor_name, compressor_level]
25
+ end
26
+
27
+ #
28
+ # @return [Integer] in bytes; cached
29
+ #
30
+ def uncompressed_size
31
+ @uncompressed_size ||= File.stat(target).size
32
+ end
33
+
34
+ #
35
+ # @return [Float] positive; should be finite; larger is better
36
+ #
37
+ def compression_ratio
38
+ uncompressed_size / size.to_f
39
+ end
40
+
41
+ #
42
+ # @return [Integer] should be positive; larger is better
43
+ #
44
+ def compression_delta
45
+ uncompressed_size - size
46
+ end
47
+
48
+ def self.read_csv(io)
49
+ results = []
50
+ CSV(io, headers: true) do |csv|
51
+ csv.each do |row|
52
+ results << Result.from_row(row)
53
+ end
54
+ end
55
+ results
56
+ end
57
+
58
+ def self.from_row(row)
59
+ Result.new(
60
+ row['target'],
61
+ row['compressor_name'],
62
+ row['compressor_level'].to_i,
63
+ row['compression_elapsed_time'].to_f,
64
+ row['compression_cpu_time'].to_f,
65
+ row['compression_max_rss'].to_i,
66
+ row['size'].to_i,
67
+ row['decompression_elapsed_time'].to_f,
68
+ row['decompression_cpu_time'].to_f,
69
+ row['decompression_max_rss'].to_i
70
+ )
71
+ end
72
+
73
+ def self.mean(results, attribute)
74
+ results.map(&attribute).inject(&:+) / results.size.to_f
75
+ end
76
+
77
+ def self.geomean(results, attribute)
78
+ results.map(&attribute).inject(&:*)**(1 / results.size.to_f)
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ VERSION_MAJOR = 0
5
+ VERSION_MINOR = 0
6
+ VERSION_PATCH = 1
7
+ VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
8
+ end
@@ -0,0 +1,271 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'minitest/autorun'
4
+ require 'tmpdir'
5
+
6
+ require 'compare_compressors'
7
+
8
+ class TestCompareCompressors < MiniTest::Test
9
+ include CompareCompressors
10
+
11
+ def test_compressors
12
+ COMPRESSORS.each do |compressor|
13
+ check_compressor(compressor)
14
+ end
15
+ end
16
+
17
+ def test_compressor_versions
18
+ COMPRESSORS.each do |compressor|
19
+ if compressor.name == 'brotli'
20
+ assert_nil compressor.version
21
+ else
22
+ assert_match(/[0-9]/, compressor.version)
23
+ end
24
+ end
25
+ end
26
+
27
+ def test_grouper_groups_over_targets
28
+ with_fixed_test_targets(2, 10_000) do |targets|
29
+ results = make_test_results(targets)
30
+ group_results = GroupResult.group(results, scale: 10_000)
31
+ assert_equal 2, group_results.size
32
+ assert_equal 'fooz', group_results[0].compressor_name
33
+ assert_equal 1, group_results[0].compressor_level
34
+ assert_in_delta \
35
+ 10_000 * (1.1 + 1.3) / 2 / 3600,
36
+ group_results[0].mean_compression_elapsed_hours
37
+ assert_in_delta \
38
+ 10_000 * (10.1 + 10.3) / 2 / 3600,
39
+ group_results[0].mean_compression_cpu_hours
40
+ assert_equal 1002, group_results[0].max_compression_max_rss
41
+ assert_in_delta \
42
+ 10_000 * (5000 + 4000) / 2.0 / 1024**3,
43
+ group_results[0].mean_compressed_gibytes
44
+ assert_in_delta \
45
+ 10_000 * (5000 + 6000) / 2.0 / 1024**3,
46
+ group_results[0].mean_compression_delta_gibytes
47
+ assert_in_delta \
48
+ Math.sqrt((10_000.0 / 5000) * (10_000.0 / 4000)),
49
+ group_results[0].geomean_compression_ratio
50
+ assert_in_delta \
51
+ 10_000 * (3.1 + 3.3) / 2 / 3600,
52
+ group_results[0].mean_decompression_elapsed_hours
53
+ assert_in_delta \
54
+ 10_000 * (2.1 + 2.3) / 2 / 3600,
55
+ group_results[0].mean_decompression_cpu_hours
56
+ assert_equal 2002, group_results[0].max_decompression_max_rss
57
+
58
+ cpu_time_cost_model = CostModel.new(
59
+ gibyte_cost: 0.023,
60
+ compression_hour_cost: 0.05,
61
+ decompression_hour_cost: 0.10,
62
+ use_cpu_time: true
63
+ )
64
+ costed_group_results =
65
+ CostedGroupResult.from_group_results(cpu_time_cost_model, group_results)
66
+
67
+ assert_equal 2, costed_group_results.size
68
+ assert_equal 'fooz', costed_group_results[0].compressor_name
69
+ assert_equal 1, costed_group_results[0].compressor_level
70
+ assert_in_delta \
71
+ 0.023 * group_results[0].mean_compressed_gibytes,
72
+ costed_group_results[0].gibyte_cost
73
+ assert_in_delta \
74
+ 0.05 * group_results[0].mean_compression_cpu_hours,
75
+ costed_group_results[0].compression_hour_cost
76
+ assert_in_delta \
77
+ 0.10 * group_results[0].mean_decompression_cpu_hours,
78
+ costed_group_results[0].decompression_hour_cost
79
+ assert_in_delta \
80
+ costed_group_results[0].total_cost,
81
+ costed_group_results[0].hour_cost +
82
+ costed_group_results[0].gibyte_cost
83
+
84
+ elapsed_time_cost_model = CostModel.new(
85
+ gibyte_cost: 0.023,
86
+ compression_hour_cost: 0.05,
87
+ decompression_hour_cost: 0.10,
88
+ use_cpu_time: false
89
+ )
90
+ costed_group_results = CostedGroupResult.from_group_results(
91
+ elapsed_time_cost_model, group_results
92
+ )
93
+ assert_in_delta \
94
+ 0.05 * group_results[0].mean_compression_elapsed_hours,
95
+ costed_group_results[0].compression_hour_cost
96
+ assert_in_delta \
97
+ 0.10 * group_results[0].mean_decompression_elapsed_hours,
98
+ costed_group_results[0].decompression_hour_cost
99
+ end
100
+ end
101
+
102
+ def write_random_test_data(pathname, num_reps)
103
+ File.open(pathname, 'w') do |f|
104
+ num_reps.times do
105
+ string = 'a' * rand(100) + 'b' * rand(100) + 'c' * rand(100)
106
+ f.puts(string.split('').shuffle.join(''))
107
+ end
108
+ end
109
+ end
110
+
111
+ def with_random_test_targets(num_targets, num_reps)
112
+ srand 42
113
+ Dir.mktmpdir do |tmp|
114
+ targets = Array.new(num_targets) do |i|
115
+ pathname = File.join(tmp, "test_#{i}")
116
+ write_random_test_data pathname, num_reps
117
+ pathname
118
+ end
119
+ yield targets
120
+ end
121
+ end
122
+
123
+ def with_fixed_test_targets(num_targets, target_size)
124
+ Dir.mktmpdir do |tmp|
125
+ targets = Array.new(num_targets) do |i|
126
+ pathname = File.join(tmp, "test_#{i}")
127
+ File.open(pathname, 'w') { |f| f.puts 'a' * target_size }
128
+ pathname
129
+ end
130
+ yield targets
131
+ end
132
+ end
133
+
134
+ # Test results for grouping.
135
+ def make_test_results(targets)
136
+ [
137
+ Result.new(
138
+ targets[0], 'fooz', 1, 1.1, 10.1, 1000, 5000, 3.1, 2.1, 2000
139
+ ),
140
+ Result.new(
141
+ targets[0], 'fooz', 2, 2.2, 20.2, 1001, 2500, 6.2, 4.2, 2001
142
+ ),
143
+ Result.new(
144
+ targets[1], 'fooz', 1, 1.3, 10.3, 1002, 4000, 3.3, 2.3, 2002
145
+ ),
146
+ Result.new(
147
+ targets[1], 'fooz', 2, 2.4, 20.4, 1003, 2000, 6.4, 4.4, 2003
148
+ )
149
+ ]
150
+ end
151
+
152
+ # An integration test for any compressor.
153
+ def check_compressor(compressor)
154
+ num_levels = compressor.levels.size
155
+
156
+ with_random_test_targets(3, 100) do |targets|
157
+ csv_string_io = StringIO.new
158
+ CSV(csv_string_io) do |csv|
159
+ Comparer.new.run(csv, [compressor], targets)
160
+ end
161
+
162
+ csv_string_io.rewind
163
+ results = Result.read_csv(csv_string_io)
164
+ targets.each do |target|
165
+ target_results = results.select { |r| r.target == target }
166
+ assert_equal num_levels, target_results.size
167
+ assert_equal \
168
+ target_results.map(&:compressor_level).min,
169
+ target_results.first.compressor_level
170
+ refute target_results.first.compression_cpu_time.negative?
171
+ assert target_results.first.size.positive?
172
+ refute target_results.first.decompression_cpu_time.negative?
173
+ end
174
+
175
+ # Average out the targets.
176
+ group_results = GroupResult.group(results, scale: 10)
177
+ assert_equal num_levels, group_results.size
178
+
179
+ # Run the 2D plotter.
180
+ size_plotter = SizePlotter.new(
181
+ terminal: Plotter::DEFAULT_TERMINAL,
182
+ output: Plotter::DEFAULT_OUTPUT,
183
+ logscale_size: true,
184
+ autoscale_fix: true,
185
+ show_labels: true,
186
+ lmargin: 5,
187
+ title: 'Test Plot',
188
+ use_cpu_time: true
189
+ )
190
+ io = StringIO.new
191
+ size_plotter.plot(group_results, pareto_only: false, io: io)
192
+ script = io.string
193
+ assert_match(/set terminal png/, script)
194
+ assert_match(/#{compressor.name} << EOD/, script)
195
+
196
+ # There's not much we can reliably say about the pareto results, because
197
+ # they depend on time. We can make sure it runs, however.
198
+ io = StringIO.new
199
+ size_plotter.plot(group_results, pareto_only: true, io: io)
200
+ assert_match(/#{compressor.name} << EOD/, io.string)
201
+ assert size_plotter.group_results.size.positive?
202
+
203
+ # Run the 3D plotter.
204
+ raw_plotter = RawPlotter.new(
205
+ terminal: Plotter::DEFAULT_TERMINAL,
206
+ output: Plotter::DEFAULT_OUTPUT,
207
+ logscale_size: true,
208
+ autoscale_fix: true,
209
+ show_labels: true,
210
+ lmargin: 5,
211
+ title: 'Test Plot',
212
+ use_cpu_time: false
213
+ )
214
+ io = StringIO.new
215
+ raw_plotter.plot(group_results, pareto_only: false, io: io)
216
+ script = io.string
217
+ assert_match(/set terminal png/, script)
218
+ assert_match(/#{compressor.name} << EOD/, script)
219
+
220
+ # Again, just make sure the Pareto only plot runs.
221
+ io = StringIO.new
222
+ raw_plotter.plot(group_results, pareto_only: true, io: io)
223
+ assert_match(/#{compressor.name} << EOD/, io.string)
224
+ assert raw_plotter.group_results.size.positive?
225
+
226
+ # Make a cost plot.
227
+ cost_model = CostModel.new(
228
+ gibyte_cost: 0.023,
229
+ compression_hour_cost: 0.05,
230
+ decompression_hour_cost: 0.10,
231
+ use_cpu_time: false
232
+ )
233
+
234
+ costed_group_results =
235
+ CostedGroupResult.from_group_results(cost_model, group_results)
236
+
237
+ # Summarise the results. Again there's not much we can reliably test here.
238
+ summary_results = cost_model.summarize(costed_group_results)
239
+ assert_equal 5, summary_results.size
240
+
241
+ # Run the cost plotter.
242
+ cost_plotter = CostPlotter.new(
243
+ cost_model,
244
+ terminal: Plotter::DEFAULT_TERMINAL,
245
+ output: Plotter::DEFAULT_OUTPUT,
246
+ logscale_size: true,
247
+ autoscale_fix: true,
248
+ show_cost_contours: true,
249
+ show_labels: true,
250
+ lmargin: 5,
251
+ title: 'Test Plot',
252
+ use_cpu_time: false
253
+ )
254
+
255
+ io = StringIO.new
256
+ cost_plotter.plot(costed_group_results, pareto_only: false, io: io)
257
+ script = io.string
258
+ assert_match(/set terminal png/, script)
259
+ assert_match(/#{compressor.name} << EOD/, script)
260
+ assert_match(/set lmargin 5/, script)
261
+ assert_match(/set logscale y/, script)
262
+ assert_match(/set autoscale fix/, script)
263
+
264
+ # Again, just make sure the Pareto only plot runs.
265
+ io = StringIO.new
266
+ cost_plotter.plot(costed_group_results, pareto_only: true, io: io)
267
+ assert_match(/#{compressor.name} << EOD/, io.string)
268
+ assert cost_plotter.group_results.size.positive?
269
+ end
270
+ end
271
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: compare_compressors
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - John Lees-Miller
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-05-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: thor
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.19.4
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.19.4
27
+ - !ruby/object:Gem::Dependency
28
+ name: gemma
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 5.0.0
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 5.0.0
41
+ description: "\n Compare compression algorithms (gzip, bzip2, xz, etc.) for a sample
42
+ of\n documents.\n "
43
+ email:
44
+ - jdleesmiller@gmail.com
45
+ executables:
46
+ - compare_compressors
47
+ extensions: []
48
+ extra_rdoc_files:
49
+ - README.md
50
+ files:
51
+ - README.md
52
+ - bin/compare_compressors
53
+ - lib/compare_compressors.rb
54
+ - lib/compare_compressors/command_line_interface.rb
55
+ - lib/compare_compressors/comparer.rb
56
+ - lib/compare_compressors/compressor.rb
57
+ - lib/compare_compressors/compressors/brotli_compressor.rb
58
+ - lib/compare_compressors/compressors/bzip2_compressor.rb
59
+ - lib/compare_compressors/compressors/gzip_compressor.rb
60
+ - lib/compare_compressors/compressors/seven_zip_compressor.rb
61
+ - lib/compare_compressors/compressors/xz_compressor.rb
62
+ - lib/compare_compressors/compressors/zstd_compressor.rb
63
+ - lib/compare_compressors/cost_model.rb
64
+ - lib/compare_compressors/costed_group_result.rb
65
+ - lib/compare_compressors/group_result.rb
66
+ - lib/compare_compressors/plotter.rb
67
+ - lib/compare_compressors/plotters/cost_plotter.rb
68
+ - lib/compare_compressors/plotters/raw_plotter.rb
69
+ - lib/compare_compressors/plotters/size_plotter.rb
70
+ - lib/compare_compressors/result.rb
71
+ - lib/compare_compressors/version.rb
72
+ - test/compare_compressors/compare_compressors_test.rb
73
+ homepage: https://github.com/jdleesmiller/compare_compressors
74
+ licenses: []
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options:
78
+ - "--main"
79
+ - README.md
80
+ - "--title"
81
+ - compare_compressors-0.0.1 Documentation
82
+ require_paths:
83
+ - lib
84
+ required_ruby_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ requirements: []
95
+ rubyforge_project:
96
+ rubygems_version: 2.6.8
97
+ signing_key:
98
+ specification_version: 4
99
+ summary: Compare compression algorithms (gzip, bzip2, xz, etc.) for a sample of documents.
100
+ test_files:
101
+ - test/compare_compressors/compare_compressors_test.rb