compare_compressors 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'benchmark'
4
+ require 'fileutils'
5
+ require 'tmpdir'
6
+
7
+ module CompareCompressors
8
+ #
9
+ # A single compressor-level result.
10
+ #
11
+ Result = Struct.new(
12
+ :target,
13
+ :compressor_name,
14
+ :compressor_level,
15
+ :compression_elapsed_time,
16
+ :compression_cpu_time,
17
+ :compression_max_rss,
18
+ :size,
19
+ :decompression_elapsed_time,
20
+ :decompression_cpu_time,
21
+ :decompression_max_rss
22
+ ) do
23
+ def group_key
24
+ [compressor_name, compressor_level]
25
+ end
26
+
27
+ #
28
+ # @return [Integer] in bytes; cached
29
+ #
30
+ def uncompressed_size
31
+ @uncompressed_size ||= File.stat(target).size
32
+ end
33
+
34
+ #
35
+ # @return [Float] positive; should be finite; larger is better
36
+ #
37
+ def compression_ratio
38
+ uncompressed_size / size.to_f
39
+ end
40
+
41
+ #
42
+ # @return [Integer] should be positive; larger is better
43
+ #
44
+ def compression_delta
45
+ uncompressed_size - size
46
+ end
47
+
48
+ def self.read_csv(io)
49
+ results = []
50
+ CSV(io, headers: true) do |csv|
51
+ csv.each do |row|
52
+ results << Result.from_row(row)
53
+ end
54
+ end
55
+ results
56
+ end
57
+
58
+ def self.from_row(row)
59
+ Result.new(
60
+ row['target'],
61
+ row['compressor_name'],
62
+ row['compressor_level'].to_i,
63
+ row['compression_elapsed_time'].to_f,
64
+ row['compression_cpu_time'].to_f,
65
+ row['compression_max_rss'].to_i,
66
+ row['size'].to_i,
67
+ row['decompression_elapsed_time'].to_f,
68
+ row['decompression_cpu_time'].to_f,
69
+ row['decompression_max_rss'].to_i
70
+ )
71
+ end
72
+
73
+ def self.mean(results, attribute)
74
+ results.map(&attribute).inject(&:+) / results.size.to_f
75
+ end
76
+
77
+ def self.geomean(results, attribute)
78
+ results.map(&attribute).inject(&:*)**(1 / results.size.to_f)
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ VERSION_MAJOR = 0
5
+ VERSION_MINOR = 0
6
+ VERSION_PATCH = 1
7
+ VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
8
+ end
@@ -0,0 +1,271 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'minitest/autorun'
4
+ require 'tmpdir'
5
+
6
+ require 'compare_compressors'
7
+
8
+ class TestCompareCompressors < MiniTest::Test
9
+ include CompareCompressors
10
+
11
+ def test_compressors
12
+ COMPRESSORS.each do |compressor|
13
+ check_compressor(compressor)
14
+ end
15
+ end
16
+
17
+ def test_compressor_versions
18
+ COMPRESSORS.each do |compressor|
19
+ if compressor.name == 'brotli'
20
+ assert_nil compressor.version
21
+ else
22
+ assert_match(/[0-9]/, compressor.version)
23
+ end
24
+ end
25
+ end
26
+
27
+ def test_grouper_groups_over_targets
28
+ with_fixed_test_targets(2, 10_000) do |targets|
29
+ results = make_test_results(targets)
30
+ group_results = GroupResult.group(results, scale: 10_000)
31
+ assert_equal 2, group_results.size
32
+ assert_equal 'fooz', group_results[0].compressor_name
33
+ assert_equal 1, group_results[0].compressor_level
34
+ assert_in_delta \
35
+ 10_000 * (1.1 + 1.3) / 2 / 3600,
36
+ group_results[0].mean_compression_elapsed_hours
37
+ assert_in_delta \
38
+ 10_000 * (10.1 + 10.3) / 2 / 3600,
39
+ group_results[0].mean_compression_cpu_hours
40
+ assert_equal 1002, group_results[0].max_compression_max_rss
41
+ assert_in_delta \
42
+ 10_000 * (5000 + 4000) / 2.0 / 1024**3,
43
+ group_results[0].mean_compressed_gibytes
44
+ assert_in_delta \
45
+ 10_000 * (5000 + 6000) / 2.0 / 1024**3,
46
+ group_results[0].mean_compression_delta_gibytes
47
+ assert_in_delta \
48
+ Math.sqrt((10_000.0 / 5000) * (10_000.0 / 4000)),
49
+ group_results[0].geomean_compression_ratio
50
+ assert_in_delta \
51
+ 10_000 * (3.1 + 3.3) / 2 / 3600,
52
+ group_results[0].mean_decompression_elapsed_hours
53
+ assert_in_delta \
54
+ 10_000 * (2.1 + 2.3) / 2 / 3600,
55
+ group_results[0].mean_decompression_cpu_hours
56
+ assert_equal 2002, group_results[0].max_decompression_max_rss
57
+
58
+ cpu_time_cost_model = CostModel.new(
59
+ gibyte_cost: 0.023,
60
+ compression_hour_cost: 0.05,
61
+ decompression_hour_cost: 0.10,
62
+ use_cpu_time: true
63
+ )
64
+ costed_group_results =
65
+ CostedGroupResult.from_group_results(cpu_time_cost_model, group_results)
66
+
67
+ assert_equal 2, costed_group_results.size
68
+ assert_equal 'fooz', costed_group_results[0].compressor_name
69
+ assert_equal 1, costed_group_results[0].compressor_level
70
+ assert_in_delta \
71
+ 0.023 * group_results[0].mean_compressed_gibytes,
72
+ costed_group_results[0].gibyte_cost
73
+ assert_in_delta \
74
+ 0.05 * group_results[0].mean_compression_cpu_hours,
75
+ costed_group_results[0].compression_hour_cost
76
+ assert_in_delta \
77
+ 0.10 * group_results[0].mean_decompression_cpu_hours,
78
+ costed_group_results[0].decompression_hour_cost
79
+ assert_in_delta \
80
+ costed_group_results[0].total_cost,
81
+ costed_group_results[0].hour_cost +
82
+ costed_group_results[0].gibyte_cost
83
+
84
+ elapsed_time_cost_model = CostModel.new(
85
+ gibyte_cost: 0.023,
86
+ compression_hour_cost: 0.05,
87
+ decompression_hour_cost: 0.10,
88
+ use_cpu_time: false
89
+ )
90
+ costed_group_results = CostedGroupResult.from_group_results(
91
+ elapsed_time_cost_model, group_results
92
+ )
93
+ assert_in_delta \
94
+ 0.05 * group_results[0].mean_compression_elapsed_hours,
95
+ costed_group_results[0].compression_hour_cost
96
+ assert_in_delta \
97
+ 0.10 * group_results[0].mean_decompression_elapsed_hours,
98
+ costed_group_results[0].decompression_hour_cost
99
+ end
100
+ end
101
+
102
+ def write_random_test_data(pathname, num_reps)
103
+ File.open(pathname, 'w') do |f|
104
+ num_reps.times do
105
+ string = 'a' * rand(100) + 'b' * rand(100) + 'c' * rand(100)
106
+ f.puts(string.split('').shuffle.join(''))
107
+ end
108
+ end
109
+ end
110
+
111
+ def with_random_test_targets(num_targets, num_reps)
112
+ srand 42
113
+ Dir.mktmpdir do |tmp|
114
+ targets = Array.new(num_targets) do |i|
115
+ pathname = File.join(tmp, "test_#{i}")
116
+ write_random_test_data pathname, num_reps
117
+ pathname
118
+ end
119
+ yield targets
120
+ end
121
+ end
122
+
123
+ def with_fixed_test_targets(num_targets, target_size)
124
+ Dir.mktmpdir do |tmp|
125
+ targets = Array.new(num_targets) do |i|
126
+ pathname = File.join(tmp, "test_#{i}")
127
+ File.open(pathname, 'w') { |f| f.puts 'a' * target_size }
128
+ pathname
129
+ end
130
+ yield targets
131
+ end
132
+ end
133
+
134
+ # Test results for grouping.
135
+ def make_test_results(targets)
136
+ [
137
+ Result.new(
138
+ targets[0], 'fooz', 1, 1.1, 10.1, 1000, 5000, 3.1, 2.1, 2000
139
+ ),
140
+ Result.new(
141
+ targets[0], 'fooz', 2, 2.2, 20.2, 1001, 2500, 6.2, 4.2, 2001
142
+ ),
143
+ Result.new(
144
+ targets[1], 'fooz', 1, 1.3, 10.3, 1002, 4000, 3.3, 2.3, 2002
145
+ ),
146
+ Result.new(
147
+ targets[1], 'fooz', 2, 2.4, 20.4, 1003, 2000, 6.4, 4.4, 2003
148
+ )
149
+ ]
150
+ end
151
+
152
+ # An integration test for any compressor.
153
+ def check_compressor(compressor)
154
+ num_levels = compressor.levels.size
155
+
156
+ with_random_test_targets(3, 100) do |targets|
157
+ csv_string_io = StringIO.new
158
+ CSV(csv_string_io) do |csv|
159
+ Comparer.new.run(csv, [compressor], targets)
160
+ end
161
+
162
+ csv_string_io.rewind
163
+ results = Result.read_csv(csv_string_io)
164
+ targets.each do |target|
165
+ target_results = results.select { |r| r.target == target }
166
+ assert_equal num_levels, target_results.size
167
+ assert_equal \
168
+ target_results.map(&:compressor_level).min,
169
+ target_results.first.compressor_level
170
+ refute target_results.first.compression_cpu_time.negative?
171
+ assert target_results.first.size.positive?
172
+ refute target_results.first.decompression_cpu_time.negative?
173
+ end
174
+
175
+ # Average out the targets.
176
+ group_results = GroupResult.group(results, scale: 10)
177
+ assert_equal num_levels, group_results.size
178
+
179
+ # Run the 2D plotter.
180
+ size_plotter = SizePlotter.new(
181
+ terminal: Plotter::DEFAULT_TERMINAL,
182
+ output: Plotter::DEFAULT_OUTPUT,
183
+ logscale_size: true,
184
+ autoscale_fix: true,
185
+ show_labels: true,
186
+ lmargin: 5,
187
+ title: 'Test Plot',
188
+ use_cpu_time: true
189
+ )
190
+ io = StringIO.new
191
+ size_plotter.plot(group_results, pareto_only: false, io: io)
192
+ script = io.string
193
+ assert_match(/set terminal png/, script)
194
+ assert_match(/#{compressor.name} << EOD/, script)
195
+
196
+ # There's not much we can reliably say about the pareto results, because
197
+ # they depend on time. We can make sure it runs, however.
198
+ io = StringIO.new
199
+ size_plotter.plot(group_results, pareto_only: true, io: io)
200
+ assert_match(/#{compressor.name} << EOD/, io.string)
201
+ assert size_plotter.group_results.size.positive?
202
+
203
+ # Run the 3D plotter.
204
+ raw_plotter = RawPlotter.new(
205
+ terminal: Plotter::DEFAULT_TERMINAL,
206
+ output: Plotter::DEFAULT_OUTPUT,
207
+ logscale_size: true,
208
+ autoscale_fix: true,
209
+ show_labels: true,
210
+ lmargin: 5,
211
+ title: 'Test Plot',
212
+ use_cpu_time: false
213
+ )
214
+ io = StringIO.new
215
+ raw_plotter.plot(group_results, pareto_only: false, io: io)
216
+ script = io.string
217
+ assert_match(/set terminal png/, script)
218
+ assert_match(/#{compressor.name} << EOD/, script)
219
+
220
+ # Again, just make sure the Pareto only plot runs.
221
+ io = StringIO.new
222
+ raw_plotter.plot(group_results, pareto_only: true, io: io)
223
+ assert_match(/#{compressor.name} << EOD/, io.string)
224
+ assert raw_plotter.group_results.size.positive?
225
+
226
+ # Make a cost plot.
227
+ cost_model = CostModel.new(
228
+ gibyte_cost: 0.023,
229
+ compression_hour_cost: 0.05,
230
+ decompression_hour_cost: 0.10,
231
+ use_cpu_time: false
232
+ )
233
+
234
+ costed_group_results =
235
+ CostedGroupResult.from_group_results(cost_model, group_results)
236
+
237
+ # Summarise the results. Again there's not much we can reliably test here.
238
+ summary_results = cost_model.summarize(costed_group_results)
239
+ assert_equal 5, summary_results.size
240
+
241
+ # Run the cost plotter.
242
+ cost_plotter = CostPlotter.new(
243
+ cost_model,
244
+ terminal: Plotter::DEFAULT_TERMINAL,
245
+ output: Plotter::DEFAULT_OUTPUT,
246
+ logscale_size: true,
247
+ autoscale_fix: true,
248
+ show_cost_contours: true,
249
+ show_labels: true,
250
+ lmargin: 5,
251
+ title: 'Test Plot',
252
+ use_cpu_time: false
253
+ )
254
+
255
+ io = StringIO.new
256
+ cost_plotter.plot(costed_group_results, pareto_only: false, io: io)
257
+ script = io.string
258
+ assert_match(/set terminal png/, script)
259
+ assert_match(/#{compressor.name} << EOD/, script)
260
+ assert_match(/set lmargin 5/, script)
261
+ assert_match(/set logscale y/, script)
262
+ assert_match(/set autoscale fix/, script)
263
+
264
+ # Again, just make sure the Pareto only plot runs.
265
+ io = StringIO.new
266
+ cost_plotter.plot(costed_group_results, pareto_only: true, io: io)
267
+ assert_match(/#{compressor.name} << EOD/, io.string)
268
+ assert cost_plotter.group_results.size.positive?
269
+ end
270
+ end
271
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: compare_compressors
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - John Lees-Miller
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-05-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: thor
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.19.4
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.19.4
27
+ - !ruby/object:Gem::Dependency
28
+ name: gemma
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 5.0.0
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 5.0.0
41
+ description: "\n Compare compression algorithms (gzip, bzip2, xz, etc.) for a sample
42
+ of\n documents.\n "
43
+ email:
44
+ - jdleesmiller@gmail.com
45
+ executables:
46
+ - compare_compressors
47
+ extensions: []
48
+ extra_rdoc_files:
49
+ - README.md
50
+ files:
51
+ - README.md
52
+ - bin/compare_compressors
53
+ - lib/compare_compressors.rb
54
+ - lib/compare_compressors/command_line_interface.rb
55
+ - lib/compare_compressors/comparer.rb
56
+ - lib/compare_compressors/compressor.rb
57
+ - lib/compare_compressors/compressors/brotli_compressor.rb
58
+ - lib/compare_compressors/compressors/bzip2_compressor.rb
59
+ - lib/compare_compressors/compressors/gzip_compressor.rb
60
+ - lib/compare_compressors/compressors/seven_zip_compressor.rb
61
+ - lib/compare_compressors/compressors/xz_compressor.rb
62
+ - lib/compare_compressors/compressors/zstd_compressor.rb
63
+ - lib/compare_compressors/cost_model.rb
64
+ - lib/compare_compressors/costed_group_result.rb
65
+ - lib/compare_compressors/group_result.rb
66
+ - lib/compare_compressors/plotter.rb
67
+ - lib/compare_compressors/plotters/cost_plotter.rb
68
+ - lib/compare_compressors/plotters/raw_plotter.rb
69
+ - lib/compare_compressors/plotters/size_plotter.rb
70
+ - lib/compare_compressors/result.rb
71
+ - lib/compare_compressors/version.rb
72
+ - test/compare_compressors/compare_compressors_test.rb
73
+ homepage: https://github.com/jdleesmiller/compare_compressors
74
+ licenses: []
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options:
78
+ - "--main"
79
+ - README.md
80
+ - "--title"
81
+ - compare_compressors-0.0.1 Documentation
82
+ require_paths:
83
+ - lib
84
+ required_ruby_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ requirements: []
95
+ rubyforge_project:
96
+ rubygems_version: 2.6.8
97
+ signing_key:
98
+ specification_version: 4
99
+ summary: Compare compression algorithms (gzip, bzip2, xz, etc.) for a sample of documents.
100
+ test_files:
101
+ - test/compare_compressors/compare_compressors_test.rb