compare_compressors 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +132 -0
- data/bin/compare_compressors +6 -0
- data/lib/compare_compressors.rb +40 -0
- data/lib/compare_compressors/command_line_interface.rb +223 -0
- data/lib/compare_compressors/comparer.rb +70 -0
- data/lib/compare_compressors/compressor.rb +150 -0
- data/lib/compare_compressors/compressors/brotli_compressor.rb +43 -0
- data/lib/compare_compressors/compressors/bzip2_compressor.rb +37 -0
- data/lib/compare_compressors/compressors/gzip_compressor.rb +34 -0
- data/lib/compare_compressors/compressors/seven_zip_compressor.rb +43 -0
- data/lib/compare_compressors/compressors/xz_compressor.rb +37 -0
- data/lib/compare_compressors/compressors/zstd_compressor.rb +37 -0
- data/lib/compare_compressors/cost_model.rb +55 -0
- data/lib/compare_compressors/costed_group_result.rb +87 -0
- data/lib/compare_compressors/group_result.rb +62 -0
- data/lib/compare_compressors/plotter.rb +164 -0
- data/lib/compare_compressors/plotters/cost_plotter.rb +90 -0
- data/lib/compare_compressors/plotters/raw_plotter.rb +61 -0
- data/lib/compare_compressors/plotters/size_plotter.rb +76 -0
- data/lib/compare_compressors/result.rb +81 -0
- data/lib/compare_compressors/version.rb +8 -0
- data/test/compare_compressors/compare_compressors_test.rb +271 -0
- metadata +101 -0
@@ -0,0 +1,150 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'fileutils'
|
4
|
+
require 'tmpdir'
|
5
|
+
|
6
|
+
module CompareCompressors
|
7
|
+
#
|
8
|
+
# Base class for compressors. Subclasses provide compressor-specific
|
9
|
+
# configuration and logic.
|
10
|
+
#
|
11
|
+
class Compressor
|
12
|
+
#
|
13
|
+
# Run the compressor at the given level on the given target and measure
|
14
|
+
# its running time and memory usage.
|
15
|
+
#
|
16
|
+
# @param [String] target original pathname of the target (read only)
|
17
|
+
# @param [String] work_target temporary path of the target (read/write)
|
18
|
+
# @param [Numeric] level the compression level
|
19
|
+
# @return [Result]
|
20
|
+
#
|
21
|
+
def evaluate(target, work_target, level)
|
22
|
+
compression_times = time(compression_command(work_target, level))
|
23
|
+
size = output_size(work_target)
|
24
|
+
remove_if_exists(work_target)
|
25
|
+
|
26
|
+
decompression_times = time(decompression_command(work_target))
|
27
|
+
remove_if_exists(output_name(work_target))
|
28
|
+
|
29
|
+
Result.new(
|
30
|
+
target, name, level, *compression_times, size, *decompression_times
|
31
|
+
)
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# @abstract
|
36
|
+
# @return [String] name that can be a ruby symbol
|
37
|
+
#
|
38
|
+
def name
|
39
|
+
raise NotImplementedError
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# @abstract
|
44
|
+
# @return [String] extension added to the compressed file
|
45
|
+
#
|
46
|
+
def extension
|
47
|
+
raise NotImplementedError
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# @abstract
|
52
|
+
# @return [Array<Integer>] the levels supported by the compressor
|
53
|
+
#
|
54
|
+
def levels
|
55
|
+
raise NotImplementedError
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# @abstract
|
60
|
+
# @return [String?] version string (for information only)
|
61
|
+
#
|
62
|
+
def version
|
63
|
+
nil
|
64
|
+
end
|
65
|
+
|
66
|
+
#
|
67
|
+
# @return [String] display name (need not be safe to intern as a symbol)
|
68
|
+
#
|
69
|
+
def display_name
|
70
|
+
name
|
71
|
+
end
|
72
|
+
|
73
|
+
#
|
74
|
+
# @abstract
|
75
|
+
# @return [Array<String>] command to run the compressor
|
76
|
+
#
|
77
|
+
def compression_command
|
78
|
+
raise NotImplementedError
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# @abstract
|
83
|
+
# @return [Array<String>] command to run the compressor in decompress mode
|
84
|
+
#
|
85
|
+
def decompression_command
|
86
|
+
raise NotImplementedError
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
def output_size(target)
|
92
|
+
File.stat(output_name(target)).size
|
93
|
+
end
|
94
|
+
|
95
|
+
def output_name(target)
|
96
|
+
"#{target}.#{extension}"
|
97
|
+
end
|
98
|
+
|
99
|
+
def time(command)
|
100
|
+
status, times, out, err = run(*command)
|
101
|
+
return times if status.zero?
|
102
|
+
raise format(
|
103
|
+
"%s: %s exited with %d:\n%s\n%s",
|
104
|
+
name, command.join(' '), status, out, err
|
105
|
+
)
|
106
|
+
end
|
107
|
+
|
108
|
+
def run(*command, **options)
|
109
|
+
Dir.mktmpdir do |tmp|
|
110
|
+
out_pathname = File.join(tmp, 'out')
|
111
|
+
err_pathname = File.join(tmp, 'err')
|
112
|
+
options[:out] = out_pathname
|
113
|
+
options[:err] = err_pathname
|
114
|
+
options[:in] = '/dev/null'
|
115
|
+
|
116
|
+
# Note: this is not the shell builtin but rather /usr/bin/time; at least
|
117
|
+
# on Ubuntu, the latter reports both time and max RSS (memory usage)
|
118
|
+
# metrics, which is what we want here. Write the time output to a
|
119
|
+
# temporary file to avoid conflicting with the child's stderr output.
|
120
|
+
time_pathname = File.join(tmp, 'time')
|
121
|
+
timed_command = [
|
122
|
+
'time', '--format=%e %S %U %M', "--output=#{time_pathname}"
|
123
|
+
] + command
|
124
|
+
|
125
|
+
Process.waitpid(Process.spawn(*timed_command, **options))
|
126
|
+
|
127
|
+
[
|
128
|
+
$CHILD_STATUS.exitstatus,
|
129
|
+
parse_time(time_pathname),
|
130
|
+
File.read(out_pathname),
|
131
|
+
File.read(err_pathname)
|
132
|
+
]
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# Returns elapsed time in seconds, total (system plus user) CPU time in
|
137
|
+
# seconds, and maximum resident set size (memory usage) in Kilobytes, which
|
138
|
+
# I think means KiB.
|
139
|
+
def parse_time(time_pathname)
|
140
|
+
elapsed, sys, user, max_rss = File.read(time_pathname).split
|
141
|
+
[elapsed.to_f, sys.to_f + user.to_f, max_rss.to_i]
|
142
|
+
end
|
143
|
+
|
144
|
+
def remove_if_exists(pathname)
|
145
|
+
FileUtils.rm pathname
|
146
|
+
rescue Errno::ENOENT
|
147
|
+
nil # not a problem
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Compress with Brotli.
|
6
|
+
#
|
7
|
+
# Note: At present, the command does not seem to have anything that prints a
|
8
|
+
# version, so we can't implement `version`.
|
9
|
+
#
|
10
|
+
class BrotliCompressor < Compressor
|
11
|
+
def name
|
12
|
+
'brotli'
|
13
|
+
end
|
14
|
+
|
15
|
+
def extension
|
16
|
+
'bro'
|
17
|
+
end
|
18
|
+
|
19
|
+
# Can't find any documentation about this, so this is based on
|
20
|
+
# https://github.com/google/brotli/blob/cdca91b6f59dd7632985667d2cd585ab68937b48/enc/quality.h
|
21
|
+
def levels
|
22
|
+
(0..11).to_a
|
23
|
+
end
|
24
|
+
|
25
|
+
def compression_command(target, level)
|
26
|
+
[
|
27
|
+
'brotli',
|
28
|
+
'--input', target,
|
29
|
+
'--output', output_name(target),
|
30
|
+
'--quality', level.to_s
|
31
|
+
]
|
32
|
+
end
|
33
|
+
|
34
|
+
def decompression_command(target)
|
35
|
+
[
|
36
|
+
'brotli',
|
37
|
+
'--decompress',
|
38
|
+
'--input', output_name(target),
|
39
|
+
'--output', target
|
40
|
+
]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Compress with bzip2.
|
6
|
+
#
|
7
|
+
class Bzip2Compressor < Compressor
|
8
|
+
def name
|
9
|
+
'bzip2'
|
10
|
+
end
|
11
|
+
|
12
|
+
def extension
|
13
|
+
'bz2'
|
14
|
+
end
|
15
|
+
|
16
|
+
def levels
|
17
|
+
(1..9).to_a
|
18
|
+
end
|
19
|
+
|
20
|
+
def version
|
21
|
+
status, _times, _out, err = run(name, '--version')
|
22
|
+
return nil unless status.zero?
|
23
|
+
version_line = err.lines.first.chomp
|
24
|
+
raise "bad #{name} version line: #{version_line.inspect}" unless
|
25
|
+
version_line =~ /Version (.+)\.\z/
|
26
|
+
Regexp.last_match(1)
|
27
|
+
end
|
28
|
+
|
29
|
+
def compression_command(target, level)
|
30
|
+
['bzip2', "-#{level}", target]
|
31
|
+
end
|
32
|
+
|
33
|
+
def decompression_command(target)
|
34
|
+
['bunzip2', output_name(target)]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Compress with gzip.
|
6
|
+
#
|
7
|
+
class GzipCompressor < Compressor
|
8
|
+
def name
|
9
|
+
'gzip'
|
10
|
+
end
|
11
|
+
|
12
|
+
def extension
|
13
|
+
'gz'
|
14
|
+
end
|
15
|
+
|
16
|
+
def levels
|
17
|
+
(1..9).to_a
|
18
|
+
end
|
19
|
+
|
20
|
+
def version
|
21
|
+
status, _times, out, _err = run(name, '--version')
|
22
|
+
return nil unless status.zero?
|
23
|
+
out.lines.first.chomp
|
24
|
+
end
|
25
|
+
|
26
|
+
def compression_command(target, level)
|
27
|
+
['gzip', "-#{level}", target]
|
28
|
+
end
|
29
|
+
|
30
|
+
def decompression_command(target)
|
31
|
+
['gunzip', output_name(target)]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Compress with `7z`.
|
6
|
+
#
|
7
|
+
class SevenZipCompressor < Compressor
|
8
|
+
def name
|
9
|
+
'seven_zip'
|
10
|
+
end
|
11
|
+
|
12
|
+
def display_name
|
13
|
+
'7z'
|
14
|
+
end
|
15
|
+
|
16
|
+
def extension
|
17
|
+
'7z'
|
18
|
+
end
|
19
|
+
|
20
|
+
# Based on share/doc/p7zip/DOC/MANUAL/cmdline/switches/method.htm
|
21
|
+
# Level 0 is no compression, so we exclude it.
|
22
|
+
def levels
|
23
|
+
[1, 3, 5, 7, 9]
|
24
|
+
end
|
25
|
+
|
26
|
+
def version
|
27
|
+
status, _times, out, _err = run('7zr', '--help')
|
28
|
+
return nil unless status.zero?
|
29
|
+
version_line = out.strip.lines.first.chomp
|
30
|
+
raise "bad version line #{version_line}" unless
|
31
|
+
version_line =~ /([0-9.]+)[\s:]+Copyright/
|
32
|
+
Regexp.last_match(1)
|
33
|
+
end
|
34
|
+
|
35
|
+
def compression_command(target, level)
|
36
|
+
['7zr', 'a', "-mx=#{level}", output_name(target), target]
|
37
|
+
end
|
38
|
+
|
39
|
+
def decompression_command(target)
|
40
|
+
['7zr', 'x', "-o#{File.dirname(target)}", output_name(target)]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Compress with `xz` (LZMA).
|
6
|
+
#
|
7
|
+
class XzCompressor < Compressor
|
8
|
+
def name
|
9
|
+
'xz'
|
10
|
+
end
|
11
|
+
|
12
|
+
def extension
|
13
|
+
'xz'
|
14
|
+
end
|
15
|
+
|
16
|
+
def levels
|
17
|
+
(0..9).to_a
|
18
|
+
end
|
19
|
+
|
20
|
+
def version
|
21
|
+
status, _times, out, _err = run(name, '--version')
|
22
|
+
return nil unless status.zero?
|
23
|
+
version_line = out.lines.first.chomp
|
24
|
+
raise "bad version line #{version_line}" unless
|
25
|
+
version_line =~ /([0-9.a-z]+)$/
|
26
|
+
Regexp.last_match(1)
|
27
|
+
end
|
28
|
+
|
29
|
+
def compression_command(target, level)
|
30
|
+
['xz', "-#{level}", target]
|
31
|
+
end
|
32
|
+
|
33
|
+
def decompression_command(target)
|
34
|
+
['unxz', output_name(target)]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Compress with Zstandard.
|
6
|
+
#
|
7
|
+
class ZstdCompressor < Compressor
|
8
|
+
def name
|
9
|
+
'zstd'
|
10
|
+
end
|
11
|
+
|
12
|
+
def extension
|
13
|
+
'zst'
|
14
|
+
end
|
15
|
+
|
16
|
+
def levels
|
17
|
+
(1..19).to_a
|
18
|
+
end
|
19
|
+
|
20
|
+
def version
|
21
|
+
status, _times, out, _err = run(name, '-V')
|
22
|
+
return nil unless status.zero?
|
23
|
+
version_line = out.lines.first.chomp
|
24
|
+
raise "bad version line #{version_line}" unless
|
25
|
+
version_line =~ /([0-9.]+),/
|
26
|
+
Regexp.last_match(1)
|
27
|
+
end
|
28
|
+
|
29
|
+
def compression_command(target, level)
|
30
|
+
['zstd', "-#{level}", target]
|
31
|
+
end
|
32
|
+
|
33
|
+
def decompression_command(target)
|
34
|
+
['unzstd', output_name(target)]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Define costs for comparing grouped results.
|
6
|
+
#
|
7
|
+
class CostModel
|
8
|
+
# Default to current Amazon S3 storage cost per GiB*month ($).
|
9
|
+
DEFAULT_GIBYTE_COST = 0.023
|
10
|
+
|
11
|
+
# Default to on-demand cost for an Amazon EC2 m3.medium ($).
|
12
|
+
DEFAULT_HOUR_COST = 0.073
|
13
|
+
|
14
|
+
# Default to elapsed time rather than CPU time.
|
15
|
+
DEFAULT_USE_CPU_TIME = false
|
16
|
+
|
17
|
+
# Default to dollars.
|
18
|
+
DEFAULT_CURRENCY = '$'
|
19
|
+
|
20
|
+
# Default to dollars.
|
21
|
+
DEFAULT_SUMMARIZE_TOP = 5
|
22
|
+
|
23
|
+
def initialize(
|
24
|
+
gibyte_cost: DEFAULT_GIBYTE_COST,
|
25
|
+
compression_hour_cost: DEFAULT_HOUR_COST,
|
26
|
+
decompression_hour_cost: DEFAULT_HOUR_COST,
|
27
|
+
use_cpu_time: DEFAULT_USE_CPU_TIME,
|
28
|
+
currency: DEFAULT_CURRENCY
|
29
|
+
)
|
30
|
+
@gibyte_cost = gibyte_cost
|
31
|
+
@compression_hour_cost = compression_hour_cost
|
32
|
+
@decompression_hour_cost = decompression_hour_cost
|
33
|
+
@use_cpu_time = use_cpu_time
|
34
|
+
@currency = currency
|
35
|
+
end
|
36
|
+
|
37
|
+
attr_reader :gibyte_cost
|
38
|
+
attr_reader :compression_hour_cost
|
39
|
+
attr_reader :decompression_hour_cost
|
40
|
+
attr_reader :use_cpu_time
|
41
|
+
attr_reader :currency
|
42
|
+
|
43
|
+
def cost(grouped_results)
|
44
|
+
grouped_results.map do |group_result|
|
45
|
+
CostedGroupResult.new(self, group_result)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def summarize(costed_grouped_results, top = DEFAULT_SUMMARIZE_TOP)
|
50
|
+
costed_grouped_results.sort_by(&:total_cost).take(top).map do |result|
|
51
|
+
result.to_s(currency)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Grouped result with costs calculated.
|
6
|
+
#
|
7
|
+
CostedGroupResult = Struct.new(
|
8
|
+
:compressor_name,
|
9
|
+
:compressor_level,
|
10
|
+
:mean_compression_elapsed_hours,
|
11
|
+
:mean_compression_cpu_hours,
|
12
|
+
:max_compression_max_rss,
|
13
|
+
:mean_compressed_gibytes,
|
14
|
+
:mean_compression_delta_gibytes,
|
15
|
+
:geomean_compression_ratio,
|
16
|
+
:mean_decompression_elapsed_hours,
|
17
|
+
:mean_decompression_cpu_hours,
|
18
|
+
:max_decompression_max_rss,
|
19
|
+
:compression_hour_cost,
|
20
|
+
:decompression_hour_cost,
|
21
|
+
:hour_cost,
|
22
|
+
:gibyte_cost,
|
23
|
+
:total_cost
|
24
|
+
) do
|
25
|
+
def self.new_from_group_result(cost_model, group_result)
|
26
|
+
if cost_model.use_cpu_time
|
27
|
+
compression_hours = group_result.mean_compression_cpu_hours
|
28
|
+
decompression_hours = group_result.mean_decompression_cpu_hours
|
29
|
+
else
|
30
|
+
compression_hours = group_result.mean_compression_elapsed_hours
|
31
|
+
decompression_hours = group_result.mean_decompression_elapsed_hours
|
32
|
+
end
|
33
|
+
compression_hour_cost =
|
34
|
+
cost_model.compression_hour_cost * compression_hours
|
35
|
+
decompression_hour_cost =
|
36
|
+
cost_model.decompression_hour_cost * decompression_hours
|
37
|
+
hour_cost = compression_hour_cost + decompression_hour_cost
|
38
|
+
gibyte_cost =
|
39
|
+
cost_model.gibyte_cost *
|
40
|
+
group_result.mean_compressed_gibytes
|
41
|
+
new(
|
42
|
+
group_result.compressor_name,
|
43
|
+
group_result.compressor_level,
|
44
|
+
group_result.mean_compression_cpu_hours,
|
45
|
+
group_result.mean_compression_elapsed_hours,
|
46
|
+
group_result.max_compression_max_rss,
|
47
|
+
group_result.mean_compressed_gibytes,
|
48
|
+
group_result.mean_compression_delta_gibytes,
|
49
|
+
group_result.geomean_compression_ratio,
|
50
|
+
group_result.mean_decompression_elapsed_hours,
|
51
|
+
group_result.mean_decompression_cpu_hours,
|
52
|
+
group_result.max_decompression_max_rss,
|
53
|
+
compression_hour_cost,
|
54
|
+
decompression_hour_cost,
|
55
|
+
hour_cost,
|
56
|
+
gibyte_cost,
|
57
|
+
hour_cost + gibyte_cost
|
58
|
+
)
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.from_group_results(cost_model, group_results)
|
62
|
+
group_results.map do |group_result|
|
63
|
+
new_from_group_result(cost_model, group_result)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def to_s(currency = CostModel::DEFAULT_CURRENCY)
|
68
|
+
gib_saved = mean_compression_delta_gibytes
|
69
|
+
<<-STRING
|
70
|
+
#{compressor_name} level #{compressor_level}:
|
71
|
+
compression ratio : #{format('%.2f', geomean_compression_ratio)}
|
72
|
+
compression elapsed hours : #{format('%.4f', mean_compression_elapsed_hours)}
|
73
|
+
compression CPU hours : #{format('%.4f', mean_compression_cpu_hours)}
|
74
|
+
compression max RSS (KiB) : #{format('%d', max_compression_max_rss)}
|
75
|
+
compressed GiB : #{format('%.4f', mean_compressed_gibytes)}
|
76
|
+
GiB saved : #{format('%.2f', gib_saved)}
|
77
|
+
decompression elapsed hours : #{format('%.4f', mean_decompression_elapsed_hours)}
|
78
|
+
decompression CPU hours : #{format('%.4f', mean_decompression_cpu_hours)}
|
79
|
+
decompression max RSS (KiB) : #{format('%d', max_decompression_max_rss)}
|
80
|
+
------------------
|
81
|
+
storage cost : #{format('%s%0.02f', currency, gibyte_cost)}
|
82
|
+
compute cost : #{format('%s%0.02f', currency, hour_cost)}
|
83
|
+
total cost : #{format('%s%0.02f', currency, total_cost)}
|
84
|
+
STRING
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|