compare_compressors 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +132 -0
- data/bin/compare_compressors +6 -0
- data/lib/compare_compressors.rb +40 -0
- data/lib/compare_compressors/command_line_interface.rb +223 -0
- data/lib/compare_compressors/comparer.rb +70 -0
- data/lib/compare_compressors/compressor.rb +150 -0
- data/lib/compare_compressors/compressors/brotli_compressor.rb +43 -0
- data/lib/compare_compressors/compressors/bzip2_compressor.rb +37 -0
- data/lib/compare_compressors/compressors/gzip_compressor.rb +34 -0
- data/lib/compare_compressors/compressors/seven_zip_compressor.rb +43 -0
- data/lib/compare_compressors/compressors/xz_compressor.rb +37 -0
- data/lib/compare_compressors/compressors/zstd_compressor.rb +37 -0
- data/lib/compare_compressors/cost_model.rb +55 -0
- data/lib/compare_compressors/costed_group_result.rb +87 -0
- data/lib/compare_compressors/group_result.rb +62 -0
- data/lib/compare_compressors/plotter.rb +164 -0
- data/lib/compare_compressors/plotters/cost_plotter.rb +90 -0
- data/lib/compare_compressors/plotters/raw_plotter.rb +61 -0
- data/lib/compare_compressors/plotters/size_plotter.rb +76 -0
- data/lib/compare_compressors/result.rb +81 -0
- data/lib/compare_compressors/version.rb +8 -0
- data/test/compare_compressors/compare_compressors_test.rb +271 -0
- metadata +101 -0
@@ -0,0 +1,150 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'fileutils'
|
4
|
+
require 'tmpdir'
|
5
|
+
|
6
|
+
module CompareCompressors
|
7
|
+
#
|
8
|
+
# Base class for compressors. Subclasses provide compressor-specific
|
9
|
+
# configuration and logic.
|
10
|
+
#
|
11
|
+
class Compressor
|
12
|
+
#
|
13
|
+
# Run the compressor at the given level on the given target and measure
|
14
|
+
# its running time and memory usage.
|
15
|
+
#
|
16
|
+
# @param [String] target original pathname of the target (read only)
|
17
|
+
# @param [String] work_target temporary path of the target (read/write)
|
18
|
+
# @param [Numeric] level the compression level
|
19
|
+
# @return [Result]
|
20
|
+
#
|
21
|
+
def evaluate(target, work_target, level)
|
22
|
+
compression_times = time(compression_command(work_target, level))
|
23
|
+
size = output_size(work_target)
|
24
|
+
remove_if_exists(work_target)
|
25
|
+
|
26
|
+
decompression_times = time(decompression_command(work_target))
|
27
|
+
remove_if_exists(output_name(work_target))
|
28
|
+
|
29
|
+
Result.new(
|
30
|
+
target, name, level, *compression_times, size, *decompression_times
|
31
|
+
)
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# @abstract
|
36
|
+
# @return [String] name that can be a ruby symbol
|
37
|
+
#
|
38
|
+
def name
|
39
|
+
raise NotImplementedError
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# @abstract
|
44
|
+
# @return [String] extension added to the compressed file
|
45
|
+
#
|
46
|
+
def extension
|
47
|
+
raise NotImplementedError
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# @abstract
|
52
|
+
# @return [Array<Integer>] the levels supported by the compressor
|
53
|
+
#
|
54
|
+
def levels
|
55
|
+
raise NotImplementedError
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# @abstract
|
60
|
+
# @return [String?] version string (for information only)
|
61
|
+
#
|
62
|
+
def version
|
63
|
+
nil
|
64
|
+
end
|
65
|
+
|
66
|
+
#
|
67
|
+
# @return [String] display name (need not be safe to intern as a symbol)
|
68
|
+
#
|
69
|
+
def display_name
|
70
|
+
name
|
71
|
+
end
|
72
|
+
|
73
|
+
#
|
74
|
+
# @abstract
|
75
|
+
# @return [Array<String>] command to run the compressor
|
76
|
+
#
|
77
|
+
def compression_command
|
78
|
+
raise NotImplementedError
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# @abstract
|
83
|
+
# @return [Array<String>] command to run the compressor in decompress mode
|
84
|
+
#
|
85
|
+
def decompression_command
|
86
|
+
raise NotImplementedError
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
def output_size(target)
|
92
|
+
File.stat(output_name(target)).size
|
93
|
+
end
|
94
|
+
|
95
|
+
def output_name(target)
|
96
|
+
"#{target}.#{extension}"
|
97
|
+
end
|
98
|
+
|
99
|
+
def time(command)
|
100
|
+
status, times, out, err = run(*command)
|
101
|
+
return times if status.zero?
|
102
|
+
raise format(
|
103
|
+
"%s: %s exited with %d:\n%s\n%s",
|
104
|
+
name, command.join(' '), status, out, err
|
105
|
+
)
|
106
|
+
end
|
107
|
+
|
108
|
+
def run(*command, **options)
|
109
|
+
Dir.mktmpdir do |tmp|
|
110
|
+
out_pathname = File.join(tmp, 'out')
|
111
|
+
err_pathname = File.join(tmp, 'err')
|
112
|
+
options[:out] = out_pathname
|
113
|
+
options[:err] = err_pathname
|
114
|
+
options[:in] = '/dev/null'
|
115
|
+
|
116
|
+
# Note: this is not the shell builtin but rather /usr/bin/time; at least
|
117
|
+
# on Ubuntu, the latter reports both time and max RSS (memory usage)
|
118
|
+
# metrics, which is what we want here. Write the time output to a
|
119
|
+
# temporary file to avoid conflicting with the child's stderr output.
|
120
|
+
time_pathname = File.join(tmp, 'time')
|
121
|
+
timed_command = [
|
122
|
+
'time', '--format=%e %S %U %M', "--output=#{time_pathname}"
|
123
|
+
] + command
|
124
|
+
|
125
|
+
Process.waitpid(Process.spawn(*timed_command, **options))
|
126
|
+
|
127
|
+
[
|
128
|
+
$CHILD_STATUS.exitstatus,
|
129
|
+
parse_time(time_pathname),
|
130
|
+
File.read(out_pathname),
|
131
|
+
File.read(err_pathname)
|
132
|
+
]
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# Returns elapsed time in seconds, total (system plus user) CPU time in
|
137
|
+
# seconds, and maximum resident set size (memory usage) in Kilobytes, which
|
138
|
+
# I think means KiB.
|
139
|
+
def parse_time(time_pathname)
|
140
|
+
elapsed, sys, user, max_rss = File.read(time_pathname).split
|
141
|
+
[elapsed.to_f, sys.to_f + user.to_f, max_rss.to_i]
|
142
|
+
end
|
143
|
+
|
144
|
+
def remove_if_exists(pathname)
|
145
|
+
FileUtils.rm pathname
|
146
|
+
rescue Errno::ENOENT
|
147
|
+
nil # not a problem
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Compress with Brotli.
|
6
|
+
#
|
7
|
+
# Note: At present, the command does not seem to have anything that prints a
|
8
|
+
# version, so we can't implement `version`.
|
9
|
+
#
|
10
|
+
class BrotliCompressor < Compressor
|
11
|
+
def name
|
12
|
+
'brotli'
|
13
|
+
end
|
14
|
+
|
15
|
+
def extension
|
16
|
+
'bro'
|
17
|
+
end
|
18
|
+
|
19
|
+
# Can't find any documentation about this, so this is based on
|
20
|
+
# https://github.com/google/brotli/blob/cdca91b6f59dd7632985667d2cd585ab68937b48/enc/quality.h
|
21
|
+
def levels
|
22
|
+
(0..11).to_a
|
23
|
+
end
|
24
|
+
|
25
|
+
def compression_command(target, level)
|
26
|
+
[
|
27
|
+
'brotli',
|
28
|
+
'--input', target,
|
29
|
+
'--output', output_name(target),
|
30
|
+
'--quality', level.to_s
|
31
|
+
]
|
32
|
+
end
|
33
|
+
|
34
|
+
def decompression_command(target)
|
35
|
+
[
|
36
|
+
'brotli',
|
37
|
+
'--decompress',
|
38
|
+
'--input', output_name(target),
|
39
|
+
'--output', target
|
40
|
+
]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Compress with bzip2.
|
6
|
+
#
|
7
|
+
class Bzip2Compressor < Compressor
|
8
|
+
def name
|
9
|
+
'bzip2'
|
10
|
+
end
|
11
|
+
|
12
|
+
def extension
|
13
|
+
'bz2'
|
14
|
+
end
|
15
|
+
|
16
|
+
def levels
|
17
|
+
(1..9).to_a
|
18
|
+
end
|
19
|
+
|
20
|
+
def version
|
21
|
+
status, _times, _out, err = run(name, '--version')
|
22
|
+
return nil unless status.zero?
|
23
|
+
version_line = err.lines.first.chomp
|
24
|
+
raise "bad #{name} version line: #{version_line.inspect}" unless
|
25
|
+
version_line =~ /Version (.+)\.\z/
|
26
|
+
Regexp.last_match(1)
|
27
|
+
end
|
28
|
+
|
29
|
+
def compression_command(target, level)
|
30
|
+
['bzip2', "-#{level}", target]
|
31
|
+
end
|
32
|
+
|
33
|
+
def decompression_command(target)
|
34
|
+
['bunzip2', output_name(target)]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Compress with gzip.
|
6
|
+
#
|
7
|
+
class GzipCompressor < Compressor
|
8
|
+
def name
|
9
|
+
'gzip'
|
10
|
+
end
|
11
|
+
|
12
|
+
def extension
|
13
|
+
'gz'
|
14
|
+
end
|
15
|
+
|
16
|
+
def levels
|
17
|
+
(1..9).to_a
|
18
|
+
end
|
19
|
+
|
20
|
+
def version
|
21
|
+
status, _times, out, _err = run(name, '--version')
|
22
|
+
return nil unless status.zero?
|
23
|
+
out.lines.first.chomp
|
24
|
+
end
|
25
|
+
|
26
|
+
def compression_command(target, level)
|
27
|
+
['gzip', "-#{level}", target]
|
28
|
+
end
|
29
|
+
|
30
|
+
def decompression_command(target)
|
31
|
+
['gunzip', output_name(target)]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Compress with `7z`.
|
6
|
+
#
|
7
|
+
class SevenZipCompressor < Compressor
|
8
|
+
def name
|
9
|
+
'seven_zip'
|
10
|
+
end
|
11
|
+
|
12
|
+
def display_name
|
13
|
+
'7z'
|
14
|
+
end
|
15
|
+
|
16
|
+
def extension
|
17
|
+
'7z'
|
18
|
+
end
|
19
|
+
|
20
|
+
# Based on share/doc/p7zip/DOC/MANUAL/cmdline/switches/method.htm
|
21
|
+
# Level 0 is no compression, so we exclude it.
|
22
|
+
def levels
|
23
|
+
[1, 3, 5, 7, 9]
|
24
|
+
end
|
25
|
+
|
26
|
+
def version
|
27
|
+
status, _times, out, _err = run('7zr', '--help')
|
28
|
+
return nil unless status.zero?
|
29
|
+
version_line = out.strip.lines.first.chomp
|
30
|
+
raise "bad version line #{version_line}" unless
|
31
|
+
version_line =~ /([0-9.]+)[\s:]+Copyright/
|
32
|
+
Regexp.last_match(1)
|
33
|
+
end
|
34
|
+
|
35
|
+
def compression_command(target, level)
|
36
|
+
['7zr', 'a', "-mx=#{level}", output_name(target), target]
|
37
|
+
end
|
38
|
+
|
39
|
+
def decompression_command(target)
|
40
|
+
['7zr', 'x', "-o#{File.dirname(target)}", output_name(target)]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Compress with `xz` (LZMA).
|
6
|
+
#
|
7
|
+
class XzCompressor < Compressor
|
8
|
+
def name
|
9
|
+
'xz'
|
10
|
+
end
|
11
|
+
|
12
|
+
def extension
|
13
|
+
'xz'
|
14
|
+
end
|
15
|
+
|
16
|
+
def levels
|
17
|
+
(0..9).to_a
|
18
|
+
end
|
19
|
+
|
20
|
+
def version
|
21
|
+
status, _times, out, _err = run(name, '--version')
|
22
|
+
return nil unless status.zero?
|
23
|
+
version_line = out.lines.first.chomp
|
24
|
+
raise "bad version line #{version_line}" unless
|
25
|
+
version_line =~ /([0-9.a-z]+)$/
|
26
|
+
Regexp.last_match(1)
|
27
|
+
end
|
28
|
+
|
29
|
+
def compression_command(target, level)
|
30
|
+
['xz', "-#{level}", target]
|
31
|
+
end
|
32
|
+
|
33
|
+
def decompression_command(target)
|
34
|
+
['unxz', output_name(target)]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Compress with Zstandard.
|
6
|
+
#
|
7
|
+
class ZstdCompressor < Compressor
|
8
|
+
def name
|
9
|
+
'zstd'
|
10
|
+
end
|
11
|
+
|
12
|
+
def extension
|
13
|
+
'zst'
|
14
|
+
end
|
15
|
+
|
16
|
+
def levels
|
17
|
+
(1..19).to_a
|
18
|
+
end
|
19
|
+
|
20
|
+
def version
|
21
|
+
status, _times, out, _err = run(name, '-V')
|
22
|
+
return nil unless status.zero?
|
23
|
+
version_line = out.lines.first.chomp
|
24
|
+
raise "bad version line #{version_line}" unless
|
25
|
+
version_line =~ /([0-9.]+),/
|
26
|
+
Regexp.last_match(1)
|
27
|
+
end
|
28
|
+
|
29
|
+
def compression_command(target, level)
|
30
|
+
['zstd', "-#{level}", target]
|
31
|
+
end
|
32
|
+
|
33
|
+
def decompression_command(target)
|
34
|
+
['unzstd', output_name(target)]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Define costs for comparing grouped results.
|
6
|
+
#
|
7
|
+
class CostModel
|
8
|
+
# Default to current Amazon S3 storage cost per GiB*month ($).
|
9
|
+
DEFAULT_GIBYTE_COST = 0.023
|
10
|
+
|
11
|
+
# Default to on-demand cost for an Amazon EC2 m3.medium ($).
|
12
|
+
DEFAULT_HOUR_COST = 0.073
|
13
|
+
|
14
|
+
# Default to elapsed time rather than CPU time.
|
15
|
+
DEFAULT_USE_CPU_TIME = false
|
16
|
+
|
17
|
+
# Default to dollars.
|
18
|
+
DEFAULT_CURRENCY = '$'
|
19
|
+
|
20
|
+
# Default to dollars.
|
21
|
+
DEFAULT_SUMMARIZE_TOP = 5
|
22
|
+
|
23
|
+
def initialize(
|
24
|
+
gibyte_cost: DEFAULT_GIBYTE_COST,
|
25
|
+
compression_hour_cost: DEFAULT_HOUR_COST,
|
26
|
+
decompression_hour_cost: DEFAULT_HOUR_COST,
|
27
|
+
use_cpu_time: DEFAULT_USE_CPU_TIME,
|
28
|
+
currency: DEFAULT_CURRENCY
|
29
|
+
)
|
30
|
+
@gibyte_cost = gibyte_cost
|
31
|
+
@compression_hour_cost = compression_hour_cost
|
32
|
+
@decompression_hour_cost = decompression_hour_cost
|
33
|
+
@use_cpu_time = use_cpu_time
|
34
|
+
@currency = currency
|
35
|
+
end
|
36
|
+
|
37
|
+
attr_reader :gibyte_cost
|
38
|
+
attr_reader :compression_hour_cost
|
39
|
+
attr_reader :decompression_hour_cost
|
40
|
+
attr_reader :use_cpu_time
|
41
|
+
attr_reader :currency
|
42
|
+
|
43
|
+
def cost(grouped_results)
|
44
|
+
grouped_results.map do |group_result|
|
45
|
+
CostedGroupResult.new(self, group_result)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def summarize(costed_grouped_results, top = DEFAULT_SUMMARIZE_TOP)
|
50
|
+
costed_grouped_results.sort_by(&:total_cost).take(top).map do |result|
|
51
|
+
result.to_s(currency)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CompareCompressors
|
4
|
+
#
|
5
|
+
# Grouped result with costs calculated.
|
6
|
+
#
|
7
|
+
CostedGroupResult = Struct.new(
|
8
|
+
:compressor_name,
|
9
|
+
:compressor_level,
|
10
|
+
:mean_compression_elapsed_hours,
|
11
|
+
:mean_compression_cpu_hours,
|
12
|
+
:max_compression_max_rss,
|
13
|
+
:mean_compressed_gibytes,
|
14
|
+
:mean_compression_delta_gibytes,
|
15
|
+
:geomean_compression_ratio,
|
16
|
+
:mean_decompression_elapsed_hours,
|
17
|
+
:mean_decompression_cpu_hours,
|
18
|
+
:max_decompression_max_rss,
|
19
|
+
:compression_hour_cost,
|
20
|
+
:decompression_hour_cost,
|
21
|
+
:hour_cost,
|
22
|
+
:gibyte_cost,
|
23
|
+
:total_cost
|
24
|
+
) do
|
25
|
+
def self.new_from_group_result(cost_model, group_result)
|
26
|
+
if cost_model.use_cpu_time
|
27
|
+
compression_hours = group_result.mean_compression_cpu_hours
|
28
|
+
decompression_hours = group_result.mean_decompression_cpu_hours
|
29
|
+
else
|
30
|
+
compression_hours = group_result.mean_compression_elapsed_hours
|
31
|
+
decompression_hours = group_result.mean_decompression_elapsed_hours
|
32
|
+
end
|
33
|
+
compression_hour_cost =
|
34
|
+
cost_model.compression_hour_cost * compression_hours
|
35
|
+
decompression_hour_cost =
|
36
|
+
cost_model.decompression_hour_cost * decompression_hours
|
37
|
+
hour_cost = compression_hour_cost + decompression_hour_cost
|
38
|
+
gibyte_cost =
|
39
|
+
cost_model.gibyte_cost *
|
40
|
+
group_result.mean_compressed_gibytes
|
41
|
+
new(
|
42
|
+
group_result.compressor_name,
|
43
|
+
group_result.compressor_level,
|
44
|
+
group_result.mean_compression_cpu_hours,
|
45
|
+
group_result.mean_compression_elapsed_hours,
|
46
|
+
group_result.max_compression_max_rss,
|
47
|
+
group_result.mean_compressed_gibytes,
|
48
|
+
group_result.mean_compression_delta_gibytes,
|
49
|
+
group_result.geomean_compression_ratio,
|
50
|
+
group_result.mean_decompression_elapsed_hours,
|
51
|
+
group_result.mean_decompression_cpu_hours,
|
52
|
+
group_result.max_decompression_max_rss,
|
53
|
+
compression_hour_cost,
|
54
|
+
decompression_hour_cost,
|
55
|
+
hour_cost,
|
56
|
+
gibyte_cost,
|
57
|
+
hour_cost + gibyte_cost
|
58
|
+
)
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.from_group_results(cost_model, group_results)
|
62
|
+
group_results.map do |group_result|
|
63
|
+
new_from_group_result(cost_model, group_result)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def to_s(currency = CostModel::DEFAULT_CURRENCY)
|
68
|
+
gib_saved = mean_compression_delta_gibytes
|
69
|
+
<<-STRING
|
70
|
+
#{compressor_name} level #{compressor_level}:
|
71
|
+
compression ratio : #{format('%.2f', geomean_compression_ratio)}
|
72
|
+
compression elapsed hours : #{format('%.4f', mean_compression_elapsed_hours)}
|
73
|
+
compression CPU hours : #{format('%.4f', mean_compression_cpu_hours)}
|
74
|
+
compression max RSS (KiB) : #{format('%d', max_compression_max_rss)}
|
75
|
+
compressed GiB : #{format('%.4f', mean_compressed_gibytes)}
|
76
|
+
GiB saved : #{format('%.2f', gib_saved)}
|
77
|
+
decompression elapsed hours : #{format('%.4f', mean_decompression_elapsed_hours)}
|
78
|
+
decompression CPU hours : #{format('%.4f', mean_decompression_cpu_hours)}
|
79
|
+
decompression max RSS (KiB) : #{format('%d', max_decompression_max_rss)}
|
80
|
+
------------------
|
81
|
+
storage cost : #{format('%s%0.02f', currency, gibyte_cost)}
|
82
|
+
compute cost : #{format('%s%0.02f', currency, hour_cost)}
|
83
|
+
total cost : #{format('%s%0.02f', currency, total_cost)}
|
84
|
+
STRING
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|