compare_compressors 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,150 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'tmpdir'
5
+
6
+ module CompareCompressors
7
+ #
8
+ # Base class for compressors. Subclasses provide compressor-specific
9
+ # configuration and logic.
10
+ #
11
+ class Compressor
12
+ #
13
+ # Run the compressor at the given level on the given target and measure
14
+ # its running time and memory usage.
15
+ #
16
+ # @param [String] target original pathname of the target (read only)
17
+ # @param [String] work_target temporary path of the target (read/write)
18
+ # @param [Numeric] level the compression level
19
+ # @return [Result]
20
+ #
21
+ def evaluate(target, work_target, level)
22
+ compression_times = time(compression_command(work_target, level))
23
+ size = output_size(work_target)
24
+ remove_if_exists(work_target)
25
+
26
+ decompression_times = time(decompression_command(work_target))
27
+ remove_if_exists(output_name(work_target))
28
+
29
+ Result.new(
30
+ target, name, level, *compression_times, size, *decompression_times
31
+ )
32
+ end
33
+
34
+ #
35
+ # @abstract
36
+ # @return [String] name that can be a ruby symbol
37
+ #
38
+ def name
39
+ raise NotImplementedError
40
+ end
41
+
42
+ #
43
+ # @abstract
44
+ # @return [String] extension added to the compressed file
45
+ #
46
+ def extension
47
+ raise NotImplementedError
48
+ end
49
+
50
+ #
51
+ # @abstract
52
+ # @return [Array<Integer>] the levels supported by the compressor
53
+ #
54
+ def levels
55
+ raise NotImplementedError
56
+ end
57
+
58
+ #
59
+ # @abstract
60
+ # @return [String?] version string (for information only)
61
+ #
62
+ def version
63
+ nil
64
+ end
65
+
66
+ #
67
+ # @return [String] display name (need not be safe to intern as a symbol)
68
+ #
69
+ def display_name
70
+ name
71
+ end
72
+
73
+ #
74
+ # @abstract
75
+ # @return [Array<String>] command to run the compressor
76
+ #
77
+ def compression_command
78
+ raise NotImplementedError
79
+ end
80
+
81
+ #
82
+ # @abstract
83
+ # @return [Array<String>] command to run the compressor in decompress mode
84
+ #
85
+ def decompression_command
86
+ raise NotImplementedError
87
+ end
88
+
89
+ private
90
+
91
+ def output_size(target)
92
+ File.stat(output_name(target)).size
93
+ end
94
+
95
+ def output_name(target)
96
+ "#{target}.#{extension}"
97
+ end
98
+
99
+ def time(command)
100
+ status, times, out, err = run(*command)
101
+ return times if status.zero?
102
+ raise format(
103
+ "%s: %s exited with %d:\n%s\n%s",
104
+ name, command.join(' '), status, out, err
105
+ )
106
+ end
107
+
108
+ def run(*command, **options)
109
+ Dir.mktmpdir do |tmp|
110
+ out_pathname = File.join(tmp, 'out')
111
+ err_pathname = File.join(tmp, 'err')
112
+ options[:out] = out_pathname
113
+ options[:err] = err_pathname
114
+ options[:in] = '/dev/null'
115
+
116
+ # Note: this is not the shell builtin but rather /usr/bin/time; at least
117
+ # on Ubuntu, the latter reports both time and max RSS (memory usage)
118
+ # metrics, which is what we want here. Write the time output to a
119
+ # temporary file to avoid conflicting with the child's stderr output.
120
+ time_pathname = File.join(tmp, 'time')
121
+ timed_command = [
122
+ 'time', '--format=%e %S %U %M', "--output=#{time_pathname}"
123
+ ] + command
124
+
125
+ Process.waitpid(Process.spawn(*timed_command, **options))
126
+
127
+ [
128
+ $CHILD_STATUS.exitstatus,
129
+ parse_time(time_pathname),
130
+ File.read(out_pathname),
131
+ File.read(err_pathname)
132
+ ]
133
+ end
134
+ end
135
+
136
+ # Returns elapsed time in seconds, total (system plus user) CPU time in
137
+ # seconds, and maximum resident set size (memory usage) in Kilobytes, which
138
+ # I think means KiB.
139
+ def parse_time(time_pathname)
140
+ elapsed, sys, user, max_rss = File.read(time_pathname).split
141
+ [elapsed.to_f, sys.to_f + user.to_f, max_rss.to_i]
142
+ end
143
+
144
+ def remove_if_exists(pathname)
145
+ FileUtils.rm pathname
146
+ rescue Errno::ENOENT
147
+ nil # not a problem
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Compress with Brotli.
6
+ #
7
+ # Note: At present, the command does not seem to have anything that prints a
8
+ # version, so we can't implement `version`.
9
+ #
10
+ class BrotliCompressor < Compressor
11
+ def name
12
+ 'brotli'
13
+ end
14
+
15
+ def extension
16
+ 'bro'
17
+ end
18
+
19
+ # Can't find any documentation about this, so this is based on
20
+ # https://github.com/google/brotli/blob/cdca91b6f59dd7632985667d2cd585ab68937b48/enc/quality.h
21
+ def levels
22
+ (0..11).to_a
23
+ end
24
+
25
+ def compression_command(target, level)
26
+ [
27
+ 'brotli',
28
+ '--input', target,
29
+ '--output', output_name(target),
30
+ '--quality', level.to_s
31
+ ]
32
+ end
33
+
34
+ def decompression_command(target)
35
+ [
36
+ 'brotli',
37
+ '--decompress',
38
+ '--input', output_name(target),
39
+ '--output', target
40
+ ]
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Compress with bzip2.
6
+ #
7
+ class Bzip2Compressor < Compressor
8
+ def name
9
+ 'bzip2'
10
+ end
11
+
12
+ def extension
13
+ 'bz2'
14
+ end
15
+
16
+ def levels
17
+ (1..9).to_a
18
+ end
19
+
20
+ def version
21
+ status, _times, _out, err = run(name, '--version')
22
+ return nil unless status.zero?
23
+ version_line = err.lines.first.chomp
24
+ raise "bad #{name} version line: #{version_line.inspect}" unless
25
+ version_line =~ /Version (.+)\.\z/
26
+ Regexp.last_match(1)
27
+ end
28
+
29
+ def compression_command(target, level)
30
+ ['bzip2', "-#{level}", target]
31
+ end
32
+
33
+ def decompression_command(target)
34
+ ['bunzip2', output_name(target)]
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Compress with gzip.
6
+ #
7
+ class GzipCompressor < Compressor
8
+ def name
9
+ 'gzip'
10
+ end
11
+
12
+ def extension
13
+ 'gz'
14
+ end
15
+
16
+ def levels
17
+ (1..9).to_a
18
+ end
19
+
20
+ def version
21
+ status, _times, out, _err = run(name, '--version')
22
+ return nil unless status.zero?
23
+ out.lines.first.chomp
24
+ end
25
+
26
+ def compression_command(target, level)
27
+ ['gzip', "-#{level}", target]
28
+ end
29
+
30
+ def decompression_command(target)
31
+ ['gunzip', output_name(target)]
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Compress with `7z`.
6
+ #
7
+ class SevenZipCompressor < Compressor
8
+ def name
9
+ 'seven_zip'
10
+ end
11
+
12
+ def display_name
13
+ '7z'
14
+ end
15
+
16
+ def extension
17
+ '7z'
18
+ end
19
+
20
+ # Based on share/doc/p7zip/DOC/MANUAL/cmdline/switches/method.htm
21
+ # Level 0 is no compression, so we exclude it.
22
+ def levels
23
+ [1, 3, 5, 7, 9]
24
+ end
25
+
26
+ def version
27
+ status, _times, out, _err = run('7zr', '--help')
28
+ return nil unless status.zero?
29
+ version_line = out.strip.lines.first.chomp
30
+ raise "bad version line #{version_line}" unless
31
+ version_line =~ /([0-9.]+)[\s:]+Copyright/
32
+ Regexp.last_match(1)
33
+ end
34
+
35
+ def compression_command(target, level)
36
+ ['7zr', 'a', "-mx=#{level}", output_name(target), target]
37
+ end
38
+
39
+ def decompression_command(target)
40
+ ['7zr', 'x', "-o#{File.dirname(target)}", output_name(target)]
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Compress with `xz` (LZMA).
6
+ #
7
+ class XzCompressor < Compressor
8
+ def name
9
+ 'xz'
10
+ end
11
+
12
+ def extension
13
+ 'xz'
14
+ end
15
+
16
+ def levels
17
+ (0..9).to_a
18
+ end
19
+
20
+ def version
21
+ status, _times, out, _err = run(name, '--version')
22
+ return nil unless status.zero?
23
+ version_line = out.lines.first.chomp
24
+ raise "bad version line #{version_line}" unless
25
+ version_line =~ /([0-9.a-z]+)$/
26
+ Regexp.last_match(1)
27
+ end
28
+
29
+ def compression_command(target, level)
30
+ ['xz', "-#{level}", target]
31
+ end
32
+
33
+ def decompression_command(target)
34
+ ['unxz', output_name(target)]
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Compress with Zstandard.
6
+ #
7
+ class ZstdCompressor < Compressor
8
+ def name
9
+ 'zstd'
10
+ end
11
+
12
+ def extension
13
+ 'zst'
14
+ end
15
+
16
+ def levels
17
+ (1..19).to_a
18
+ end
19
+
20
+ def version
21
+ status, _times, out, _err = run(name, '-V')
22
+ return nil unless status.zero?
23
+ version_line = out.lines.first.chomp
24
+ raise "bad version line #{version_line}" unless
25
+ version_line =~ /([0-9.]+),/
26
+ Regexp.last_match(1)
27
+ end
28
+
29
+ def compression_command(target, level)
30
+ ['zstd', "-#{level}", target]
31
+ end
32
+
33
+ def decompression_command(target)
34
+ ['unzstd', output_name(target)]
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Define costs for comparing grouped results.
6
+ #
7
+ class CostModel
8
+ # Default to current Amazon S3 storage cost per GiB*month ($).
9
+ DEFAULT_GIBYTE_COST = 0.023
10
+
11
+ # Default to on-demand cost for an Amazon EC2 m3.medium ($).
12
+ DEFAULT_HOUR_COST = 0.073
13
+
14
+ # Default to elapsed time rather than CPU time.
15
+ DEFAULT_USE_CPU_TIME = false
16
+
17
+ # Default to dollars.
18
+ DEFAULT_CURRENCY = '$'
19
+
20
+ # Default to dollars.
21
+ DEFAULT_SUMMARIZE_TOP = 5
22
+
23
+ def initialize(
24
+ gibyte_cost: DEFAULT_GIBYTE_COST,
25
+ compression_hour_cost: DEFAULT_HOUR_COST,
26
+ decompression_hour_cost: DEFAULT_HOUR_COST,
27
+ use_cpu_time: DEFAULT_USE_CPU_TIME,
28
+ currency: DEFAULT_CURRENCY
29
+ )
30
+ @gibyte_cost = gibyte_cost
31
+ @compression_hour_cost = compression_hour_cost
32
+ @decompression_hour_cost = decompression_hour_cost
33
+ @use_cpu_time = use_cpu_time
34
+ @currency = currency
35
+ end
36
+
37
+ attr_reader :gibyte_cost
38
+ attr_reader :compression_hour_cost
39
+ attr_reader :decompression_hour_cost
40
+ attr_reader :use_cpu_time
41
+ attr_reader :currency
42
+
43
+ def cost(grouped_results)
44
+ grouped_results.map do |group_result|
45
+ CostedGroupResult.new(self, group_result)
46
+ end
47
+ end
48
+
49
+ def summarize(costed_grouped_results, top = DEFAULT_SUMMARIZE_TOP)
50
+ costed_grouped_results.sort_by(&:total_cost).take(top).map do |result|
51
+ result.to_s(currency)
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Grouped result with costs calculated.
6
+ #
7
+ CostedGroupResult = Struct.new(
8
+ :compressor_name,
9
+ :compressor_level,
10
+ :mean_compression_elapsed_hours,
11
+ :mean_compression_cpu_hours,
12
+ :max_compression_max_rss,
13
+ :mean_compressed_gibytes,
14
+ :mean_compression_delta_gibytes,
15
+ :geomean_compression_ratio,
16
+ :mean_decompression_elapsed_hours,
17
+ :mean_decompression_cpu_hours,
18
+ :max_decompression_max_rss,
19
+ :compression_hour_cost,
20
+ :decompression_hour_cost,
21
+ :hour_cost,
22
+ :gibyte_cost,
23
+ :total_cost
24
+ ) do
25
+ def self.new_from_group_result(cost_model, group_result)
26
+ if cost_model.use_cpu_time
27
+ compression_hours = group_result.mean_compression_cpu_hours
28
+ decompression_hours = group_result.mean_decompression_cpu_hours
29
+ else
30
+ compression_hours = group_result.mean_compression_elapsed_hours
31
+ decompression_hours = group_result.mean_decompression_elapsed_hours
32
+ end
33
+ compression_hour_cost =
34
+ cost_model.compression_hour_cost * compression_hours
35
+ decompression_hour_cost =
36
+ cost_model.decompression_hour_cost * decompression_hours
37
+ hour_cost = compression_hour_cost + decompression_hour_cost
38
+ gibyte_cost =
39
+ cost_model.gibyte_cost *
40
+ group_result.mean_compressed_gibytes
41
+ new(
42
+ group_result.compressor_name,
43
+ group_result.compressor_level,
44
+ group_result.mean_compression_cpu_hours,
45
+ group_result.mean_compression_elapsed_hours,
46
+ group_result.max_compression_max_rss,
47
+ group_result.mean_compressed_gibytes,
48
+ group_result.mean_compression_delta_gibytes,
49
+ group_result.geomean_compression_ratio,
50
+ group_result.mean_decompression_elapsed_hours,
51
+ group_result.mean_decompression_cpu_hours,
52
+ group_result.max_decompression_max_rss,
53
+ compression_hour_cost,
54
+ decompression_hour_cost,
55
+ hour_cost,
56
+ gibyte_cost,
57
+ hour_cost + gibyte_cost
58
+ )
59
+ end
60
+
61
+ def self.from_group_results(cost_model, group_results)
62
+ group_results.map do |group_result|
63
+ new_from_group_result(cost_model, group_result)
64
+ end
65
+ end
66
+
67
+ def to_s(currency = CostModel::DEFAULT_CURRENCY)
68
+ gib_saved = mean_compression_delta_gibytes
69
+ <<-STRING
70
+ #{compressor_name} level #{compressor_level}:
71
+ compression ratio : #{format('%.2f', geomean_compression_ratio)}
72
+ compression elapsed hours : #{format('%.4f', mean_compression_elapsed_hours)}
73
+ compression CPU hours : #{format('%.4f', mean_compression_cpu_hours)}
74
+ compression max RSS (KiB) : #{format('%d', max_compression_max_rss)}
75
+ compressed GiB : #{format('%.4f', mean_compressed_gibytes)}
76
+ GiB saved : #{format('%.2f', gib_saved)}
77
+ decompression elapsed hours : #{format('%.4f', mean_decompression_elapsed_hours)}
78
+ decompression CPU hours : #{format('%.4f', mean_decompression_cpu_hours)}
79
+ decompression max RSS (KiB) : #{format('%d', max_decompression_max_rss)}
80
+ ------------------
81
+ storage cost : #{format('%s%0.02f', currency, gibyte_cost)}
82
+ compute cost : #{format('%s%0.02f', currency, hour_cost)}
83
+ total cost : #{format('%s%0.02f', currency, total_cost)}
84
+ STRING
85
+ end
86
+ end
87
+ end