compare_compressors 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,150 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'tmpdir'
5
+
6
+ module CompareCompressors
7
+ #
8
+ # Base class for compressors. Subclasses provide compressor-specific
9
+ # configuration and logic.
10
+ #
11
+ class Compressor
12
+ #
13
+ # Run the compressor at the given level on the given target and measure
14
+ # its running time and memory usage.
15
+ #
16
+ # @param [String] target original pathname of the target (read only)
17
+ # @param [String] work_target temporary path of the target (read/write)
18
+ # @param [Numeric] level the compression level
19
+ # @return [Result]
20
+ #
21
+ def evaluate(target, work_target, level)
22
+ compression_times = time(compression_command(work_target, level))
23
+ size = output_size(work_target)
24
+ remove_if_exists(work_target)
25
+
26
+ decompression_times = time(decompression_command(work_target))
27
+ remove_if_exists(output_name(work_target))
28
+
29
+ Result.new(
30
+ target, name, level, *compression_times, size, *decompression_times
31
+ )
32
+ end
33
+
34
+ #
35
+ # @abstract
36
+ # @return [String] name that can be a ruby symbol
37
+ #
38
+ def name
39
+ raise NotImplementedError
40
+ end
41
+
42
+ #
43
+ # @abstract
44
+ # @return [String] extension added to the compressed file
45
+ #
46
+ def extension
47
+ raise NotImplementedError
48
+ end
49
+
50
+ #
51
+ # @abstract
52
+ # @return [Array<Integer>] the levels supported by the compressor
53
+ #
54
+ def levels
55
+ raise NotImplementedError
56
+ end
57
+
58
+ #
59
+ # @abstract
60
+ # @return [String?] version string (for information only)
61
+ #
62
+ def version
63
+ nil
64
+ end
65
+
66
+ #
67
+ # @return [String] display name (need not be safe to intern as a symbol)
68
+ #
69
+ def display_name
70
+ name
71
+ end
72
+
73
+ #
74
+ # @abstract
75
+ # @return [Array<String>] command to run the compressor
76
+ #
77
+ def compression_command
78
+ raise NotImplementedError
79
+ end
80
+
81
+ #
82
+ # @abstract
83
+ # @return [Array<String>] command to run the compressor in decompress mode
84
+ #
85
+ def decompression_command
86
+ raise NotImplementedError
87
+ end
88
+
89
+ private
90
+
91
+ def output_size(target)
92
+ File.stat(output_name(target)).size
93
+ end
94
+
95
+ def output_name(target)
96
+ "#{target}.#{extension}"
97
+ end
98
+
99
+ def time(command)
100
+ status, times, out, err = run(*command)
101
+ return times if status.zero?
102
+ raise format(
103
+ "%s: %s exited with %d:\n%s\n%s",
104
+ name, command.join(' '), status, out, err
105
+ )
106
+ end
107
+
108
+ def run(*command, **options)
109
+ Dir.mktmpdir do |tmp|
110
+ out_pathname = File.join(tmp, 'out')
111
+ err_pathname = File.join(tmp, 'err')
112
+ options[:out] = out_pathname
113
+ options[:err] = err_pathname
114
+ options[:in] = '/dev/null'
115
+
116
+ # Note: this is not the shell builtin but rather /usr/bin/time; at least
117
+ # on Ubuntu, the latter reports both time and max RSS (memory usage)
118
+ # metrics, which is what we want here. Write the time output to a
119
+ # temporary file to avoid conflicting with the child's stderr output.
120
+ time_pathname = File.join(tmp, 'time')
121
+ timed_command = [
122
+ 'time', '--format=%e %S %U %M', "--output=#{time_pathname}"
123
+ ] + command
124
+
125
+ Process.waitpid(Process.spawn(*timed_command, **options))
126
+
127
+ [
128
+ $CHILD_STATUS.exitstatus,
129
+ parse_time(time_pathname),
130
+ File.read(out_pathname),
131
+ File.read(err_pathname)
132
+ ]
133
+ end
134
+ end
135
+
136
+ # Returns elapsed time in seconds, total (system plus user) CPU time in
137
+ # seconds, and maximum resident set size (memory usage) in Kilobytes, which
138
+ # I think means KiB.
139
+ def parse_time(time_pathname)
140
+ elapsed, sys, user, max_rss = File.read(time_pathname).split
141
+ [elapsed.to_f, sys.to_f + user.to_f, max_rss.to_i]
142
+ end
143
+
144
+ def remove_if_exists(pathname)
145
+ FileUtils.rm pathname
146
+ rescue Errno::ENOENT
147
+ nil # not a problem
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Compress with Brotli.
6
+ #
7
+ # Note: At present, the command does not seem to have anything that prints a
8
+ # version, so we can't implement `version`.
9
+ #
10
+ class BrotliCompressor < Compressor
11
+ def name
12
+ 'brotli'
13
+ end
14
+
15
+ def extension
16
+ 'bro'
17
+ end
18
+
19
+ # Can't find any documentation about this, so this is based on
20
+ # https://github.com/google/brotli/blob/cdca91b6f59dd7632985667d2cd585ab68937b48/enc/quality.h
21
+ def levels
22
+ (0..11).to_a
23
+ end
24
+
25
+ def compression_command(target, level)
26
+ [
27
+ 'brotli',
28
+ '--input', target,
29
+ '--output', output_name(target),
30
+ '--quality', level.to_s
31
+ ]
32
+ end
33
+
34
+ def decompression_command(target)
35
+ [
36
+ 'brotli',
37
+ '--decompress',
38
+ '--input', output_name(target),
39
+ '--output', target
40
+ ]
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Compress with bzip2.
6
+ #
7
+ class Bzip2Compressor < Compressor
8
+ def name
9
+ 'bzip2'
10
+ end
11
+
12
+ def extension
13
+ 'bz2'
14
+ end
15
+
16
+ def levels
17
+ (1..9).to_a
18
+ end
19
+
20
+ def version
21
+ status, _times, _out, err = run(name, '--version')
22
+ return nil unless status.zero?
23
+ version_line = err.lines.first.chomp
24
+ raise "bad #{name} version line: #{version_line.inspect}" unless
25
+ version_line =~ /Version (.+)\.\z/
26
+ Regexp.last_match(1)
27
+ end
28
+
29
+ def compression_command(target, level)
30
+ ['bzip2', "-#{level}", target]
31
+ end
32
+
33
+ def decompression_command(target)
34
+ ['bunzip2', output_name(target)]
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Compress with gzip.
6
+ #
7
+ class GzipCompressor < Compressor
8
+ def name
9
+ 'gzip'
10
+ end
11
+
12
+ def extension
13
+ 'gz'
14
+ end
15
+
16
+ def levels
17
+ (1..9).to_a
18
+ end
19
+
20
+ def version
21
+ status, _times, out, _err = run(name, '--version')
22
+ return nil unless status.zero?
23
+ out.lines.first.chomp
24
+ end
25
+
26
+ def compression_command(target, level)
27
+ ['gzip', "-#{level}", target]
28
+ end
29
+
30
+ def decompression_command(target)
31
+ ['gunzip', output_name(target)]
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Compress with `7z`.
6
+ #
7
+ class SevenZipCompressor < Compressor
8
+ def name
9
+ 'seven_zip'
10
+ end
11
+
12
+ def display_name
13
+ '7z'
14
+ end
15
+
16
+ def extension
17
+ '7z'
18
+ end
19
+
20
+ # Based on share/doc/p7zip/DOC/MANUAL/cmdline/switches/method.htm
21
+ # Level 0 is no compression, so we exclude it.
22
+ def levels
23
+ [1, 3, 5, 7, 9]
24
+ end
25
+
26
+ def version
27
+ status, _times, out, _err = run('7zr', '--help')
28
+ return nil unless status.zero?
29
+ version_line = out.strip.lines.first.chomp
30
+ raise "bad version line #{version_line}" unless
31
+ version_line =~ /([0-9.]+)[\s:]+Copyright/
32
+ Regexp.last_match(1)
33
+ end
34
+
35
+ def compression_command(target, level)
36
+ ['7zr', 'a', "-mx=#{level}", output_name(target), target]
37
+ end
38
+
39
+ def decompression_command(target)
40
+ ['7zr', 'x', "-o#{File.dirname(target)}", output_name(target)]
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Compress with `xz` (LZMA).
6
+ #
7
+ class XzCompressor < Compressor
8
+ def name
9
+ 'xz'
10
+ end
11
+
12
+ def extension
13
+ 'xz'
14
+ end
15
+
16
+ def levels
17
+ (0..9).to_a
18
+ end
19
+
20
+ def version
21
+ status, _times, out, _err = run(name, '--version')
22
+ return nil unless status.zero?
23
+ version_line = out.lines.first.chomp
24
+ raise "bad version line #{version_line}" unless
25
+ version_line =~ /([0-9.a-z]+)$/
26
+ Regexp.last_match(1)
27
+ end
28
+
29
+ def compression_command(target, level)
30
+ ['xz', "-#{level}", target]
31
+ end
32
+
33
+ def decompression_command(target)
34
+ ['unxz', output_name(target)]
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Compress with Zstandard.
6
+ #
7
+ class ZstdCompressor < Compressor
8
+ def name
9
+ 'zstd'
10
+ end
11
+
12
+ def extension
13
+ 'zst'
14
+ end
15
+
16
+ def levels
17
+ (1..19).to_a
18
+ end
19
+
20
+ def version
21
+ status, _times, out, _err = run(name, '-V')
22
+ return nil unless status.zero?
23
+ version_line = out.lines.first.chomp
24
+ raise "bad version line #{version_line}" unless
25
+ version_line =~ /([0-9.]+),/
26
+ Regexp.last_match(1)
27
+ end
28
+
29
+ def compression_command(target, level)
30
+ ['zstd', "-#{level}", target]
31
+ end
32
+
33
+ def decompression_command(target)
34
+ ['unzstd', output_name(target)]
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Define costs for comparing grouped results.
6
+ #
7
+ class CostModel
8
+ # Default to current Amazon S3 storage cost per GiB*month ($).
9
+ DEFAULT_GIBYTE_COST = 0.023
10
+
11
+ # Default to on-demand cost for an Amazon EC2 m3.medium ($).
12
+ DEFAULT_HOUR_COST = 0.073
13
+
14
+ # Default to elapsed time rather than CPU time.
15
+ DEFAULT_USE_CPU_TIME = false
16
+
17
+ # Default to dollars.
18
+ DEFAULT_CURRENCY = '$'
19
+
20
+ # Default to dollars.
21
+ DEFAULT_SUMMARIZE_TOP = 5
22
+
23
+ def initialize(
24
+ gibyte_cost: DEFAULT_GIBYTE_COST,
25
+ compression_hour_cost: DEFAULT_HOUR_COST,
26
+ decompression_hour_cost: DEFAULT_HOUR_COST,
27
+ use_cpu_time: DEFAULT_USE_CPU_TIME,
28
+ currency: DEFAULT_CURRENCY
29
+ )
30
+ @gibyte_cost = gibyte_cost
31
+ @compression_hour_cost = compression_hour_cost
32
+ @decompression_hour_cost = decompression_hour_cost
33
+ @use_cpu_time = use_cpu_time
34
+ @currency = currency
35
+ end
36
+
37
+ attr_reader :gibyte_cost
38
+ attr_reader :compression_hour_cost
39
+ attr_reader :decompression_hour_cost
40
+ attr_reader :use_cpu_time
41
+ attr_reader :currency
42
+
43
+ def cost(grouped_results)
44
+ grouped_results.map do |group_result|
45
+ CostedGroupResult.new(self, group_result)
46
+ end
47
+ end
48
+
49
+ def summarize(costed_grouped_results, top = DEFAULT_SUMMARIZE_TOP)
50
+ costed_grouped_results.sort_by(&:total_cost).take(top).map do |result|
51
+ result.to_s(currency)
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CompareCompressors
4
+ #
5
+ # Grouped result with costs calculated.
6
+ #
7
+ CostedGroupResult = Struct.new(
8
+ :compressor_name,
9
+ :compressor_level,
10
+ :mean_compression_elapsed_hours,
11
+ :mean_compression_cpu_hours,
12
+ :max_compression_max_rss,
13
+ :mean_compressed_gibytes,
14
+ :mean_compression_delta_gibytes,
15
+ :geomean_compression_ratio,
16
+ :mean_decompression_elapsed_hours,
17
+ :mean_decompression_cpu_hours,
18
+ :max_decompression_max_rss,
19
+ :compression_hour_cost,
20
+ :decompression_hour_cost,
21
+ :hour_cost,
22
+ :gibyte_cost,
23
+ :total_cost
24
+ ) do
25
+ def self.new_from_group_result(cost_model, group_result)
26
+ if cost_model.use_cpu_time
27
+ compression_hours = group_result.mean_compression_cpu_hours
28
+ decompression_hours = group_result.mean_decompression_cpu_hours
29
+ else
30
+ compression_hours = group_result.mean_compression_elapsed_hours
31
+ decompression_hours = group_result.mean_decompression_elapsed_hours
32
+ end
33
+ compression_hour_cost =
34
+ cost_model.compression_hour_cost * compression_hours
35
+ decompression_hour_cost =
36
+ cost_model.decompression_hour_cost * decompression_hours
37
+ hour_cost = compression_hour_cost + decompression_hour_cost
38
+ gibyte_cost =
39
+ cost_model.gibyte_cost *
40
+ group_result.mean_compressed_gibytes
41
+ new(
42
+ group_result.compressor_name,
43
+ group_result.compressor_level,
44
+ group_result.mean_compression_cpu_hours,
45
+ group_result.mean_compression_elapsed_hours,
46
+ group_result.max_compression_max_rss,
47
+ group_result.mean_compressed_gibytes,
48
+ group_result.mean_compression_delta_gibytes,
49
+ group_result.geomean_compression_ratio,
50
+ group_result.mean_decompression_elapsed_hours,
51
+ group_result.mean_decompression_cpu_hours,
52
+ group_result.max_decompression_max_rss,
53
+ compression_hour_cost,
54
+ decompression_hour_cost,
55
+ hour_cost,
56
+ gibyte_cost,
57
+ hour_cost + gibyte_cost
58
+ )
59
+ end
60
+
61
+ def self.from_group_results(cost_model, group_results)
62
+ group_results.map do |group_result|
63
+ new_from_group_result(cost_model, group_result)
64
+ end
65
+ end
66
+
67
+ def to_s(currency = CostModel::DEFAULT_CURRENCY)
68
+ gib_saved = mean_compression_delta_gibytes
69
+ <<-STRING
70
+ #{compressor_name} level #{compressor_level}:
71
+ compression ratio : #{format('%.2f', geomean_compression_ratio)}
72
+ compression elapsed hours : #{format('%.4f', mean_compression_elapsed_hours)}
73
+ compression CPU hours : #{format('%.4f', mean_compression_cpu_hours)}
74
+ compression max RSS (KiB) : #{format('%d', max_compression_max_rss)}
75
+ compressed GiB : #{format('%.4f', mean_compressed_gibytes)}
76
+ GiB saved : #{format('%.2f', gib_saved)}
77
+ decompression elapsed hours : #{format('%.4f', mean_decompression_elapsed_hours)}
78
+ decompression CPU hours : #{format('%.4f', mean_decompression_cpu_hours)}
79
+ decompression max RSS (KiB) : #{format('%d', max_decompression_max_rss)}
80
+ ------------------
81
+ storage cost : #{format('%s%0.02f', currency, gibyte_cost)}
82
+ compute cost : #{format('%s%0.02f', currency, hour_cost)}
83
+ total cost : #{format('%s%0.02f', currency, total_cost)}
84
+ STRING
85
+ end
86
+ end
87
+ end