vibe_zstd 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.standard.yml +3 -0
- data/CHANGELOG.md +22 -0
- data/LICENSE.txt +21 -0
- data/README.md +978 -0
- data/Rakefile +20 -0
- data/benchmark/README.md +198 -0
- data/benchmark/compression_levels.rb +99 -0
- data/benchmark/context_reuse.rb +174 -0
- data/benchmark/decompression_speed_by_level.rb +65 -0
- data/benchmark/dictionary_training.rb +182 -0
- data/benchmark/dictionary_usage.rb +121 -0
- data/benchmark/for_readme.rb +157 -0
- data/benchmark/generate_fixture.rb +82 -0
- data/benchmark/helpers.rb +237 -0
- data/benchmark/multithreading.rb +105 -0
- data/benchmark/run_all.rb +150 -0
- data/benchmark/streaming.rb +154 -0
- data/ext/vibe_zstd/Makefile +270 -0
- data/ext/vibe_zstd/cctx.c +565 -0
- data/ext/vibe_zstd/dctx.c +493 -0
- data/ext/vibe_zstd/dict.c +587 -0
- data/ext/vibe_zstd/extconf.rb +52 -0
- data/ext/vibe_zstd/frames.c +132 -0
- data/ext/vibe_zstd/libzstd/LICENSE +30 -0
- data/ext/vibe_zstd/libzstd/common/allocations.h +55 -0
- data/ext/vibe_zstd/libzstd/common/bits.h +205 -0
- data/ext/vibe_zstd/libzstd/common/bitstream.h +454 -0
- data/ext/vibe_zstd/libzstd/common/compiler.h +464 -0
- data/ext/vibe_zstd/libzstd/common/cpu.h +249 -0
- data/ext/vibe_zstd/libzstd/common/debug.c +30 -0
- data/ext/vibe_zstd/libzstd/common/debug.h +107 -0
- data/ext/vibe_zstd/libzstd/common/entropy_common.c +340 -0
- data/ext/vibe_zstd/libzstd/common/error_private.c +64 -0
- data/ext/vibe_zstd/libzstd/common/error_private.h +158 -0
- data/ext/vibe_zstd/libzstd/common/fse.h +625 -0
- data/ext/vibe_zstd/libzstd/common/fse_decompress.c +315 -0
- data/ext/vibe_zstd/libzstd/common/huf.h +277 -0
- data/ext/vibe_zstd/libzstd/common/mem.h +422 -0
- data/ext/vibe_zstd/libzstd/common/pool.c +371 -0
- data/ext/vibe_zstd/libzstd/common/pool.h +81 -0
- data/ext/vibe_zstd/libzstd/common/portability_macros.h +171 -0
- data/ext/vibe_zstd/libzstd/common/threading.c +182 -0
- data/ext/vibe_zstd/libzstd/common/threading.h +142 -0
- data/ext/vibe_zstd/libzstd/common/xxhash.c +18 -0
- data/ext/vibe_zstd/libzstd/common/xxhash.h +7094 -0
- data/ext/vibe_zstd/libzstd/common/zstd_common.c +48 -0
- data/ext/vibe_zstd/libzstd/common/zstd_deps.h +123 -0
- data/ext/vibe_zstd/libzstd/common/zstd_internal.h +324 -0
- data/ext/vibe_zstd/libzstd/common/zstd_trace.h +156 -0
- data/ext/vibe_zstd/libzstd/compress/clevels.h +134 -0
- data/ext/vibe_zstd/libzstd/compress/fse_compress.c +625 -0
- data/ext/vibe_zstd/libzstd/compress/hist.c +191 -0
- data/ext/vibe_zstd/libzstd/compress/hist.h +82 -0
- data/ext/vibe_zstd/libzstd/compress/huf_compress.c +1464 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_compress.c +7843 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_compress_internal.h +1636 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_compress_literals.c +235 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_compress_literals.h +39 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_compress_sequences.c +442 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_compress_sequences.h +55 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_compress_superblock.c +688 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_compress_superblock.h +32 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_cwksp.h +765 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_double_fast.c +778 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_double_fast.h +42 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_fast.c +985 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_fast.h +30 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_lazy.c +2199 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_lazy.h +193 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_ldm.c +745 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_ldm.h +109 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_ldm_geartab.h +106 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_opt.c +1580 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_opt.h +72 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_preSplit.c +238 -0
- data/ext/vibe_zstd/libzstd/compress/zstd_preSplit.h +33 -0
- data/ext/vibe_zstd/libzstd/compress/zstdmt_compress.c +1923 -0
- data/ext/vibe_zstd/libzstd/compress/zstdmt_compress.h +102 -0
- data/ext/vibe_zstd/libzstd/decompress/huf_decompress.c +1944 -0
- data/ext/vibe_zstd/libzstd/decompress/huf_decompress_amd64.S +602 -0
- data/ext/vibe_zstd/libzstd/decompress/zstd_ddict.c +244 -0
- data/ext/vibe_zstd/libzstd/decompress/zstd_ddict.h +44 -0
- data/ext/vibe_zstd/libzstd/decompress/zstd_decompress.c +2410 -0
- data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_block.c +2209 -0
- data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_block.h +73 -0
- data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_internal.h +240 -0
- data/ext/vibe_zstd/libzstd/deprecated/zbuff.h +214 -0
- data/ext/vibe_zstd/libzstd/deprecated/zbuff_common.c +26 -0
- data/ext/vibe_zstd/libzstd/deprecated/zbuff_compress.c +167 -0
- data/ext/vibe_zstd/libzstd/deprecated/zbuff_decompress.c +77 -0
- data/ext/vibe_zstd/libzstd/dictBuilder/cover.c +1302 -0
- data/ext/vibe_zstd/libzstd/dictBuilder/cover.h +152 -0
- data/ext/vibe_zstd/libzstd/dictBuilder/divsufsort.c +1913 -0
- data/ext/vibe_zstd/libzstd/dictBuilder/divsufsort.h +57 -0
- data/ext/vibe_zstd/libzstd/dictBuilder/fastcover.c +766 -0
- data/ext/vibe_zstd/libzstd/dictBuilder/zdict.c +1133 -0
- data/ext/vibe_zstd/libzstd/zdict.h +481 -0
- data/ext/vibe_zstd/libzstd/zstd.h +3198 -0
- data/ext/vibe_zstd/libzstd/zstd_errors.h +107 -0
- data/ext/vibe_zstd/streaming.c +410 -0
- data/ext/vibe_zstd/vibe_zstd.c +293 -0
- data/ext/vibe_zstd/vibe_zstd.h +56 -0
- data/ext/vibe_zstd/vibe_zstd_internal.h +27 -0
- data/lib/vibe_zstd/constants.rb +67 -0
- data/lib/vibe_zstd/version.rb +5 -0
- data/lib/vibe_zstd.rb +255 -0
- data/sig/vibe_zstd.rbs +76 -0
- metadata +179 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require_relative "helpers"
|
|
5
|
+
|
|
6
|
+
# Benchmark: Dictionary Training Algorithms
|
|
7
|
+
# Compares train_dict, train_dict_cover, and train_dict_fast_cover
|
|
8
|
+
|
|
9
|
+
BenchmarkHelpers.run_comparison(title: "Dictionary Training Algorithm Comparison") do |results|
|
|
10
|
+
# Generate training samples
|
|
11
|
+
puts "Generating training samples..."
|
|
12
|
+
samples = 200.times.map do |i|
|
|
13
|
+
{
|
|
14
|
+
id: i,
|
|
15
|
+
name: "User #{i}",
|
|
16
|
+
email: "user#{i}@example.com",
|
|
17
|
+
created_at: Time.now.to_i - rand(100000),
|
|
18
|
+
status: %w[active pending inactive verified][rand(4)],
|
|
19
|
+
preferences: {
|
|
20
|
+
theme: %w[light dark auto][rand(3)],
|
|
21
|
+
notifications: rand(2) == 1
|
|
22
|
+
}
|
|
23
|
+
}.to_json
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
puts "Training samples: #{samples.size}"
|
|
27
|
+
puts "Average sample size: #{Formatter.format_bytes(samples.map(&:bytesize).sum / samples.size)}"
|
|
28
|
+
puts "Total training data: #{Formatter.format_bytes(samples.map(&:bytesize).sum)}\n\n"
|
|
29
|
+
|
|
30
|
+
dict_size = 16 * 1024 # 16KB
|
|
31
|
+
|
|
32
|
+
# Benchmark 1: train_dict (default algorithm - fast)
|
|
33
|
+
Formatter.section("Testing: train_dict (default/fast algorithm)")
|
|
34
|
+
dict_fast = nil
|
|
35
|
+
fast_time = Benchmark.measure do
|
|
36
|
+
dict_fast = VibeZstd.train_dict(samples, max_dict_size: dict_size)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
puts "Training time: #{fast_time.real.round(3)}s"
|
|
40
|
+
puts "Dictionary size: #{Formatter.format_bytes(dict_fast.bytesize)}"
|
|
41
|
+
puts "Dictionary ID: #{VibeZstd.get_dict_id(dict_fast)}"
|
|
42
|
+
|
|
43
|
+
# Test compression effectiveness
|
|
44
|
+
test_sample = samples[100]
|
|
45
|
+
cdict_fast = VibeZstd::CDict.new(dict_fast)
|
|
46
|
+
compressed_fast = VibeZstd.compress(test_sample, dict: cdict_fast)
|
|
47
|
+
ratio_fast = test_sample.bytesize.to_f / compressed_fast.bytesize
|
|
48
|
+
|
|
49
|
+
results << BenchmarkResult.new(
|
|
50
|
+
:name => "train_dict",
|
|
51
|
+
:compression_ratio => ratio_fast,
|
|
52
|
+
:memory_bytes => dict_fast.bytesize,
|
|
53
|
+
"Training time" => "#{fast_time.real.round(3)}s",
|
|
54
|
+
"Compressed" => Formatter.format_bytes(compressed_fast.bytesize)
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Benchmark 2: train_dict_cover (COVER algorithm - better quality)
|
|
58
|
+
Formatter.section("Testing: train_dict_cover (COVER algorithm)")
|
|
59
|
+
dict_cover = nil
|
|
60
|
+
cover_time = Benchmark.measure do
|
|
61
|
+
dict_cover = VibeZstd.train_dict_cover(
|
|
62
|
+
samples,
|
|
63
|
+
max_dict_size: dict_size,
|
|
64
|
+
k: 200, # Segment size
|
|
65
|
+
d: 6 # Dmer size
|
|
66
|
+
)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
puts "Training time: #{cover_time.real.round(3)}s"
|
|
70
|
+
puts "Dictionary size: #{Formatter.format_bytes(dict_cover.bytesize)}"
|
|
71
|
+
puts "Dictionary ID: #{VibeZstd.get_dict_id(dict_cover)}"
|
|
72
|
+
|
|
73
|
+
cdict_cover = VibeZstd::CDict.new(dict_cover)
|
|
74
|
+
compressed_cover = VibeZstd.compress(test_sample, dict: cdict_cover)
|
|
75
|
+
ratio_cover = test_sample.bytesize.to_f / compressed_cover.bytesize
|
|
76
|
+
|
|
77
|
+
results << BenchmarkResult.new(
|
|
78
|
+
:name => "train_dict_cover",
|
|
79
|
+
:compression_ratio => ratio_cover,
|
|
80
|
+
:memory_bytes => dict_cover.bytesize,
|
|
81
|
+
"Training time" => "#{cover_time.real.round(3)}s",
|
|
82
|
+
"Compressed" => Formatter.format_bytes(compressed_cover.bytesize)
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Benchmark 3: train_dict_fast_cover (fast COVER - balanced)
|
|
86
|
+
Formatter.section("Testing: train_dict_fast_cover (fast COVER)")
|
|
87
|
+
dict_fast_cover = nil
|
|
88
|
+
fast_cover_time = Benchmark.measure do
|
|
89
|
+
dict_fast_cover = VibeZstd.train_dict_fast_cover(
|
|
90
|
+
samples,
|
|
91
|
+
max_dict_size: dict_size,
|
|
92
|
+
k: 200,
|
|
93
|
+
d: 6,
|
|
94
|
+
f: 20, # Frequency array size
|
|
95
|
+
accel: 5 # Acceleration (1-10, higher = faster)
|
|
96
|
+
)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
puts "Training time: #{fast_cover_time.real.round(3)}s"
|
|
100
|
+
puts "Dictionary size: #{Formatter.format_bytes(dict_fast_cover.bytesize)}"
|
|
101
|
+
puts "Dictionary ID: #{VibeZstd.get_dict_id(dict_fast_cover)}"
|
|
102
|
+
|
|
103
|
+
cdict_fast_cover = VibeZstd::CDict.new(dict_fast_cover)
|
|
104
|
+
compressed_fast_cover = VibeZstd.compress(test_sample, dict: cdict_fast_cover)
|
|
105
|
+
ratio_fast_cover = test_sample.bytesize.to_f / compressed_fast_cover.bytesize
|
|
106
|
+
|
|
107
|
+
results << BenchmarkResult.new(
|
|
108
|
+
:name => "train_dict_fast_cover",
|
|
109
|
+
:compression_ratio => ratio_fast_cover,
|
|
110
|
+
:memory_bytes => dict_fast_cover.bytesize,
|
|
111
|
+
"Training time" => "#{fast_cover_time.real.round(3)}s",
|
|
112
|
+
"Compressed" => Formatter.format_bytes(compressed_fast_cover.bytesize)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Compare compression across multiple samples
|
|
116
|
+
puts "\n"
|
|
117
|
+
Formatter.section("Compression effectiveness across test samples")
|
|
118
|
+
|
|
119
|
+
test_samples = (101..110).map { |i| samples[i] }
|
|
120
|
+
|
|
121
|
+
[:fast, :cover, :fast_cover].each do |dict_type|
|
|
122
|
+
dict_data = case dict_type
|
|
123
|
+
when :fast then dict_fast
|
|
124
|
+
when :cover then dict_cover
|
|
125
|
+
when :fast_cover then dict_fast_cover
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
cdict = VibeZstd::CDict.new(dict_data)
|
|
129
|
+
total_original = 0
|
|
130
|
+
total_compressed = 0
|
|
131
|
+
|
|
132
|
+
test_samples.each do |sample|
|
|
133
|
+
total_original += sample.bytesize
|
|
134
|
+
compressed = VibeZstd.compress(sample, dict: cdict)
|
|
135
|
+
total_compressed += compressed.bytesize
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
avg_ratio = total_original.to_f / total_compressed
|
|
139
|
+
puts " #{dict_type}: #{Formatter.format_ratio(avg_ratio)} average ratio"
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Test dictionary sizes
|
|
143
|
+
puts "\n"
|
|
144
|
+
Formatter.section("Dictionary size impact")
|
|
145
|
+
|
|
146
|
+
sizes = [4096, 8192, 16384, 32768]
|
|
147
|
+
size_results = []
|
|
148
|
+
|
|
149
|
+
sizes.each do |size|
|
|
150
|
+
dict = VibeZstd.train_dict(samples, max_dict_size: size)
|
|
151
|
+
cdict = VibeZstd::CDict.new(dict)
|
|
152
|
+
compressed = VibeZstd.compress(test_sample, dict: cdict)
|
|
153
|
+
ratio = test_sample.bytesize.to_f / compressed.bytesize
|
|
154
|
+
|
|
155
|
+
size_results << {
|
|
156
|
+
"Dict Size" => Formatter.format_bytes(size),
|
|
157
|
+
"Actual Size" => Formatter.format_bytes(dict.bytesize),
|
|
158
|
+
"Ratio" => Formatter.format_ratio(ratio),
|
|
159
|
+
"Compressed" => Formatter.format_bytes(compressed.bytesize)
|
|
160
|
+
}
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
Formatter.table(size_results)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
puts "\n💡 Dictionary Training Recommendations:"
|
|
167
|
+
puts " train_dict (default):"
|
|
168
|
+
puts " ✓ Fastest training"
|
|
169
|
+
puts " ✓ Good enough for most use cases"
|
|
170
|
+
puts " ✓ Use when training time matters"
|
|
171
|
+
puts "\n train_dict_cover:"
|
|
172
|
+
puts " ✓ Best compression ratios"
|
|
173
|
+
puts " ✓ Slower training (2-10x slower)"
|
|
174
|
+
puts " ✓ Use for production dictionaries"
|
|
175
|
+
puts "\n train_dict_fast_cover:"
|
|
176
|
+
puts " ✓ Balanced speed/quality"
|
|
177
|
+
puts " ✓ Configurable with accel parameter"
|
|
178
|
+
puts " ✓ Good default for most users"
|
|
179
|
+
puts "\n Dictionary size:"
|
|
180
|
+
puts " - Larger = better compression (diminishing returns > 64KB)"
|
|
181
|
+
puts " - Typical: 16KB-64KB for small messages"
|
|
182
|
+
puts " - Memory overhead: ~2x dictionary size in memory"
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require_relative "helpers"
|
|
5
|
+
|
|
6
|
+
# Benchmark: Dictionary Usage Performance
|
|
7
|
+
# Demonstrates compression ratio and speed improvements when using trained dictionaries
|
|
8
|
+
|
|
9
|
+
BenchmarkHelpers.run_comparison(title: "Dictionary Usage Performance Comparison") do |results|
|
|
10
|
+
# Load the trained dictionary
|
|
11
|
+
dict_path = File.join(__dir__, "..", "test", "fixtures", "sample.dict")
|
|
12
|
+
unless File.exist?(dict_path)
|
|
13
|
+
puts "⚠️ Dictionary fixture not found. Run: ruby benchmark/generate_fixture.rb"
|
|
14
|
+
exit 1
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
dict_data = File.binread(dict_path)
|
|
18
|
+
cdict = VibeZstd::CDict.new(dict_data)
|
|
19
|
+
ddict = VibeZstd::DDict.new(dict_data)
|
|
20
|
+
|
|
21
|
+
puts "Dictionary size: #{Formatter.format_bytes(dict_data.bytesize)}"
|
|
22
|
+
puts "Dictionary ID: #{VibeZstd.get_dict_id(dict_data)}\n\n"
|
|
23
|
+
|
|
24
|
+
# Generate test data (similar to training data for best dictionary performance)
|
|
25
|
+
test_samples = 100.times.map do |i|
|
|
26
|
+
{
|
|
27
|
+
id: i + 1000,
|
|
28
|
+
name: "User #{i + 1000}",
|
|
29
|
+
email: "user#{i + 1000}@example.com",
|
|
30
|
+
created_at: Time.now.to_i,
|
|
31
|
+
status: %w[active pending inactive verified][rand(4)],
|
|
32
|
+
preferences: {
|
|
33
|
+
theme: %w[light dark auto][rand(3)],
|
|
34
|
+
notifications: rand(2) == 1,
|
|
35
|
+
language: %w[en es fr de][rand(4)]
|
|
36
|
+
},
|
|
37
|
+
metadata: {
|
|
38
|
+
login_count: rand(1000),
|
|
39
|
+
last_ip: "10.0.#{rand(255)}.#{rand(255)}",
|
|
40
|
+
session_duration: rand(3600)
|
|
41
|
+
}
|
|
42
|
+
}.to_json
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Benchmark without dictionary
|
|
46
|
+
Formatter.section("Testing: Compression without dictionary")
|
|
47
|
+
cctx = VibeZstd::CCtx.new
|
|
48
|
+
dctx = VibeZstd::DCtx.new
|
|
49
|
+
|
|
50
|
+
compressed_sizes_no_dict = []
|
|
51
|
+
no_dict_time = Benchmark.measure do
|
|
52
|
+
test_samples.each do |sample|
|
|
53
|
+
compressed = cctx.compress(sample)
|
|
54
|
+
compressed_sizes_no_dict << compressed.bytesize
|
|
55
|
+
dctx.decompress(compressed)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
no_dict_ops_per_sec = test_samples.size / no_dict_time.real
|
|
60
|
+
avg_compressed_no_dict = compressed_sizes_no_dict.sum / compressed_sizes_no_dict.size.to_f
|
|
61
|
+
puts "Completed #{test_samples.size} operations in #{no_dict_time.real.round(3)}s"
|
|
62
|
+
|
|
63
|
+
# Benchmark with dictionary
|
|
64
|
+
Formatter.section("Testing: Compression with dictionary")
|
|
65
|
+
|
|
66
|
+
compressed_sizes_with_dict = []
|
|
67
|
+
with_dict_time = Benchmark.measure do
|
|
68
|
+
test_samples.each do |sample|
|
|
69
|
+
compressed = cctx.compress(sample, dict: cdict)
|
|
70
|
+
compressed_sizes_with_dict << compressed.bytesize
|
|
71
|
+
dctx.decompress(compressed, dict: ddict)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
with_dict_ops_per_sec = test_samples.size / with_dict_time.real
|
|
76
|
+
avg_compressed_with_dict = compressed_sizes_with_dict.sum / compressed_sizes_with_dict.size.to_f
|
|
77
|
+
puts "Completed #{test_samples.size} operations in #{with_dict_time.real.round(3)}s"
|
|
78
|
+
|
|
79
|
+
# Calculate compression ratios
|
|
80
|
+
avg_original_size = test_samples.map(&:bytesize).sum / test_samples.size.to_f
|
|
81
|
+
compression_ratio_no_dict = avg_original_size / avg_compressed_no_dict
|
|
82
|
+
compression_ratio_with_dict = avg_original_size / avg_compressed_with_dict
|
|
83
|
+
|
|
84
|
+
# Memory estimates
|
|
85
|
+
cdict_memory = Memory.estimate_cdict(dict_data.bytesize)
|
|
86
|
+
ddict_memory = Memory.estimate_ddict(dict_data.bytesize)
|
|
87
|
+
dict_memory_overhead = cdict_memory + ddict_memory
|
|
88
|
+
|
|
89
|
+
# Collect results
|
|
90
|
+
results << BenchmarkResult.new(
|
|
91
|
+
:name => "Without dictionary",
|
|
92
|
+
:iterations_per_sec => no_dict_ops_per_sec,
|
|
93
|
+
:compression_ratio => compression_ratio_no_dict,
|
|
94
|
+
:memory_bytes => Memory.estimate_cctx + Memory.estimate_dctx,
|
|
95
|
+
"Avg compressed size" => Formatter.format_bytes(avg_compressed_no_dict.to_i)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
results << BenchmarkResult.new(
|
|
99
|
+
:name => "With dictionary",
|
|
100
|
+
:iterations_per_sec => with_dict_ops_per_sec,
|
|
101
|
+
:compression_ratio => compression_ratio_with_dict,
|
|
102
|
+
:memory_bytes => Memory.estimate_cctx + Memory.estimate_dctx + dict_memory_overhead,
|
|
103
|
+
"Avg compressed size" => Formatter.format_bytes(avg_compressed_with_dict.to_i)
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
puts "\n📊 Detailed Statistics:"
|
|
107
|
+
puts " Average original size: #{Formatter.format_bytes(avg_original_size.to_i)}"
|
|
108
|
+
puts " Average compressed (no dict): #{Formatter.format_bytes(avg_compressed_no_dict.to_i)}"
|
|
109
|
+
puts " Average compressed (with dict): #{Formatter.format_bytes(avg_compressed_with_dict.to_i)}"
|
|
110
|
+
puts " Compression improvement: #{((avg_compressed_no_dict - avg_compressed_with_dict) / avg_compressed_no_dict * 100).round(1)}% smaller"
|
|
111
|
+
puts "\n💾 Memory Overhead:"
|
|
112
|
+
puts " Dictionary in memory: #{Formatter.format_bytes(dict_memory_overhead)}"
|
|
113
|
+
puts " Dictionary on disk: #{Formatter.format_bytes(dict_data.bytesize)}"
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
puts "\n💡 When to use dictionaries:"
|
|
117
|
+
puts " ✓ Small, similar data (JSON, logs, etc.)"
|
|
118
|
+
puts " ✓ Many small messages with repeated patterns"
|
|
119
|
+
puts " ✓ When compression ratio is more important than speed"
|
|
120
|
+
puts " ✗ Large files (> 1MB each)"
|
|
121
|
+
puts " ✗ Highly variable data with no patterns"
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Quick benchmark script to generate results for README
|
|
5
|
+
|
|
6
|
+
require_relative "helpers"
|
|
7
|
+
|
|
8
|
+
puts "# Performance Benchmarks\n\n"
|
|
9
|
+
puts "Results from Ruby #{RUBY_VERSION} on #{RUBY_PLATFORM}, Zstd #{VibeZstd.version_string}\n\n"
|
|
10
|
+
|
|
11
|
+
# 1. Context Reuse
|
|
12
|
+
puts "## Context Reuse Performance\n\n"
|
|
13
|
+
puts "Reusing compression/decompression contexts vs creating new ones (5000 iterations each):\n\n"
|
|
14
|
+
|
|
15
|
+
# Test with different data sizes
|
|
16
|
+
test_cases = {
|
|
17
|
+
"1KB" => DataGenerator.json_data(count: 5),
|
|
18
|
+
"10KB" => DataGenerator.json_data(count: 50),
|
|
19
|
+
"100KB" => DataGenerator.json_data(count: 500)
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
puts "| Data Size | New Context | Reused Context | Speedup |"
|
|
23
|
+
puts "|-----------|-------------|----------------|---------|"
|
|
24
|
+
|
|
25
|
+
test_cases.each do |size_label, test_data|
|
|
26
|
+
iterations = 5000
|
|
27
|
+
|
|
28
|
+
new_ctx_time = Benchmark.measure do
|
|
29
|
+
iterations.times do
|
|
30
|
+
cctx = VibeZstd::CCtx.new
|
|
31
|
+
dctx = VibeZstd::DCtx.new
|
|
32
|
+
compressed = cctx.compress(test_data)
|
|
33
|
+
dctx.decompress(compressed)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
reused_time = Benchmark.measure do
|
|
38
|
+
cctx = VibeZstd::CCtx.new
|
|
39
|
+
dctx = VibeZstd::DCtx.new
|
|
40
|
+
iterations.times do
|
|
41
|
+
compressed = cctx.compress(test_data)
|
|
42
|
+
dctx.decompress(compressed)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
new_ops = iterations / new_ctx_time.real
|
|
47
|
+
reused_ops = iterations / reused_time.real
|
|
48
|
+
speedup = reused_ops / new_ops
|
|
49
|
+
|
|
50
|
+
puts "| #{size_label} | #{Formatter.format_number(new_ops.to_i)} ops/s | #{Formatter.format_number(reused_ops.to_i)} ops/s | #{speedup.round(2)}x |"
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
cctx_mem = Memory.estimate_cctx(3)
|
|
54
|
+
dctx_mem = Memory.estimate_dctx
|
|
55
|
+
total_mem = cctx_mem + dctx_mem
|
|
56
|
+
|
|
57
|
+
puts "\n**Memory savings:** Reusing contexts saves #{Formatter.format_bytes(total_mem * 4999)} for 5000 operations (99.98% reduction)\n"
|
|
58
|
+
puts "**Recommendation:** Always reuse CCtx/DCtx instances for multiple operations.\n\n"
|
|
59
|
+
|
|
60
|
+
# 2. Dictionary Performance
|
|
61
|
+
puts "## Dictionary Compression\n\n"
|
|
62
|
+
puts "Compression with vs without trained dictionaries (100 JSON samples):\n\n"
|
|
63
|
+
|
|
64
|
+
dict_path = File.join(__dir__, "..", "test", "fixtures", "sample.dict")
|
|
65
|
+
dict_data = File.binread(dict_path)
|
|
66
|
+
cdict = VibeZstd::CDict.new(dict_data)
|
|
67
|
+
VibeZstd::DDict.new(dict_data)
|
|
68
|
+
|
|
69
|
+
test_sample = {
|
|
70
|
+
id: 999,
|
|
71
|
+
name: "Test User",
|
|
72
|
+
email: "test@example.com",
|
|
73
|
+
status: "active",
|
|
74
|
+
preferences: {theme: "dark", notifications: true}
|
|
75
|
+
}.to_json
|
|
76
|
+
|
|
77
|
+
compressed_no_dict = VibeZstd.compress(test_sample)
|
|
78
|
+
compressed_with_dict = VibeZstd.compress(test_sample, dict: cdict)
|
|
79
|
+
|
|
80
|
+
ratio_no_dict = test_sample.bytesize.to_f / compressed_no_dict.bytesize
|
|
81
|
+
ratio_with_dict = test_sample.bytesize.to_f / compressed_with_dict.bytesize
|
|
82
|
+
|
|
83
|
+
puts "| Method | Compressed Size | Ratio | Improvement |"
|
|
84
|
+
puts "|--------|----------------|-------|-------------|"
|
|
85
|
+
puts "| Without dictionary | #{compressed_no_dict.bytesize}B | #{ratio_no_dict.round(2)}x | - |"
|
|
86
|
+
puts "| With dictionary (16KB) | #{compressed_with_dict.bytesize}B | #{ratio_with_dict.round(2)}x | #{((compressed_no_dict.bytesize - compressed_with_dict.bytesize).to_f / compressed_no_dict.bytesize * 100).round(1)}% smaller |"
|
|
87
|
+
puts "\nOriginal size: #{test_sample.bytesize} bytes\n\n"
|
|
88
|
+
|
|
89
|
+
# 3. Compression Levels
|
|
90
|
+
puts "## Compression Levels\n\n"
|
|
91
|
+
puts "Speed vs compression ratio trade-offs:\n\n"
|
|
92
|
+
|
|
93
|
+
large_data = DataGenerator.mixed_data(size: 50_000)
|
|
94
|
+
levels = [-1, 1, 3, 9, 19]
|
|
95
|
+
|
|
96
|
+
puts "| Level | Ratio | Speed (ops/sec) | Memory | Use Case |"
|
|
97
|
+
puts "|-------|-------|-----------------|--------|----------|"
|
|
98
|
+
|
|
99
|
+
levels.each do |level|
|
|
100
|
+
cctx = VibeZstd::CCtx.new
|
|
101
|
+
|
|
102
|
+
compressed = nil
|
|
103
|
+
time = Benchmark.measure do
|
|
104
|
+
10.times { compressed = cctx.compress(large_data, level: level) }
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
ops_per_sec = 10 / time.real
|
|
108
|
+
ratio = large_data.bytesize.to_f / compressed.bytesize
|
|
109
|
+
memory = Memory.estimate_cctx(level)
|
|
110
|
+
|
|
111
|
+
use_case = case level
|
|
112
|
+
when -1 then "Ultra-fast, real-time"
|
|
113
|
+
when 1 then "Fast, high-throughput"
|
|
114
|
+
when 3 then "Balanced (default)"
|
|
115
|
+
when 9 then "Better compression"
|
|
116
|
+
when 19 then "Maximum compression"
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
puts "| #{level} | #{ratio.round(2)}x | #{Formatter.format_number(ops_per_sec.to_i)} | #{Formatter.format_bytes(memory)} | #{use_case} |"
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
puts "\n"
|
|
123
|
+
|
|
124
|
+
# 4. Multi-threading
|
|
125
|
+
puts "## Multi-threading Performance\n\n"
|
|
126
|
+
puts "Compression speedup with multiple workers (500KB data):\n\n"
|
|
127
|
+
|
|
128
|
+
mt_data = DataGenerator.mixed_data(size: 500_000)
|
|
129
|
+
|
|
130
|
+
puts "| Workers | Throughput | Speedup | Efficiency |"
|
|
131
|
+
puts "|---------|------------|---------|------------|"
|
|
132
|
+
|
|
133
|
+
baseline_throughput = nil
|
|
134
|
+
|
|
135
|
+
[0, 2, 4].each do |workers|
|
|
136
|
+
cctx = VibeZstd::CCtx.new
|
|
137
|
+
cctx.nb_workers = workers if workers > 0
|
|
138
|
+
|
|
139
|
+
cctx.compress(mt_data) # warmup
|
|
140
|
+
|
|
141
|
+
time = Benchmark.measure do
|
|
142
|
+
5.times { cctx.compress(mt_data) }
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
throughput = (mt_data.bytesize * 5 / time.real)
|
|
146
|
+
|
|
147
|
+
if workers == 0
|
|
148
|
+
baseline_throughput = throughput
|
|
149
|
+
puts "| #{workers} (single) | #{Formatter.format_bytes(throughput.to_i)}/s | 1.0x | 100% |"
|
|
150
|
+
else
|
|
151
|
+
speedup = throughput / baseline_throughput
|
|
152
|
+
efficiency = (speedup / workers * 100).round(0)
|
|
153
|
+
puts "| #{workers} | #{Formatter.format_bytes(throughput.to_i)}/s | #{speedup.round(2)}x | #{efficiency}% |"
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
puts "\n**Note:** Multi-threading is most effective for data > 256KB. Diminishing returns after 4 workers.\n"
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Script to generate a trained dictionary fixture for testing and benchmarking
|
|
5
|
+
|
|
6
|
+
$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
|
|
7
|
+
require "vibe_zstd"
|
|
8
|
+
require "json"
|
|
9
|
+
require "fileutils"
|
|
10
|
+
|
|
11
|
+
# Create fixtures directory
|
|
12
|
+
FIXTURES_DIR = File.join(__dir__, "..", "test", "fixtures")
|
|
13
|
+
FileUtils.mkdir_p(FIXTURES_DIR)
|
|
14
|
+
|
|
15
|
+
# Generate training samples (JSON-like data similar to web application data)
|
|
16
|
+
puts "Generating training samples..."
|
|
17
|
+
samples = 200.times.map do |i|
|
|
18
|
+
{
|
|
19
|
+
id: i,
|
|
20
|
+
name: "User #{i}",
|
|
21
|
+
email: "user#{i}@example.com",
|
|
22
|
+
created_at: Time.now.to_i - rand(100000),
|
|
23
|
+
status: %w[active pending inactive verified][rand(4)],
|
|
24
|
+
preferences: {
|
|
25
|
+
theme: %w[light dark auto][rand(3)],
|
|
26
|
+
notifications: rand(2) == 1,
|
|
27
|
+
language: %w[en es fr de][rand(4)]
|
|
28
|
+
},
|
|
29
|
+
metadata: {
|
|
30
|
+
login_count: rand(1000),
|
|
31
|
+
last_ip: "192.168.#{rand(255)}.#{rand(255)}",
|
|
32
|
+
user_agent: "Mozilla/5.0 (compatible; Bot/1.0)",
|
|
33
|
+
session_duration: rand(3600)
|
|
34
|
+
},
|
|
35
|
+
tags: rand(5).times.map { |t| "tag#{t}" }
|
|
36
|
+
}.to_json
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
puts "Training dictionary with #{samples.size} samples..."
|
|
40
|
+
dict_data = VibeZstd.train_dict(samples, max_dict_size: 16 * 1024) # 16KB dictionary
|
|
41
|
+
|
|
42
|
+
# Save the dictionary
|
|
43
|
+
dict_path = File.join(FIXTURES_DIR, "sample.dict")
|
|
44
|
+
File.binwrite(dict_path, dict_data)
|
|
45
|
+
|
|
46
|
+
puts "\nDictionary saved to: #{dict_path}"
|
|
47
|
+
puts "Dictionary size: #{dict_data.bytesize} bytes"
|
|
48
|
+
puts "Dictionary ID: #{VibeZstd.get_dict_id(dict_data)}"
|
|
49
|
+
|
|
50
|
+
# Test the dictionary
|
|
51
|
+
puts "\nTesting dictionary effectiveness..."
|
|
52
|
+
test_data = {
|
|
53
|
+
id: 999,
|
|
54
|
+
name: "Test User",
|
|
55
|
+
email: "test@example.com",
|
|
56
|
+
status: "active",
|
|
57
|
+
preferences: {theme: "dark", notifications: true, language: "en"},
|
|
58
|
+
metadata: {login_count: 42, last_ip: "192.168.1.1"}
|
|
59
|
+
}.to_json
|
|
60
|
+
|
|
61
|
+
# Compress without dictionary
|
|
62
|
+
compressed_no_dict = VibeZstd.compress(test_data)
|
|
63
|
+
|
|
64
|
+
# Compress with dictionary
|
|
65
|
+
cdict = VibeZstd::CDict.new(dict_data)
|
|
66
|
+
compressed_with_dict = VibeZstd.compress(test_data, dict: cdict)
|
|
67
|
+
|
|
68
|
+
puts "\nCompression comparison:"
|
|
69
|
+
puts " Original size: #{test_data.bytesize} bytes"
|
|
70
|
+
puts " Without dictionary: #{compressed_no_dict.bytesize} bytes (#{(test_data.bytesize.to_f / compressed_no_dict.bytesize).round(2)}x)"
|
|
71
|
+
puts " With dictionary: #{compressed_with_dict.bytesize} bytes (#{(test_data.bytesize.to_f / compressed_with_dict.bytesize).round(2)}x)"
|
|
72
|
+
puts " Dictionary improvement: #{((compressed_no_dict.bytesize - compressed_with_dict.bytesize).to_f / compressed_no_dict.bytesize * 100).round(1)}% smaller"
|
|
73
|
+
|
|
74
|
+
# Verify decompression works
|
|
75
|
+
ddict = VibeZstd::DDict.new(dict_data)
|
|
76
|
+
decompressed = VibeZstd.decompress(compressed_with_dict, dict: ddict)
|
|
77
|
+
if decompressed == test_data
|
|
78
|
+
puts "\n✓ Dictionary verification successful!"
|
|
79
|
+
else
|
|
80
|
+
puts "\n✗ Dictionary verification FAILED!"
|
|
81
|
+
exit 1
|
|
82
|
+
end
|