omnizip 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +243 -368
- data/README.adoc +101 -5
- data/docs/guides/archive-formats/index.adoc +31 -1
- data/docs/guides/archive-formats/ole-format.adoc +316 -0
- data/docs/guides/archive-formats/rpm-format.adoc +249 -0
- data/docs/index.adoc +12 -2
- data/lib/omnizip/algorithms/lzma/distance_coder.rb +29 -18
- data/lib/omnizip/algorithms/lzma/encoder.rb +2 -1
- data/lib/omnizip/algorithms/lzma/length_coder.rb +6 -3
- data/lib/omnizip/algorithms/lzma/literal_decoder.rb +2 -1
- data/lib/omnizip/algorithms/lzma/lzip_decoder.rb +40 -13
- data/lib/omnizip/algorithms/lzma/range_decoder.rb +36 -2
- data/lib/omnizip/algorithms/lzma/range_encoder.rb +19 -0
- data/lib/omnizip/algorithms/lzma/xz_encoder_fast.rb +2 -1
- data/lib/omnizip/algorithms/lzma/xz_utils_decoder.rb +148 -112
- data/lib/omnizip/algorithms/lzma.rb +20 -5
- data/lib/omnizip/algorithms/ppmd7/decoder.rb +25 -21
- data/lib/omnizip/algorithms/ppmd7/encoder.rb +4 -11
- data/lib/omnizip/algorithms/sevenzip_lzma2.rb +2 -1
- data/lib/omnizip/algorithms/xz_lzma2.rb +2 -1
- data/lib/omnizip/algorithms/zstandard/constants.rb +125 -9
- data/lib/omnizip/algorithms/zstandard/decoder.rb +202 -17
- data/lib/omnizip/algorithms/zstandard/encoder.rb +197 -17
- data/lib/omnizip/algorithms/zstandard/frame/block.rb +128 -0
- data/lib/omnizip/algorithms/zstandard/frame/header.rb +224 -0
- data/lib/omnizip/algorithms/zstandard/fse/bitstream.rb +186 -0
- data/lib/omnizip/algorithms/zstandard/fse/encoder.rb +325 -0
- data/lib/omnizip/algorithms/zstandard/fse/table.rb +269 -0
- data/lib/omnizip/algorithms/zstandard/huffman.rb +272 -0
- data/lib/omnizip/algorithms/zstandard/huffman_encoder.rb +339 -0
- data/lib/omnizip/algorithms/zstandard/literals.rb +178 -0
- data/lib/omnizip/algorithms/zstandard/literals_encoder.rb +251 -0
- data/lib/omnizip/algorithms/zstandard/sequences.rb +346 -0
- data/lib/omnizip/buffer/memory_extractor.rb +3 -3
- data/lib/omnizip/buffer.rb +2 -2
- data/lib/omnizip/filters/delta.rb +2 -1
- data/lib/omnizip/filters/registry.rb +6 -6
- data/lib/omnizip/formats/cpio/bounded_io.rb +66 -0
- data/lib/omnizip/formats/lzip.rb +2 -1
- data/lib/omnizip/formats/lzma_alone.rb +2 -1
- data/lib/omnizip/formats/ole/allocation_table.rb +244 -0
- data/lib/omnizip/formats/ole/constants.rb +61 -0
- data/lib/omnizip/formats/ole/dirent.rb +380 -0
- data/lib/omnizip/formats/ole/header.rb +198 -0
- data/lib/omnizip/formats/ole/ranges_io.rb +264 -0
- data/lib/omnizip/formats/ole/storage.rb +305 -0
- data/lib/omnizip/formats/ole/types/variant.rb +328 -0
- data/lib/omnizip/formats/ole.rb +145 -0
- data/lib/omnizip/formats/rar/compression/ppmd/decoder.rb +92 -49
- data/lib/omnizip/formats/rar/compression/ppmd/encoder.rb +13 -20
- data/lib/omnizip/formats/rar/rar5/compression/lzss.rb +6 -2
- data/lib/omnizip/formats/rar3/reader.rb +6 -2
- data/lib/omnizip/formats/rar5/reader.rb +4 -1
- data/lib/omnizip/formats/rpm/constants.rb +58 -0
- data/lib/omnizip/formats/rpm/entry.rb +102 -0
- data/lib/omnizip/formats/rpm/header.rb +113 -0
- data/lib/omnizip/formats/rpm/lead.rb +122 -0
- data/lib/omnizip/formats/rpm/tag.rb +230 -0
- data/lib/omnizip/formats/rpm.rb +434 -0
- data/lib/omnizip/formats/seven_zip/bcj2_stream_decompressor.rb +239 -0
- data/lib/omnizip/formats/seven_zip/coder_chain.rb +32 -8
- data/lib/omnizip/formats/seven_zip/constants.rb +1 -1
- data/lib/omnizip/formats/seven_zip/reader.rb +84 -8
- data/lib/omnizip/formats/seven_zip/stream_compressor.rb +2 -1
- data/lib/omnizip/formats/seven_zip/stream_decompressor.rb +6 -0
- data/lib/omnizip/formats/seven_zip/writer.rb +21 -9
- data/lib/omnizip/formats/seven_zip.rb +10 -0
- data/lib/omnizip/formats/xar/entry.rb +18 -5
- data/lib/omnizip/formats/xar/header.rb +34 -6
- data/lib/omnizip/formats/xar/reader.rb +43 -10
- data/lib/omnizip/formats/xar/toc.rb +34 -21
- data/lib/omnizip/formats/xar/writer.rb +15 -5
- data/lib/omnizip/formats/xz_impl/block_decoder.rb +45 -33
- data/lib/omnizip/formats/xz_impl/block_encoder.rb +2 -1
- data/lib/omnizip/formats/xz_impl/index_decoder.rb +3 -1
- data/lib/omnizip/formats/xz_impl/stream_header_parser.rb +2 -1
- data/lib/omnizip/formats/zip/end_of_central_directory.rb +4 -3
- data/lib/omnizip/implementations/seven_zip/lzma/decoder.rb +14 -6
- data/lib/omnizip/implementations/seven_zip/lzma/encoder.rb +2 -1
- data/lib/omnizip/implementations/seven_zip/lzma2/encoder.rb +28 -13
- data/lib/omnizip/implementations/xz_utils/lzma2/encoder.rb +13 -6
- data/lib/omnizip/pipe/stream_compressor.rb +1 -1
- data/lib/omnizip/version.rb +1 -1
- data/readme-docs/compression-algorithms.adoc +6 -2
- metadata +30 -2
|
@@ -139,23 +139,16 @@ module Omnizip
|
|
|
139
139
|
|
|
140
140
|
# Encode a range for the symbol
|
|
141
141
|
#
|
|
142
|
-
#
|
|
143
|
-
#
|
|
142
|
+
# Uses proper range coding to encode the symbol based on
|
|
143
|
+
# its frequency distribution in the current context.
|
|
144
144
|
#
|
|
145
145
|
# @param cum_freq [Integer] Cumulative frequency
|
|
146
146
|
# @param freq [Integer] Symbol frequency
|
|
147
147
|
# @param total_freq [Integer] Total frequency
|
|
148
148
|
# @return [void]
|
|
149
149
|
def encode_range(cum_freq, freq, total_freq)
|
|
150
|
-
#
|
|
151
|
-
|
|
152
|
-
low = (cum_freq * scale) / total_freq
|
|
153
|
-
high = ((cum_freq + freq) * scale) / total_freq
|
|
154
|
-
|
|
155
|
-
# Encode using direct bits for simplicity
|
|
156
|
-
# Full implementation would use proper range subdivision
|
|
157
|
-
(high - low).bit_length
|
|
158
|
-
@range_encoder.encode_direct_bits(low, 16)
|
|
150
|
+
# Use proper range encoding (not direct bits)
|
|
151
|
+
@range_encoder.encode_freq(cum_freq, freq, total_freq)
|
|
159
152
|
end
|
|
160
153
|
end
|
|
161
154
|
end
|
|
@@ -105,7 +105,8 @@ module Omnizip
|
|
|
105
105
|
dict_size = @options.fetch(:dict_size, 8 * 1024 * 1024)
|
|
106
106
|
|
|
107
107
|
# Use existing LZMA2::Decoder
|
|
108
|
-
decoder = LZMA2::Decoder.new(input, raw_mode: true,
|
|
108
|
+
decoder = LZMA2::Decoder.new(input, raw_mode: true,
|
|
109
|
+
dict_size: dict_size)
|
|
109
110
|
decompressed = decoder.decode_stream
|
|
110
111
|
|
|
111
112
|
# Reverse filter if set
|
|
@@ -100,7 +100,8 @@ module Omnizip
|
|
|
100
100
|
def decompress(input, output)
|
|
101
101
|
dict_size = @options.fetch(:dict_size, 8 * 1024 * 1024)
|
|
102
102
|
|
|
103
|
-
decoder = Implementations::XZUtils::LZMA2::Decoder.new(input,
|
|
103
|
+
decoder = Implementations::XZUtils::LZMA2::Decoder.new(input,
|
|
104
|
+
raw_mode: true, dict_size: dict_size)
|
|
104
105
|
decompressed = decoder.decode_stream
|
|
105
106
|
|
|
106
107
|
# Reverse filter if set
|
|
@@ -1,24 +1,140 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
# Copyright (C) 2025 Ribose Inc.
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
6
|
+
# copy of this software and associated documentation files (the "Software"),
|
|
7
|
+
# to deal in the Software without restriction, including without limitation
|
|
8
|
+
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
9
|
+
# and/or sell copies of the Software, and to permit persons to whom the
|
|
10
|
+
# Software is furnished to do so, subject to the following conditions:
|
|
11
|
+
#
|
|
12
|
+
# The above copyright notice and this permission notice shall be included in
|
|
13
|
+
# all copies or substantial portions of the Software.
|
|
14
|
+
#
|
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
20
|
+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
21
|
+
# DEALINGS IN THE SOFTWARE.
|
|
22
|
+
|
|
3
23
|
module Omnizip
|
|
4
24
|
module Algorithms
|
|
5
25
|
class Zstandard
|
|
6
|
-
# Constants
|
|
26
|
+
# Constants from RFC 8878 (Zstandard Compression)
|
|
27
|
+
#
|
|
28
|
+
# @see https://datatracker.ietf.org/doc/html/rfc8878
|
|
7
29
|
module Constants
|
|
8
|
-
#
|
|
30
|
+
# Frame Constants
|
|
31
|
+
MAGIC_NUMBER = 0xFD2FB528
|
|
32
|
+
MAGIC_BYTES = [0x28, 0xB5, 0x2F, 0xFD].freeze
|
|
33
|
+
SKIPPABLE_MAGIC_BASE = 0x184D2A50
|
|
34
|
+
SKIPPABLE_MAGIC_MASK = 0xFFFFFFF0
|
|
35
|
+
|
|
36
|
+
# Block Types (RFC 8878 Section 3.1.1.2)
|
|
37
|
+
BLOCK_TYPE_RAW = 0
|
|
38
|
+
BLOCK_TYPE_RLE = 1
|
|
39
|
+
BLOCK_TYPE_COMPRESSED = 2
|
|
40
|
+
BLOCK_TYPE_RESERVED = 3
|
|
41
|
+
BLOCK_HEADER_SIZE = 3
|
|
42
|
+
BLOCK_MAX_SIZE = 128 * 1024
|
|
43
|
+
|
|
44
|
+
# Literals Block Types (RFC 8878 Section 3.1.1.3.1)
|
|
45
|
+
LITERALS_BLOCK_RAW = 0
|
|
46
|
+
LITERALS_BLOCK_RLE = 1
|
|
47
|
+
LITERALS_BLOCK_COMPRESSED = 2
|
|
48
|
+
LITERALS_BLOCK_TREELESS = 3
|
|
49
|
+
HUFFMAN_MAX_BITS = 11
|
|
50
|
+
|
|
51
|
+
# Sequence Compression Modes (RFC 8878 Section 3.1.1.3.2)
|
|
52
|
+
MODE_PREDEFINED = 0
|
|
53
|
+
MODE_RLE = 1
|
|
54
|
+
MODE_FSE = 2
|
|
55
|
+
MODE_REPEAT = 3
|
|
56
|
+
|
|
57
|
+
# FSE Accuracy Logs (RFC 8878 Section 4)
|
|
58
|
+
LITERALS_LENGTH_ACCURACY_LOG = 6
|
|
59
|
+
MATCH_LENGTH_ACCURACY_LOG = 6
|
|
60
|
+
OFFSET_ACCURACY_LOG = 5
|
|
61
|
+
|
|
62
|
+
# Repeat Offsets (RFC 8878 Section 3.1.2.2.3)
|
|
63
|
+
REPEAT_OFFSET_1 = 1
|
|
64
|
+
REPEAT_OFFSET_2 = 2
|
|
65
|
+
REPEAT_OFFSET_3 = 3
|
|
66
|
+
DEFAULT_REPEAT_OFFSETS = [1, 4, 8].freeze
|
|
67
|
+
|
|
68
|
+
# Window Constants (RFC 8878 Section 3.1.1.1.2)
|
|
69
|
+
WINDOW_LOG_MIN = 10
|
|
70
|
+
WINDOW_LOG_MAX = 41
|
|
71
|
+
|
|
72
|
+
# Huffman Constants (RFC 8878 Section 4.2.1)
|
|
73
|
+
HUFFMAN_MAX_LOG = 11
|
|
74
|
+
HUFFMAN_MAX_CODE_LENGTH = 11
|
|
75
|
+
HUFFMAN_STANDARD_TABLE_SIZE = 256
|
|
76
|
+
|
|
77
|
+
# FSE Table Limits (RFC 8878 Section 4.1)
|
|
78
|
+
FSE_MAX_ACCURACY_LOG = 9
|
|
79
|
+
FSE_MIN_ACCURACY_LOG = 5
|
|
80
|
+
|
|
81
|
+
# Compression levels
|
|
9
82
|
MIN_LEVEL = 1
|
|
10
83
|
MAX_LEVEL = 22
|
|
11
84
|
DEFAULT_LEVEL = 3
|
|
12
85
|
|
|
13
|
-
# Fast compression levels
|
|
14
|
-
FAST_LEVEL = 1
|
|
15
|
-
BALANCED_LEVEL = 3
|
|
16
|
-
|
|
17
|
-
# Maximum compression level
|
|
18
|
-
ULTRA_LEVEL = 22
|
|
19
|
-
|
|
20
86
|
# Buffer size for streaming operations
|
|
21
87
|
BUFFER_SIZE = 128 * 1024 # 128KB
|
|
88
|
+
|
|
89
|
+
# Literal length codes (RFC 8878 Table 9)
|
|
90
|
+
# Each entry: [baseline, extra_bits]
|
|
91
|
+
LITERAL_LENGTH_TABLE = [
|
|
92
|
+
[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [5, 0], [6, 0], [7, 0],
|
|
93
|
+
[8, 0], [9, 0], [10, 0], [11, 0], [12, 0], [13, 0], [14, 0], [15, 0],
|
|
94
|
+
[16, 1], [18, 1], [20, 1], [22, 1], [24, 1], [28, 1], [32, 1], [40, 1],
|
|
95
|
+
[48, 1], [64, 1], [128, 2], [256, 2], [512, 2], [1024, 2], [2048, 2],
|
|
96
|
+
[4096, 2], [8192, 2], [16384, 3], [32768, 3], [65536, 3]
|
|
97
|
+
].freeze
|
|
98
|
+
|
|
99
|
+
# Match length codes (RFC 8878 Table 10)
|
|
100
|
+
# Each entry: [baseline, extra_bits]
|
|
101
|
+
MATCH_LENGTH_TABLE = [
|
|
102
|
+
[3, 0], [4, 0], [5, 0], [6, 0], [7, 0], [8, 0], [9, 0], [10, 0],
|
|
103
|
+
[11, 0], [12, 0], [13, 0], [14, 0], [15, 0], [16, 0], [17, 0], [18, 0],
|
|
104
|
+
[19, 0], [20, 0], [21, 0], [22, 0], [23, 0], [24, 0], [25, 0], [26, 0],
|
|
105
|
+
[27, 0], [28, 0], [29, 0], [30, 0], [31, 0], [32, 0], [33, 0], [34, 0],
|
|
106
|
+
[35, 1], [37, 1], [39, 1], [41, 1], [43, 1], [47, 1], [51, 1], [59, 1],
|
|
107
|
+
[67, 1], [83, 1], [99, 1], [131, 2], [195, 2], [259, 2], [323, 2],
|
|
108
|
+
[387, 2], [451, 2], [515, 2], [579, 2], [643, 2], [707, 2], [771, 2],
|
|
109
|
+
[835, 2], [899, 2], [963, 2], [1027, 2], [1283, 2], [1539, 2],
|
|
110
|
+
[1795, 2], [2051, 2], [2307, 2], [2563, 2]
|
|
111
|
+
].freeze
|
|
112
|
+
|
|
113
|
+
# Predefined FSE distribution for literals length (RFC 8878 Section 4.1.3)
|
|
114
|
+
PREDEFINED_LL_DISTRIBUTION = [
|
|
115
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
116
|
+
4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
117
|
+
0, 0, 0, 0
|
|
118
|
+
].freeze
|
|
119
|
+
|
|
120
|
+
# Predefined FSE distribution for match length (RFC 8878 Section 4.1.3)
|
|
121
|
+
# Sum = 64 (must equal 2^6 = 64)
|
|
122
|
+
PREDEFINED_ML_DISTRIBUTION = [
|
|
123
|
+
1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
|
|
124
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
125
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
126
|
+
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
127
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
128
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
129
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
130
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
131
|
+
].freeze
|
|
132
|
+
|
|
133
|
+
# Predefined FSE distribution for offset (RFC 8878 Section 4.1.3)
|
|
134
|
+
PREDEFINED_OFFSET_DISTRIBUTION = [
|
|
135
|
+
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0,
|
|
136
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
137
|
+
].freeze
|
|
22
138
|
end
|
|
23
139
|
end
|
|
24
140
|
end
|
|
@@ -1,29 +1,50 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
# Copyright (C) 2025 Ribose Inc.
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
6
|
+
# copy of this software and associated documentation files (the "Software"),
|
|
7
|
+
# to deal in the Software without restriction, including without limitation
|
|
8
|
+
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
9
|
+
# and/or sell copies of the Software, and to permit persons to whom the
|
|
10
|
+
# Software is furnished to do so, subject to the following conditions:
|
|
11
|
+
#
|
|
12
|
+
# The above copyright notice and this permission notice shall be included in
|
|
13
|
+
# all copies or substantial portions of the Software.
|
|
14
|
+
#
|
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
20
|
+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
21
|
+
# DEALINGS IN THE SOFTWARE.
|
|
4
22
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def self.decompress(*)
|
|
11
|
-
raise LoadError, "Zstandard support requires the 'zstd-ruby' gem. " \
|
|
12
|
-
"Install it with: gem install zstd-ruby"
|
|
13
|
-
end
|
|
14
|
-
end
|
|
15
|
-
end
|
|
23
|
+
require_relative "constants"
|
|
24
|
+
require_relative "frame/header"
|
|
25
|
+
require_relative "frame/block"
|
|
26
|
+
require_relative "literals"
|
|
27
|
+
require_relative "sequences"
|
|
16
28
|
|
|
17
29
|
module Omnizip
|
|
18
30
|
module Algorithms
|
|
19
31
|
class Zstandard
|
|
20
|
-
# Zstandard decoder
|
|
32
|
+
# Pure Ruby Zstandard decoder (RFC 8878)
|
|
33
|
+
#
|
|
34
|
+
# Decodes Zstandard-compressed data according to RFC 8878.
|
|
21
35
|
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
36
|
+
# Decoder pipeline:
|
|
37
|
+
# 1. Parse frame header
|
|
38
|
+
# 2. For each block:
|
|
39
|
+
# a. Parse block header
|
|
40
|
+
# b. Decode literals section
|
|
41
|
+
# c. Decode sequences section
|
|
42
|
+
# d. Execute sequences (LZ77 copy operations)
|
|
43
|
+
# 3. Verify content checksum if present
|
|
24
44
|
class Decoder
|
|
25
45
|
include Constants
|
|
26
46
|
|
|
47
|
+
# @return [IO] Input stream
|
|
27
48
|
attr_reader :input_stream
|
|
28
49
|
|
|
29
50
|
# Initialize decoder
|
|
@@ -31,14 +52,178 @@ module Omnizip
|
|
|
31
52
|
# @param input_stream [IO] Input stream of compressed data
|
|
32
53
|
def initialize(input_stream)
|
|
33
54
|
@input_stream = input_stream
|
|
55
|
+
@repeat_offsets = DEFAULT_REPEAT_OFFSETS.dup
|
|
56
|
+
@previous_huffman_table = nil
|
|
57
|
+
@previous_fse_tables = {}
|
|
34
58
|
end
|
|
35
59
|
|
|
36
60
|
# Decode compressed data stream
|
|
37
61
|
#
|
|
38
62
|
# @return [String] Decompressed data
|
|
39
63
|
def decode_stream
|
|
40
|
-
|
|
41
|
-
|
|
64
|
+
output = String.new(encoding: Encoding::BINARY)
|
|
65
|
+
|
|
66
|
+
loop do
|
|
67
|
+
# Read magic number
|
|
68
|
+
magic = read_u32le
|
|
69
|
+
|
|
70
|
+
# Check for skippable frame
|
|
71
|
+
if skippable_frame?(magic)
|
|
72
|
+
skip_frame
|
|
73
|
+
next
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Validate magic number
|
|
77
|
+
unless magic == MAGIC_NUMBER
|
|
78
|
+
raise "Invalid Zstandard magic: 0x#{magic.to_s(16)}"
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Parse frame
|
|
82
|
+
frame_output = decode_frame
|
|
83
|
+
output << frame_output
|
|
84
|
+
|
|
85
|
+
# Check for more frames
|
|
86
|
+
break if @input_stream.eof?
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
output
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
private
|
|
93
|
+
|
|
94
|
+
# Check if frame is skippable
|
|
95
|
+
def skippable_frame?(magic)
|
|
96
|
+
(magic & SKIPPABLE_MAGIC_MASK) == SKIPPABLE_MAGIC_BASE
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Skip skippable frame
|
|
100
|
+
def skip_frame
|
|
101
|
+
# Read frame size (4 bytes)
|
|
102
|
+
size = read_u32le
|
|
103
|
+
@input_stream.seek(size, IO::SEEK_CUR)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Read unsigned 32-bit little-endian
|
|
107
|
+
def read_u32le
|
|
108
|
+
bytes = @input_stream.read(4)
|
|
109
|
+
return 0 if bytes.nil? || bytes.length < 4
|
|
110
|
+
|
|
111
|
+
bytes.unpack1("V")
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Decode a single frame
|
|
115
|
+
def decode_frame
|
|
116
|
+
# Parse frame header
|
|
117
|
+
header = Frame::Header.parse(@input_stream)
|
|
118
|
+
|
|
119
|
+
# Calculate window size
|
|
120
|
+
calculate_window_size(header)
|
|
121
|
+
|
|
122
|
+
# Decode blocks
|
|
123
|
+
output = String.new(encoding: Encoding::BINARY)
|
|
124
|
+
|
|
125
|
+
loop do
|
|
126
|
+
block = Frame::Block.parse(@input_stream)
|
|
127
|
+
|
|
128
|
+
# Decode block content
|
|
129
|
+
block_output = decode_block(block, header)
|
|
130
|
+
output << block_output
|
|
131
|
+
|
|
132
|
+
break if block.last_block
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Verify checksum if present
|
|
136
|
+
if header.content_checksum?
|
|
137
|
+
verify_checksum(output)
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
output
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Calculate window size from header
|
|
144
|
+
def calculate_window_size(header)
|
|
145
|
+
return BLOCK_MAX_SIZE if header.single_segment
|
|
146
|
+
return nil unless header.window_log
|
|
147
|
+
|
|
148
|
+
header.window_size || BLOCK_MAX_SIZE
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Decode a single block
|
|
152
|
+
def decode_block(block, _header)
|
|
153
|
+
case block.block_type
|
|
154
|
+
when BLOCK_TYPE_RAW
|
|
155
|
+
decode_raw_block(block)
|
|
156
|
+
when BLOCK_TYPE_RLE
|
|
157
|
+
decode_rle_block(block)
|
|
158
|
+
when BLOCK_TYPE_COMPRESSED
|
|
159
|
+
decode_compressed_block(block)
|
|
160
|
+
else
|
|
161
|
+
raise "Reserved block type: #{block.block_type}"
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Decode raw (uncompressed) block
|
|
166
|
+
def decode_raw_block(block)
|
|
167
|
+
@input_stream.read(block.block_size)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Decode RLE block
|
|
171
|
+
def decode_rle_block(block)
|
|
172
|
+
byte = @input_stream.read(1)
|
|
173
|
+
byte * block.block_size
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Decode compressed block
|
|
177
|
+
def decode_compressed_block(_block)
|
|
178
|
+
# Record start position for calculating remaining bytes
|
|
179
|
+
@input_stream.pos
|
|
180
|
+
|
|
181
|
+
# Decode literals section
|
|
182
|
+
literals_decoder = LiteralsDecoder.decode(@input_stream,
|
|
183
|
+
@previous_huffman_table)
|
|
184
|
+
literals = literals_decoder.literals
|
|
185
|
+
@previous_huffman_table = literals_decoder.huffman_table
|
|
186
|
+
|
|
187
|
+
# Decode sequences section
|
|
188
|
+
sequences_decoder = SequencesDecoder.decode(@input_stream,
|
|
189
|
+
literals.bytesize,
|
|
190
|
+
@previous_fse_tables)
|
|
191
|
+
sequences = sequences_decoder.sequences
|
|
192
|
+
|
|
193
|
+
# Execute sequences to produce output
|
|
194
|
+
if sequences.empty?
|
|
195
|
+
# No sequences - literals are the output
|
|
196
|
+
literals
|
|
197
|
+
else
|
|
198
|
+
SequenceExecutor.execute(literals, sequences)
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Verify content checksum
|
|
203
|
+
def verify_checksum(output)
|
|
204
|
+
# Read checksum (4 bytes)
|
|
205
|
+
checksum_bytes = @input_stream.read(4)
|
|
206
|
+
return unless checksum_bytes && checksum_bytes.length == 4
|
|
207
|
+
|
|
208
|
+
expected = checksum_bytes.unpack1("V")
|
|
209
|
+
calculated = xxhash32(output)
|
|
210
|
+
|
|
211
|
+
if calculated != expected
|
|
212
|
+
warn "Zstandard checksum mismatch (expected #{expected}, got #{calculated})"
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Calculate XXHash32 checksum (simplified)
|
|
217
|
+
def xxhash32(data, seed = 0)
|
|
218
|
+
# Simplified XXHash32 - for checksum verification only
|
|
219
|
+
# Full implementation would use proper XXHash32 algorithm
|
|
220
|
+
hash = seed
|
|
221
|
+
|
|
222
|
+
data.each_byte do |byte|
|
|
223
|
+
hash = ((hash << 5) + hash + byte) & 0xFFFFFFFF
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
hash
|
|
42
227
|
end
|
|
43
228
|
end
|
|
44
229
|
end
|
|
@@ -1,26 +1,35 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
# Copyright (C) 2025 Ribose Inc.
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
6
|
+
# copy of this software and associated documentation files (the "Software"),
|
|
7
|
+
# to deal in the Software without restriction, including without limitation
|
|
8
|
+
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
9
|
+
# and/or sell copies of the Software, and to permit persons to whom the
|
|
10
|
+
# Software is furnished to do so, subject to the following conditions:
|
|
11
|
+
#
|
|
12
|
+
# The above copyright notice and this permission notice shall be included in
|
|
13
|
+
# all copies or substantial portions of the Software.
|
|
14
|
+
#
|
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
20
|
+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
21
|
+
# DEALINGS IN THE SOFTWARE.
|
|
4
22
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
rescue LoadError
|
|
8
|
-
# Zstd gem not available - provide helpful error message
|
|
9
|
-
module Zstd
|
|
10
|
-
def self.compress(*)
|
|
11
|
-
raise LoadError, "Zstandard support requires the 'zstd-ruby' gem. " \
|
|
12
|
-
"Install it with: gem install zstd-ruby"
|
|
13
|
-
end
|
|
14
|
-
end
|
|
15
|
-
end
|
|
23
|
+
require_relative "constants"
|
|
24
|
+
require_relative "literals_encoder"
|
|
16
25
|
|
|
17
26
|
module Omnizip
|
|
18
27
|
module Algorithms
|
|
19
28
|
class Zstandard
|
|
20
|
-
# Zstandard encoder
|
|
29
|
+
# Pure Ruby Zstandard encoder (RFC 8878)
|
|
21
30
|
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
31
|
+
# Encodes data using Zstandard format.
|
|
32
|
+
# Supports raw blocks and Huffman-compressed literals.
|
|
24
33
|
class Encoder
|
|
25
34
|
include Constants
|
|
26
35
|
|
|
@@ -31,10 +40,12 @@ module Omnizip
|
|
|
31
40
|
# @param output_stream [IO] Output stream for compressed data
|
|
32
41
|
# @param options [Hash] Encoder options
|
|
33
42
|
# @option options [Integer] :level Compression level (1-22)
|
|
43
|
+
# @option options [Boolean] :use_compression Use Huffman compression (default: true)
|
|
34
44
|
def initialize(output_stream, options = {})
|
|
35
45
|
@output_stream = output_stream
|
|
36
46
|
@options = options
|
|
37
47
|
@level = options[:level] || DEFAULT_LEVEL
|
|
48
|
+
@use_compression = options.fetch(:use_compression, true)
|
|
38
49
|
end
|
|
39
50
|
|
|
40
51
|
# Encode data stream
|
|
@@ -42,8 +53,177 @@ module Omnizip
|
|
|
42
53
|
# @param data [String] Data to compress
|
|
43
54
|
# @return [void]
|
|
44
55
|
def encode_stream(data)
|
|
45
|
-
|
|
46
|
-
|
|
56
|
+
# Write Zstandard frame
|
|
57
|
+
write_frame(data)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
# Write a complete Zstandard frame
|
|
63
|
+
def write_frame(data)
|
|
64
|
+
# Write magic number
|
|
65
|
+
write_u32le(MAGIC_NUMBER)
|
|
66
|
+
|
|
67
|
+
# Write frame header descriptor
|
|
68
|
+
# Single segment, no checksum, no dictionary
|
|
69
|
+
if data.bytesize < 256
|
|
70
|
+
# Single segment, 1-byte FCS (FCS flag = 0)
|
|
71
|
+
descriptor = 0x20 # Single segment flag (bit 5)
|
|
72
|
+
@output_stream.putc(descriptor)
|
|
73
|
+
@output_stream.putc(data.bytesize)
|
|
74
|
+
else
|
|
75
|
+
# Single segment, 4-byte FCS (FCS flag = 2)
|
|
76
|
+
# Bits 6-7 = 10 binary = 0x80
|
|
77
|
+
# Bit 5 = 1 (single segment) = 0x20
|
|
78
|
+
descriptor = 0x80 | 0x20 # 0xA0
|
|
79
|
+
@output_stream.putc(descriptor)
|
|
80
|
+
write_u32le(data.bytesize)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Write blocks
|
|
84
|
+
write_blocks(data)
|
|
85
|
+
|
|
86
|
+
# Write content checksum (optional, disabled for now)
|
|
87
|
+
# write_u32le(xxhash32(data))
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Write blocks containing the data
|
|
91
|
+
def write_blocks(data)
|
|
92
|
+
return if data.empty?
|
|
93
|
+
|
|
94
|
+
offset = 0
|
|
95
|
+
max_block_size = BLOCK_MAX_SIZE
|
|
96
|
+
|
|
97
|
+
while offset < data.bytesize
|
|
98
|
+
chunk = data.byteslice(offset, max_block_size)
|
|
99
|
+
offset += chunk.bytesize
|
|
100
|
+
|
|
101
|
+
is_last = offset >= data.bytesize
|
|
102
|
+
|
|
103
|
+
# Use RLE for repetitive data, otherwise raw blocks
|
|
104
|
+
# Compressed blocks are deferred until decoder fully supports them
|
|
105
|
+
if rle_efficient?(chunk)
|
|
106
|
+
write_rle_block(chunk, is_last)
|
|
107
|
+
else
|
|
108
|
+
write_raw_block(chunk, is_last)
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Check if RLE encoding would be efficient for a chunk
|
|
114
|
+
def rle_efficient?(chunk)
|
|
115
|
+
return false if chunk.bytesize < 3
|
|
116
|
+
|
|
117
|
+
first_byte = chunk.getbyte(0)
|
|
118
|
+
chunk.bytes.all?(first_byte)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Write an RLE (run-length encoded) block
|
|
122
|
+
def write_rle_block(data, is_last)
|
|
123
|
+
byte = data.getbyte(0)
|
|
124
|
+
size = data.bytesize
|
|
125
|
+
|
|
126
|
+
# Block header (3 bytes, little-endian)
|
|
127
|
+
# Bit 0: Last_Block (1 = last)
|
|
128
|
+
# Bits 1-2: Block_Type (1 = RLE)
|
|
129
|
+
# Bits 3-23: Block_Size
|
|
130
|
+
|
|
131
|
+
header = size << 3 # Block size in bits 3-23
|
|
132
|
+
header |= BLOCK_TYPE_RLE << 1 # Block type = 1 (RLE)
|
|
133
|
+
header |= 1 if is_last # Last block flag in bit 0
|
|
134
|
+
|
|
135
|
+
# Write 3 bytes little-endian
|
|
136
|
+
@output_stream.putc(header & 0xFF)
|
|
137
|
+
@output_stream.putc((header >> 8) & 0xFF)
|
|
138
|
+
@output_stream.putc((header >> 16) & 0xFF)
|
|
139
|
+
|
|
140
|
+
# Write single byte to repeat
|
|
141
|
+
@output_stream.putc(byte)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Write a raw (uncompressed) block
|
|
145
|
+
def write_raw_block(data, is_last)
|
|
146
|
+
# Block header (3 bytes, little-endian)
|
|
147
|
+
# Bit 0: Last_Block (1 = last)
|
|
148
|
+
# Bits 1-2: Block_Type (0 = raw)
|
|
149
|
+
# Bits 3-23: Block_Size
|
|
150
|
+
|
|
151
|
+
header = data.bytesize << 3 # Block size in bits 3-23
|
|
152
|
+
header |= BLOCK_TYPE_RAW << 1 # Block type in bits 1-2
|
|
153
|
+
header |= 1 if is_last # Last block flag in bit 0
|
|
154
|
+
|
|
155
|
+
# Write 3 bytes little-endian
|
|
156
|
+
@output_stream.putc(header & 0xFF)
|
|
157
|
+
@output_stream.putc((header >> 8) & 0xFF)
|
|
158
|
+
@output_stream.putc((header >> 16) & 0xFF)
|
|
159
|
+
|
|
160
|
+
# Write block content
|
|
161
|
+
@output_stream.write(data)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Write a compressed block with Huffman literals
|
|
165
|
+
#
|
|
166
|
+
# @param data [String] Block data
|
|
167
|
+
# @param is_last [Boolean] Whether this is the last block
|
|
168
|
+
# @return [Boolean] True if compression succeeded, false otherwise
|
|
169
|
+
def write_compressed_block(data, is_last)
|
|
170
|
+
# Encode literals section
|
|
171
|
+
literals_section = LiteralsEncoder.encode(data, use_compression: true)
|
|
172
|
+
|
|
173
|
+
# Check if compression is beneficial
|
|
174
|
+
# Compressed block has overhead: block header (3) + literals header + sequences
|
|
175
|
+
# For now, we need sequences section too (even if empty)
|
|
176
|
+
sequences_section = encode_empty_sequences
|
|
177
|
+
|
|
178
|
+
block_content = literals_section + sequences_section
|
|
179
|
+
compressed_size = block_content.bytesize
|
|
180
|
+
|
|
181
|
+
# Only use compressed if it's smaller
|
|
182
|
+
if compressed_size >= data.bytesize
|
|
183
|
+
return false
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Write block header for compressed block
|
|
187
|
+
header = compressed_size << 3 # Block size in bits 3-23
|
|
188
|
+
header |= BLOCK_TYPE_COMPRESSED << 1 # Block type = 2 (compressed)
|
|
189
|
+
header |= 1 if is_last # Last block flag in bit 0
|
|
190
|
+
|
|
191
|
+
# Write 3 bytes little-endian
|
|
192
|
+
@output_stream.putc(header & 0xFF)
|
|
193
|
+
@output_stream.putc((header >> 8) & 0xFF)
|
|
194
|
+
@output_stream.putc((header >> 16) & 0xFF)
|
|
195
|
+
|
|
196
|
+
# Write block content
|
|
197
|
+
@output_stream.write(block_content)
|
|
198
|
+
|
|
199
|
+
true
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Encode empty sequences section
|
|
203
|
+
#
|
|
204
|
+
# For blocks with only literals (no matches), we need an empty sequences section.
|
|
205
|
+
def encode_empty_sequences
|
|
206
|
+
# Number of sequences = 0 (single byte 0x00)
|
|
207
|
+
"\x00"
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# Write unsigned 32-bit little-endian
|
|
211
|
+
def write_u32le(value)
|
|
212
|
+
@output_stream.putc(value & 0xFF)
|
|
213
|
+
@output_stream.putc((value >> 8) & 0xFF)
|
|
214
|
+
@output_stream.putc((value >> 16) & 0xFF)
|
|
215
|
+
@output_stream.putc((value >> 24) & 0xFF)
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Calculate XXHash32 checksum (simplified)
|
|
219
|
+
def xxhash32(data, seed = 0)
|
|
220
|
+
hash = seed
|
|
221
|
+
|
|
222
|
+
data.each_byte do |byte|
|
|
223
|
+
hash = ((hash << 5) + hash + byte) & 0xFFFFFFFF
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
hash
|
|
47
227
|
end
|
|
48
228
|
end
|
|
49
229
|
end
|