cabriolet 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +703 -38
  3. data/lib/cabriolet/algorithm_factory.rb +250 -0
  4. data/lib/cabriolet/base_compressor.rb +206 -0
  5. data/lib/cabriolet/binary/bitstream.rb +167 -16
  6. data/lib/cabriolet/binary/bitstream_writer.rb +150 -21
  7. data/lib/cabriolet/binary/chm_structures.rb +2 -2
  8. data/lib/cabriolet/binary/hlp_structures.rb +258 -37
  9. data/lib/cabriolet/binary/lit_structures.rb +231 -65
  10. data/lib/cabriolet/binary/oab_structures.rb +17 -1
  11. data/lib/cabriolet/cab/command_handler.rb +226 -0
  12. data/lib/cabriolet/cab/compressor.rb +108 -84
  13. data/lib/cabriolet/cab/decompressor.rb +16 -20
  14. data/lib/cabriolet/cab/extractor.rb +142 -66
  15. data/lib/cabriolet/cab/file_compression_work.rb +52 -0
  16. data/lib/cabriolet/cab/file_compression_worker.rb +89 -0
  17. data/lib/cabriolet/checksum.rb +49 -0
  18. data/lib/cabriolet/chm/command_handler.rb +227 -0
  19. data/lib/cabriolet/chm/compressor.rb +7 -3
  20. data/lib/cabriolet/chm/decompressor.rb +39 -21
  21. data/lib/cabriolet/chm/parser.rb +5 -2
  22. data/lib/cabriolet/cli/base_command_handler.rb +127 -0
  23. data/lib/cabriolet/cli/command_dispatcher.rb +140 -0
  24. data/lib/cabriolet/cli/command_registry.rb +83 -0
  25. data/lib/cabriolet/cli.rb +356 -607
  26. data/lib/cabriolet/collections/file_collection.rb +175 -0
  27. data/lib/cabriolet/compressors/base.rb +1 -1
  28. data/lib/cabriolet/compressors/lzx.rb +241 -54
  29. data/lib/cabriolet/compressors/mszip.rb +35 -3
  30. data/lib/cabriolet/compressors/quantum.rb +36 -95
  31. data/lib/cabriolet/decompressors/base.rb +1 -1
  32. data/lib/cabriolet/decompressors/lzss.rb +13 -3
  33. data/lib/cabriolet/decompressors/lzx.rb +70 -33
  34. data/lib/cabriolet/decompressors/mszip.rb +126 -39
  35. data/lib/cabriolet/decompressors/quantum.rb +83 -53
  36. data/lib/cabriolet/errors.rb +3 -0
  37. data/lib/cabriolet/extraction/base_extractor.rb +88 -0
  38. data/lib/cabriolet/extraction/extractor.rb +171 -0
  39. data/lib/cabriolet/extraction/file_extraction_work.rb +60 -0
  40. data/lib/cabriolet/extraction/file_extraction_worker.rb +106 -0
  41. data/lib/cabriolet/file_entry.rb +156 -0
  42. data/lib/cabriolet/file_manager.rb +144 -0
  43. data/lib/cabriolet/format_base.rb +79 -0
  44. data/lib/cabriolet/hlp/command_handler.rb +282 -0
  45. data/lib/cabriolet/hlp/compressor.rb +28 -238
  46. data/lib/cabriolet/hlp/decompressor.rb +107 -147
  47. data/lib/cabriolet/hlp/parser.rb +52 -101
  48. data/lib/cabriolet/hlp/quickhelp/compression_stream.rb +138 -0
  49. data/lib/cabriolet/hlp/quickhelp/compressor.rb +151 -0
  50. data/lib/cabriolet/hlp/quickhelp/decompressor.rb +558 -0
  51. data/lib/cabriolet/hlp/quickhelp/file_writer.rb +125 -0
  52. data/lib/cabriolet/hlp/quickhelp/huffman_stream.rb +74 -0
  53. data/lib/cabriolet/hlp/quickhelp/huffman_tree.rb +167 -0
  54. data/lib/cabriolet/hlp/quickhelp/offset_calculator.rb +61 -0
  55. data/lib/cabriolet/hlp/quickhelp/parser.rb +274 -0
  56. data/lib/cabriolet/hlp/quickhelp/structure_builder.rb +93 -0
  57. data/lib/cabriolet/hlp/quickhelp/topic_builder.rb +52 -0
  58. data/lib/cabriolet/hlp/quickhelp/topic_compressor.rb +83 -0
  59. data/lib/cabriolet/hlp/winhelp/btree_builder.rb +289 -0
  60. data/lib/cabriolet/hlp/winhelp/compressor.rb +400 -0
  61. data/lib/cabriolet/hlp/winhelp/decompressor.rb +192 -0
  62. data/lib/cabriolet/hlp/winhelp/parser.rb +484 -0
  63. data/lib/cabriolet/hlp/winhelp/zeck_lz77.rb +271 -0
  64. data/lib/cabriolet/huffman/encoder.rb +15 -12
  65. data/lib/cabriolet/huffman/tree.rb +85 -1
  66. data/lib/cabriolet/kwaj/command_handler.rb +213 -0
  67. data/lib/cabriolet/kwaj/compressor.rb +7 -3
  68. data/lib/cabriolet/kwaj/decompressor.rb +18 -12
  69. data/lib/cabriolet/lit/command_handler.rb +221 -0
  70. data/lib/cabriolet/lit/compressor.rb +119 -168
  71. data/lib/cabriolet/lit/content_encoder.rb +76 -0
  72. data/lib/cabriolet/lit/content_type_detector.rb +50 -0
  73. data/lib/cabriolet/lit/decompressor.rb +518 -152
  74. data/lib/cabriolet/lit/directory_builder.rb +153 -0
  75. data/lib/cabriolet/lit/guid_generator.rb +16 -0
  76. data/lib/cabriolet/lit/header_writer.rb +124 -0
  77. data/lib/cabriolet/lit/parser.rb +670 -0
  78. data/lib/cabriolet/lit/piece_builder.rb +74 -0
  79. data/lib/cabriolet/lit/structure_builder.rb +252 -0
  80. data/lib/cabriolet/models/hlp_file.rb +130 -29
  81. data/lib/cabriolet/models/hlp_header.rb +105 -17
  82. data/lib/cabriolet/models/lit_header.rb +212 -25
  83. data/lib/cabriolet/models/szdd_header.rb +10 -2
  84. data/lib/cabriolet/models/winhelp_header.rb +127 -0
  85. data/lib/cabriolet/oab/command_handler.rb +257 -0
  86. data/lib/cabriolet/oab/compressor.rb +17 -8
  87. data/lib/cabriolet/oab/decompressor.rb +41 -10
  88. data/lib/cabriolet/offset_calculator.rb +81 -0
  89. data/lib/cabriolet/plugin.rb +233 -0
  90. data/lib/cabriolet/plugin_manager.rb +453 -0
  91. data/lib/cabriolet/plugin_validator.rb +422 -0
  92. data/lib/cabriolet/quantum_shared.rb +105 -0
  93. data/lib/cabriolet/system/io_system.rb +3 -0
  94. data/lib/cabriolet/system/memory_handle.rb +17 -4
  95. data/lib/cabriolet/szdd/command_handler.rb +217 -0
  96. data/lib/cabriolet/szdd/compressor.rb +15 -11
  97. data/lib/cabriolet/szdd/decompressor.rb +18 -9
  98. data/lib/cabriolet/version.rb +1 -1
  99. data/lib/cabriolet.rb +181 -20
  100. metadata +69 -4
  101. data/lib/cabriolet/auto.rb +0 -173
  102. data/lib/cabriolet/parallel.rb +0 -333
@@ -1,75 +1,24 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative "../quantum_shared"
4
+
3
5
  module Cabriolet
4
6
  module Compressors
5
7
  # Quantum compresses data using arithmetic coding and LZ77-based matching
6
8
  # Based on the Quantum decompressor and libmspack qtmd.c implementation
7
9
  #
8
- # STATUS: Functional with known limitations
9
- # - Literals: WORKING ✓
10
- # - Short matches (3-13 bytes): WORKING ✓
11
- # - Longer matches (14+ bytes): Limited support (known issue)
12
- # - Simple data round-trips successfully
13
- # - Complex repeated patterns may have issues
14
- #
15
10
  # The Quantum method was created by David Stafford, adapted by Microsoft
16
11
  # Corporation.
12
+ #
13
+ # NOTE: This compressor is a work-in-progress. The arithmetic coding
14
+ # implementation needs refinement to match the decoder exactly.
15
+ # For now, this implementation focuses on correct structure.
17
16
  # rubocop:disable Metrics/ClassLength
18
17
  class Quantum < Base
19
- # Frame size (32KB per frame)
20
- FRAME_SIZE = 32_768
21
-
22
- # Match constants
23
- MIN_MATCH = 3
24
- MAX_MATCH = 1028
25
-
26
- # Position slot tables (same as decompressor)
27
- POSITION_BASE = [
28
- 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384,
29
- 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12_288, 16_384,
30
- 24_576, 32_768, 49_152, 65_536, 98_304, 131_072, 196_608, 262_144,
31
- 393_216, 524_288, 786_432, 1_048_576, 1_572_864
32
- ].freeze
33
-
34
- EXTRA_BITS = [
35
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
36
- 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16,
37
- 17, 17, 18, 18, 19, 19
38
- ].freeze
39
-
40
- LENGTH_BASE = [
41
- 0, 1, 2, 3, 4, 5, 6, 8, 10, 12, 14, 18, 22, 26,
42
- 30, 38, 46, 54, 62, 78, 94, 110, 126, 158, 190, 222, 254
43
- ].freeze
44
-
45
- LENGTH_EXTRA = [
46
- 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
47
- 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0
48
- ].freeze
18
+ include QuantumShared
49
19
 
50
20
  attr_reader :window_bits, :window_size
51
21
 
52
- # Represents a symbol in an arithmetic coding model
53
- class ModelSymbol
54
- attr_accessor :sym, :cumfreq
55
-
56
- def initialize(sym, cumfreq)
57
- @sym = sym
58
- @cumfreq = cumfreq
59
- end
60
- end
61
-
62
- # Represents an arithmetic coding model
63
- class Model
64
- attr_accessor :shiftsleft, :entries, :syms
65
-
66
- def initialize(syms, entries)
67
- @syms = syms
68
- @entries = entries
69
- @shiftsleft = 4
70
- end
71
- end
72
-
73
22
  # Initialize Quantum compressor
74
23
  #
75
24
  # @param io_system [System::IOSystem] I/O system for reading/writing
@@ -77,7 +26,8 @@ module Cabriolet
77
26
  # @param output [System::FileHandle, System::MemoryHandle] Output handle
78
27
  # @param buffer_size [Integer] Buffer size for I/O operations
79
28
  # @param window_bits [Integer] Window size parameter (10-21)
80
- def initialize(io_system, input, output, buffer_size, window_bits: 10)
29
+ def initialize(io_system, input, output, buffer_size, window_bits: 10,
30
+ **_kwargs)
81
31
  super(io_system, input, output, buffer_size)
82
32
 
83
33
  # Validate window_bits
@@ -179,7 +129,6 @@ module Cabriolet
179
129
 
180
130
  # Compress a single frame
181
131
  def compress_frame(data)
182
- # No header needed - the first 16 bits of encoded data will be read as C
183
132
  pos = 0
184
133
 
185
134
  while pos < data.bytesize
@@ -198,27 +147,25 @@ module Cabriolet
198
147
  end
199
148
  end
200
149
 
201
- # Finish arithmetic coding - output final range
202
- # We need to output enough bits to disambiguate the final range
150
+ # Finish arithmetic coding
203
151
  finish_arithmetic_coding
204
152
  end
205
153
 
206
- # Finish arithmetic coding by outputting the final state
154
+ # Finish arithmetic coding
207
155
  def finish_arithmetic_coding
208
- # Output enough bits to ensure decoder can decode correctly
209
- # We need to output a value that falls within [L, H)
210
- # A common approach is to output L plus half the range
211
- @underflow_bits += 1
212
- bit = if @l.anybits?(0x4000)
213
- 1
214
- else
215
- 0
216
- end
217
- @bitstream.write_bits_msb(bit, 1)
218
- @underflow_bits.times do
219
- @bitstream.write_bits_msb(bit ^ 1, 1)
156
+ # Output pending underflow bits
157
+ if @underflow_bits.positive?
158
+ bit = if @l.anybits?(0x4000)
159
+ 1
160
+ else
161
+ 0
162
+ end
163
+ @bitstream.write_bits_msb(bit, 1)
164
+ @underflow_bits.times do
165
+ @bitstream.write_bits_msb(bit ^ 1, 1)
166
+ end
167
+ @underflow_bits = 0
220
168
  end
221
- @underflow_bits = 0
222
169
  end
223
170
 
224
171
  # Find best match in the sliding window
@@ -335,7 +282,6 @@ module Cabriolet
335
282
  end
336
283
 
337
284
  # Encode a symbol using arithmetic coding
338
- # This is the inverse of GET_SYMBOL macro in qtmd.c
339
285
  def encode_symbol(model, sym)
340
286
  # Find symbol index in model
341
287
  i = 0
@@ -346,33 +292,29 @@ module Cabriolet
346
292
  "Symbol #{sym} not found in model"
347
293
  end
348
294
 
349
- # Calculate range (matching decoder line 93, 101-102)
350
- range = (@h - @l) + 1
295
+ # Calculate range - use decoder's formula
296
+ range = ((@h - @l) & 0xFFFF) + 1
351
297
  symf = model.syms[0].cumfreq
352
298
 
353
- # Update H and L (matching decoder lines 103-104)
354
- # Decoder uses syms[i-1] and syms[i], so encoder at index j
355
- # should use syms[j] and syms[j+1] to make decoder land at i=j+1
356
- # But decoder returns syms[i-1].sym, so it will return syms[j].sym ✓
299
+ # Update H and L
357
300
  @h = @l + ((model.syms[i].cumfreq * range) / symf) - 1
358
301
  @l += ((model.syms[i + 1].cumfreq * range) / symf)
359
302
 
360
- # Update model frequencies (matching decoder line 106)
303
+ # Update model frequencies
361
304
  j = i
362
305
  while j >= 0
363
306
  model.syms[j].cumfreq += 8
364
307
  j -= 1
365
308
  end
366
309
 
367
- # Check if model needs updating (matching decoder line 107)
310
+ # Check if model needs updating
368
311
  update_model(model) if model.syms[0].cumfreq > 3800
369
312
 
370
- # Normalize range (matching decoder lines 109-121)
313
+ # Normalize range
371
314
  normalize_range
372
315
  end
373
316
 
374
317
  # Normalize arithmetic coding range and output bits
375
- # This implements the encoder equivalent of the decoder's normalization (lines 109-121)
376
318
  def normalize_range
377
319
  loop do
378
320
  if (@l & 0x8000) == (@h & 0x8000)
@@ -395,37 +337,36 @@ module Cabriolet
395
337
  @h |= 0x4000
396
338
 
397
339
  # Can't normalize further
398
-
399
340
  end
400
341
 
401
- # Shift range (both for underflow and MSB match cases)
342
+ # Shift range
402
343
  @l = (@l << 1) & 0xFFFF
403
344
  @h = ((@h << 1) | 1) & 0xFFFF
404
345
  end
405
346
  end
406
347
 
407
- # Update model statistics (matching qtmd_update_model exactly)
348
+ # Update model statistics
408
349
  def update_model(model)
409
350
  model.shiftsleft -= 1
410
351
 
411
352
  if model.shiftsleft.positive?
412
- # Simple shift (matching decoder lines 129-135)
353
+ # Simple shift
413
354
  (model.entries - 1).downto(0) do |i|
414
355
  model.syms[i].cumfreq >>= 1
415
356
  model.syms[i].cumfreq = model.syms[i + 1].cumfreq + 1 if model.syms[i].cumfreq <= model.syms[i + 1].cumfreq
416
357
  end
417
358
  else
418
- # Full rebuild (matching decoder lines 137-163)
359
+ # Full rebuild
419
360
  model.shiftsleft = 50
420
361
 
421
- # Convert cumfreq to frequencies (lines 139-145)
362
+ # Convert cumfreq to frequencies
422
363
  (0...model.entries).each do |i|
423
364
  model.syms[i].cumfreq -= model.syms[i + 1].cumfreq
424
365
  model.syms[i].cumfreq += 1
425
366
  model.syms[i].cumfreq >>= 1
426
367
  end
427
368
 
428
- # Sort by frequency (selection sort for stability, lines 150-158)
369
+ # Sort by frequency
429
370
  (0...(model.entries - 1)).each do |i|
430
371
  ((i + 1)...model.entries).each do |j|
431
372
  if model.syms[i].cumfreq < model.syms[j].cumfreq
@@ -434,7 +375,7 @@ module Cabriolet
434
375
  end
435
376
  end
436
377
 
437
- # Convert back to cumulative frequencies (lines 161-163)
378
+ # Convert back to cumulative frequencies
438
379
  (model.entries - 1).downto(0) do |i|
439
380
  model.syms[i].cumfreq += model.syms[i + 1].cumfreq
440
381
  end
@@ -12,7 +12,7 @@ module Cabriolet
12
12
  # @param input [System::FileHandle, System::MemoryHandle] Input handle
13
13
  # @param output [System::FileHandle, System::MemoryHandle] Output handle
14
14
  # @param buffer_size [Integer] Buffer size for I/O operations
15
- def initialize(io_system, input, output, buffer_size)
15
+ def initialize(io_system, input, output, buffer_size, **_kwargs)
16
16
  @io_system = io_system
17
17
  @input = input
18
18
  @output = output
@@ -40,13 +40,17 @@ module Cabriolet
40
40
 
41
41
  # Decompress LZSS data
42
42
  #
43
- # @param bytes [Integer] Number of bytes to decompress (unused, reads
44
- # until EOF)
43
+ # @param bytes [Integer, nil] Maximum number of output bytes to write (nil or 0 = until EOF)
45
44
  # @return [Integer] Number of bytes decompressed
46
- def decompress(_bytes)
45
+ def decompress(bytes = nil)
47
46
  bytes_written = 0
47
+ # Only enforce limit if bytes is a positive integer
48
+ enforce_limit = bytes&.positive?
48
49
 
49
50
  loop do
51
+ # Check if we've reached the output byte limit (only when limit is enforced)
52
+ break if enforce_limit && bytes_written >= bytes
53
+
50
54
  # Read control byte
51
55
  control_byte = read_input_byte
52
56
  break if control_byte.nil?
@@ -55,6 +59,9 @@ module Cabriolet
55
59
 
56
60
  # Process each bit in the control byte
57
61
  8.times do |bit_index|
62
+ # Check output limit before each operation (only when limit is enforced)
63
+ break if enforce_limit && bytes_written >= bytes
64
+
58
65
  mask = 1 << bit_index
59
66
 
60
67
  if control_byte.anybits?(mask)
@@ -81,6 +88,9 @@ module Cabriolet
81
88
 
82
89
  # Copy from window
83
90
  length.times do
91
+ # Check if we've reached the limit mid-match
92
+ break if enforce_limit && bytes_written >= bytes
93
+
84
94
  byte = @window[match_pos]
85
95
  @window[@window_pos] = byte
86
96
  write_output_byte(byte)
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative "base"
4
+
3
5
  module Cabriolet
4
6
  module Decompressors
5
7
  # LZX handles LZX compressed data
@@ -100,7 +102,7 @@ module Cabriolet
100
102
  # @param output_length [Integer] Expected output length for E8 processing
101
103
  # @param is_delta [Boolean] Whether this is LZX DELTA format
102
104
  def initialize(io_system, input, output, buffer_size, window_bits:,
103
- reset_interval: 0, output_length: 0, is_delta: false)
105
+ reset_interval: 0, output_length: 0, is_delta: false, salvage: false, **_kwargs)
104
106
  super(io_system, input, output, buffer_size)
105
107
 
106
108
  # Validate window_bits
@@ -146,8 +148,9 @@ module Cabriolet
146
148
  @intel_started = false
147
149
  @e8_buf = "\0" * FRAME_SIZE
148
150
 
149
- # Initialize bitstream
150
- @bitstream = Binary::Bitstream.new(io_system, input, buffer_size)
151
+ # Initialize bitstream (LZX uses MSB-first bit ordering per libmspack lzxd.c)
152
+ @bitstream = Binary::Bitstream.new(io_system, input, buffer_size,
153
+ bit_order: :msb, salvage: salvage)
151
154
 
152
155
  # Initialize Huffman trees
153
156
  initialize_trees
@@ -173,19 +176,21 @@ module Cabriolet
173
176
  def decompress(bytes)
174
177
  return 0 if bytes <= 0
175
178
 
179
+ # Read Intel filesize header if not already read (once per stream)
180
+ read_intel_header unless @header_read
181
+
176
182
  total_written = 0
177
183
  end_frame = ((@offset + bytes) / FRAME_SIZE) + 1
178
184
 
179
185
  while @frame < end_frame
180
- # Check reset interval
181
- reset_state if @reset_interval.positive? && (@frame % @reset_interval).zero?
186
+ # Check reset interval - reset offset registers at frame boundaries
187
+ if @reset_interval.positive? && (@frame % @reset_interval).zero? && @frame.positive?
188
+ @r0 = @r1 = @r2 = 1
189
+ end
182
190
 
183
191
  # Read DELTA chunk size if needed
184
192
  @bitstream.read_bits(16) if @is_delta
185
193
 
186
- # Read Intel filesize header if needed
187
- read_intel_header unless @header_read
188
-
189
194
  # Calculate frame size
190
195
  frame_size = calculate_frame_size
191
196
 
@@ -238,6 +243,10 @@ module Cabriolet
238
243
 
239
244
  # Reset LZX state (called at reset intervals)
240
245
  #
246
+ # Per libmspack: Only reset state variables, NOT Huffman code lengths.
247
+ # Lengths persist across blocks and are updated via delta encoding.
248
+ # They are only zeroed at initialization (in initialize_trees).
249
+ #
241
250
  # @return [void]
242
251
  def reset_state
243
252
  @r0 = 1
@@ -247,12 +256,17 @@ module Cabriolet
247
256
  @block_remaining = 0
248
257
  @block_type = BLOCKTYPE_INVALID
249
258
 
250
- # Reset tree lengths to 0
251
- @maintree_lengths.fill(0)
252
- @length_lengths.fill(0)
259
+ # NOTE: Do NOT reset @maintree_lengths or @length_lengths here!
260
+ # Per libmspack lzxd.c line 267-269, lengths are initialized to 0
261
+ # only once (at start) "because deltas will be applied to them".
262
+ # Resetting them here breaks delta encoding between blocks.
253
263
  end
254
264
 
255
- # Read Intel filesize header
265
+ # Read Intel filesize header (once per stream, before any frames)
266
+ #
267
+ # Format per libmspack:
268
+ # - 1 bit: Intel flag (if 0, filesize = 0; if 1, read 32-bit filesize)
269
+ # - If flag is 1: 32 bits for filesize (16 bits high, 16 bits low)
256
270
  #
257
271
  # @return [void]
258
272
  def read_intel_header
@@ -304,13 +318,20 @@ module Cabriolet
304
318
 
305
319
  # Read block header
306
320
  #
321
+ # LZX block header format (per libmspack):
322
+ # - 3 bits: block_type
323
+ # - 24 bits: block_length (16 bits high, 8 bits low, combined as (high << 8) | low)
324
+ #
307
325
  # @return [void]
308
326
  def read_block_header
309
- # Align for uncompressed blocks
327
+ # Align for uncompressed blocks - this ensures correct byte alignment
328
+ # when reading the R0, R1, R2 values from the block header
310
329
  @bitstream.byte_align if @block_type == BLOCKTYPE_UNCOMPRESSED && @block_length.allbits?(1)
311
330
 
312
- # Read block type and length
331
+ # Read block type (3 bits)
313
332
  @block_type = @bitstream.read_bits(3)
333
+
334
+ # Read block length (24 bits: 16 bits high, then 8 bits low)
314
335
  high = @bitstream.read_bits(16)
315
336
  low = @bitstream.read_bits(8)
316
337
  @block_length = (high << 8) | low
@@ -324,6 +345,8 @@ module Cabriolet
324
345
  when BLOCKTYPE_UNCOMPRESSED
325
346
  read_uncompressed_block_header
326
347
  else
348
+ # Per libmspack lzxd.c line 519-521, BLOCKTYPE_INVALID (0) and
349
+ # blocktypes 4-7 are all invalid and should raise an error
327
350
  raise DecompressionError, "Invalid block type: #{@block_type}"
328
351
  end
329
352
  end
@@ -338,11 +361,11 @@ module Cabriolet
338
361
  end
339
362
 
340
363
  # Build aligned tree
341
- @aligned_tree = Huffman::Tree.new(@aligned_lengths, ALIGNED_MAXSYMBOLS)
342
- unless @aligned_tree.build_table(ALIGNED_TABLEBITS)
343
- raise DecompressionError,
344
- "Failed to build aligned tree"
345
- end
364
+ # Note: Aligned tree may be incomplete (Kraft sum < 1.0), which is valid
365
+ # as long as the unused codes are never encountered in the bitstream
366
+ @aligned_tree = Huffman::Tree.new(@aligned_lengths, ALIGNED_MAXSYMBOLS,
367
+ bit_order: :msb)
368
+ @aligned_tree.build_table(ALIGNED_TABLEBITS)
346
369
 
347
370
  # Read main and length trees (same as verbatim)
348
371
  read_main_and_length_trees
@@ -359,15 +382,14 @@ module Cabriolet
359
382
  #
360
383
  # @return [void]
361
384
  def read_main_and_length_trees
362
- # Read and build pretree
363
- read_pretree
364
-
365
385
  # Read main tree lengths using pretree
386
+ # Note: Each call to read_lengths reads its own pretree (per libmspack lzxd_read_lens)
366
387
  read_lengths(@maintree_lengths, 0, 256)
367
388
  read_lengths(@maintree_lengths, 256, @maintree_maxsymbols)
368
389
 
369
390
  # Build main tree
370
- @maintree = Huffman::Tree.new(@maintree_lengths, @maintree_maxsymbols)
391
+ @maintree = Huffman::Tree.new(@maintree_lengths, @maintree_maxsymbols,
392
+ bit_order: :msb)
371
393
  unless @maintree.build_table(LENGTH_TABLEBITS)
372
394
  raise DecompressionError,
373
395
  "Failed to build main tree"
@@ -380,7 +402,8 @@ module Cabriolet
380
402
  read_lengths(@length_lengths, 0, NUM_SECONDARY_LENGTHS)
381
403
 
382
404
  # Build length tree (may be empty)
383
- @length_tree = Huffman::Tree.new(@length_lengths, LENGTH_MAXSYMBOLS)
405
+ @length_tree = Huffman::Tree.new(@length_lengths, LENGTH_MAXSYMBOLS,
406
+ bit_order: :msb)
384
407
  if @length_tree.build_table(LENGTH_TABLEBITS)
385
408
  @length_empty = false
386
409
  else
@@ -401,7 +424,8 @@ module Cabriolet
401
424
  @pretree_lengths[i] = @bitstream.read_bits(4)
402
425
  end
403
426
 
404
- @pretree = Huffman::Tree.new(@pretree_lengths, PRETREE_MAXSYMBOLS)
427
+ @pretree = Huffman::Tree.new(@pretree_lengths, PRETREE_MAXSYMBOLS,
428
+ bit_order: :msb)
405
429
  return if @pretree.build_table(PRETREE_TABLEBITS)
406
430
 
407
431
  raise DecompressionError, "Failed to build pretree"
@@ -409,11 +433,16 @@ module Cabriolet
409
433
 
410
434
  # Read code lengths using pretree
411
435
  #
436
+ # Per libmspack's lzxd_read_lens, each call reads its own pretree first
437
+ #
412
438
  # @param lengths [Array<Integer>] Target length array
413
439
  # @param first [Integer] First symbol index
414
440
  # @param last [Integer] Last symbol index (exclusive)
415
441
  # @return [void]
416
442
  def read_lengths(lengths, first, last)
443
+ # Read and build pretree (20 elements, 4 bits each)
444
+ read_pretree
445
+
417
446
  x = first
418
447
 
419
448
  while x < last
@@ -494,9 +523,9 @@ module Cabriolet
494
523
  @window_posn += 1
495
524
  run_length -= 1
496
525
  else
497
- # Match: decode length and offset
498
- decode_match(main_element, run_length)
499
- run_length = 0 # Match decoding handles run_length internally
526
+ # Match: decode length and offset, then decrement run_length by match_length
527
+ match_length = decode_match(main_element, run_length)
528
+ run_length -= match_length
500
529
  end
501
530
  end
502
531
  end
@@ -504,8 +533,8 @@ module Cabriolet
504
533
  # Decode and copy a match
505
534
  #
506
535
  # @param main_element [Integer] Main tree symbol
507
- # @param run_length [Integer] Remaining run length
508
- # @return [void]
536
+ # @param run_length [Integer] Remaining run length (unused, kept for compatibility)
537
+ # @return [Integer] Match length (bytes consumed)
509
538
  def decode_match(main_element, _run_length)
510
539
  main_element -= NUM_CHARS
511
540
 
@@ -533,8 +562,10 @@ module Cabriolet
533
562
  match_offset = @r0
534
563
  when 1
535
564
  @r1, @r0 = @r0, @r1
565
+ match_offset = @r0
536
566
  when 2
537
567
  @r2, @r0 = @r0, @r2
568
+ match_offset = @r0
538
569
  else
539
570
  # Calculate offset from position slot
540
571
  extra = position_slot >= 36 ? 17 : EXTRA_BITS[position_slot]
@@ -573,6 +604,9 @@ module Cabriolet
573
604
 
574
605
  # Copy match
575
606
  copy_match(match_offset, match_length)
607
+
608
+ # Return match length so caller can decrement run_length
609
+ match_length
576
610
  end
577
611
 
578
612
  # Decode extended match length for LZX DELTA
@@ -608,9 +642,12 @@ module Cabriolet
608
642
  # @return [void]
609
643
  def copy_match(offset, length)
610
644
  if offset > @window_posn
611
- # Match wraps around window
612
- if offset > @offset && (offset - @window_posn).positive?
613
- raise DecompressionError, "Match offset beyond stream"
645
+ # Match wraps around window - validate it doesn't read beyond available data
646
+ # Per libmspack lzxd.c lines 622-628: check if match offset goes beyond
647
+ # what has been decompressed so far (accounting for any reference data)
648
+ ref_data_size = 0 # We don't support reference data yet (LZX DELTA feature)
649
+ if offset > @offset && (offset - @window_posn) > ref_data_size
650
+ raise DecompressionError, "Match offset beyond LZX stream"
614
651
  end
615
652
 
616
653
  # Copy from end of window