cabriolet 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/ARCHITECTURE.md +799 -0
  3. data/CHANGELOG.md +44 -0
  4. data/LICENSE +29 -0
  5. data/README.adoc +1207 -0
  6. data/exe/cabriolet +6 -0
  7. data/lib/cabriolet/auto.rb +173 -0
  8. data/lib/cabriolet/binary/bitstream.rb +148 -0
  9. data/lib/cabriolet/binary/bitstream_writer.rb +180 -0
  10. data/lib/cabriolet/binary/chm_structures.rb +213 -0
  11. data/lib/cabriolet/binary/hlp_structures.rb +66 -0
  12. data/lib/cabriolet/binary/kwaj_structures.rb +74 -0
  13. data/lib/cabriolet/binary/lit_structures.rb +107 -0
  14. data/lib/cabriolet/binary/oab_structures.rb +112 -0
  15. data/lib/cabriolet/binary/structures.rb +56 -0
  16. data/lib/cabriolet/binary/szdd_structures.rb +60 -0
  17. data/lib/cabriolet/cab/compressor.rb +382 -0
  18. data/lib/cabriolet/cab/decompressor.rb +510 -0
  19. data/lib/cabriolet/cab/extractor.rb +357 -0
  20. data/lib/cabriolet/cab/parser.rb +264 -0
  21. data/lib/cabriolet/chm/compressor.rb +513 -0
  22. data/lib/cabriolet/chm/decompressor.rb +436 -0
  23. data/lib/cabriolet/chm/parser.rb +254 -0
  24. data/lib/cabriolet/cli.rb +776 -0
  25. data/lib/cabriolet/compressors/base.rb +34 -0
  26. data/lib/cabriolet/compressors/lzss.rb +250 -0
  27. data/lib/cabriolet/compressors/lzx.rb +581 -0
  28. data/lib/cabriolet/compressors/mszip.rb +315 -0
  29. data/lib/cabriolet/compressors/quantum.rb +446 -0
  30. data/lib/cabriolet/constants.rb +75 -0
  31. data/lib/cabriolet/decompressors/base.rb +39 -0
  32. data/lib/cabriolet/decompressors/lzss.rb +138 -0
  33. data/lib/cabriolet/decompressors/lzx.rb +726 -0
  34. data/lib/cabriolet/decompressors/mszip.rb +390 -0
  35. data/lib/cabriolet/decompressors/none.rb +27 -0
  36. data/lib/cabriolet/decompressors/quantum.rb +456 -0
  37. data/lib/cabriolet/errors.rb +39 -0
  38. data/lib/cabriolet/format_detector.rb +156 -0
  39. data/lib/cabriolet/hlp/compressor.rb +272 -0
  40. data/lib/cabriolet/hlp/decompressor.rb +198 -0
  41. data/lib/cabriolet/hlp/parser.rb +131 -0
  42. data/lib/cabriolet/huffman/decoder.rb +79 -0
  43. data/lib/cabriolet/huffman/encoder.rb +108 -0
  44. data/lib/cabriolet/huffman/tree.rb +138 -0
  45. data/lib/cabriolet/kwaj/compressor.rb +479 -0
  46. data/lib/cabriolet/kwaj/decompressor.rb +237 -0
  47. data/lib/cabriolet/kwaj/parser.rb +183 -0
  48. data/lib/cabriolet/lit/compressor.rb +255 -0
  49. data/lib/cabriolet/lit/decompressor.rb +250 -0
  50. data/lib/cabriolet/models/cabinet.rb +81 -0
  51. data/lib/cabriolet/models/chm_file.rb +28 -0
  52. data/lib/cabriolet/models/chm_header.rb +67 -0
  53. data/lib/cabriolet/models/chm_section.rb +38 -0
  54. data/lib/cabriolet/models/file.rb +119 -0
  55. data/lib/cabriolet/models/folder.rb +102 -0
  56. data/lib/cabriolet/models/folder_data.rb +21 -0
  57. data/lib/cabriolet/models/hlp_file.rb +45 -0
  58. data/lib/cabriolet/models/hlp_header.rb +37 -0
  59. data/lib/cabriolet/models/kwaj_header.rb +98 -0
  60. data/lib/cabriolet/models/lit_header.rb +55 -0
  61. data/lib/cabriolet/models/oab_header.rb +95 -0
  62. data/lib/cabriolet/models/szdd_header.rb +72 -0
  63. data/lib/cabriolet/modifier.rb +326 -0
  64. data/lib/cabriolet/oab/compressor.rb +353 -0
  65. data/lib/cabriolet/oab/decompressor.rb +315 -0
  66. data/lib/cabriolet/parallel.rb +333 -0
  67. data/lib/cabriolet/repairer.rb +288 -0
  68. data/lib/cabriolet/streaming.rb +221 -0
  69. data/lib/cabriolet/system/file_handle.rb +107 -0
  70. data/lib/cabriolet/system/io_system.rb +87 -0
  71. data/lib/cabriolet/system/memory_handle.rb +105 -0
  72. data/lib/cabriolet/szdd/compressor.rb +217 -0
  73. data/lib/cabriolet/szdd/decompressor.rb +184 -0
  74. data/lib/cabriolet/szdd/parser.rb +127 -0
  75. data/lib/cabriolet/validator.rb +332 -0
  76. data/lib/cabriolet/version.rb +5 -0
  77. data/lib/cabriolet.rb +104 -0
  78. metadata +157 -0
@@ -0,0 +1,436 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "parser"
4
+ require_relative "../decompressors/lzx"
5
+ require_relative "../system/file_handle"
6
+ require_relative "../system/memory_handle"
7
+
8
+ module Cabriolet
9
+ module CHM
10
+ # Decompressor for CHM (Compiled HTML Help) files
11
+ class Decompressor
12
+ LZX_FRAME_SIZE = 32_768
13
+
14
+ attr_reader :io_system, :chm
15
+
16
+ def initialize(io_system = nil)
17
+ @io_system = io_system || System::IOSystem.new
18
+ @chm = nil
19
+ @input_handle = nil
20
+ @lzx_state = nil
21
+ @lzx_offset = 0
22
+ @lzx_length = 0
23
+ end
24
+
25
+ # Open a CHM file
26
+ # @param filename [String] Path to CHM file
27
+ # @param entire [Boolean] If true, parse all file entries
28
+ # @return [Models::CHMHeader] CHM header
29
+ def open(filename, entire: true)
30
+ @input_handle = @io_system.open(filename, Constants::MODE_READ)
31
+ @chm = Parser.new(@input_handle).parse(entire: entire)
32
+ @chm.filename = filename
33
+ @chm
34
+ rescue StandardError => e
35
+ @input_handle&.close
36
+ @input_handle = nil
37
+ raise e
38
+ end
39
+
40
+ # Open a CHM file quickly (without parsing file entries)
41
+ # @param filename [String] Path to CHM file
42
+ # @return [Models::CHMHeader] CHM header
43
+ def fast_open(filename)
44
+ open(filename, entire: false)
45
+ end
46
+
47
+ # Close the CHM file
48
+ def close
49
+ cleanup_lzx
50
+ @input_handle&.close
51
+ @input_handle = nil
52
+ @chm = nil
53
+ end
54
+
55
+ # Extract a file from the CHM archive
56
+ # @param file [Models::CHMFile] File to extract
57
+ # @param output_path [String] Output path for extracted file
58
+ # @return [void]
59
+ def extract(file, output_path)
60
+ raise ArgumentError, "File is nil" if file.nil?
61
+ raise ArgumentError, "File section is nil" if file.section.nil?
62
+
63
+ # Handle empty files
64
+ if file.empty?
65
+ @io_system.open(output_path, Constants::MODE_WRITE).close
66
+ return
67
+ end
68
+
69
+ case file.section.id
70
+ when 0
71
+ extract_uncompressed(file, output_path)
72
+ when 1
73
+ extract_compressed(file, output_path)
74
+ else
75
+ raise Errors::FormatError, "Invalid section ID: #{file.section.id}"
76
+ end
77
+ end
78
+
79
+ # Find a file by name using fast_find
80
+ # @param filename [String] Name of the file to find
81
+ # @return [Models::CHMFile, nil] The file if found, nil otherwise
82
+ def fast_find(filename)
83
+ raise ArgumentError, "CHM not opened" unless @chm
84
+
85
+ # Use fast index search if available
86
+ if @chm.index_root < @chm.num_chunks
87
+ fast_search_pmgi(filename)
88
+ else
89
+ # Linear search through PMGL chunks
90
+ fast_search_pmgl(filename)
91
+ end
92
+ end
93
+
94
+ private
95
+
96
+ # Extract uncompressed file (section 0)
97
+ def extract_uncompressed(file, output_path)
98
+ output_handle = @io_system.open(output_path, Constants::MODE_WRITE)
99
+
100
+ # Seek to file data
101
+ offset = @chm.sec0.offset + file.offset
102
+ @input_handle.seek(offset, Constants::SEEK_START)
103
+
104
+ # Copy data in chunks
105
+ remaining = file.length
106
+ buffer_size = 4096
107
+
108
+ while remaining.positive?
109
+ chunk_size = [buffer_size, remaining].min
110
+ data = @input_handle.read(chunk_size)
111
+ if data.nil? || data.length < chunk_size
112
+ raise Errors::ReadError,
113
+ "Unexpected end of file"
114
+ end
115
+
116
+ output_handle.write(data)
117
+ remaining -= chunk_size
118
+ end
119
+
120
+ output_handle.close
121
+ end
122
+
123
+ # Extract compressed file (section 1, MSCompressed/LZX)
124
+ def extract_compressed(file, output_path)
125
+ # Initialize LZX decompressor if needed
126
+ init_lzx(file) unless lzx_ready?(file)
127
+
128
+ # Seek to correct position in input
129
+ @input_handle.seek(@lzx_input_offset, Constants::SEEK_START)
130
+
131
+ # Skip to file offset if needed
132
+ skip_amount = file.offset - @lzx_offset
133
+ if skip_amount.positive?
134
+ # Decompress and discard to a dummy memory handle
135
+ dummy_output = System::MemoryHandle.new("", Constants::MODE_WRITE)
136
+ saved_output = @lzx_state.instance_variable_get(:@output)
137
+ @lzx_state.instance_variable_set(:@output, dummy_output)
138
+ @lzx_state.decompress(skip_amount)
139
+ @lzx_state.instance_variable_set(:@output, saved_output)
140
+ @lzx_offset += skip_amount
141
+ end
142
+
143
+ # Decompress to memory buffer
144
+ memory_output = System::MemoryHandle.new("", Constants::MODE_WRITE)
145
+ @lzx_state.instance_variable_set(:@output, memory_output)
146
+ @lzx_state.decompress(file.length)
147
+ @lzx_offset += file.length
148
+
149
+ # Save input position for next extraction
150
+ @lzx_input_offset = @input_handle.tell
151
+
152
+ # Write buffer to file
153
+ output_handle = @io_system.open(output_path, Constants::MODE_WRITE)
154
+ output_handle.write(memory_output.buffer)
155
+ output_handle.close
156
+ end
157
+
158
+ # Check if LZX state is ready for this file
159
+ def lzx_ready?(file)
160
+ return false unless @lzx_state
161
+ return false if file.offset < @lzx_offset
162
+
163
+ true
164
+ end
165
+
166
+ # Initialize LZX decompressor for section 1
167
+ def init_lzx(file)
168
+ cleanup_lzx
169
+
170
+ sec = @chm.sec1
171
+
172
+ # Find required system files
173
+ content = sec.content || find_system_file(Parser::CONTENT_NAME)
174
+ control = sec.control || find_system_file(Parser::CONTROL_NAME)
175
+
176
+ unless content
177
+ raise Errors::FormatError,
178
+ "MSCompressed Content file not found"
179
+ end
180
+ raise Errors::FormatError, "ControlData file not found" unless control
181
+
182
+ # Read control data
183
+ control_data = read_system_file(control)
184
+ unless control_data.length == 28
185
+ raise Errors::FormatError,
186
+ "ControlData wrong size"
187
+ end
188
+
189
+ window_size, reset_interval = parse_control_data(control_data)
190
+
191
+ # Calculate window bits
192
+ window_bits = case window_size
193
+ when 0x008000 then 15
194
+ when 0x010000 then 16
195
+ when 0x020000 then 17
196
+ when 0x040000 then 18
197
+ when 0x080000 then 19
198
+ when 0x100000 then 20
199
+ when 0x200000 then 21
200
+ else
201
+ raise Errors::FormatError,
202
+ "Invalid window size: #{window_size}"
203
+ end
204
+
205
+ # Validate reset interval
206
+ if reset_interval.zero? || (reset_interval % LZX_FRAME_SIZE) != 0
207
+ raise Errors::FormatError, "Invalid reset interval: #{reset_interval}"
208
+ end
209
+
210
+ # Find reset table entry for this file
211
+ entry = file.offset / reset_interval
212
+ entry *= reset_interval / LZX_FRAME_SIZE
213
+
214
+ length, offset = read_reset_table(sec, entry, reset_interval)
215
+
216
+ # Calculate input offset
217
+ @lzx_input_offset = @chm.sec0.offset + content.offset + offset
218
+
219
+ # Set start offset and length
220
+ @lzx_offset = entry * LZX_FRAME_SIZE
221
+ @lzx_length = length
222
+
223
+ # Seek to input position
224
+ @input_handle.seek(@lzx_input_offset, Constants::SEEK_START)
225
+
226
+ # Create output handle (will be set per extraction)
227
+ output_handle = System::MemoryHandle.new("")
228
+
229
+ # Initialize LZX decompressor
230
+ @lzx_state = Decompressors::LZX.new(
231
+ @io_system,
232
+ @input_handle,
233
+ output_handle,
234
+ 4096,
235
+ window_bits: window_bits,
236
+ reset_interval: reset_interval / LZX_FRAME_SIZE,
237
+ output_length: length - @lzx_offset,
238
+ )
239
+ end
240
+
241
+ # Parse control data to get window size and reset interval
242
+ def parse_control_data(data)
243
+ signature = data[4, 4]
244
+ unless signature == "LZXC"
245
+ raise Errors::SignatureError,
246
+ "Invalid LZXC signature"
247
+ end
248
+
249
+ version = data[8, 4].unpack1("V")
250
+ reset_interval = data[12, 4].unpack1("V")
251
+ window_size = data[16, 4].unpack1("V")
252
+
253
+ # Adjust for version 2
254
+ if version == 2
255
+ reset_interval *= LZX_FRAME_SIZE
256
+ window_size *= LZX_FRAME_SIZE
257
+ elsif version != 1
258
+ raise Errors::FormatError, "Unknown ControlData version: #{version}"
259
+ end
260
+
261
+ [window_size, reset_interval]
262
+ end
263
+
264
+ # Read reset table entry
265
+ def read_reset_table(sec, entry, reset_interval)
266
+ rtable = sec.rtable || find_system_file(Parser::RTABLE_NAME)
267
+
268
+ if rtable
269
+ # Read from reset table
270
+ read_reset_table_entry(rtable, entry, reset_interval)
271
+ else
272
+ # Fall back to SpanInfo
273
+ spaninfo = sec.spaninfo || find_system_file(Parser::SPANINFO_NAME)
274
+ unless spaninfo
275
+ raise Errors::FormatError,
276
+ "Neither ResetTable nor SpanInfo found"
277
+ end
278
+
279
+ length = read_spaninfo(spaninfo)
280
+ [length, 0]
281
+ end
282
+ end
283
+
284
+ # Read an entry from the reset table
285
+ def read_reset_table_entry(rtable, entry, reset_interval)
286
+ data = read_system_file(rtable)
287
+ raise Errors::FormatError, "ResetTable too short" if data.length < 40
288
+
289
+ # Check frame length
290
+ frame_len = data[32, 8].unpack1("Q<")
291
+ unless frame_len == LZX_FRAME_SIZE
292
+ raise Errors::FormatError,
293
+ "Invalid frame length"
294
+ end
295
+
296
+ # Get uncompressed length
297
+ uncomp_len = data[16, 8].unpack1("Q<")
298
+
299
+ # Get entry info
300
+ num_entries = data[4, 4].unpack1("V")
301
+ entry_size = data[8, 4].unpack1("V")
302
+ table_offset = data[12, 4].unpack1("V")
303
+
304
+ if entry < num_entries && table_offset + (entry * entry_size) + entry_size <= data.length
305
+ pos = table_offset + (entry * entry_size)
306
+ offset = case entry_size
307
+ when 4 then data[pos, 4].unpack1("V")
308
+ when 8 then data[pos, 8].unpack1("Q<")
309
+ else
310
+ raise Errors::FormatError,
311
+ "Invalid entry size: #{entry_size}"
312
+ end
313
+
314
+ # Pad length to next reset interval
315
+ length = uncomp_len + reset_interval - 1
316
+ length &= -reset_interval
317
+
318
+ [length, offset]
319
+ else
320
+ # Invalid entry, fall back
321
+ [uncomp_len, 0]
322
+ end
323
+ end
324
+
325
+ # Read SpanInfo to get uncompressed length
326
+ def read_spaninfo(spaninfo)
327
+ data = read_system_file(spaninfo)
328
+ raise Errors::FormatError, "SpanInfo wrong size" unless data.length == 8
329
+
330
+ length = data.unpack1("Q<")
331
+ unless length.positive?
332
+ raise Errors::FormatError,
333
+ "Invalid SpanInfo length"
334
+ end
335
+
336
+ length
337
+ end
338
+
339
+ # Find a system file by name
340
+ def find_system_file(name)
341
+ file = @chm.sysfiles
342
+ while file
343
+ return file if file.filename == name
344
+
345
+ file = file.next_file
346
+ end
347
+ nil
348
+ end
349
+
350
+ # Read a system file's contents
351
+ def read_system_file(file)
352
+ unless file.section.id.zero?
353
+ raise Errors::FormatError,
354
+ "System file must be in section 0"
355
+ end
356
+
357
+ offset = @chm.sec0.offset + file.offset
358
+ @input_handle.seek(offset, Constants::SEEK_START)
359
+ @input_handle.read(file.length)
360
+ end
361
+
362
+ # Fast search using PMGI index
363
+ def fast_search_pmgi(filename)
364
+ # TODO: Implement PMGI-based binary search
365
+ # For now, fall back to PMGL linear search
366
+ fast_search_pmgl(filename)
367
+ end
368
+
369
+ # Fast search using PMGL chunks
370
+ def fast_search_pmgl(filename)
371
+ original_pos = @input_handle.tell
372
+
373
+ (@chm.first_pmgl..@chm.last_pmgl).each do |chunk_num|
374
+ offset = @chm.dir_offset + (chunk_num * @chm.chunk_size)
375
+ @input_handle.seek(offset, Constants::SEEK_START)
376
+ chunk = @input_handle.read(@chm.chunk_size)
377
+
378
+ next unless chunk && chunk.length == @chm.chunk_size
379
+ next unless chunk[0, 4] == "PMGL"
380
+
381
+ file = search_chunk(chunk, filename)
382
+ if file
383
+ @input_handle.seek(original_pos, Constants::SEEK_START)
384
+ return file
385
+ end
386
+ end
387
+
388
+ @input_handle.seek(original_pos, Constants::SEEK_START)
389
+ nil
390
+ end
391
+
392
+ # Search a chunk for a filename
393
+ def search_chunk(chunk, filename)
394
+ num_entries = chunk[-2, 2].unpack1("v")
395
+ pos = 20
396
+ chunk_end = chunk.length - 2
397
+
398
+ num_entries.times do
399
+ break if pos >= chunk_end
400
+
401
+ begin
402
+ name_len, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
403
+ break if pos + name_len > chunk_end
404
+
405
+ name = chunk[pos, name_len].force_encoding("UTF-8")
406
+ pos += name_len
407
+
408
+ section, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
409
+ offset, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
410
+ length, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
411
+
412
+ if name == filename
413
+ file = Models::CHMFile.new
414
+ file.filename = name
415
+ file.section = (section.zero? ? @chm.sec0 : @chm.sec1)
416
+ file.offset = offset
417
+ file.length = length
418
+ return file
419
+ end
420
+ rescue Errors::FormatError
421
+ break
422
+ end
423
+ end
424
+
425
+ nil
426
+ end
427
+
428
+ # Clean up LZX state
429
+ def cleanup_lzx
430
+ @lzx_state = nil
431
+ @lzx_offset = 0
432
+ @lzx_length = 0
433
+ end
434
+ end
435
+ end
436
+ end
@@ -0,0 +1,254 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../binary/chm_structures"
4
+ require_relative "../models/chm_header"
5
+ require_relative "../models/chm_file"
6
+ require_relative "../errors"
7
+
8
+ module Cabriolet
9
+ module CHM
10
+ # Parser for CHM (Compiled HTML Help) files
11
+ class Parser
12
+ # Expected GUID values in CHM headers
13
+ GUID1 = [0x10, 0xFD, 0x01, 0x7C, 0xAA, 0x7B, 0xD0, 0x11,
14
+ 0x9E, 0x0C, 0x00, 0xA0, 0xC9, 0x22, 0xE6, 0xEC].pack("C*")
15
+ GUID2 = [0x11, 0xFD, 0x01, 0x7C, 0xAA, 0x7B, 0xD0, 0x11,
16
+ 0x9E, 0x0C, 0x00, 0xA0, 0xC9, 0x22, 0xE6, 0xEC].pack("C*")
17
+
18
+ # System file names
19
+ CONTENT_NAME = "::DataSpace/Storage/MSCompressed/Content"
20
+ CONTROL_NAME = "::DataSpace/Storage/MSCompressed/ControlData"
21
+ SPANINFO_NAME = "::DataSpace/Storage/MSCompressed/SpanInfo"
22
+ RTABLE_NAME = "::DataSpace/Storage/MSCompressed/Transform/" \
23
+ "{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable"
24
+
25
+ attr_reader :io, :chm
26
+
27
+ def initialize(io)
28
+ @io = io
29
+ @chm = Models::CHMHeader.new
30
+ end
31
+
32
+ # Parse the CHM file
33
+ # @param entire [Boolean] If true, parse all file entries. If false, only headers.
34
+ # @return [Models::CHMHeader]
35
+ def parse(entire: true)
36
+ read_itsf_header
37
+ read_header_sections
38
+ read_directory_header
39
+
40
+ read_file_entries if entire
41
+
42
+ @chm
43
+ end
44
+
45
+ private
46
+
47
+ # Read the ITSF header (main file header)
48
+ def read_itsf_header
49
+ @io.seek(0, Constants::SEEK_START)
50
+ header = Binary::CHMITSFHeader.read(@io)
51
+
52
+ # Check signature
53
+ unless header.signature == "ITSF"
54
+ raise SignatureError,
55
+ "Invalid ITSF signature"
56
+ end
57
+
58
+ # Check GUIDs
59
+ unless header.guid1 == GUID1 && header.guid2 == GUID2
60
+ raise SignatureError,
61
+ "Invalid CHM GUIDs"
62
+ end
63
+
64
+ @chm.version = header.version
65
+ @chm.timestamp = header.timestamp
66
+ @chm.language = header.language_id
67
+ end
68
+
69
+ # Read header sections table and header section 0
70
+ def read_header_sections
71
+ section_table = Binary::CHMHeaderSectionTable.read(@io)
72
+
73
+ offset_hs0 = section_table.offset_hs0
74
+ @chm.dir_offset = section_table.offset_hs1
75
+ @chm.sec0.offset = section_table.offset_cs0
76
+
77
+ # Seek to header section 0
78
+ @io.seek(offset_hs0, Constants::SEEK_START)
79
+ hs0 = Binary::CHMHeaderSection0.read(@io)
80
+ @chm.length = hs0.file_len
81
+ end
82
+
83
+ # Read header section 1 (directory header)
84
+ def read_directory_header
85
+ @io.seek(@chm.dir_offset, Constants::SEEK_START)
86
+ hs1 = Binary::CHMHeaderSection1.read(@io)
87
+
88
+ # Check signature
89
+ unless hs1.signature == "ITSP"
90
+ raise SignatureError,
91
+ "Invalid ITSP signature"
92
+ end
93
+
94
+ @chm.dir_offset = @io.tell
95
+ @chm.chunk_size = hs1.chunk_size
96
+ @chm.density = hs1.density
97
+ @chm.depth = hs1.depth
98
+ @chm.index_root = hs1.index_root
99
+ @chm.num_chunks = hs1.num_chunks
100
+ @chm.first_pmgl = hs1.first_pmgl
101
+ @chm.last_pmgl = hs1.last_pmgl
102
+
103
+ # For CHM versions < 3, calculate section 0 offset
104
+ @chm.sec0.offset = @chm.dir_offset + (@chm.chunk_size * @chm.num_chunks) if @chm.version < 3
105
+
106
+ validate_chunk_parameters
107
+ end
108
+
109
+ # Validate chunk parameters
110
+ def validate_chunk_parameters
111
+ # Check if content offset is valid
112
+ if @chm.sec0.offset > @chm.length
113
+ raise FormatError,
114
+ "Content section offset beyond file length"
115
+ end
116
+
117
+ # Chunk size must be large enough
118
+ raise FormatError, "Chunk size too small" if @chm.chunk_size < 20
119
+
120
+ # Must have chunks
121
+ raise FormatError, "No chunks in CHM file" if @chm.num_chunks.zero?
122
+
123
+ # Sanity limits
124
+ if @chm.num_chunks > 100_000
125
+ raise FormatError,
126
+ "Too many chunks (> 100,000)"
127
+ end
128
+
129
+ if @chm.chunk_size > 8192
130
+ raise FormatError,
131
+ "Chunk size too large (> 8192)"
132
+ end
133
+
134
+ # Validate chunk indices
135
+ if @chm.first_pmgl > @chm.last_pmgl
136
+ raise FormatError,
137
+ "First PMGL > Last PMGL"
138
+ end
139
+
140
+ return unless @chm.index_root != 0xFFFFFFFF && @chm.index_root >= @chm.num_chunks
141
+
142
+ raise FormatError, "Index root out of range"
143
+ end
144
+
145
+ # Read all file entries from PMGL chunks
146
+ def read_file_entries
147
+ # Seek to first PMGL chunk
148
+ if @chm.first_pmgl != 0
149
+ pmgl_offset = @chm.first_pmgl * @chm.chunk_size
150
+ @io.seek(@chm.dir_offset + pmgl_offset, Constants::SEEK_START)
151
+ end
152
+
153
+ num_chunks = @chm.last_pmgl - @chm.first_pmgl + 1
154
+ last_file = nil
155
+
156
+ num_chunks.times do
157
+ chunk = @io.read(@chm.chunk_size)
158
+ next unless chunk && chunk.length == @chm.chunk_size
159
+
160
+ # Check if this is a PMGL chunk
161
+ next unless chunk[0, 4] == "PMGL"
162
+
163
+ files = parse_pmgl_chunk(chunk)
164
+ files.each do |file|
165
+ if file.system_file?
166
+ # Add to system files list
167
+ file.next_file = @chm.sysfiles
168
+ @chm.sysfiles = file
169
+ identify_system_file(file)
170
+ else
171
+ # Add to regular files list
172
+ if last_file
173
+ last_file.next_file = file
174
+ else
175
+ @chm.files = file
176
+ end
177
+ last_file = file
178
+ end
179
+ end
180
+ end
181
+ end
182
+
183
+ # Parse a PMGL chunk to extract file entries
184
+ # @param chunk [String] The chunk data
185
+ # @return [Array<Models::CHMFile>] The files found in this chunk
186
+ def parse_pmgl_chunk(chunk)
187
+ files = []
188
+
189
+ # Read number of entries (last 2 bytes)
190
+ num_entries = chunk[-2, 2].unpack1("v")
191
+
192
+ # Start reading entries after PMGL header
193
+ pos = 20 # PMGL header is 20 bytes
194
+ chunk_end = chunk.length - 2
195
+
196
+ num_entries.times do
197
+ break if pos >= chunk_end
198
+
199
+ begin
200
+ # Read name length
201
+ name_len, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
202
+ break if pos + name_len > chunk_end
203
+
204
+ # Read name
205
+ name = chunk[pos, name_len]
206
+ pos += name_len
207
+
208
+ # Read section, offset, length
209
+ section, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
210
+ offset, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
211
+ length, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
212
+
213
+ # Skip blank or single-char names
214
+ next if name_len < 2 || name[0].nil? || name[1].nil?
215
+
216
+ # Skip directory entries (end with '/')
217
+ next if offset.zero? && length.zero? && name[-1] == "/"
218
+
219
+ # Validate section number
220
+ next if section > 1
221
+
222
+ # Create file entry
223
+ file = Models::CHMFile.new
224
+ file.filename = name.force_encoding("UTF-8")
225
+ file.section = (section.zero? ? @chm.sec0 : @chm.sec1)
226
+ file.offset = offset
227
+ file.length = length
228
+
229
+ files << file
230
+ rescue Cabriolet::FormatError
231
+ # Skip malformed entries
232
+ break
233
+ end
234
+ end
235
+
236
+ files
237
+ end
238
+
239
+ # Identify and link system files
240
+ def identify_system_file(file)
241
+ case file.filename
242
+ when CONTENT_NAME
243
+ @chm.sec1.content = file
244
+ when CONTROL_NAME
245
+ @chm.sec1.control = file
246
+ when SPANINFO_NAME
247
+ @chm.sec1.spaninfo = file
248
+ when RTABLE_NAME
249
+ @chm.sec1.rtable = file
250
+ end
251
+ end
252
+ end
253
+ end
254
+ end