cabriolet 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ARCHITECTURE.md +799 -0
- data/CHANGELOG.md +44 -0
- data/LICENSE +29 -0
- data/README.adoc +1207 -0
- data/exe/cabriolet +6 -0
- data/lib/cabriolet/auto.rb +173 -0
- data/lib/cabriolet/binary/bitstream.rb +148 -0
- data/lib/cabriolet/binary/bitstream_writer.rb +180 -0
- data/lib/cabriolet/binary/chm_structures.rb +213 -0
- data/lib/cabriolet/binary/hlp_structures.rb +66 -0
- data/lib/cabriolet/binary/kwaj_structures.rb +74 -0
- data/lib/cabriolet/binary/lit_structures.rb +107 -0
- data/lib/cabriolet/binary/oab_structures.rb +112 -0
- data/lib/cabriolet/binary/structures.rb +56 -0
- data/lib/cabriolet/binary/szdd_structures.rb +60 -0
- data/lib/cabriolet/cab/compressor.rb +382 -0
- data/lib/cabriolet/cab/decompressor.rb +510 -0
- data/lib/cabriolet/cab/extractor.rb +357 -0
- data/lib/cabriolet/cab/parser.rb +264 -0
- data/lib/cabriolet/chm/compressor.rb +513 -0
- data/lib/cabriolet/chm/decompressor.rb +436 -0
- data/lib/cabriolet/chm/parser.rb +254 -0
- data/lib/cabriolet/cli.rb +776 -0
- data/lib/cabriolet/compressors/base.rb +34 -0
- data/lib/cabriolet/compressors/lzss.rb +250 -0
- data/lib/cabriolet/compressors/lzx.rb +581 -0
- data/lib/cabriolet/compressors/mszip.rb +315 -0
- data/lib/cabriolet/compressors/quantum.rb +446 -0
- data/lib/cabriolet/constants.rb +75 -0
- data/lib/cabriolet/decompressors/base.rb +39 -0
- data/lib/cabriolet/decompressors/lzss.rb +138 -0
- data/lib/cabriolet/decompressors/lzx.rb +726 -0
- data/lib/cabriolet/decompressors/mszip.rb +390 -0
- data/lib/cabriolet/decompressors/none.rb +27 -0
- data/lib/cabriolet/decompressors/quantum.rb +456 -0
- data/lib/cabriolet/errors.rb +39 -0
- data/lib/cabriolet/format_detector.rb +156 -0
- data/lib/cabriolet/hlp/compressor.rb +272 -0
- data/lib/cabriolet/hlp/decompressor.rb +198 -0
- data/lib/cabriolet/hlp/parser.rb +131 -0
- data/lib/cabriolet/huffman/decoder.rb +79 -0
- data/lib/cabriolet/huffman/encoder.rb +108 -0
- data/lib/cabriolet/huffman/tree.rb +138 -0
- data/lib/cabriolet/kwaj/compressor.rb +479 -0
- data/lib/cabriolet/kwaj/decompressor.rb +237 -0
- data/lib/cabriolet/kwaj/parser.rb +183 -0
- data/lib/cabriolet/lit/compressor.rb +255 -0
- data/lib/cabriolet/lit/decompressor.rb +250 -0
- data/lib/cabriolet/models/cabinet.rb +81 -0
- data/lib/cabriolet/models/chm_file.rb +28 -0
- data/lib/cabriolet/models/chm_header.rb +67 -0
- data/lib/cabriolet/models/chm_section.rb +38 -0
- data/lib/cabriolet/models/file.rb +119 -0
- data/lib/cabriolet/models/folder.rb +102 -0
- data/lib/cabriolet/models/folder_data.rb +21 -0
- data/lib/cabriolet/models/hlp_file.rb +45 -0
- data/lib/cabriolet/models/hlp_header.rb +37 -0
- data/lib/cabriolet/models/kwaj_header.rb +98 -0
- data/lib/cabriolet/models/lit_header.rb +55 -0
- data/lib/cabriolet/models/oab_header.rb +95 -0
- data/lib/cabriolet/models/szdd_header.rb +72 -0
- data/lib/cabriolet/modifier.rb +326 -0
- data/lib/cabriolet/oab/compressor.rb +353 -0
- data/lib/cabriolet/oab/decompressor.rb +315 -0
- data/lib/cabriolet/parallel.rb +333 -0
- data/lib/cabriolet/repairer.rb +288 -0
- data/lib/cabriolet/streaming.rb +221 -0
- data/lib/cabriolet/system/file_handle.rb +107 -0
- data/lib/cabriolet/system/io_system.rb +87 -0
- data/lib/cabriolet/system/memory_handle.rb +105 -0
- data/lib/cabriolet/szdd/compressor.rb +217 -0
- data/lib/cabriolet/szdd/decompressor.rb +184 -0
- data/lib/cabriolet/szdd/parser.rb +127 -0
- data/lib/cabriolet/validator.rb +332 -0
- data/lib/cabriolet/version.rb +5 -0
- data/lib/cabriolet.rb +104 -0
- metadata +157 -0
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "parser"
|
|
4
|
+
require_relative "../decompressors/lzx"
|
|
5
|
+
require_relative "../system/file_handle"
|
|
6
|
+
require_relative "../system/memory_handle"
|
|
7
|
+
|
|
8
|
+
module Cabriolet
|
|
9
|
+
module CHM
|
|
10
|
+
# Decompressor for CHM (Compiled HTML Help) files
|
|
11
|
+
class Decompressor
|
|
12
|
+
LZX_FRAME_SIZE = 32_768
|
|
13
|
+
|
|
14
|
+
attr_reader :io_system, :chm
|
|
15
|
+
|
|
16
|
+
def initialize(io_system = nil)
|
|
17
|
+
@io_system = io_system || System::IOSystem.new
|
|
18
|
+
@chm = nil
|
|
19
|
+
@input_handle = nil
|
|
20
|
+
@lzx_state = nil
|
|
21
|
+
@lzx_offset = 0
|
|
22
|
+
@lzx_length = 0
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Open a CHM file
|
|
26
|
+
# @param filename [String] Path to CHM file
|
|
27
|
+
# @param entire [Boolean] If true, parse all file entries
|
|
28
|
+
# @return [Models::CHMHeader] CHM header
|
|
29
|
+
def open(filename, entire: true)
|
|
30
|
+
@input_handle = @io_system.open(filename, Constants::MODE_READ)
|
|
31
|
+
@chm = Parser.new(@input_handle).parse(entire: entire)
|
|
32
|
+
@chm.filename = filename
|
|
33
|
+
@chm
|
|
34
|
+
rescue StandardError => e
|
|
35
|
+
@input_handle&.close
|
|
36
|
+
@input_handle = nil
|
|
37
|
+
raise e
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Open a CHM file quickly (without parsing file entries)
|
|
41
|
+
# @param filename [String] Path to CHM file
|
|
42
|
+
# @return [Models::CHMHeader] CHM header
|
|
43
|
+
def fast_open(filename)
|
|
44
|
+
open(filename, entire: false)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Close the CHM file
|
|
48
|
+
def close
|
|
49
|
+
cleanup_lzx
|
|
50
|
+
@input_handle&.close
|
|
51
|
+
@input_handle = nil
|
|
52
|
+
@chm = nil
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Extract a file from the CHM archive
|
|
56
|
+
# @param file [Models::CHMFile] File to extract
|
|
57
|
+
# @param output_path [String] Output path for extracted file
|
|
58
|
+
# @return [void]
|
|
59
|
+
def extract(file, output_path)
|
|
60
|
+
raise ArgumentError, "File is nil" if file.nil?
|
|
61
|
+
raise ArgumentError, "File section is nil" if file.section.nil?
|
|
62
|
+
|
|
63
|
+
# Handle empty files
|
|
64
|
+
if file.empty?
|
|
65
|
+
@io_system.open(output_path, Constants::MODE_WRITE).close
|
|
66
|
+
return
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
case file.section.id
|
|
70
|
+
when 0
|
|
71
|
+
extract_uncompressed(file, output_path)
|
|
72
|
+
when 1
|
|
73
|
+
extract_compressed(file, output_path)
|
|
74
|
+
else
|
|
75
|
+
raise Errors::FormatError, "Invalid section ID: #{file.section.id}"
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Find a file by name using fast_find
|
|
80
|
+
# @param filename [String] Name of the file to find
|
|
81
|
+
# @return [Models::CHMFile, nil] The file if found, nil otherwise
|
|
82
|
+
def fast_find(filename)
|
|
83
|
+
raise ArgumentError, "CHM not opened" unless @chm
|
|
84
|
+
|
|
85
|
+
# Use fast index search if available
|
|
86
|
+
if @chm.index_root < @chm.num_chunks
|
|
87
|
+
fast_search_pmgi(filename)
|
|
88
|
+
else
|
|
89
|
+
# Linear search through PMGL chunks
|
|
90
|
+
fast_search_pmgl(filename)
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
private
|
|
95
|
+
|
|
96
|
+
# Extract uncompressed file (section 0)
|
|
97
|
+
def extract_uncompressed(file, output_path)
|
|
98
|
+
output_handle = @io_system.open(output_path, Constants::MODE_WRITE)
|
|
99
|
+
|
|
100
|
+
# Seek to file data
|
|
101
|
+
offset = @chm.sec0.offset + file.offset
|
|
102
|
+
@input_handle.seek(offset, Constants::SEEK_START)
|
|
103
|
+
|
|
104
|
+
# Copy data in chunks
|
|
105
|
+
remaining = file.length
|
|
106
|
+
buffer_size = 4096
|
|
107
|
+
|
|
108
|
+
while remaining.positive?
|
|
109
|
+
chunk_size = [buffer_size, remaining].min
|
|
110
|
+
data = @input_handle.read(chunk_size)
|
|
111
|
+
if data.nil? || data.length < chunk_size
|
|
112
|
+
raise Errors::ReadError,
|
|
113
|
+
"Unexpected end of file"
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
output_handle.write(data)
|
|
117
|
+
remaining -= chunk_size
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
output_handle.close
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Extract compressed file (section 1, MSCompressed/LZX)
|
|
124
|
+
def extract_compressed(file, output_path)
|
|
125
|
+
# Initialize LZX decompressor if needed
|
|
126
|
+
init_lzx(file) unless lzx_ready?(file)
|
|
127
|
+
|
|
128
|
+
# Seek to correct position in input
|
|
129
|
+
@input_handle.seek(@lzx_input_offset, Constants::SEEK_START)
|
|
130
|
+
|
|
131
|
+
# Skip to file offset if needed
|
|
132
|
+
skip_amount = file.offset - @lzx_offset
|
|
133
|
+
if skip_amount.positive?
|
|
134
|
+
# Decompress and discard to a dummy memory handle
|
|
135
|
+
dummy_output = System::MemoryHandle.new("", Constants::MODE_WRITE)
|
|
136
|
+
saved_output = @lzx_state.instance_variable_get(:@output)
|
|
137
|
+
@lzx_state.instance_variable_set(:@output, dummy_output)
|
|
138
|
+
@lzx_state.decompress(skip_amount)
|
|
139
|
+
@lzx_state.instance_variable_set(:@output, saved_output)
|
|
140
|
+
@lzx_offset += skip_amount
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Decompress to memory buffer
|
|
144
|
+
memory_output = System::MemoryHandle.new("", Constants::MODE_WRITE)
|
|
145
|
+
@lzx_state.instance_variable_set(:@output, memory_output)
|
|
146
|
+
@lzx_state.decompress(file.length)
|
|
147
|
+
@lzx_offset += file.length
|
|
148
|
+
|
|
149
|
+
# Save input position for next extraction
|
|
150
|
+
@lzx_input_offset = @input_handle.tell
|
|
151
|
+
|
|
152
|
+
# Write buffer to file
|
|
153
|
+
output_handle = @io_system.open(output_path, Constants::MODE_WRITE)
|
|
154
|
+
output_handle.write(memory_output.buffer)
|
|
155
|
+
output_handle.close
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Check if LZX state is ready for this file
|
|
159
|
+
def lzx_ready?(file)
|
|
160
|
+
return false unless @lzx_state
|
|
161
|
+
return false if file.offset < @lzx_offset
|
|
162
|
+
|
|
163
|
+
true
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Initialize LZX decompressor for section 1
|
|
167
|
+
def init_lzx(file)
|
|
168
|
+
cleanup_lzx
|
|
169
|
+
|
|
170
|
+
sec = @chm.sec1
|
|
171
|
+
|
|
172
|
+
# Find required system files
|
|
173
|
+
content = sec.content || find_system_file(Parser::CONTENT_NAME)
|
|
174
|
+
control = sec.control || find_system_file(Parser::CONTROL_NAME)
|
|
175
|
+
|
|
176
|
+
unless content
|
|
177
|
+
raise Errors::FormatError,
|
|
178
|
+
"MSCompressed Content file not found"
|
|
179
|
+
end
|
|
180
|
+
raise Errors::FormatError, "ControlData file not found" unless control
|
|
181
|
+
|
|
182
|
+
# Read control data
|
|
183
|
+
control_data = read_system_file(control)
|
|
184
|
+
unless control_data.length == 28
|
|
185
|
+
raise Errors::FormatError,
|
|
186
|
+
"ControlData wrong size"
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
window_size, reset_interval = parse_control_data(control_data)
|
|
190
|
+
|
|
191
|
+
# Calculate window bits
|
|
192
|
+
window_bits = case window_size
|
|
193
|
+
when 0x008000 then 15
|
|
194
|
+
when 0x010000 then 16
|
|
195
|
+
when 0x020000 then 17
|
|
196
|
+
when 0x040000 then 18
|
|
197
|
+
when 0x080000 then 19
|
|
198
|
+
when 0x100000 then 20
|
|
199
|
+
when 0x200000 then 21
|
|
200
|
+
else
|
|
201
|
+
raise Errors::FormatError,
|
|
202
|
+
"Invalid window size: #{window_size}"
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# Validate reset interval
|
|
206
|
+
if reset_interval.zero? || (reset_interval % LZX_FRAME_SIZE) != 0
|
|
207
|
+
raise Errors::FormatError, "Invalid reset interval: #{reset_interval}"
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# Find reset table entry for this file
|
|
211
|
+
entry = file.offset / reset_interval
|
|
212
|
+
entry *= reset_interval / LZX_FRAME_SIZE
|
|
213
|
+
|
|
214
|
+
length, offset = read_reset_table(sec, entry, reset_interval)
|
|
215
|
+
|
|
216
|
+
# Calculate input offset
|
|
217
|
+
@lzx_input_offset = @chm.sec0.offset + content.offset + offset
|
|
218
|
+
|
|
219
|
+
# Set start offset and length
|
|
220
|
+
@lzx_offset = entry * LZX_FRAME_SIZE
|
|
221
|
+
@lzx_length = length
|
|
222
|
+
|
|
223
|
+
# Seek to input position
|
|
224
|
+
@input_handle.seek(@lzx_input_offset, Constants::SEEK_START)
|
|
225
|
+
|
|
226
|
+
# Create output handle (will be set per extraction)
|
|
227
|
+
output_handle = System::MemoryHandle.new("")
|
|
228
|
+
|
|
229
|
+
# Initialize LZX decompressor
|
|
230
|
+
@lzx_state = Decompressors::LZX.new(
|
|
231
|
+
@io_system,
|
|
232
|
+
@input_handle,
|
|
233
|
+
output_handle,
|
|
234
|
+
4096,
|
|
235
|
+
window_bits: window_bits,
|
|
236
|
+
reset_interval: reset_interval / LZX_FRAME_SIZE,
|
|
237
|
+
output_length: length - @lzx_offset,
|
|
238
|
+
)
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Parse control data to get window size and reset interval
|
|
242
|
+
def parse_control_data(data)
|
|
243
|
+
signature = data[4, 4]
|
|
244
|
+
unless signature == "LZXC"
|
|
245
|
+
raise Errors::SignatureError,
|
|
246
|
+
"Invalid LZXC signature"
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
version = data[8, 4].unpack1("V")
|
|
250
|
+
reset_interval = data[12, 4].unpack1("V")
|
|
251
|
+
window_size = data[16, 4].unpack1("V")
|
|
252
|
+
|
|
253
|
+
# Adjust for version 2
|
|
254
|
+
if version == 2
|
|
255
|
+
reset_interval *= LZX_FRAME_SIZE
|
|
256
|
+
window_size *= LZX_FRAME_SIZE
|
|
257
|
+
elsif version != 1
|
|
258
|
+
raise Errors::FormatError, "Unknown ControlData version: #{version}"
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
[window_size, reset_interval]
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Read reset table entry
|
|
265
|
+
def read_reset_table(sec, entry, reset_interval)
|
|
266
|
+
rtable = sec.rtable || find_system_file(Parser::RTABLE_NAME)
|
|
267
|
+
|
|
268
|
+
if rtable
|
|
269
|
+
# Read from reset table
|
|
270
|
+
read_reset_table_entry(rtable, entry, reset_interval)
|
|
271
|
+
else
|
|
272
|
+
# Fall back to SpanInfo
|
|
273
|
+
spaninfo = sec.spaninfo || find_system_file(Parser::SPANINFO_NAME)
|
|
274
|
+
unless spaninfo
|
|
275
|
+
raise Errors::FormatError,
|
|
276
|
+
"Neither ResetTable nor SpanInfo found"
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
length = read_spaninfo(spaninfo)
|
|
280
|
+
[length, 0]
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# Read an entry from the reset table
|
|
285
|
+
def read_reset_table_entry(rtable, entry, reset_interval)
|
|
286
|
+
data = read_system_file(rtable)
|
|
287
|
+
raise Errors::FormatError, "ResetTable too short" if data.length < 40
|
|
288
|
+
|
|
289
|
+
# Check frame length
|
|
290
|
+
frame_len = data[32, 8].unpack1("Q<")
|
|
291
|
+
unless frame_len == LZX_FRAME_SIZE
|
|
292
|
+
raise Errors::FormatError,
|
|
293
|
+
"Invalid frame length"
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
# Get uncompressed length
|
|
297
|
+
uncomp_len = data[16, 8].unpack1("Q<")
|
|
298
|
+
|
|
299
|
+
# Get entry info
|
|
300
|
+
num_entries = data[4, 4].unpack1("V")
|
|
301
|
+
entry_size = data[8, 4].unpack1("V")
|
|
302
|
+
table_offset = data[12, 4].unpack1("V")
|
|
303
|
+
|
|
304
|
+
if entry < num_entries && table_offset + (entry * entry_size) + entry_size <= data.length
|
|
305
|
+
pos = table_offset + (entry * entry_size)
|
|
306
|
+
offset = case entry_size
|
|
307
|
+
when 4 then data[pos, 4].unpack1("V")
|
|
308
|
+
when 8 then data[pos, 8].unpack1("Q<")
|
|
309
|
+
else
|
|
310
|
+
raise Errors::FormatError,
|
|
311
|
+
"Invalid entry size: #{entry_size}"
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
# Pad length to next reset interval
|
|
315
|
+
length = uncomp_len + reset_interval - 1
|
|
316
|
+
length &= -reset_interval
|
|
317
|
+
|
|
318
|
+
[length, offset]
|
|
319
|
+
else
|
|
320
|
+
# Invalid entry, fall back
|
|
321
|
+
[uncomp_len, 0]
|
|
322
|
+
end
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
# Read SpanInfo to get uncompressed length
|
|
326
|
+
def read_spaninfo(spaninfo)
|
|
327
|
+
data = read_system_file(spaninfo)
|
|
328
|
+
raise Errors::FormatError, "SpanInfo wrong size" unless data.length == 8
|
|
329
|
+
|
|
330
|
+
length = data.unpack1("Q<")
|
|
331
|
+
unless length.positive?
|
|
332
|
+
raise Errors::FormatError,
|
|
333
|
+
"Invalid SpanInfo length"
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
length
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
# Find a system file by name
|
|
340
|
+
def find_system_file(name)
|
|
341
|
+
file = @chm.sysfiles
|
|
342
|
+
while file
|
|
343
|
+
return file if file.filename == name
|
|
344
|
+
|
|
345
|
+
file = file.next_file
|
|
346
|
+
end
|
|
347
|
+
nil
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
# Read a system file's contents
|
|
351
|
+
def read_system_file(file)
|
|
352
|
+
unless file.section.id.zero?
|
|
353
|
+
raise Errors::FormatError,
|
|
354
|
+
"System file must be in section 0"
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
offset = @chm.sec0.offset + file.offset
|
|
358
|
+
@input_handle.seek(offset, Constants::SEEK_START)
|
|
359
|
+
@input_handle.read(file.length)
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
# Fast search using PMGI index
|
|
363
|
+
def fast_search_pmgi(filename)
|
|
364
|
+
# TODO: Implement PMGI-based binary search
|
|
365
|
+
# For now, fall back to PMGL linear search
|
|
366
|
+
fast_search_pmgl(filename)
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
# Fast search using PMGL chunks
|
|
370
|
+
def fast_search_pmgl(filename)
|
|
371
|
+
original_pos = @input_handle.tell
|
|
372
|
+
|
|
373
|
+
(@chm.first_pmgl..@chm.last_pmgl).each do |chunk_num|
|
|
374
|
+
offset = @chm.dir_offset + (chunk_num * @chm.chunk_size)
|
|
375
|
+
@input_handle.seek(offset, Constants::SEEK_START)
|
|
376
|
+
chunk = @input_handle.read(@chm.chunk_size)
|
|
377
|
+
|
|
378
|
+
next unless chunk && chunk.length == @chm.chunk_size
|
|
379
|
+
next unless chunk[0, 4] == "PMGL"
|
|
380
|
+
|
|
381
|
+
file = search_chunk(chunk, filename)
|
|
382
|
+
if file
|
|
383
|
+
@input_handle.seek(original_pos, Constants::SEEK_START)
|
|
384
|
+
return file
|
|
385
|
+
end
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
@input_handle.seek(original_pos, Constants::SEEK_START)
|
|
389
|
+
nil
|
|
390
|
+
end
|
|
391
|
+
|
|
392
|
+
# Search a chunk for a filename
|
|
393
|
+
def search_chunk(chunk, filename)
|
|
394
|
+
num_entries = chunk[-2, 2].unpack1("v")
|
|
395
|
+
pos = 20
|
|
396
|
+
chunk_end = chunk.length - 2
|
|
397
|
+
|
|
398
|
+
num_entries.times do
|
|
399
|
+
break if pos >= chunk_end
|
|
400
|
+
|
|
401
|
+
begin
|
|
402
|
+
name_len, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
|
|
403
|
+
break if pos + name_len > chunk_end
|
|
404
|
+
|
|
405
|
+
name = chunk[pos, name_len].force_encoding("UTF-8")
|
|
406
|
+
pos += name_len
|
|
407
|
+
|
|
408
|
+
section, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
|
|
409
|
+
offset, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
|
|
410
|
+
length, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
|
|
411
|
+
|
|
412
|
+
if name == filename
|
|
413
|
+
file = Models::CHMFile.new
|
|
414
|
+
file.filename = name
|
|
415
|
+
file.section = (section.zero? ? @chm.sec0 : @chm.sec1)
|
|
416
|
+
file.offset = offset
|
|
417
|
+
file.length = length
|
|
418
|
+
return file
|
|
419
|
+
end
|
|
420
|
+
rescue Errors::FormatError
|
|
421
|
+
break
|
|
422
|
+
end
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
nil
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
# Clean up LZX state
|
|
429
|
+
def cleanup_lzx
|
|
430
|
+
@lzx_state = nil
|
|
431
|
+
@lzx_offset = 0
|
|
432
|
+
@lzx_length = 0
|
|
433
|
+
end
|
|
434
|
+
end
|
|
435
|
+
end
|
|
436
|
+
end
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../binary/chm_structures"
|
|
4
|
+
require_relative "../models/chm_header"
|
|
5
|
+
require_relative "../models/chm_file"
|
|
6
|
+
require_relative "../errors"
|
|
7
|
+
|
|
8
|
+
module Cabriolet
|
|
9
|
+
module CHM
|
|
10
|
+
# Parser for CHM (Compiled HTML Help) files
|
|
11
|
+
class Parser
|
|
12
|
+
# Expected GUID values in CHM headers
|
|
13
|
+
GUID1 = [0x10, 0xFD, 0x01, 0x7C, 0xAA, 0x7B, 0xD0, 0x11,
|
|
14
|
+
0x9E, 0x0C, 0x00, 0xA0, 0xC9, 0x22, 0xE6, 0xEC].pack("C*")
|
|
15
|
+
GUID2 = [0x11, 0xFD, 0x01, 0x7C, 0xAA, 0x7B, 0xD0, 0x11,
|
|
16
|
+
0x9E, 0x0C, 0x00, 0xA0, 0xC9, 0x22, 0xE6, 0xEC].pack("C*")
|
|
17
|
+
|
|
18
|
+
# System file names
|
|
19
|
+
CONTENT_NAME = "::DataSpace/Storage/MSCompressed/Content"
|
|
20
|
+
CONTROL_NAME = "::DataSpace/Storage/MSCompressed/ControlData"
|
|
21
|
+
SPANINFO_NAME = "::DataSpace/Storage/MSCompressed/SpanInfo"
|
|
22
|
+
RTABLE_NAME = "::DataSpace/Storage/MSCompressed/Transform/" \
|
|
23
|
+
"{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable"
|
|
24
|
+
|
|
25
|
+
attr_reader :io, :chm
|
|
26
|
+
|
|
27
|
+
def initialize(io)
|
|
28
|
+
@io = io
|
|
29
|
+
@chm = Models::CHMHeader.new
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Parse the CHM file
|
|
33
|
+
# @param entire [Boolean] If true, parse all file entries. If false, only headers.
|
|
34
|
+
# @return [Models::CHMHeader]
|
|
35
|
+
def parse(entire: true)
|
|
36
|
+
read_itsf_header
|
|
37
|
+
read_header_sections
|
|
38
|
+
read_directory_header
|
|
39
|
+
|
|
40
|
+
read_file_entries if entire
|
|
41
|
+
|
|
42
|
+
@chm
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
# Read the ITSF header (main file header)
|
|
48
|
+
def read_itsf_header
|
|
49
|
+
@io.seek(0, Constants::SEEK_START)
|
|
50
|
+
header = Binary::CHMITSFHeader.read(@io)
|
|
51
|
+
|
|
52
|
+
# Check signature
|
|
53
|
+
unless header.signature == "ITSF"
|
|
54
|
+
raise SignatureError,
|
|
55
|
+
"Invalid ITSF signature"
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Check GUIDs
|
|
59
|
+
unless header.guid1 == GUID1 && header.guid2 == GUID2
|
|
60
|
+
raise SignatureError,
|
|
61
|
+
"Invalid CHM GUIDs"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
@chm.version = header.version
|
|
65
|
+
@chm.timestamp = header.timestamp
|
|
66
|
+
@chm.language = header.language_id
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Read header sections table and header section 0
|
|
70
|
+
def read_header_sections
|
|
71
|
+
section_table = Binary::CHMHeaderSectionTable.read(@io)
|
|
72
|
+
|
|
73
|
+
offset_hs0 = section_table.offset_hs0
|
|
74
|
+
@chm.dir_offset = section_table.offset_hs1
|
|
75
|
+
@chm.sec0.offset = section_table.offset_cs0
|
|
76
|
+
|
|
77
|
+
# Seek to header section 0
|
|
78
|
+
@io.seek(offset_hs0, Constants::SEEK_START)
|
|
79
|
+
hs0 = Binary::CHMHeaderSection0.read(@io)
|
|
80
|
+
@chm.length = hs0.file_len
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Read header section 1 (directory header)
|
|
84
|
+
def read_directory_header
|
|
85
|
+
@io.seek(@chm.dir_offset, Constants::SEEK_START)
|
|
86
|
+
hs1 = Binary::CHMHeaderSection1.read(@io)
|
|
87
|
+
|
|
88
|
+
# Check signature
|
|
89
|
+
unless hs1.signature == "ITSP"
|
|
90
|
+
raise SignatureError,
|
|
91
|
+
"Invalid ITSP signature"
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
@chm.dir_offset = @io.tell
|
|
95
|
+
@chm.chunk_size = hs1.chunk_size
|
|
96
|
+
@chm.density = hs1.density
|
|
97
|
+
@chm.depth = hs1.depth
|
|
98
|
+
@chm.index_root = hs1.index_root
|
|
99
|
+
@chm.num_chunks = hs1.num_chunks
|
|
100
|
+
@chm.first_pmgl = hs1.first_pmgl
|
|
101
|
+
@chm.last_pmgl = hs1.last_pmgl
|
|
102
|
+
|
|
103
|
+
# For CHM versions < 3, calculate section 0 offset
|
|
104
|
+
@chm.sec0.offset = @chm.dir_offset + (@chm.chunk_size * @chm.num_chunks) if @chm.version < 3
|
|
105
|
+
|
|
106
|
+
validate_chunk_parameters
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Validate chunk parameters
|
|
110
|
+
def validate_chunk_parameters
|
|
111
|
+
# Check if content offset is valid
|
|
112
|
+
if @chm.sec0.offset > @chm.length
|
|
113
|
+
raise FormatError,
|
|
114
|
+
"Content section offset beyond file length"
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Chunk size must be large enough
|
|
118
|
+
raise FormatError, "Chunk size too small" if @chm.chunk_size < 20
|
|
119
|
+
|
|
120
|
+
# Must have chunks
|
|
121
|
+
raise FormatError, "No chunks in CHM file" if @chm.num_chunks.zero?
|
|
122
|
+
|
|
123
|
+
# Sanity limits
|
|
124
|
+
if @chm.num_chunks > 100_000
|
|
125
|
+
raise FormatError,
|
|
126
|
+
"Too many chunks (> 100,000)"
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
if @chm.chunk_size > 8192
|
|
130
|
+
raise FormatError,
|
|
131
|
+
"Chunk size too large (> 8192)"
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Validate chunk indices
|
|
135
|
+
if @chm.first_pmgl > @chm.last_pmgl
|
|
136
|
+
raise FormatError,
|
|
137
|
+
"First PMGL > Last PMGL"
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
return unless @chm.index_root != 0xFFFFFFFF && @chm.index_root >= @chm.num_chunks
|
|
141
|
+
|
|
142
|
+
raise FormatError, "Index root out of range"
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Read all file entries from PMGL chunks
|
|
146
|
+
def read_file_entries
|
|
147
|
+
# Seek to first PMGL chunk
|
|
148
|
+
if @chm.first_pmgl != 0
|
|
149
|
+
pmgl_offset = @chm.first_pmgl * @chm.chunk_size
|
|
150
|
+
@io.seek(@chm.dir_offset + pmgl_offset, Constants::SEEK_START)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
num_chunks = @chm.last_pmgl - @chm.first_pmgl + 1
|
|
154
|
+
last_file = nil
|
|
155
|
+
|
|
156
|
+
num_chunks.times do
|
|
157
|
+
chunk = @io.read(@chm.chunk_size)
|
|
158
|
+
next unless chunk && chunk.length == @chm.chunk_size
|
|
159
|
+
|
|
160
|
+
# Check if this is a PMGL chunk
|
|
161
|
+
next unless chunk[0, 4] == "PMGL"
|
|
162
|
+
|
|
163
|
+
files = parse_pmgl_chunk(chunk)
|
|
164
|
+
files.each do |file|
|
|
165
|
+
if file.system_file?
|
|
166
|
+
# Add to system files list
|
|
167
|
+
file.next_file = @chm.sysfiles
|
|
168
|
+
@chm.sysfiles = file
|
|
169
|
+
identify_system_file(file)
|
|
170
|
+
else
|
|
171
|
+
# Add to regular files list
|
|
172
|
+
if last_file
|
|
173
|
+
last_file.next_file = file
|
|
174
|
+
else
|
|
175
|
+
@chm.files = file
|
|
176
|
+
end
|
|
177
|
+
last_file = file
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Parse a PMGL chunk to extract file entries
|
|
184
|
+
# @param chunk [String] The chunk data
|
|
185
|
+
# @return [Array<Models::CHMFile>] The files found in this chunk
|
|
186
|
+
def parse_pmgl_chunk(chunk)
|
|
187
|
+
files = []
|
|
188
|
+
|
|
189
|
+
# Read number of entries (last 2 bytes)
|
|
190
|
+
num_entries = chunk[-2, 2].unpack1("v")
|
|
191
|
+
|
|
192
|
+
# Start reading entries after PMGL header
|
|
193
|
+
pos = 20 # PMGL header is 20 bytes
|
|
194
|
+
chunk_end = chunk.length - 2
|
|
195
|
+
|
|
196
|
+
num_entries.times do
|
|
197
|
+
break if pos >= chunk_end
|
|
198
|
+
|
|
199
|
+
begin
|
|
200
|
+
# Read name length
|
|
201
|
+
name_len, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
|
|
202
|
+
break if pos + name_len > chunk_end
|
|
203
|
+
|
|
204
|
+
# Read name
|
|
205
|
+
name = chunk[pos, name_len]
|
|
206
|
+
pos += name_len
|
|
207
|
+
|
|
208
|
+
# Read section, offset, length
|
|
209
|
+
section, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
|
|
210
|
+
offset, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
|
|
211
|
+
length, pos = Binary::ENCINTReader.read_from_string(chunk, pos)
|
|
212
|
+
|
|
213
|
+
# Skip blank or single-char names
|
|
214
|
+
next if name_len < 2 || name[0].nil? || name[1].nil?
|
|
215
|
+
|
|
216
|
+
# Skip directory entries (end with '/')
|
|
217
|
+
next if offset.zero? && length.zero? && name[-1] == "/"
|
|
218
|
+
|
|
219
|
+
# Validate section number
|
|
220
|
+
next if section > 1
|
|
221
|
+
|
|
222
|
+
# Create file entry
|
|
223
|
+
file = Models::CHMFile.new
|
|
224
|
+
file.filename = name.force_encoding("UTF-8")
|
|
225
|
+
file.section = (section.zero? ? @chm.sec0 : @chm.sec1)
|
|
226
|
+
file.offset = offset
|
|
227
|
+
file.length = length
|
|
228
|
+
|
|
229
|
+
files << file
|
|
230
|
+
rescue Cabriolet::FormatError
|
|
231
|
+
# Skip malformed entries
|
|
232
|
+
break
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
files
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
# Identify and link system files
|
|
240
|
+
def identify_system_file(file)
|
|
241
|
+
case file.filename
|
|
242
|
+
when CONTENT_NAME
|
|
243
|
+
@chm.sec1.content = file
|
|
244
|
+
when CONTROL_NAME
|
|
245
|
+
@chm.sec1.control = file
|
|
246
|
+
when SPANINFO_NAME
|
|
247
|
+
@chm.sec1.spaninfo = file
|
|
248
|
+
when RTABLE_NAME
|
|
249
|
+
@chm.sec1.rtable = file
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
end
|