omnizip 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +243 -368
- data/README.adoc +101 -5
- data/docs/guides/archive-formats/index.adoc +31 -1
- data/docs/guides/archive-formats/ole-format.adoc +316 -0
- data/docs/guides/archive-formats/rpm-format.adoc +249 -0
- data/docs/index.adoc +12 -2
- data/lib/omnizip/algorithms/lzma/distance_coder.rb +29 -18
- data/lib/omnizip/algorithms/lzma/encoder.rb +2 -1
- data/lib/omnizip/algorithms/lzma/length_coder.rb +6 -3
- data/lib/omnizip/algorithms/lzma/literal_decoder.rb +2 -1
- data/lib/omnizip/algorithms/lzma/lzip_decoder.rb +40 -13
- data/lib/omnizip/algorithms/lzma/range_decoder.rb +36 -2
- data/lib/omnizip/algorithms/lzma/range_encoder.rb +19 -0
- data/lib/omnizip/algorithms/lzma/xz_encoder_fast.rb +2 -1
- data/lib/omnizip/algorithms/lzma/xz_utils_decoder.rb +148 -112
- data/lib/omnizip/algorithms/lzma.rb +20 -5
- data/lib/omnizip/algorithms/ppmd7/decoder.rb +25 -21
- data/lib/omnizip/algorithms/ppmd7/encoder.rb +4 -11
- data/lib/omnizip/algorithms/sevenzip_lzma2.rb +2 -1
- data/lib/omnizip/algorithms/xz_lzma2.rb +2 -1
- data/lib/omnizip/algorithms/zstandard/constants.rb +125 -9
- data/lib/omnizip/algorithms/zstandard/decoder.rb +202 -17
- data/lib/omnizip/algorithms/zstandard/encoder.rb +197 -17
- data/lib/omnizip/algorithms/zstandard/frame/block.rb +128 -0
- data/lib/omnizip/algorithms/zstandard/frame/header.rb +224 -0
- data/lib/omnizip/algorithms/zstandard/fse/bitstream.rb +186 -0
- data/lib/omnizip/algorithms/zstandard/fse/encoder.rb +325 -0
- data/lib/omnizip/algorithms/zstandard/fse/table.rb +269 -0
- data/lib/omnizip/algorithms/zstandard/huffman.rb +272 -0
- data/lib/omnizip/algorithms/zstandard/huffman_encoder.rb +339 -0
- data/lib/omnizip/algorithms/zstandard/literals.rb +178 -0
- data/lib/omnizip/algorithms/zstandard/literals_encoder.rb +251 -0
- data/lib/omnizip/algorithms/zstandard/sequences.rb +346 -0
- data/lib/omnizip/buffer/memory_extractor.rb +3 -3
- data/lib/omnizip/buffer.rb +2 -2
- data/lib/omnizip/filters/delta.rb +2 -1
- data/lib/omnizip/filters/registry.rb +6 -6
- data/lib/omnizip/formats/cpio/bounded_io.rb +66 -0
- data/lib/omnizip/formats/lzip.rb +2 -1
- data/lib/omnizip/formats/lzma_alone.rb +2 -1
- data/lib/omnizip/formats/ole/allocation_table.rb +244 -0
- data/lib/omnizip/formats/ole/constants.rb +61 -0
- data/lib/omnizip/formats/ole/dirent.rb +380 -0
- data/lib/omnizip/formats/ole/header.rb +198 -0
- data/lib/omnizip/formats/ole/ranges_io.rb +264 -0
- data/lib/omnizip/formats/ole/storage.rb +305 -0
- data/lib/omnizip/formats/ole/types/variant.rb +328 -0
- data/lib/omnizip/formats/ole.rb +145 -0
- data/lib/omnizip/formats/rar/compression/ppmd/decoder.rb +92 -49
- data/lib/omnizip/formats/rar/compression/ppmd/encoder.rb +13 -20
- data/lib/omnizip/formats/rar/rar5/compression/lzss.rb +6 -2
- data/lib/omnizip/formats/rar3/reader.rb +6 -2
- data/lib/omnizip/formats/rar5/reader.rb +4 -1
- data/lib/omnizip/formats/rpm/constants.rb +58 -0
- data/lib/omnizip/formats/rpm/entry.rb +102 -0
- data/lib/omnizip/formats/rpm/header.rb +113 -0
- data/lib/omnizip/formats/rpm/lead.rb +122 -0
- data/lib/omnizip/formats/rpm/tag.rb +230 -0
- data/lib/omnizip/formats/rpm.rb +434 -0
- data/lib/omnizip/formats/seven_zip/bcj2_stream_decompressor.rb +239 -0
- data/lib/omnizip/formats/seven_zip/coder_chain.rb +32 -8
- data/lib/omnizip/formats/seven_zip/constants.rb +1 -1
- data/lib/omnizip/formats/seven_zip/reader.rb +84 -8
- data/lib/omnizip/formats/seven_zip/stream_compressor.rb +2 -1
- data/lib/omnizip/formats/seven_zip/stream_decompressor.rb +6 -0
- data/lib/omnizip/formats/seven_zip/writer.rb +21 -9
- data/lib/omnizip/formats/seven_zip.rb +10 -0
- data/lib/omnizip/formats/xar/entry.rb +18 -5
- data/lib/omnizip/formats/xar/header.rb +34 -6
- data/lib/omnizip/formats/xar/reader.rb +43 -10
- data/lib/omnizip/formats/xar/toc.rb +34 -21
- data/lib/omnizip/formats/xar/writer.rb +15 -5
- data/lib/omnizip/formats/xz_impl/block_decoder.rb +45 -33
- data/lib/omnizip/formats/xz_impl/block_encoder.rb +2 -1
- data/lib/omnizip/formats/xz_impl/index_decoder.rb +3 -1
- data/lib/omnizip/formats/xz_impl/stream_header_parser.rb +2 -1
- data/lib/omnizip/formats/zip/end_of_central_directory.rb +4 -3
- data/lib/omnizip/implementations/seven_zip/lzma/decoder.rb +14 -6
- data/lib/omnizip/implementations/seven_zip/lzma/encoder.rb +2 -1
- data/lib/omnizip/implementations/seven_zip/lzma2/encoder.rb +28 -13
- data/lib/omnizip/implementations/xz_utils/lzma2/encoder.rb +13 -6
- data/lib/omnizip/pipe/stream_compressor.rb +1 -1
- data/lib/omnizip/version.rb +1 -1
- data/readme-docs/compression-algorithms.adoc +6 -2
- metadata +30 -2
|
@@ -22,17 +22,20 @@ module Omnizip
|
|
|
22
22
|
def self.build_from_folder(folder)
|
|
23
23
|
return nil if folder.coders.empty?
|
|
24
24
|
|
|
25
|
-
#
|
|
26
|
-
|
|
25
|
+
# Find the compression method (not a filter) among coders
|
|
26
|
+
# Filters like BCJ, BCJ2 have specific method IDs
|
|
27
|
+
main_coder = find_compression_coder(folder.coders)
|
|
28
|
+
raise "No compression method found in folder" unless main_coder
|
|
29
|
+
|
|
27
30
|
algorithm = algorithm_for_method(main_coder.method_id)
|
|
28
31
|
|
|
29
|
-
# Check for filters
|
|
32
|
+
# Check for filters (all coders except the compression method)
|
|
30
33
|
filters = []
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
folder.coders.each do |coder|
|
|
35
|
+
next if coder == main_coder
|
|
36
|
+
|
|
37
|
+
filter = filter_for_method(coder.method_id)
|
|
38
|
+
filters << filter if filter
|
|
36
39
|
end
|
|
37
40
|
|
|
38
41
|
{
|
|
@@ -43,6 +46,25 @@ module Omnizip
|
|
|
43
46
|
}
|
|
44
47
|
end
|
|
45
48
|
|
|
49
|
+
# Find the compression coder among all coders
|
|
50
|
+
#
|
|
51
|
+
# @param coders [Array<Models::CoderInfo>] All coders in the folder
|
|
52
|
+
# @return [Models::CoderInfo, nil] The compression coder or nil
|
|
53
|
+
def self.find_compression_coder(coders)
|
|
54
|
+
# Try to find a known compression method
|
|
55
|
+
coders.each do |coder|
|
|
56
|
+
case coder.method_id
|
|
57
|
+
when MethodId::LZMA, MethodId::LZMA2, MethodId::BZIP2,
|
|
58
|
+
MethodId::DEFLATE, MethodId::DEFLATE64, MethodId::PPMD,
|
|
59
|
+
MethodId::COPY
|
|
60
|
+
return coder
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Fall back to last coder if no known compression method found
|
|
65
|
+
coders.last
|
|
66
|
+
end
|
|
67
|
+
|
|
46
68
|
# Map method ID to algorithm
|
|
47
69
|
#
|
|
48
70
|
# @param method_id [Integer] Method ID from .7z file
|
|
@@ -92,6 +114,8 @@ module Omnizip
|
|
|
92
114
|
:bcj_arm64
|
|
93
115
|
when FilterId::DELTA
|
|
94
116
|
:delta
|
|
117
|
+
when FilterId::BCJ2
|
|
118
|
+
:bcj2
|
|
95
119
|
end
|
|
96
120
|
end
|
|
97
121
|
|
|
@@ -6,6 +6,7 @@ require_relative "parser"
|
|
|
6
6
|
require_relative "models/stream_info"
|
|
7
7
|
require_relative "models/file_entry"
|
|
8
8
|
require_relative "stream_decompressor"
|
|
9
|
+
require_relative "bcj2_stream_decompressor"
|
|
9
10
|
require_relative "split_archive_reader"
|
|
10
11
|
require_relative "header_encryptor"
|
|
11
12
|
require_relative "encrypted_header"
|
|
@@ -26,12 +27,14 @@ module Omnizip
|
|
|
26
27
|
# @param file_path [String] Path to .7z file
|
|
27
28
|
# @param options [Hash] Reader options
|
|
28
29
|
# @option options [String] :password Password for encrypted headers
|
|
30
|
+
# @option options [Integer] :offset Offset for embedded archives
|
|
29
31
|
def initialize(file_path, options = {})
|
|
30
32
|
@file_path = file_path
|
|
31
33
|
@entries = []
|
|
32
34
|
@stream_info = nil
|
|
33
35
|
@split_reader = nil
|
|
34
36
|
@password = options[:password]
|
|
37
|
+
@offset = options[:offset] || 0
|
|
35
38
|
end
|
|
36
39
|
|
|
37
40
|
# Open and parse .7z archive
|
|
@@ -47,6 +50,8 @@ module Omnizip
|
|
|
47
50
|
@stream_info = @split_reader.stream_info
|
|
48
51
|
else
|
|
49
52
|
File.open(@file_path, "rb") do |io|
|
|
53
|
+
# Seek to offset for embedded archives
|
|
54
|
+
io.seek(@offset) if @offset.positive?
|
|
50
55
|
parse_archive(io)
|
|
51
56
|
end
|
|
52
57
|
end
|
|
@@ -111,6 +116,8 @@ module Omnizip
|
|
|
111
116
|
FileUtils.mkdir_p(output_path)
|
|
112
117
|
elsif entry.has_stream?
|
|
113
118
|
File.open(@file_path, "rb") do |io|
|
|
119
|
+
# Seek to offset for embedded archives
|
|
120
|
+
io.seek(@offset) if @offset.positive?
|
|
114
121
|
data = extract_entry_data(io, entry)
|
|
115
122
|
File.binwrite(output_path, data)
|
|
116
123
|
end
|
|
@@ -173,7 +180,8 @@ module Omnizip
|
|
|
173
180
|
# Read next header metadata
|
|
174
181
|
# NOTE: next_header_offset is from the END of the Start Header (byte 32)
|
|
175
182
|
# NOT from the end of the file
|
|
176
|
-
|
|
183
|
+
# For embedded archives, add offset to get absolute position
|
|
184
|
+
next_header_pos = @offset + Constants::START_HEADER_SIZE + @header.next_header_offset
|
|
177
185
|
io.seek(next_header_pos)
|
|
178
186
|
next_header_data = io.read(@header.next_header_size)
|
|
179
187
|
|
|
@@ -272,7 +280,8 @@ module Omnizip
|
|
|
272
280
|
end
|
|
273
281
|
|
|
274
282
|
# Decompress the header using the stream info
|
|
275
|
-
|
|
283
|
+
# For embedded archives, add offset to get absolute file position
|
|
284
|
+
pack_pos = @offset + @header.start_pos_after_header + stream_info.pack_pos
|
|
276
285
|
folder = stream_info.folders[0]
|
|
277
286
|
pack_size = stream_info.pack_sizes[0]
|
|
278
287
|
unpack_size = folder.uncompressed_size
|
|
@@ -383,16 +392,86 @@ module Omnizip
|
|
|
383
392
|
folder = @stream_info.folders[entry.folder_index]
|
|
384
393
|
return "" unless folder
|
|
385
394
|
|
|
386
|
-
# Calculate pack position
|
|
387
|
-
pack_pos = @header.start_pos_after_header +
|
|
395
|
+
# Calculate pack position (add offset for embedded archives)
|
|
396
|
+
pack_pos = @offset + @header.start_pos_after_header +
|
|
388
397
|
@stream_info.pack_pos
|
|
389
398
|
|
|
390
|
-
# Get pack
|
|
399
|
+
# Get pack sizes for this folder
|
|
391
400
|
pack_idx = 0
|
|
392
401
|
entry.folder_index.times do |i|
|
|
393
402
|
num_streams = @stream_info.folders[i].pack_stream_indices.size
|
|
394
403
|
pack_idx += num_streams
|
|
395
404
|
end
|
|
405
|
+
|
|
406
|
+
# Check if this is a BCJ2 multi-stream folder
|
|
407
|
+
if Bcj2StreamDecompressor.bcj2_folder?(folder)
|
|
408
|
+
extract_bcj2_entry(io, entry, folder, pack_pos, pack_idx)
|
|
409
|
+
else
|
|
410
|
+
extract_regular_entry(io, entry, folder, pack_pos, pack_idx)
|
|
411
|
+
end
|
|
412
|
+
rescue StandardError => e
|
|
413
|
+
warn "Extraction failed for #{entry.name}: #{e.message}"
|
|
414
|
+
raise
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
# Extract entry from BCJ2 multi-stream folder
|
|
418
|
+
#
|
|
419
|
+
# @param io [IO] Archive file handle
|
|
420
|
+
# @param entry [Models::FileEntry] Entry to extract
|
|
421
|
+
# @param folder [Models::Folder] Folder specification
|
|
422
|
+
# @param pack_pos [Integer] Base pack position
|
|
423
|
+
# @param pack_idx [Integer] Starting pack index
|
|
424
|
+
# @return [String] Extracted data
|
|
425
|
+
def extract_bcj2_entry(io, entry, folder, pack_pos, pack_idx)
|
|
426
|
+
# BCJ2 folders have multiple pack streams
|
|
427
|
+
num_pack_streams = folder.pack_stream_indices.size
|
|
428
|
+
pack_sizes = Array.new(num_pack_streams) do |i|
|
|
429
|
+
@stream_info.pack_sizes[pack_idx + i] || 0
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
# Decompress the entire BCJ2 folder
|
|
433
|
+
decompressor = Bcj2StreamDecompressor.new(
|
|
434
|
+
io, folder, pack_pos, pack_sizes, @stream_info
|
|
435
|
+
)
|
|
436
|
+
full_data = decompressor.decompress(folder.uncompressed_size)
|
|
437
|
+
|
|
438
|
+
# For solid archives, extract this file's portion
|
|
439
|
+
num_files_in_folder = @stream_info.num_unpack_streams_in_folders[entry.folder_index] || 1
|
|
440
|
+
|
|
441
|
+
if num_files_in_folder > 1
|
|
442
|
+
# Find offset of this file within the uncompressed stream
|
|
443
|
+
file_offset = 0
|
|
444
|
+
@entries.each do |e|
|
|
445
|
+
break if e.file_index == entry.file_index
|
|
446
|
+
|
|
447
|
+
file_offset += e.size if e.has_stream? && e.folder_index == entry.folder_index
|
|
448
|
+
end
|
|
449
|
+
data = full_data[file_offset, entry.size]
|
|
450
|
+
else
|
|
451
|
+
data = full_data[0, entry.size]
|
|
452
|
+
end
|
|
453
|
+
|
|
454
|
+
# Verify CRC if available
|
|
455
|
+
if entry.crc
|
|
456
|
+
crc = Omnizip::Checksums::Crc32.new
|
|
457
|
+
crc.update(data)
|
|
458
|
+
unless crc.value == entry.crc
|
|
459
|
+
raise "CRC mismatch for #{entry.name}: expected 0x#{entry.crc.to_s(16)}, got 0x#{crc.value.to_s(16)}"
|
|
460
|
+
end
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
data
|
|
464
|
+
end
|
|
465
|
+
|
|
466
|
+
# Extract entry from regular (non-BCJ2) folder
|
|
467
|
+
#
|
|
468
|
+
# @param io [IO] Archive file handle
|
|
469
|
+
# @param entry [Models::FileEntry] Entry to extract
|
|
470
|
+
# @param folder [Models::Folder] Folder specification
|
|
471
|
+
# @param pack_pos [Integer] Base pack position
|
|
472
|
+
# @param pack_idx [Integer] Starting pack index
|
|
473
|
+
# @return [String] Extracted data
|
|
474
|
+
def extract_regular_entry(io, entry, folder, pack_pos, pack_idx)
|
|
396
475
|
pack_size = @stream_info.pack_sizes[pack_idx] || 0
|
|
397
476
|
|
|
398
477
|
# For solid archives, multiple files share one compressed stream
|
|
@@ -434,9 +513,6 @@ module Omnizip
|
|
|
434
513
|
expected_crc = entry.crc
|
|
435
514
|
decompressor.decompress_and_verify(entry.size, expected_crc)
|
|
436
515
|
end
|
|
437
|
-
rescue StandardError => e
|
|
438
|
-
warn "Extraction failed for #{entry.name}: #{e.message}"
|
|
439
|
-
raise
|
|
440
516
|
end
|
|
441
517
|
|
|
442
518
|
# Check if file path indicates a split archive
|
|
@@ -54,7 +54,8 @@ module Omnizip
|
|
|
54
54
|
|
|
55
55
|
# For 7-Zip format, use raw_mode (no property byte in compressed data)
|
|
56
56
|
# The properties are encoded in the 7-Zip header instead
|
|
57
|
-
encoder.compress(input_io, output_io,
|
|
57
|
+
encoder.compress(input_io, output_io,
|
|
58
|
+
{ raw_mode: true, standalone: false })
|
|
58
59
|
result = output_io.string
|
|
59
60
|
end
|
|
60
61
|
|
|
@@ -80,6 +80,12 @@ module Omnizip
|
|
|
80
80
|
filter_class = FilterRegistry.get(filter_sym)
|
|
81
81
|
next unless filter_class
|
|
82
82
|
|
|
83
|
+
# BCJ2 requires special handling with multiple streams
|
|
84
|
+
if filter_sym == :bcj2
|
|
85
|
+
raise "BCJ2 archives require multi-stream decompression which is not yet implemented. " \
|
|
86
|
+
"Please use the 7z command-line tool for this archive."
|
|
87
|
+
end
|
|
88
|
+
|
|
83
89
|
filter = filter_class.new
|
|
84
90
|
filtered = StringIO.new
|
|
85
91
|
filter.reverse(StringIO.new(result), filtered)
|
|
@@ -134,7 +134,8 @@ module Omnizip
|
|
|
134
134
|
|
|
135
135
|
# Step 3: Build Next Header properties
|
|
136
136
|
# This includes kHeader, MAIN_STREAMS_INFO, FILES_INFO, etc.
|
|
137
|
-
next_header_data = build_next_header_properties(file_data,
|
|
137
|
+
next_header_data = build_next_header_properties(file_data,
|
|
138
|
+
packed_sizes)
|
|
138
139
|
|
|
139
140
|
# Step 4: Write the complete data section
|
|
140
141
|
# Note: CRC is stored in StartHeader, NOT appended to Next Header
|
|
@@ -150,7 +151,8 @@ module Omnizip
|
|
|
150
151
|
# (CRC32 is appended after the header data, not included in size)
|
|
151
152
|
next_header_size = next_header_data.bytesize
|
|
152
153
|
|
|
153
|
-
write_start_header(io, next_header_offset, next_header_size,
|
|
154
|
+
write_start_header(io, next_header_offset, next_header_size,
|
|
155
|
+
next_header_data)
|
|
154
156
|
end
|
|
155
157
|
|
|
156
158
|
# Build packed data for solid mode (LZMA2 compression)
|
|
@@ -192,7 +194,8 @@ module Omnizip
|
|
|
192
194
|
entry.size = data.bytesize
|
|
193
195
|
end
|
|
194
196
|
|
|
195
|
-
{ data: combined, total_size: total_size,
|
|
197
|
+
{ data: combined, total_size: total_size,
|
|
198
|
+
streams: [{ data: combined, size: total_size }] }
|
|
196
199
|
else
|
|
197
200
|
# Non-solid mode: each file gets its own stream
|
|
198
201
|
streams = []
|
|
@@ -240,7 +243,8 @@ module Omnizip
|
|
|
240
243
|
# Solid mode: one pack stream, one folder
|
|
241
244
|
# packed_sizes is a single-element array with compressed size
|
|
242
245
|
compressed_size = packed_sizes.first
|
|
243
|
-
build_solid_streams_info(metadata, unpack_size, compressed_size,
|
|
246
|
+
build_solid_streams_info(metadata, unpack_size, compressed_size,
|
|
247
|
+
num_files)
|
|
244
248
|
else
|
|
245
249
|
# Non-solid mode: one pack stream per file, one folder per file
|
|
246
250
|
build_non_solid_streams_info(metadata, file_data[:streams])
|
|
@@ -286,7 +290,8 @@ module Omnizip
|
|
|
286
290
|
encrypted_header.to_binary
|
|
287
291
|
end
|
|
288
292
|
|
|
289
|
-
def build_solid_streams_info(metadata, unpack_size, compressed_size,
|
|
293
|
+
def build_solid_streams_info(metadata, unpack_size, compressed_size,
|
|
294
|
+
num_files)
|
|
290
295
|
# kPackInfo property (0x06)
|
|
291
296
|
metadata << [PropertyId::PACK_INFO].pack("C")
|
|
292
297
|
metadata << write_number(0) # Pack position
|
|
@@ -624,7 +629,8 @@ module Omnizip
|
|
|
624
629
|
footer
|
|
625
630
|
end
|
|
626
631
|
|
|
627
|
-
def write_start_header(io, next_header_offset, next_header_size,
|
|
632
|
+
def write_start_header(io, next_header_offset, next_header_size,
|
|
633
|
+
next_header_data)
|
|
628
634
|
header = String.new(encoding: "BINARY")
|
|
629
635
|
|
|
630
636
|
# Signature (6 bytes)
|
|
@@ -711,17 +717,23 @@ module Omnizip
|
|
|
711
717
|
# 11110xxx pattern
|
|
712
718
|
first_byte = 0xF0 | (value >> 32)
|
|
713
719
|
result << [first_byte].pack("C")
|
|
714
|
-
4.downto(1)
|
|
720
|
+
4.downto(1) do |i|
|
|
721
|
+
result << [(value >> (8 * (i - 1))) & 0xFF].pack("C")
|
|
722
|
+
end
|
|
715
723
|
when 6
|
|
716
724
|
# 111110xx pattern
|
|
717
725
|
first_byte = 0xF8 | (value >> 40)
|
|
718
726
|
result << [first_byte].pack("C")
|
|
719
|
-
5.downto(1)
|
|
727
|
+
5.downto(1) do |i|
|
|
728
|
+
result << [(value >> (8 * (i - 1))) & 0xFF].pack("C")
|
|
729
|
+
end
|
|
720
730
|
when 7
|
|
721
731
|
# 1111110x pattern
|
|
722
732
|
first_byte = 0xFC | (value >> 48)
|
|
723
733
|
result << [first_byte].pack("C")
|
|
724
|
-
6.downto(1)
|
|
734
|
+
6.downto(1) do |i|
|
|
735
|
+
result << [(value >> (8 * (i - 1))) & 0xFF].pack("C")
|
|
736
|
+
end
|
|
725
737
|
else
|
|
726
738
|
# 8 bytes: 11111110 or 11111111 prefix
|
|
727
739
|
result << if value < (1 << 56)
|
|
@@ -80,6 +80,16 @@ module Omnizip
|
|
|
80
80
|
reader
|
|
81
81
|
end
|
|
82
82
|
|
|
83
|
+
# Search for embedded .7z archive in self-extracting executable
|
|
84
|
+
#
|
|
85
|
+
# @param path [String] Path to potential self-extracting archive
|
|
86
|
+
# @return [Integer, nil] Offset of embedded 7z signature, or nil if not found
|
|
87
|
+
def self.search_embedded(path)
|
|
88
|
+
data = File.binread(path)
|
|
89
|
+
signature = Constants::SIGNATURE
|
|
90
|
+
data.index(signature)
|
|
91
|
+
end
|
|
92
|
+
|
|
83
93
|
# Auto-register .7z format when loaded
|
|
84
94
|
def self.register!
|
|
85
95
|
require_relative "../format_registry"
|
|
@@ -191,13 +191,26 @@ module Omnizip
|
|
|
191
191
|
data_hash[:length] = @data_size if @data_size&.positive?
|
|
192
192
|
|
|
193
193
|
if @data_encoding && @data_encoding != COMPRESSION_NONE
|
|
194
|
-
data_hash[:encoding] =
|
|
194
|
+
data_hash[:encoding] =
|
|
195
|
+
COMPRESSION_MIME_TYPES[@data_encoding] || @data_encoding
|
|
195
196
|
end
|
|
196
197
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
198
|
+
if @archived_checksum
|
|
199
|
+
data_hash[:archived_checksum] =
|
|
200
|
+
@archived_checksum
|
|
201
|
+
end
|
|
202
|
+
if @archived_checksum_style
|
|
203
|
+
data_hash[:archived_checksum_style] =
|
|
204
|
+
@archived_checksum_style
|
|
205
|
+
end
|
|
206
|
+
if @extracted_checksum
|
|
207
|
+
data_hash[:extracted_checksum] =
|
|
208
|
+
@extracted_checksum
|
|
209
|
+
end
|
|
210
|
+
if @extracted_checksum_style
|
|
211
|
+
data_hash[:extracted_checksum_style] =
|
|
212
|
+
@extracted_checksum_style
|
|
213
|
+
end
|
|
201
214
|
|
|
202
215
|
hash[:data] = data_hash
|
|
203
216
|
end
|
|
@@ -28,11 +28,35 @@ module Omnizip
|
|
|
28
28
|
# @return [Header] Parsed header object
|
|
29
29
|
# @raise [ArgumentError] If data is invalid
|
|
30
30
|
def self.parse(data)
|
|
31
|
-
|
|
31
|
+
if data.bytesize < HEADER_SIZE
|
|
32
|
+
raise ArgumentError,
|
|
33
|
+
"Header data too short (#{data.bytesize} bytes)"
|
|
34
|
+
end
|
|
32
35
|
|
|
33
36
|
magic = data[0, 4].unpack1("N")
|
|
34
|
-
header_size
|
|
35
|
-
|
|
37
|
+
# XAR spec: header_size and version are little-endian, rest are big-endian
|
|
38
|
+
# However, some tools (like macOS xar) store these in big-endian format.
|
|
39
|
+
# We detect this by checking if the parsed values make sense.
|
|
40
|
+
header_size_le = data[4, 2].unpack1("v") # little-endian
|
|
41
|
+
header_size_be = data[4, 2].unpack1("n") # big-endian
|
|
42
|
+
version_le = data[6, 2].unpack1("v") # little-endian
|
|
43
|
+
version_be = data[6, 2].unpack1("n") # big-endian
|
|
44
|
+
|
|
45
|
+
# Detect endianness: standard header is 28 bytes, version is 1
|
|
46
|
+
# If little-endian gives valid values, use it; otherwise use big-endian
|
|
47
|
+
if header_size_le == HEADER_SIZE && version_le == XAR_VERSION
|
|
48
|
+
header_size = header_size_le
|
|
49
|
+
version = version_le
|
|
50
|
+
elsif header_size_be == HEADER_SIZE && version_be == XAR_VERSION
|
|
51
|
+
header_size = header_size_be
|
|
52
|
+
version = version_be
|
|
53
|
+
else
|
|
54
|
+
# Default to little-endian (spec-compliant)
|
|
55
|
+
header_size = header_size_le
|
|
56
|
+
version = version_le
|
|
57
|
+
# Normalize version 256 to 1 (big-endian encoding of version 1)
|
|
58
|
+
version = 1 if version == 256
|
|
59
|
+
end
|
|
36
60
|
toc_compressed_size = data[8, 8].unpack1("Q>") # big-endian uint64
|
|
37
61
|
toc_uncompressed_size = data[16, 8].unpack1("Q>") # big-endian uint64
|
|
38
62
|
checksum_algorithm = data[24, 4].unpack1("N")
|
|
@@ -104,7 +128,9 @@ module Omnizip
|
|
|
104
128
|
# @raise [ArgumentError] If header is invalid
|
|
105
129
|
def validate!
|
|
106
130
|
unless @magic == MAGIC
|
|
107
|
-
raise ArgumentError,
|
|
131
|
+
raise ArgumentError,
|
|
132
|
+
format("Invalid magic: 0x%08x (expected 0x%08x)", @magic,
|
|
133
|
+
MAGIC)
|
|
108
134
|
end
|
|
109
135
|
|
|
110
136
|
unless @header_size >= HEADER_SIZE
|
|
@@ -115,8 +141,10 @@ module Omnizip
|
|
|
115
141
|
raise ArgumentError, "Unsupported version: #{@version}"
|
|
116
142
|
end
|
|
117
143
|
|
|
118
|
-
unless [CKSUM_NONE, CKSUM_SHA1, CKSUM_MD5,
|
|
119
|
-
|
|
144
|
+
unless [CKSUM_NONE, CKSUM_SHA1, CKSUM_MD5,
|
|
145
|
+
CKSUM_OTHER].include?(@checksum_algorithm)
|
|
146
|
+
raise ArgumentError,
|
|
147
|
+
"Unknown checksum algorithm: #{@checksum_algorithm}"
|
|
120
148
|
end
|
|
121
149
|
|
|
122
150
|
if @checksum_algorithm == CKSUM_OTHER && @checksum_name.to_s.strip.empty?
|
|
@@ -114,10 +114,13 @@ module Omnizip
|
|
|
114
114
|
# @param entry [Entry] Entry to read
|
|
115
115
|
# @return [String, nil] Entry data or nil if no data
|
|
116
116
|
def read_data(entry)
|
|
117
|
-
return nil unless entry.
|
|
117
|
+
return nil unless entry.data_length&.positive?
|
|
118
118
|
return nil unless @file
|
|
119
119
|
|
|
120
120
|
@file.seek(@heap_offset + entry.data_offset)
|
|
121
|
+
# In XAR format:
|
|
122
|
+
# - data_length is the compressed (archived) size (what to read from heap)
|
|
123
|
+
# - data_size is the uncompressed (extracted) size (decompressed size)
|
|
121
124
|
compressed_data = @file.read(entry.data_length)
|
|
122
125
|
|
|
123
126
|
decompress_data(compressed_data, entry.data_encoding, entry.data_size)
|
|
@@ -130,7 +133,9 @@ module Omnizip
|
|
|
130
133
|
FileUtils.mkdir_p(output_dir)
|
|
131
134
|
|
|
132
135
|
# Sort entries to ensure directories are created first
|
|
133
|
-
sorted_entries = @entries.sort_by
|
|
136
|
+
sorted_entries = @entries.sort_by do |e|
|
|
137
|
+
[e.directory? ? 0 : 1, e.name]
|
|
138
|
+
end
|
|
134
139
|
|
|
135
140
|
sorted_entries.each do |entry|
|
|
136
141
|
extract_entry(entry, output_dir)
|
|
@@ -206,9 +211,11 @@ module Omnizip
|
|
|
206
211
|
@toc = Toc.parse(compressed_toc, @header.toc_uncompressed_size)
|
|
207
212
|
@entries = @toc.entries
|
|
208
213
|
|
|
209
|
-
# Calculate heap offset
|
|
214
|
+
# Calculate heap offset:
|
|
215
|
+
# The heap starts immediately after the compressed TOC.
|
|
216
|
+
# The TOC checksum is stored INSIDE the heap (at offset 0), not after it.
|
|
217
|
+
# File data offsets in the TOC are relative to the heap start.
|
|
210
218
|
@heap_offset = @header.header_size + @header.toc_compressed_size
|
|
211
|
-
@heap_offset += @header.checksum_size if @header.checksum?
|
|
212
219
|
end
|
|
213
220
|
|
|
214
221
|
# Decompress data based on encoding
|
|
@@ -235,14 +242,40 @@ module Omnizip
|
|
|
235
242
|
|
|
236
243
|
# Decompress gzip data
|
|
237
244
|
#
|
|
238
|
-
# @param data [String]
|
|
245
|
+
# @param data [String] Zlib compressed data (XAR uses zlib, not actual gzip)
|
|
239
246
|
# @return [String] Decompressed data
|
|
240
247
|
def decompress_gzip(data)
|
|
241
|
-
zlib
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
zlib
|
|
245
|
-
|
|
248
|
+
# XAR "gzip" compression is actually zlib format (with 0x78xx header)
|
|
249
|
+
# Try different decompression methods for robustness
|
|
250
|
+
|
|
251
|
+
# Method 1: Standard zlib format (with header)
|
|
252
|
+
begin
|
|
253
|
+
result = Zlib::Inflate.inflate(data)
|
|
254
|
+
return result
|
|
255
|
+
rescue Zlib::Error
|
|
256
|
+
# Continue to next method
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Method 2: Raw deflate (some implementations may use this)
|
|
260
|
+
begin
|
|
261
|
+
inf = Zlib::Inflate.new(-Zlib::MAX_WBITS)
|
|
262
|
+
result = inf.inflate(data)
|
|
263
|
+
inf.finish
|
|
264
|
+
inf.close
|
|
265
|
+
return result
|
|
266
|
+
rescue Zlib::Error
|
|
267
|
+
# Continue to next method
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# Method 3: Raw deflate without finish (for truncated data)
|
|
271
|
+
begin
|
|
272
|
+
inf = Zlib::Inflate.new(-Zlib::MAX_WBITS)
|
|
273
|
+
result = inf.inflate(data)
|
|
274
|
+
inf.close
|
|
275
|
+
result
|
|
276
|
+
rescue Zlib::Error => e
|
|
277
|
+
raise "Failed to decompress data: #{e.message}"
|
|
278
|
+
end
|
|
246
279
|
end
|
|
247
280
|
|
|
248
281
|
# Decompress bzip2 data
|
|
@@ -20,7 +20,8 @@ module Omnizip
|
|
|
20
20
|
class Toc
|
|
21
21
|
include Constants
|
|
22
22
|
|
|
23
|
-
attr_accessor :creation_time, :checksum_offset, :checksum_size,
|
|
23
|
+
attr_accessor :creation_time, :checksum_offset, :checksum_size,
|
|
24
|
+
:checksum_style
|
|
24
25
|
attr_reader :entries
|
|
25
26
|
|
|
26
27
|
# Parse TOC from compressed data
|
|
@@ -36,17 +37,23 @@ module Omnizip
|
|
|
36
37
|
|
|
37
38
|
# Decompress TOC data
|
|
38
39
|
#
|
|
39
|
-
# @param compressed_data [String]
|
|
40
|
+
# @param compressed_data [String] Zlib-compressed data
|
|
40
41
|
# @param expected_size [Integer, nil] Expected size for validation
|
|
41
42
|
# @return [String] Decompressed XML
|
|
42
43
|
def self.decompress(compressed_data, expected_size = nil)
|
|
43
|
-
zlib
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
44
|
+
# XAR TOC is zlib compressed (with zlib headers, 0x78xx)
|
|
45
|
+
# Try zlib format first (most common), then fall back to raw deflate
|
|
46
|
+
result = begin
|
|
47
|
+
# Try standard zlib format (with header)
|
|
48
|
+
Zlib::Inflate.inflate(compressed_data)
|
|
49
|
+
rescue Zlib::DataError
|
|
50
|
+
# Fall back to raw deflate for non-conforming implementations
|
|
51
|
+
Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(compressed_data)
|
|
52
|
+
end
|
|
47
53
|
|
|
48
54
|
if expected_size && result.bytesize != expected_size
|
|
49
|
-
raise ArgumentError,
|
|
55
|
+
raise ArgumentError,
|
|
56
|
+
"TOC size mismatch: #{result.bytesize} != #{expected_size}"
|
|
50
57
|
end
|
|
51
58
|
|
|
52
59
|
result
|
|
@@ -279,10 +286,10 @@ module Omnizip
|
|
|
279
286
|
if (data = elem.elements["data"])
|
|
280
287
|
options[:data_offset] = int_content(data.elements["offset"]) || 0
|
|
281
288
|
# In XAR format:
|
|
282
|
-
# - <
|
|
283
|
-
# - <
|
|
284
|
-
options[:data_size] = int_content(data.elements["
|
|
285
|
-
options[:data_length] = int_content(data.elements["
|
|
289
|
+
# - <size> is the uncompressed (extracted) size
|
|
290
|
+
# - <length> is the compressed (archived) size in the heap
|
|
291
|
+
options[:data_size] = int_content(data.elements["size"]) || 0
|
|
292
|
+
options[:data_length] = int_content(data.elements["length"]) || 0
|
|
286
293
|
|
|
287
294
|
if (encoding = data.elements["encoding"])
|
|
288
295
|
style = encoding.attributes["style"]
|
|
@@ -291,12 +298,14 @@ module Omnizip
|
|
|
291
298
|
|
|
292
299
|
if (archived_sum = data.elements["archived-checksum"])
|
|
293
300
|
options[:archived_checksum] = text_content(archived_sum)
|
|
294
|
-
options[:archived_checksum_style] =
|
|
301
|
+
options[:archived_checksum_style] =
|
|
302
|
+
archived_sum.attributes["style"]
|
|
295
303
|
end
|
|
296
304
|
|
|
297
305
|
if (extracted_sum = data.elements["extracted-checksum"])
|
|
298
306
|
options[:extracted_checksum] = text_content(extracted_sum)
|
|
299
|
-
options[:extracted_checksum_style] =
|
|
307
|
+
options[:extracted_checksum_style] =
|
|
308
|
+
extracted_sum.attributes["style"]
|
|
300
309
|
end
|
|
301
310
|
end
|
|
302
311
|
|
|
@@ -347,7 +356,10 @@ module Omnizip
|
|
|
347
356
|
elem.elements.each("file") do |file_elem|
|
|
348
357
|
entry = parse_file_element(file_elem)
|
|
349
358
|
# Prepend parent path to name
|
|
350
|
-
|
|
359
|
+
unless parent_entry.name.empty?
|
|
360
|
+
entry.name = File.join(parent_entry.name,
|
|
361
|
+
entry.name)
|
|
362
|
+
end
|
|
351
363
|
toc.add_entry(entry)
|
|
352
364
|
|
|
353
365
|
# Recurse for deeper nesting
|
|
@@ -392,7 +404,8 @@ module Omnizip
|
|
|
392
404
|
# @param entry [Entry] Entry to add
|
|
393
405
|
# @param children_map [Hash] Children by parent path
|
|
394
406
|
# @param parent_path [String] Path of parent directory (for nested entries)
|
|
395
|
-
def add_file_element(parent_elem, entry, children_map,
|
|
407
|
+
def add_file_element(parent_elem, entry, children_map,
|
|
408
|
+
parent_path = nil)
|
|
396
409
|
file_elem = parent_elem.add_element("file")
|
|
397
410
|
file_elem.add_attribute("id", entry.id.to_s)
|
|
398
411
|
|
|
@@ -453,16 +466,16 @@ module Omnizip
|
|
|
453
466
|
offset_elem.add_text(entry.data_offset.to_s)
|
|
454
467
|
|
|
455
468
|
# In XAR format:
|
|
456
|
-
# - <length> is the
|
|
457
|
-
# - <size> is the
|
|
458
|
-
if entry.
|
|
469
|
+
# - <length> is the compressed (archived) size in the heap
|
|
470
|
+
# - <size> is the uncompressed (extracted) size
|
|
471
|
+
if entry.data_length&.positive?
|
|
459
472
|
length_elem = data_elem.add_element("length")
|
|
460
|
-
length_elem.add_text(entry.
|
|
473
|
+
length_elem.add_text(entry.data_length.to_s)
|
|
461
474
|
end
|
|
462
475
|
|
|
463
|
-
if entry.
|
|
476
|
+
if entry.data_size&.positive?
|
|
464
477
|
size_elem = data_elem.add_element("size")
|
|
465
|
-
size_elem.add_text(entry.
|
|
478
|
+
size_elem.add_text(entry.data_size.to_s)
|
|
466
479
|
end
|
|
467
480
|
|
|
468
481
|
if entry.data_encoding && entry.data_encoding != COMPRESSION_NONE
|