omnizip 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +243 -368
  3. data/README.adoc +101 -5
  4. data/docs/guides/archive-formats/index.adoc +31 -1
  5. data/docs/guides/archive-formats/ole-format.adoc +316 -0
  6. data/docs/guides/archive-formats/rpm-format.adoc +249 -0
  7. data/docs/index.adoc +12 -2
  8. data/lib/omnizip/algorithms/lzma/distance_coder.rb +29 -18
  9. data/lib/omnizip/algorithms/lzma/encoder.rb +2 -1
  10. data/lib/omnizip/algorithms/lzma/length_coder.rb +6 -3
  11. data/lib/omnizip/algorithms/lzma/literal_decoder.rb +2 -1
  12. data/lib/omnizip/algorithms/lzma/lzip_decoder.rb +40 -13
  13. data/lib/omnizip/algorithms/lzma/range_decoder.rb +36 -2
  14. data/lib/omnizip/algorithms/lzma/range_encoder.rb +19 -0
  15. data/lib/omnizip/algorithms/lzma/xz_encoder_fast.rb +2 -1
  16. data/lib/omnizip/algorithms/lzma/xz_utils_decoder.rb +148 -112
  17. data/lib/omnizip/algorithms/lzma.rb +20 -5
  18. data/lib/omnizip/algorithms/ppmd7/decoder.rb +25 -21
  19. data/lib/omnizip/algorithms/ppmd7/encoder.rb +4 -11
  20. data/lib/omnizip/algorithms/sevenzip_lzma2.rb +2 -1
  21. data/lib/omnizip/algorithms/xz_lzma2.rb +2 -1
  22. data/lib/omnizip/algorithms/zstandard/constants.rb +125 -9
  23. data/lib/omnizip/algorithms/zstandard/decoder.rb +202 -17
  24. data/lib/omnizip/algorithms/zstandard/encoder.rb +197 -17
  25. data/lib/omnizip/algorithms/zstandard/frame/block.rb +128 -0
  26. data/lib/omnizip/algorithms/zstandard/frame/header.rb +224 -0
  27. data/lib/omnizip/algorithms/zstandard/fse/bitstream.rb +186 -0
  28. data/lib/omnizip/algorithms/zstandard/fse/encoder.rb +325 -0
  29. data/lib/omnizip/algorithms/zstandard/fse/table.rb +269 -0
  30. data/lib/omnizip/algorithms/zstandard/huffman.rb +272 -0
  31. data/lib/omnizip/algorithms/zstandard/huffman_encoder.rb +339 -0
  32. data/lib/omnizip/algorithms/zstandard/literals.rb +178 -0
  33. data/lib/omnizip/algorithms/zstandard/literals_encoder.rb +251 -0
  34. data/lib/omnizip/algorithms/zstandard/sequences.rb +346 -0
  35. data/lib/omnizip/buffer/memory_extractor.rb +3 -3
  36. data/lib/omnizip/buffer.rb +2 -2
  37. data/lib/omnizip/filters/delta.rb +2 -1
  38. data/lib/omnizip/filters/registry.rb +6 -6
  39. data/lib/omnizip/formats/cpio/bounded_io.rb +66 -0
  40. data/lib/omnizip/formats/lzip.rb +2 -1
  41. data/lib/omnizip/formats/lzma_alone.rb +2 -1
  42. data/lib/omnizip/formats/ole/allocation_table.rb +244 -0
  43. data/lib/omnizip/formats/ole/constants.rb +61 -0
  44. data/lib/omnizip/formats/ole/dirent.rb +380 -0
  45. data/lib/omnizip/formats/ole/header.rb +198 -0
  46. data/lib/omnizip/formats/ole/ranges_io.rb +264 -0
  47. data/lib/omnizip/formats/ole/storage.rb +305 -0
  48. data/lib/omnizip/formats/ole/types/variant.rb +328 -0
  49. data/lib/omnizip/formats/ole.rb +145 -0
  50. data/lib/omnizip/formats/rar/compression/ppmd/decoder.rb +92 -49
  51. data/lib/omnizip/formats/rar/compression/ppmd/encoder.rb +13 -20
  52. data/lib/omnizip/formats/rar/rar5/compression/lzss.rb +6 -2
  53. data/lib/omnizip/formats/rar3/reader.rb +6 -2
  54. data/lib/omnizip/formats/rar5/reader.rb +4 -1
  55. data/lib/omnizip/formats/rpm/constants.rb +58 -0
  56. data/lib/omnizip/formats/rpm/entry.rb +102 -0
  57. data/lib/omnizip/formats/rpm/header.rb +113 -0
  58. data/lib/omnizip/formats/rpm/lead.rb +122 -0
  59. data/lib/omnizip/formats/rpm/tag.rb +230 -0
  60. data/lib/omnizip/formats/rpm.rb +434 -0
  61. data/lib/omnizip/formats/seven_zip/bcj2_stream_decompressor.rb +239 -0
  62. data/lib/omnizip/formats/seven_zip/coder_chain.rb +32 -8
  63. data/lib/omnizip/formats/seven_zip/constants.rb +1 -1
  64. data/lib/omnizip/formats/seven_zip/reader.rb +84 -8
  65. data/lib/omnizip/formats/seven_zip/stream_compressor.rb +2 -1
  66. data/lib/omnizip/formats/seven_zip/stream_decompressor.rb +6 -0
  67. data/lib/omnizip/formats/seven_zip/writer.rb +21 -9
  68. data/lib/omnizip/formats/seven_zip.rb +10 -0
  69. data/lib/omnizip/formats/xar/entry.rb +18 -5
  70. data/lib/omnizip/formats/xar/header.rb +34 -6
  71. data/lib/omnizip/formats/xar/reader.rb +43 -10
  72. data/lib/omnizip/formats/xar/toc.rb +34 -21
  73. data/lib/omnizip/formats/xar/writer.rb +15 -5
  74. data/lib/omnizip/formats/xz_impl/block_decoder.rb +45 -33
  75. data/lib/omnizip/formats/xz_impl/block_encoder.rb +2 -1
  76. data/lib/omnizip/formats/xz_impl/index_decoder.rb +3 -1
  77. data/lib/omnizip/formats/xz_impl/stream_header_parser.rb +2 -1
  78. data/lib/omnizip/formats/zip/end_of_central_directory.rb +4 -3
  79. data/lib/omnizip/implementations/seven_zip/lzma/decoder.rb +14 -6
  80. data/lib/omnizip/implementations/seven_zip/lzma/encoder.rb +2 -1
  81. data/lib/omnizip/implementations/seven_zip/lzma2/encoder.rb +28 -13
  82. data/lib/omnizip/implementations/xz_utils/lzma2/encoder.rb +13 -6
  83. data/lib/omnizip/pipe/stream_compressor.rb +1 -1
  84. data/lib/omnizip/version.rb +1 -1
  85. data/readme-docs/compression-algorithms.adoc +6 -2
  86. metadata +30 -2
@@ -22,17 +22,20 @@ module Omnizip
22
22
  def self.build_from_folder(folder)
23
23
  return nil if folder.coders.empty?
24
24
 
25
- # For now, support single coder or coder+filter combinations
26
- main_coder = folder.coders.last
25
+ # Find the compression method (not a filter) among coders
26
+ # Filters like BCJ, BCJ2 have specific method IDs
27
+ main_coder = find_compression_coder(folder.coders)
28
+ raise "No compression method found in folder" unless main_coder
29
+
27
30
  algorithm = algorithm_for_method(main_coder.method_id)
28
31
 
29
- # Check for filters
32
+ # Check for filters (all coders except the compression method)
30
33
  filters = []
31
- if folder.coders.size > 1
32
- folder.coders[0..-2].each do |coder|
33
- filter = filter_for_method(coder.method_id)
34
- filters << filter if filter
35
- end
34
+ folder.coders.each do |coder|
35
+ next if coder == main_coder
36
+
37
+ filter = filter_for_method(coder.method_id)
38
+ filters << filter if filter
36
39
  end
37
40
 
38
41
  {
@@ -43,6 +46,25 @@ module Omnizip
43
46
  }
44
47
  end
45
48
 
49
+ # Find the compression coder among all coders
50
+ #
51
+ # @param coders [Array<Models::CoderInfo>] All coders in the folder
52
+ # @return [Models::CoderInfo, nil] The compression coder or nil
53
+ def self.find_compression_coder(coders)
54
+ # Try to find a known compression method
55
+ coders.each do |coder|
56
+ case coder.method_id
57
+ when MethodId::LZMA, MethodId::LZMA2, MethodId::BZIP2,
58
+ MethodId::DEFLATE, MethodId::DEFLATE64, MethodId::PPMD,
59
+ MethodId::COPY
60
+ return coder
61
+ end
62
+ end
63
+
64
+ # Fall back to last coder if no known compression method found
65
+ coders.last
66
+ end
67
+
46
68
  # Map method ID to algorithm
47
69
  #
48
70
  # @param method_id [Integer] Method ID from .7z file
@@ -92,6 +114,8 @@ module Omnizip
92
114
  :bcj_arm64
93
115
  when FilterId::DELTA
94
116
  :delta
117
+ when FilterId::BCJ2
118
+ :bcj2
95
119
  end
96
120
  end
97
121
 
@@ -84,7 +84,7 @@ module Omnizip
84
84
  ARM64 = 0x03030601
85
85
 
86
86
  # BCJ2 filter
87
- BCJ2 = 0x03030111
87
+ BCJ2 = 0x0303011B
88
88
 
89
89
  # Swap filters for byte order conversion
90
90
  SWAP2 = 0x03030204
@@ -6,6 +6,7 @@ require_relative "parser"
6
6
  require_relative "models/stream_info"
7
7
  require_relative "models/file_entry"
8
8
  require_relative "stream_decompressor"
9
+ require_relative "bcj2_stream_decompressor"
9
10
  require_relative "split_archive_reader"
10
11
  require_relative "header_encryptor"
11
12
  require_relative "encrypted_header"
@@ -26,12 +27,14 @@ module Omnizip
26
27
  # @param file_path [String] Path to .7z file
27
28
  # @param options [Hash] Reader options
28
29
  # @option options [String] :password Password for encrypted headers
30
+ # @option options [Integer] :offset Offset for embedded archives
29
31
  def initialize(file_path, options = {})
30
32
  @file_path = file_path
31
33
  @entries = []
32
34
  @stream_info = nil
33
35
  @split_reader = nil
34
36
  @password = options[:password]
37
+ @offset = options[:offset] || 0
35
38
  end
36
39
 
37
40
  # Open and parse .7z archive
@@ -47,6 +50,8 @@ module Omnizip
47
50
  @stream_info = @split_reader.stream_info
48
51
  else
49
52
  File.open(@file_path, "rb") do |io|
53
+ # Seek to offset for embedded archives
54
+ io.seek(@offset) if @offset.positive?
50
55
  parse_archive(io)
51
56
  end
52
57
  end
@@ -111,6 +116,8 @@ module Omnizip
111
116
  FileUtils.mkdir_p(output_path)
112
117
  elsif entry.has_stream?
113
118
  File.open(@file_path, "rb") do |io|
119
+ # Seek to offset for embedded archives
120
+ io.seek(@offset) if @offset.positive?
114
121
  data = extract_entry_data(io, entry)
115
122
  File.binwrite(output_path, data)
116
123
  end
@@ -173,7 +180,8 @@ module Omnizip
173
180
  # Read next header metadata
174
181
  # NOTE: next_header_offset is from the END of the Start Header (byte 32)
175
182
  # NOT from the end of the file
176
- next_header_pos = Constants::START_HEADER_SIZE + @header.next_header_offset
183
+ # For embedded archives, add offset to get absolute position
184
+ next_header_pos = @offset + Constants::START_HEADER_SIZE + @header.next_header_offset
177
185
  io.seek(next_header_pos)
178
186
  next_header_data = io.read(@header.next_header_size)
179
187
 
@@ -272,7 +280,8 @@ module Omnizip
272
280
  end
273
281
 
274
282
  # Decompress the header using the stream info
275
- pack_pos = @header.start_pos_after_header + stream_info.pack_pos
283
+ # For embedded archives, add offset to get absolute file position
284
+ pack_pos = @offset + @header.start_pos_after_header + stream_info.pack_pos
276
285
  folder = stream_info.folders[0]
277
286
  pack_size = stream_info.pack_sizes[0]
278
287
  unpack_size = folder.uncompressed_size
@@ -383,16 +392,86 @@ module Omnizip
383
392
  folder = @stream_info.folders[entry.folder_index]
384
393
  return "" unless folder
385
394
 
386
- # Calculate pack position
387
- pack_pos = @header.start_pos_after_header +
395
+ # Calculate pack position (add offset for embedded archives)
396
+ pack_pos = @offset + @header.start_pos_after_header +
388
397
  @stream_info.pack_pos
389
398
 
390
- # Get pack size for this folder
399
+ # Get pack sizes for this folder
391
400
  pack_idx = 0
392
401
  entry.folder_index.times do |i|
393
402
  num_streams = @stream_info.folders[i].pack_stream_indices.size
394
403
  pack_idx += num_streams
395
404
  end
405
+
406
+ # Check if this is a BCJ2 multi-stream folder
407
+ if Bcj2StreamDecompressor.bcj2_folder?(folder)
408
+ extract_bcj2_entry(io, entry, folder, pack_pos, pack_idx)
409
+ else
410
+ extract_regular_entry(io, entry, folder, pack_pos, pack_idx)
411
+ end
412
+ rescue StandardError => e
413
+ warn "Extraction failed for #{entry.name}: #{e.message}"
414
+ raise
415
+ end
416
+
417
+ # Extract entry from BCJ2 multi-stream folder
418
+ #
419
+ # @param io [IO] Archive file handle
420
+ # @param entry [Models::FileEntry] Entry to extract
421
+ # @param folder [Models::Folder] Folder specification
422
+ # @param pack_pos [Integer] Base pack position
423
+ # @param pack_idx [Integer] Starting pack index
424
+ # @return [String] Extracted data
425
+ def extract_bcj2_entry(io, entry, folder, pack_pos, pack_idx)
426
+ # BCJ2 folders have multiple pack streams
427
+ num_pack_streams = folder.pack_stream_indices.size
428
+ pack_sizes = Array.new(num_pack_streams) do |i|
429
+ @stream_info.pack_sizes[pack_idx + i] || 0
430
+ end
431
+
432
+ # Decompress the entire BCJ2 folder
433
+ decompressor = Bcj2StreamDecompressor.new(
434
+ io, folder, pack_pos, pack_sizes, @stream_info
435
+ )
436
+ full_data = decompressor.decompress(folder.uncompressed_size)
437
+
438
+ # For solid archives, extract this file's portion
439
+ num_files_in_folder = @stream_info.num_unpack_streams_in_folders[entry.folder_index] || 1
440
+
441
+ if num_files_in_folder > 1
442
+ # Find offset of this file within the uncompressed stream
443
+ file_offset = 0
444
+ @entries.each do |e|
445
+ break if e.file_index == entry.file_index
446
+
447
+ file_offset += e.size if e.has_stream? && e.folder_index == entry.folder_index
448
+ end
449
+ data = full_data[file_offset, entry.size]
450
+ else
451
+ data = full_data[0, entry.size]
452
+ end
453
+
454
+ # Verify CRC if available
455
+ if entry.crc
456
+ crc = Omnizip::Checksums::Crc32.new
457
+ crc.update(data)
458
+ unless crc.value == entry.crc
459
+ raise "CRC mismatch for #{entry.name}: expected 0x#{entry.crc.to_s(16)}, got 0x#{crc.value.to_s(16)}"
460
+ end
461
+ end
462
+
463
+ data
464
+ end
465
+
466
+ # Extract entry from regular (non-BCJ2) folder
467
+ #
468
+ # @param io [IO] Archive file handle
469
+ # @param entry [Models::FileEntry] Entry to extract
470
+ # @param folder [Models::Folder] Folder specification
471
+ # @param pack_pos [Integer] Base pack position
472
+ # @param pack_idx [Integer] Starting pack index
473
+ # @return [String] Extracted data
474
+ def extract_regular_entry(io, entry, folder, pack_pos, pack_idx)
396
475
  pack_size = @stream_info.pack_sizes[pack_idx] || 0
397
476
 
398
477
  # For solid archives, multiple files share one compressed stream
@@ -434,9 +513,6 @@ module Omnizip
434
513
  expected_crc = entry.crc
435
514
  decompressor.decompress_and_verify(entry.size, expected_crc)
436
515
  end
437
- rescue StandardError => e
438
- warn "Extraction failed for #{entry.name}: #{e.message}"
439
- raise
440
516
  end
441
517
 
442
518
  # Check if file path indicates a split archive
@@ -54,7 +54,8 @@ module Omnizip
54
54
 
55
55
  # For 7-Zip format, use raw_mode (no property byte in compressed data)
56
56
  # The properties are encoded in the 7-Zip header instead
57
- encoder.compress(input_io, output_io, { raw_mode: true, standalone: false })
57
+ encoder.compress(input_io, output_io,
58
+ { raw_mode: true, standalone: false })
58
59
  result = output_io.string
59
60
  end
60
61
 
@@ -80,6 +80,12 @@ module Omnizip
80
80
  filter_class = FilterRegistry.get(filter_sym)
81
81
  next unless filter_class
82
82
 
83
+ # BCJ2 requires special handling with multiple streams
84
+ if filter_sym == :bcj2
85
+ raise "BCJ2 archives require multi-stream decompression which is not yet implemented. " \
86
+ "Please use the 7z command-line tool for this archive."
87
+ end
88
+
83
89
  filter = filter_class.new
84
90
  filtered = StringIO.new
85
91
  filter.reverse(StringIO.new(result), filtered)
@@ -134,7 +134,8 @@ module Omnizip
134
134
 
135
135
  # Step 3: Build Next Header properties
136
136
  # This includes kHeader, MAIN_STREAMS_INFO, FILES_INFO, etc.
137
- next_header_data = build_next_header_properties(file_data, packed_sizes)
137
+ next_header_data = build_next_header_properties(file_data,
138
+ packed_sizes)
138
139
 
139
140
  # Step 4: Write the complete data section
140
141
  # Note: CRC is stored in StartHeader, NOT appended to Next Header
@@ -150,7 +151,8 @@ module Omnizip
150
151
  # (CRC32 is appended after the header data, not included in size)
151
152
  next_header_size = next_header_data.bytesize
152
153
 
153
- write_start_header(io, next_header_offset, next_header_size, next_header_data)
154
+ write_start_header(io, next_header_offset, next_header_size,
155
+ next_header_data)
154
156
  end
155
157
 
156
158
  # Build packed data for solid mode (LZMA2 compression)
@@ -192,7 +194,8 @@ module Omnizip
192
194
  entry.size = data.bytesize
193
195
  end
194
196
 
195
- { data: combined, total_size: total_size, streams: [{ data: combined, size: total_size }] }
197
+ { data: combined, total_size: total_size,
198
+ streams: [{ data: combined, size: total_size }] }
196
199
  else
197
200
  # Non-solid mode: each file gets its own stream
198
201
  streams = []
@@ -240,7 +243,8 @@ module Omnizip
240
243
  # Solid mode: one pack stream, one folder
241
244
  # packed_sizes is a single-element array with compressed size
242
245
  compressed_size = packed_sizes.first
243
- build_solid_streams_info(metadata, unpack_size, compressed_size, num_files)
246
+ build_solid_streams_info(metadata, unpack_size, compressed_size,
247
+ num_files)
244
248
  else
245
249
  # Non-solid mode: one pack stream per file, one folder per file
246
250
  build_non_solid_streams_info(metadata, file_data[:streams])
@@ -286,7 +290,8 @@ module Omnizip
286
290
  encrypted_header.to_binary
287
291
  end
288
292
 
289
- def build_solid_streams_info(metadata, unpack_size, compressed_size, num_files)
293
+ def build_solid_streams_info(metadata, unpack_size, compressed_size,
294
+ num_files)
290
295
  # kPackInfo property (0x06)
291
296
  metadata << [PropertyId::PACK_INFO].pack("C")
292
297
  metadata << write_number(0) # Pack position
@@ -624,7 +629,8 @@ module Omnizip
624
629
  footer
625
630
  end
626
631
 
627
- def write_start_header(io, next_header_offset, next_header_size, next_header_data)
632
+ def write_start_header(io, next_header_offset, next_header_size,
633
+ next_header_data)
628
634
  header = String.new(encoding: "BINARY")
629
635
 
630
636
  # Signature (6 bytes)
@@ -711,17 +717,23 @@ module Omnizip
711
717
  # 11110xxx pattern
712
718
  first_byte = 0xF0 | (value >> 32)
713
719
  result << [first_byte].pack("C")
714
- 4.downto(1) { |i| result << [(value >> (8 * (i - 1))) & 0xFF].pack("C") }
720
+ 4.downto(1) do |i|
721
+ result << [(value >> (8 * (i - 1))) & 0xFF].pack("C")
722
+ end
715
723
  when 6
716
724
  # 111110xx pattern
717
725
  first_byte = 0xF8 | (value >> 40)
718
726
  result << [first_byte].pack("C")
719
- 5.downto(1) { |i| result << [(value >> (8 * (i - 1))) & 0xFF].pack("C") }
727
+ 5.downto(1) do |i|
728
+ result << [(value >> (8 * (i - 1))) & 0xFF].pack("C")
729
+ end
720
730
  when 7
721
731
  # 1111110x pattern
722
732
  first_byte = 0xFC | (value >> 48)
723
733
  result << [first_byte].pack("C")
724
- 6.downto(1) { |i| result << [(value >> (8 * (i - 1))) & 0xFF].pack("C") }
734
+ 6.downto(1) do |i|
735
+ result << [(value >> (8 * (i - 1))) & 0xFF].pack("C")
736
+ end
725
737
  else
726
738
  # 8 bytes: 11111110 or 11111111 prefix
727
739
  result << if value < (1 << 56)
@@ -80,6 +80,16 @@ module Omnizip
80
80
  reader
81
81
  end
82
82
 
83
+ # Search for embedded .7z archive in self-extracting executable
84
+ #
85
+ # @param path [String] Path to potential self-extracting archive
86
+ # @return [Integer, nil] Offset of embedded 7z signature, or nil if not found
87
+ def self.search_embedded(path)
88
+ data = File.binread(path)
89
+ signature = Constants::SIGNATURE
90
+ data.index(signature)
91
+ end
92
+
83
93
  # Auto-register .7z format when loaded
84
94
  def self.register!
85
95
  require_relative "../format_registry"
@@ -191,13 +191,26 @@ module Omnizip
191
191
  data_hash[:length] = @data_size if @data_size&.positive?
192
192
 
193
193
  if @data_encoding && @data_encoding != COMPRESSION_NONE
194
- data_hash[:encoding] = COMPRESSION_MIME_TYPES[@data_encoding] || @data_encoding
194
+ data_hash[:encoding] =
195
+ COMPRESSION_MIME_TYPES[@data_encoding] || @data_encoding
195
196
  end
196
197
 
197
- data_hash[:archived_checksum] = @archived_checksum if @archived_checksum
198
- data_hash[:archived_checksum_style] = @archived_checksum_style if @archived_checksum_style
199
- data_hash[:extracted_checksum] = @extracted_checksum if @extracted_checksum
200
- data_hash[:extracted_checksum_style] = @extracted_checksum_style if @extracted_checksum_style
198
+ if @archived_checksum
199
+ data_hash[:archived_checksum] =
200
+ @archived_checksum
201
+ end
202
+ if @archived_checksum_style
203
+ data_hash[:archived_checksum_style] =
204
+ @archived_checksum_style
205
+ end
206
+ if @extracted_checksum
207
+ data_hash[:extracted_checksum] =
208
+ @extracted_checksum
209
+ end
210
+ if @extracted_checksum_style
211
+ data_hash[:extracted_checksum_style] =
212
+ @extracted_checksum_style
213
+ end
201
214
 
202
215
  hash[:data] = data_hash
203
216
  end
@@ -28,11 +28,35 @@ module Omnizip
28
28
  # @return [Header] Parsed header object
29
29
  # @raise [ArgumentError] If data is invalid
30
30
  def self.parse(data)
31
- raise ArgumentError, "Header data too short (#{data.bytesize} bytes)" if data.bytesize < HEADER_SIZE
31
+ if data.bytesize < HEADER_SIZE
32
+ raise ArgumentError,
33
+ "Header data too short (#{data.bytesize} bytes)"
34
+ end
32
35
 
33
36
  magic = data[0, 4].unpack1("N")
34
- header_size = data[4, 2].unpack1("v") # little-endian
35
- version = data[6, 2].unpack1("v") # little-endian
37
+ # XAR spec: header_size and version are little-endian, rest are big-endian
38
+ # However, some tools (like macOS xar) store these in big-endian format.
39
+ # We detect this by checking if the parsed values make sense.
40
+ header_size_le = data[4, 2].unpack1("v") # little-endian
41
+ header_size_be = data[4, 2].unpack1("n") # big-endian
42
+ version_le = data[6, 2].unpack1("v") # little-endian
43
+ version_be = data[6, 2].unpack1("n") # big-endian
44
+
45
+ # Detect endianness: standard header is 28 bytes, version is 1
46
+ # If little-endian gives valid values, use it; otherwise use big-endian
47
+ if header_size_le == HEADER_SIZE && version_le == XAR_VERSION
48
+ header_size = header_size_le
49
+ version = version_le
50
+ elsif header_size_be == HEADER_SIZE && version_be == XAR_VERSION
51
+ header_size = header_size_be
52
+ version = version_be
53
+ else
54
+ # Default to little-endian (spec-compliant)
55
+ header_size = header_size_le
56
+ version = version_le
57
+ # Normalize version 256 to 1 (big-endian encoding of version 1)
58
+ version = 1 if version == 256
59
+ end
36
60
  toc_compressed_size = data[8, 8].unpack1("Q>") # big-endian uint64
37
61
  toc_uncompressed_size = data[16, 8].unpack1("Q>") # big-endian uint64
38
62
  checksum_algorithm = data[24, 4].unpack1("N")
@@ -104,7 +128,9 @@ module Omnizip
104
128
  # @raise [ArgumentError] If header is invalid
105
129
  def validate!
106
130
  unless @magic == MAGIC
107
- raise ArgumentError, format("Invalid magic: 0x%08x (expected 0x%08x)", @magic, MAGIC)
131
+ raise ArgumentError,
132
+ format("Invalid magic: 0x%08x (expected 0x%08x)", @magic,
133
+ MAGIC)
108
134
  end
109
135
 
110
136
  unless @header_size >= HEADER_SIZE
@@ -115,8 +141,10 @@ module Omnizip
115
141
  raise ArgumentError, "Unsupported version: #{@version}"
116
142
  end
117
143
 
118
- unless [CKSUM_NONE, CKSUM_SHA1, CKSUM_MD5, CKSUM_OTHER].include?(@checksum_algorithm)
119
- raise ArgumentError, "Unknown checksum algorithm: #{@checksum_algorithm}"
144
+ unless [CKSUM_NONE, CKSUM_SHA1, CKSUM_MD5,
145
+ CKSUM_OTHER].include?(@checksum_algorithm)
146
+ raise ArgumentError,
147
+ "Unknown checksum algorithm: #{@checksum_algorithm}"
120
148
  end
121
149
 
122
150
  if @checksum_algorithm == CKSUM_OTHER && @checksum_name.to_s.strip.empty?
@@ -114,10 +114,13 @@ module Omnizip
114
114
  # @param entry [Entry] Entry to read
115
115
  # @return [String, nil] Entry data or nil if no data
116
116
  def read_data(entry)
117
- return nil unless entry.data_size&.positive?
117
+ return nil unless entry.data_length&.positive?
118
118
  return nil unless @file
119
119
 
120
120
  @file.seek(@heap_offset + entry.data_offset)
121
+ # In XAR format:
122
+ # - data_length is the compressed (archived) size (what to read from heap)
123
+ # - data_size is the uncompressed (extracted) size (decompressed size)
121
124
  compressed_data = @file.read(entry.data_length)
122
125
 
123
126
  decompress_data(compressed_data, entry.data_encoding, entry.data_size)
@@ -130,7 +133,9 @@ module Omnizip
130
133
  FileUtils.mkdir_p(output_dir)
131
134
 
132
135
  # Sort entries to ensure directories are created first
133
- sorted_entries = @entries.sort_by { |e| [e.directory? ? 0 : 1, e.name] }
136
+ sorted_entries = @entries.sort_by do |e|
137
+ [e.directory? ? 0 : 1, e.name]
138
+ end
134
139
 
135
140
  sorted_entries.each do |entry|
136
141
  extract_entry(entry, output_dir)
@@ -206,9 +211,11 @@ module Omnizip
206
211
  @toc = Toc.parse(compressed_toc, @header.toc_uncompressed_size)
207
212
  @entries = @toc.entries
208
213
 
209
- # Calculate heap offset (after header + compressed TOC + TOC checksum)
214
+ # Calculate heap offset:
215
+ # The heap starts immediately after the compressed TOC.
216
+ # The TOC checksum is stored INSIDE the heap (at offset 0), not after it.
217
+ # File data offsets in the TOC are relative to the heap start.
210
218
  @heap_offset = @header.header_size + @header.toc_compressed_size
211
- @heap_offset += @header.checksum_size if @header.checksum?
212
219
  end
213
220
 
214
221
  # Decompress data based on encoding
@@ -235,14 +242,40 @@ module Omnizip
235
242
 
236
243
  # Decompress gzip data
237
244
  #
238
- # @param data [String] Gzip compressed data
245
+ # @param data [String] Zlib compressed data (XAR uses zlib, not actual gzip)
239
246
  # @return [String] Decompressed data
240
247
  def decompress_gzip(data)
241
- zlib = Zlib::Inflate.new(-Zlib::MAX_WBITS)
242
- result = zlib.inflate(data)
243
- zlib.finish
244
- zlib.close
245
- result
248
+ # XAR "gzip" compression is actually zlib format (with 0x78xx header)
249
+ # Try different decompression methods for robustness
250
+
251
+ # Method 1: Standard zlib format (with header)
252
+ begin
253
+ result = Zlib::Inflate.inflate(data)
254
+ return result
255
+ rescue Zlib::Error
256
+ # Continue to next method
257
+ end
258
+
259
+ # Method 2: Raw deflate (some implementations may use this)
260
+ begin
261
+ inf = Zlib::Inflate.new(-Zlib::MAX_WBITS)
262
+ result = inf.inflate(data)
263
+ inf.finish
264
+ inf.close
265
+ return result
266
+ rescue Zlib::Error
267
+ # Continue to next method
268
+ end
269
+
270
+ # Method 3: Raw deflate without finish (for truncated data)
271
+ begin
272
+ inf = Zlib::Inflate.new(-Zlib::MAX_WBITS)
273
+ result = inf.inflate(data)
274
+ inf.close
275
+ result
276
+ rescue Zlib::Error => e
277
+ raise "Failed to decompress data: #{e.message}"
278
+ end
246
279
  end
247
280
 
248
281
  # Decompress bzip2 data
@@ -20,7 +20,8 @@ module Omnizip
20
20
  class Toc
21
21
  include Constants
22
22
 
23
- attr_accessor :creation_time, :checksum_offset, :checksum_size, :checksum_style
23
+ attr_accessor :creation_time, :checksum_offset, :checksum_size,
24
+ :checksum_style
24
25
  attr_reader :entries
25
26
 
26
27
  # Parse TOC from compressed data
@@ -36,17 +37,23 @@ module Omnizip
36
37
 
37
38
  # Decompress TOC data
38
39
  #
39
- # @param compressed_data [String] GZIP-compressed data
40
+ # @param compressed_data [String] Zlib-compressed data
40
41
  # @param expected_size [Integer, nil] Expected size for validation
41
42
  # @return [String] Decompressed XML
42
43
  def self.decompress(compressed_data, expected_size = nil)
43
- zlib = Zlib::Inflate.new(-Zlib::MAX_WBITS) # Raw deflate
44
- result = zlib.inflate(compressed_data)
45
- zlib.finish
46
- zlib.close
44
+ # XAR TOC is zlib compressed (with zlib headers, 0x78xx)
45
+ # Try zlib format first (most common), then fall back to raw deflate
46
+ result = begin
47
+ # Try standard zlib format (with header)
48
+ Zlib::Inflate.inflate(compressed_data)
49
+ rescue Zlib::DataError
50
+ # Fall back to raw deflate for non-conforming implementations
51
+ Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(compressed_data)
52
+ end
47
53
 
48
54
  if expected_size && result.bytesize != expected_size
49
- raise ArgumentError, "TOC size mismatch: #{result.bytesize} != #{expected_size}"
55
+ raise ArgumentError,
56
+ "TOC size mismatch: #{result.bytesize} != #{expected_size}"
50
57
  end
51
58
 
52
59
  result
@@ -279,10 +286,10 @@ module Omnizip
279
286
  if (data = elem.elements["data"])
280
287
  options[:data_offset] = int_content(data.elements["offset"]) || 0
281
288
  # In XAR format:
282
- # - <length> is the uncompressed (extracted) size
283
- # - <size> is the compressed (archived) size
284
- options[:data_size] = int_content(data.elements["length"]) || 0
285
- options[:data_length] = int_content(data.elements["size"]) || 0
289
+ # - <size> is the uncompressed (extracted) size
290
+ # - <length> is the compressed (archived) size in the heap
291
+ options[:data_size] = int_content(data.elements["size"]) || 0
292
+ options[:data_length] = int_content(data.elements["length"]) || 0
286
293
 
287
294
  if (encoding = data.elements["encoding"])
288
295
  style = encoding.attributes["style"]
@@ -291,12 +298,14 @@ module Omnizip
291
298
 
292
299
  if (archived_sum = data.elements["archived-checksum"])
293
300
  options[:archived_checksum] = text_content(archived_sum)
294
- options[:archived_checksum_style] = archived_sum.attributes["style"]
301
+ options[:archived_checksum_style] =
302
+ archived_sum.attributes["style"]
295
303
  end
296
304
 
297
305
  if (extracted_sum = data.elements["extracted-checksum"])
298
306
  options[:extracted_checksum] = text_content(extracted_sum)
299
- options[:extracted_checksum_style] = extracted_sum.attributes["style"]
307
+ options[:extracted_checksum_style] =
308
+ extracted_sum.attributes["style"]
300
309
  end
301
310
  end
302
311
 
@@ -347,7 +356,10 @@ module Omnizip
347
356
  elem.elements.each("file") do |file_elem|
348
357
  entry = parse_file_element(file_elem)
349
358
  # Prepend parent path to name
350
- entry.name = File.join(parent_entry.name, entry.name) unless parent_entry.name.empty?
359
+ unless parent_entry.name.empty?
360
+ entry.name = File.join(parent_entry.name,
361
+ entry.name)
362
+ end
351
363
  toc.add_entry(entry)
352
364
 
353
365
  # Recurse for deeper nesting
@@ -392,7 +404,8 @@ module Omnizip
392
404
  # @param entry [Entry] Entry to add
393
405
  # @param children_map [Hash] Children by parent path
394
406
  # @param parent_path [String] Path of parent directory (for nested entries)
395
- def add_file_element(parent_elem, entry, children_map, parent_path = nil)
407
+ def add_file_element(parent_elem, entry, children_map,
408
+ parent_path = nil)
396
409
  file_elem = parent_elem.add_element("file")
397
410
  file_elem.add_attribute("id", entry.id.to_s)
398
411
 
@@ -453,16 +466,16 @@ module Omnizip
453
466
  offset_elem.add_text(entry.data_offset.to_s)
454
467
 
455
468
  # In XAR format:
456
- # - <length> is the uncompressed (extracted) size
457
- # - <size> is the compressed (archived) size
458
- if entry.data_size&.positive?
469
+ # - <length> is the compressed (archived) size in the heap
470
+ # - <size> is the uncompressed (extracted) size
471
+ if entry.data_length&.positive?
459
472
  length_elem = data_elem.add_element("length")
460
- length_elem.add_text(entry.data_size.to_s)
473
+ length_elem.add_text(entry.data_length.to_s)
461
474
  end
462
475
 
463
- if entry.data_length&.positive?
476
+ if entry.data_size&.positive?
464
477
  size_elem = data_elem.add_element("size")
465
- size_elem.add_text(entry.data_length.to_s)
478
+ size_elem.add_text(entry.data_size.to_s)
466
479
  end
467
480
 
468
481
  if entry.data_encoding && entry.data_encoding != COMPRESSION_NONE