zip_kit 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +7 -0
  2. data/.codeclimate.yml +7 -0
  3. data/.document +5 -0
  4. data/.github/workflows/ci.yml +29 -0
  5. data/.gitignore +61 -0
  6. data/.rspec +1 -0
  7. data/.standard.yml +8 -0
  8. data/.yardopts +1 -0
  9. data/CHANGELOG.md +255 -0
  10. data/CODE_OF_CONDUCT.md +46 -0
  11. data/CONTRIBUTING.md +153 -0
  12. data/Gemfile +4 -0
  13. data/IMPLEMENTATION_DETAILS.md +97 -0
  14. data/LICENSE.txt +20 -0
  15. data/README.md +234 -0
  16. data/Rakefile +21 -0
  17. data/bench/buffered_crc32_bench.rb +109 -0
  18. data/examples/archive_size_estimate.rb +15 -0
  19. data/examples/config.ru +7 -0
  20. data/examples/deferred_write.rb +58 -0
  21. data/examples/parallel_compression_with_block_deflate.rb +86 -0
  22. data/examples/rack_application.rb +63 -0
  23. data/examples/s3_upload.rb +23 -0
  24. data/lib/zip_kit/block_deflate.rb +130 -0
  25. data/lib/zip_kit/block_write.rb +47 -0
  26. data/lib/zip_kit/file_reader/inflating_reader.rb +36 -0
  27. data/lib/zip_kit/file_reader/stored_reader.rb +35 -0
  28. data/lib/zip_kit/file_reader.rb +740 -0
  29. data/lib/zip_kit/null_writer.rb +12 -0
  30. data/lib/zip_kit/output_enumerator.rb +150 -0
  31. data/lib/zip_kit/path_set.rb +163 -0
  32. data/lib/zip_kit/rack_chunked_body.rb +32 -0
  33. data/lib/zip_kit/rack_tempfile_body.rb +61 -0
  34. data/lib/zip_kit/rails_streaming.rb +37 -0
  35. data/lib/zip_kit/remote_io.rb +114 -0
  36. data/lib/zip_kit/remote_uncap.rb +22 -0
  37. data/lib/zip_kit/size_estimator.rb +84 -0
  38. data/lib/zip_kit/stream_crc32.rb +60 -0
  39. data/lib/zip_kit/streamer/deflated_writer.rb +45 -0
  40. data/lib/zip_kit/streamer/entry.rb +37 -0
  41. data/lib/zip_kit/streamer/filler.rb +9 -0
  42. data/lib/zip_kit/streamer/heuristic.rb +68 -0
  43. data/lib/zip_kit/streamer/stored_writer.rb +39 -0
  44. data/lib/zip_kit/streamer/writable.rb +36 -0
  45. data/lib/zip_kit/streamer.rb +614 -0
  46. data/lib/zip_kit/uniquify_filename.rb +39 -0
  47. data/lib/zip_kit/version.rb +5 -0
  48. data/lib/zip_kit/write_and_tell.rb +40 -0
  49. data/lib/zip_kit/write_buffer.rb +71 -0
  50. data/lib/zip_kit/write_shovel.rb +22 -0
  51. data/lib/zip_kit/zip_writer.rb +436 -0
  52. data/lib/zip_kit.rb +24 -0
  53. data/zip_kit.gemspec +41 -0
  54. metadata +335 -0
@@ -0,0 +1,740 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "stringio"
4
+
5
+ # A very barebones ZIP file reader. Is made for maximum interoperability, but at the same
6
+ # time we attempt to keep it somewhat concise.
7
+ #
8
+ # ## REALLY CRAZY IMPORTANT STUFF: SECURITY IMPLICATIONS
9
+ #
10
+ # Please **BEWARE** - using this is a security risk if you are reading files that have been
11
+ # supplied by users. This implementation has _not_ been formally verified for correctness. As
12
+ # ZIP files contain relative offsets in lots of places it might be possible for a maliciously
13
+ # crafted ZIP file to put the decode procedure in an endless loop, make it attempt huge reads
14
+ # from the input file and so on. Additionally, the reader module for deflated data has
15
+ # no support for ZIP bomb protection. So either limit the `FileReader` usage to the files you
16
+ # trust, or triple-check all the inputs upfront. Patches to make this reader more secure
17
+ # are welcome of course.
18
+ #
19
+ # ## Usage
20
+ #
21
+ # File.open('zipfile.zip', 'rb') do |f|
22
+ # entries = ZipKit::FileReader.read_zip_structure(io: f)
23
+ # entries.each do |e|
24
+ # File.open(e.filename, 'wb') do |extracted_file|
25
+ # ex = e.extractor_from(f)
26
+ # extracted_file << ex.extract(1024 * 1024) until ex.eof?
27
+ # end
28
+ # end
29
+ # end
30
+ #
31
+ # ## Supported features
32
+ #
33
+ # * Deflate and stored storage modes
34
+ # * Zip64 (extra fields and offsets)
35
+ # * Data descriptors
36
+ #
37
+ # ## Unsupported features
38
+ #
39
+ # * Archives split over multiple disks/files
40
+ # * Any ZIP encryption
41
+ # * EFS language flag and InfoZIP filename extra field
42
+ # * CRC32 checksums are _not_ verified
43
+ #
44
+ # ## Mode of operation
45
+ #
46
+ # By default, `FileReader` _ignores_ the data in local file headers (as it is
47
+ # often unreliable). It reads the ZIP file "from the tail", finds the
48
+ # end-of-central-directory signatures, then reads the central directory entries,
49
+ # reconstitutes the entries with their filenames, attributes and so on, and
50
+ # sets these entries up with the absolute _offsets_ into the source file/IO object.
51
+ # These offsets can then be used to extract the actual compressed data of
52
+ # the files and to expand it.
53
+ #
54
+ # ## Recovering damaged or incomplete ZIP files
55
+ #
56
+ # If the ZIP file you are trying to read does not contain the central directory
57
+ # records `read_zip_structure` will not work, since it starts the read process
58
+ # from the EOCD marker at the end of the central directory and then crawls
59
+ # "back" in the IO to figure out the rest. You can explicitly apply a fallback
60
+ # for reading the archive "straight ahead" instead using `read_zip_straight_ahead`
61
+ # - the method will instead scan your IO from the very start, skipping over
62
+ # the actual entry data. This is less efficient than central directory parsing since
63
+ # it involves a much larger number of reads (1 read from the IO per entry in the ZIP).
64
+
65
+ class ZipKit::FileReader
66
+ require_relative "file_reader/stored_reader"
67
+ require_relative "file_reader/inflating_reader"
68
+
69
+ ReadError = Class.new(StandardError)
70
+ UnsupportedFeature = Class.new(StandardError)
71
+ InvalidStructure = Class.new(ReadError)
72
+ LocalHeaderPending = Class.new(StandardError) do
73
+ def message
74
+ "The compressed data offset is not available (local header has not been read)"
75
+ end
76
+ end
77
+ MissingEOCD = Class.new(StandardError) do
78
+ def message
79
+ "Could not find the EOCD signature in the buffer - maybe a malformed ZIP file"
80
+ end
81
+ end
82
+
83
+ private_constant :StoredReader, :InflatingReader
84
+
85
+ # Represents a file within the ZIP archive being read. This is different from
86
+ # the Entry object used in Streamer for ZIP writing, since during writing more
87
+ # data can be kept in memory for immediate use.
88
+ class ZipEntry
89
+ # @return [Fixnum] bit-packed version signature of the program that made the archive
90
+ attr_accessor :made_by
91
+
92
+ # @return [Fixnum] ZIP version support needed to extract this file
93
+ attr_accessor :version_needed_to_extract
94
+
95
+ # @return [Fixnum] bit-packed general purpose flags
96
+ attr_accessor :gp_flags
97
+
98
+ # @return [Fixnum] Storage mode (0 for stored, 8 for deflate)
99
+ attr_accessor :storage_mode
100
+
101
+ # @return [Fixnum] the bit-packed DOS time
102
+ attr_accessor :dos_time
103
+
104
+ # @return [Fixnum] the bit-packed DOS date
105
+ attr_accessor :dos_date
106
+
107
+ # @return [Fixnum] the CRC32 checksum of this file
108
+ attr_accessor :crc32
109
+
110
+ # @return [Fixnum] size of compressed file data in the ZIP
111
+ attr_accessor :compressed_size
112
+
113
+ # @return [Fixnum] size of the file once uncompressed
114
+ attr_accessor :uncompressed_size
115
+
116
+ # @return [String] the filename
117
+ attr_accessor :filename
118
+
119
+ # @return [Fixnum] disk number where this file starts
120
+ attr_accessor :disk_number_start
121
+
122
+ # @return [Fixnum] internal attributes of the file
123
+ attr_accessor :internal_attrs
124
+
125
+ # @return [Fixnum] external attributes of the file
126
+ attr_accessor :external_attrs
127
+
128
+ # @return [Fixnum] at what offset the local file header starts
129
+ # in your original IO object
130
+ attr_accessor :local_file_header_offset
131
+
132
+ # @return [String] the file comment
133
+ attr_accessor :comment
134
+
135
+ # Returns a reader for the actual compressed data of the entry.
136
+ #
137
+ # reader = entry.extractor_from(source_file)
138
+ # outfile << reader.extract(512 * 1024) until reader.eof?
139
+ #
140
+ # @return [#extract(n_bytes), #eof?] the reader for the data
141
+ def extractor_from(from_io)
142
+ from_io.seek(compressed_data_offset, IO::SEEK_SET)
143
+ case storage_mode
144
+ when 8
145
+ InflatingReader.new(from_io, compressed_size)
146
+ when 0
147
+ StoredReader.new(from_io, compressed_size)
148
+ else
149
+ raise UnsupportedFeature, "Unsupported storage mode for reading - %<storage_mode>d" %
150
+ {storage_mode: storage_mode}
151
+ end
152
+ end
153
+
154
+ # @return [Fixnum] at what offset you should start reading
155
+ # for the compressed data in your original IO object
156
+ def compressed_data_offset
157
+ @compressed_data_offset || raise(LocalHeaderPending)
158
+ end
159
+
160
+ # Tells whether the compressed data offset is already known for this entry
161
+ # @return [Boolean]
162
+ def known_offset?
163
+ !@compressed_data_offset.nil?
164
+ end
165
+
166
+ # Tells whether the entry uses a data descriptor (this is defined
167
+ # by bit 3 in the GP flags).
168
+ def uses_data_descriptor?
169
+ (gp_flags & 0x0008) == 0x0008
170
+ end
171
+
172
+ # Sets the offset at which the compressed data for this file starts in the ZIP.
173
+ # By default, the value will be set by the Reader for you. If you use delayed
174
+ # reading, you need to set it by using the `get_compressed_data_offset` on the Reader:
175
+ #
176
+ # entry.compressed_data_offset = reader.get_compressed_data_offset(io: file,
177
+ # local_file_header_offset: entry.local_header_offset)
178
+ def compressed_data_offset=(offset)
179
+ @compressed_data_offset = offset.to_i
180
+ end
181
+ end
182
+
183
+ # Parse an IO handle to a ZIP archive into an array of Entry objects.
184
+ #
185
+ # @param io[#tell, #seek, #read, #size] an IO-ish object
186
+ # @param read_local_headers[Boolean] whether the local headers must be read upfront. When reading
187
+ # a locally available ZIP file this option will not have much use since the small reads from
188
+ # the file handle are not going to be that important. However, if you are using remote reads
189
+ # to decipher a ZIP file located on an HTTP server, the operation _must_ perform an HTTP
190
+ # request for _each entry in the ZIP file_ to determine where the actual file data starts.
191
+ # This, for a ZIP archive of 1000 files, will incur 1000 extra HTTP requests - which you might
192
+ # not want to perform upfront, or - at least - not want to perform _at once_. When the option is
193
+ # set to `false`, you will be getting instances of `LazyEntry` instead of `Entry`. Those objects
194
+ # will raise an exception when you attempt to access their compressed data offset in the ZIP
195
+ # (since the reads have not been performed yet). As a rule, this option can be left in it's
196
+ # default setting (`true`) unless you want to _only_ read the central directory, or you need
197
+ # to limit the number of HTTP requests.
198
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
199
+ def read_zip_structure(io:, read_local_headers: true)
200
+ zip_file_size = io.size
201
+ eocd_offset = get_eocd_offset(io, zip_file_size)
202
+
203
+ zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
204
+ num_files, cdir_location, _cdir_size =
205
+ if zip64_end_of_cdir_location
206
+ num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
207
+ else
208
+ num_files_and_central_directory_offset(io, eocd_offset)
209
+ end
210
+
211
+ log do
212
+ "Located the central directory start at %<location>d" %
213
+ {location: cdir_location}
214
+ end
215
+ seek(io, cdir_location)
216
+
217
+ # Read the entire central directory AND anything behind it, in one fell swoop.
218
+ # Strictly speaking, we should be able to read `cdir_size` bytes and not a byte more.
219
+ # However, we know for a fact that in some of our files the central directory size
220
+ # is in fact misreported. `zipinfo` then says:
221
+ #
222
+ # warning [ktsglobal-2b03bc.zip]: 1 extra byte at beginning or within zipfile
223
+ # (attempting to process anyway)
224
+ # error [ktsglobal-2b03bc.zip]: reported length of central directory is
225
+ # -1 bytes too long (Atari STZip zipfile? J.H.Holm ZIPSPLIT 1.1
226
+ # zipfile?). Compensating...
227
+ #
228
+ # Since the EOCD is not that big anyway, we just read the entire "tail" of the ZIP ignoring
229
+ # the central directory size alltogether.
230
+ central_directory_str = io.read # and not read_n(io, cdir_size), see above
231
+ central_directory_io = StringIO.new(central_directory_str)
232
+ log do
233
+ "Read %<byte_size>d bytes with central directory + EOCD record and locator" %
234
+ {byte_size: central_directory_str.bytesize}
235
+ end
236
+
237
+ entries = (0...num_files).map { |entry_n|
238
+ offset_location = cdir_location + central_directory_io.tell
239
+ log do
240
+ "Reading the central directory entry %<entry_n>d starting at offset %<offset>d" %
241
+ {entry_n: entry_n, offset: offset_location}
242
+ end
243
+ read_cdir_entry(central_directory_io)
244
+ }
245
+
246
+ read_local_headers(entries, io) if read_local_headers
247
+
248
+ entries
249
+ end
250
+
251
+ # Sometimes you might encounter truncated ZIP files, which do not contain
252
+ # any central directory whatsoever - or where the central directory is
253
+ # truncated. In that case, employing the technique of reading the ZIP
254
+ # "from the end" is impossible, and the only recourse is reading each
255
+ # local file header in sucession. If the entries in such a ZIP use data
256
+ # descriptors, you would need to scan after the entry until you encounter
257
+ # the data descriptor signature - and that might be unreliable at best.
258
+ # Therefore, this reading technique does not support data descriptors.
259
+ # It can however recover the entries you still can read if these entries
260
+ # contain all the necessary information about the contained file.
261
+ #
262
+ # @param io[#tell, #read, #seek] the IO-ish object to read the local file
263
+ # headers from @return [Array<ZipEntry>] an array of entries that could be
264
+ # recovered before hitting EOF
265
+ def read_zip_straight_ahead(io:)
266
+ entries = []
267
+ loop do
268
+ cur_offset = io.tell
269
+ entry = read_local_file_header(io: io)
270
+ if entry.uses_data_descriptor?
271
+ raise UnsupportedFeature, "The local file header at #{cur_offset} uses \
272
+ a data descriptor and the start of next entry \
273
+ cannot be found"
274
+ end
275
+ entries << entry
276
+ next_local_header_offset = entry.compressed_data_offset + entry.compressed_size
277
+ log do
278
+ "Recovered a local file file header at offset %<cur_offset>d, seeking to the next at %<header_offset>d" %
279
+ {cur_offset: cur_offset, header_offset: next_local_header_offset}
280
+ end
281
+ seek(io, next_local_header_offset) # Seek to the next entry, and raise if seek is impossible
282
+ end
283
+ entries
284
+ rescue ReadError, RangeError # RangeError is raised if offset exceeds int32/int64 range
285
+ log do
286
+ "Got a read/seek error after reaching %<cur_offset>d, no more entries can be recovered" %
287
+ {cur_offset: cur_offset}
288
+ end
289
+ entries
290
+ end
291
+
292
+ # Parse the local header entry and get the offset in the IO at which the
293
+ # actual compressed data of the file starts within the ZIP.
294
+ # The method will eager-read the entire local header for the file
295
+ # (the maximum size the local header may use), starting at the given offset,
296
+ # and will then compute its size. That size plus the local header offset
297
+ # given will be the compressed data offset of the entry (read starting at
298
+ # this offset to get the data).
299
+ #
300
+ # @param io[#read] an IO-ish object the ZIP file can be read from
301
+ # @return [Array<ZipEntry, Fixnum>] the parsed local header entry and
302
+ # the compressed data offset
303
+ def read_local_file_header(io:)
304
+ local_file_header_offset = io.tell
305
+
306
+ # Reading in bulk is cheaper - grab the maximum length of the local header,
307
+ # including any headroom for extra fields etc.
308
+ local_file_header_str_plus_headroom = io.read(MAX_LOCAL_HEADER_SIZE)
309
+ raise ReadError if local_file_header_str_plus_headroom.nil? # reached EOF
310
+
311
+ io_starting_at_local_header = StringIO.new(local_file_header_str_plus_headroom)
312
+
313
+ assert_signature(io_starting_at_local_header, 0x04034b50)
314
+ e = ZipEntry.new
315
+ e.version_needed_to_extract = read_2b(io_starting_at_local_header) # Version needed to extract
316
+ e.gp_flags = read_2b(io_starting_at_local_header) # gp flags
317
+ e.storage_mode = read_2b(io_starting_at_local_header) # storage mode
318
+ e.dos_time = read_2b(io_starting_at_local_header) # dos time
319
+ e.dos_date = read_2b(io_starting_at_local_header) # dos date
320
+ e.crc32 = read_4b(io_starting_at_local_header) # CRC32
321
+ e.compressed_size = read_4b(io_starting_at_local_header) # Comp size
322
+ e.uncompressed_size = read_4b(io_starting_at_local_header) # Uncomp size
323
+
324
+ filename_size = read_2b(io_starting_at_local_header)
325
+ extra_size = read_2b(io_starting_at_local_header)
326
+ e.filename = read_n(io_starting_at_local_header, filename_size)
327
+ extra_fields_str = read_n(io_starting_at_local_header, extra_size)
328
+
329
+ # Parse out the extra fields
330
+ extra_table = parse_out_extra_fields(extra_fields_str)
331
+
332
+ # ...of which we really only need the Zip64 extra
333
+ if (zip64_extra_contents = extra_table[1])
334
+ # If the Zip64 extra is present, we let it override all
335
+ # the values fetched from the conventional header
336
+ zip64_extra = StringIO.new(zip64_extra_contents)
337
+ log do
338
+ "Will read Zip64 extra data from local header field for %<filename>s, %<size>d bytes" %
339
+ {filename: e.filename, size: zip64_extra.size}
340
+ end
341
+ # Now here be dragons. The APPNOTE specifies that
342
+ #
343
+ # > The order of the fields in the ZIP64 extended
344
+ # > information record is fixed, but the fields will
345
+ # > only appear if the corresponding Local or Central
346
+ # > directory record field is set to 0xFFFF or 0xFFFFFFFF.
347
+ #
348
+ # It means that before we read this stuff we need to check if the previously-read
349
+ # values are at overflow, and only _then_ proceed to read them. Bah.
350
+ e.uncompressed_size = read_8b(zip64_extra) if e.uncompressed_size == 0xFFFFFFFF
351
+ e.compressed_size = read_8b(zip64_extra) if e.compressed_size == 0xFFFFFFFF
352
+ end
353
+
354
+ offset = local_file_header_offset + io_starting_at_local_header.tell
355
+ e.compressed_data_offset = offset
356
+
357
+ e
358
+ end
359
+
360
+ # Get the offset in the IO at which the actual compressed data of the file
361
+ # starts within the ZIP. The method will eager-read the entire local header
362
+ # for the file (the maximum size the local header may use), starting at the
363
+ # given offset, and will then compute its size. That size plus the local
364
+ # header offset given will be the compressed data offset of the entry
365
+ # (read starting at this offset to get the data).
366
+ #
367
+ # @param io[#seek, #read] an IO-ish object the ZIP file can be read from
368
+ # @param local_file_header_offset[Fixnum] absolute offset (0-based) where the
369
+ # local file header is supposed to begin @return [Fixnum] absolute offset
370
+ # (0-based) of where the compressed data begins for this file within the ZIP
371
+ def get_compressed_data_offset(io:, local_file_header_offset:)
372
+ seek(io, local_file_header_offset)
373
+ entry_recovered_from_local_file_header = read_local_file_header(io: io)
374
+ entry_recovered_from_local_file_header.compressed_data_offset
375
+ end
376
+
377
+ # Parse an IO handle to a ZIP archive into an array of Entry objects, reading from the end
378
+ # of the IO object.
379
+ #
380
+ # @see #read_zip_structure
381
+ # @param options[Hash] any options the instance method of the same name accepts
382
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
383
+ def self.read_zip_structure(**options)
384
+ new.read_zip_structure(**options)
385
+ end
386
+
387
+ # Parse an IO handle to a ZIP archive into an array of Entry objects, reading from the start of
388
+ # the file and parsing local file headers one-by-one
389
+ #
390
+ # @see #read_zip_straight_ahead
391
+ # @param options[Hash] any options the instance method of the same name accepts
392
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
393
+ def self.read_zip_straight_ahead(**options)
394
+ new.read_zip_straight_ahead(**options)
395
+ end
396
+
397
+ private
398
+
399
+ def read_local_headers(entries, io)
400
+ entries.each_with_index do |entry, i|
401
+ log do
402
+ "Reading the local header for entry %<index>d at offset %<offset>d" %
403
+ {index: i, offset: entry.local_file_header_offset}
404
+ end
405
+ off = get_compressed_data_offset(io: io,
406
+ local_file_header_offset: entry.local_file_header_offset)
407
+ entry.compressed_data_offset = off
408
+ end
409
+ end
410
+
411
+ def skip_ahead_2(io)
412
+ skip_ahead_n(io, 2)
413
+ end
414
+
415
+ def skip_ahead_4(io)
416
+ skip_ahead_n(io, 4)
417
+ end
418
+
419
+ def skip_ahead_8(io)
420
+ skip_ahead_n(io, 8)
421
+ end
422
+
423
+ def seek(io, absolute_pos)
424
+ io.seek(absolute_pos, IO::SEEK_SET)
425
+ unless absolute_pos == io.tell
426
+ raise ReadError,
427
+ "Expected to seek to #{absolute_pos} but only \
428
+ got to #{io.tell}"
429
+ end
430
+ nil
431
+ end
432
+
433
+ def assert_signature(io, signature_magic_number)
434
+ readback = read_4b(io)
435
+ if readback != signature_magic_number
436
+ expected = "0x0" + signature_magic_number.to_s(16)
437
+ actual = "0x0" + readback.to_s(16)
438
+ raise InvalidStructure, "Expected signature #{expected}, but read #{actual}"
439
+ end
440
+ end
441
+
442
+ def skip_ahead_n(io, n)
443
+ pos_before = io.tell
444
+ io.seek(io.tell + n, IO::SEEK_SET)
445
+ pos_after = io.tell
446
+ delta = pos_after - pos_before
447
+ unless delta == n
448
+ raise ReadError, "Expected to seek #{n} bytes ahead, but could \
449
+ only seek #{delta} bytes ahead"
450
+ end
451
+ nil
452
+ end
453
+
454
+ def read_n(io, n_bytes)
455
+ io.read(n_bytes).tap do |d|
456
+ raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
457
+ unless d.bytesize == n_bytes
458
+ raise ReadError, "Expected to read #{n_bytes} bytes, \
459
+ read #{d.bytesize}"
460
+ end
461
+ end
462
+ end
463
+
464
+ def read_2b(io)
465
+ read_n(io, 2).unpack(C_UINT2).shift
466
+ end
467
+
468
+ def read_4b(io)
469
+ read_n(io, 4).unpack(C_UINT4).shift
470
+ end
471
+
472
+ def read_8b(io)
473
+ read_n(io, 8).unpack(C_UINT8).shift
474
+ end
475
+
476
+ def read_cdir_entry(io)
477
+ # read_cdir_entry is too high. [45.66/15]
478
+ assert_signature(io, 0x02014b50)
479
+ ZipEntry.new.tap do |e|
480
+ e.made_by = read_2b(io)
481
+ e.version_needed_to_extract = read_2b(io)
482
+ e.gp_flags = read_2b(io)
483
+ e.storage_mode = read_2b(io)
484
+ e.dos_time = read_2b(io)
485
+ e.dos_date = read_2b(io)
486
+ e.crc32 = read_4b(io)
487
+ e.compressed_size = read_4b(io)
488
+ e.uncompressed_size = read_4b(io)
489
+ filename_size = read_2b(io)
490
+ extra_size = read_2b(io)
491
+ comment_len = read_2b(io)
492
+ e.disk_number_start = read_2b(io)
493
+ e.internal_attrs = read_2b(io)
494
+ e.external_attrs = read_4b(io)
495
+ e.local_file_header_offset = read_4b(io)
496
+ e.filename = read_n(io, filename_size)
497
+
498
+ # Extra fields
499
+ extras = read_n(io, extra_size)
500
+ # Comment
501
+ e.comment = read_n(io, comment_len)
502
+
503
+ # Parse out the extra fields
504
+ extra_table = parse_out_extra_fields(extras)
505
+
506
+ # ...of which we really only need the Zip64 extra
507
+ if zip64_extra_contents ||= extra_table[1]
508
+ # If the Zip64 extra is present, we let it override all
509
+ # the values fetched from the conventional header
510
+ zip64_extra = StringIO.new(zip64_extra_contents)
511
+ log do
512
+ "Will read Zip64 extra data for %<filename>s, %<size>d bytes" %
513
+ {filename: e.filename, size: zip64_extra.size}
514
+ end
515
+ # Now here be dragons. The APPNOTE specifies that
516
+ #
517
+ # > The order of the fields in the ZIP64 extended
518
+ # > information record is fixed, but the fields will
519
+ # > only appear if the corresponding Local or Central
520
+ # > directory record field is set to 0xFFFF or 0xFFFFFFFF.
521
+ #
522
+ # It means that before we read this stuff we need to check if the previously-read
523
+ # values are at overflow, and only _then_ proceed to read them. Bah.
524
+ e.uncompressed_size = read_8b(zip64_extra) if e.uncompressed_size == 0xFFFFFFFF
525
+ e.compressed_size = read_8b(zip64_extra) if e.compressed_size == 0xFFFFFFFF
526
+ e.local_file_header_offset = read_8b(zip64_extra) if e.local_file_header_offset == 0xFFFFFFFF
527
+ # Disk number comes last and we can skip it anyway, since we do
528
+ # not support multi-disk archives
529
+ end
530
+ end
531
+ end
532
+
533
+ def get_eocd_offset(file_io, zip_file_size)
534
+ # Start reading from the _comment_ of the zip file (from the very end).
535
+ # The maximum size of the comment is 0xFFFF (what fits in 2 bytes)
536
+ implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
537
+ implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0
538
+
539
+ # Use a soft seek (we might not be able to get as far behind in the IO as we want)
540
+ # and a soft read (we might not be able to read as many bytes as we want)
541
+ file_io.seek(implied_position_of_eocd_record, IO::SEEK_SET)
542
+ str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
543
+ eocd_idx_in_buf = locate_eocd_signature(str_containing_eocd_record)
544
+
545
+ raise MissingEOCD unless eocd_idx_in_buf
546
+
547
+ eocd_offset = implied_position_of_eocd_record + eocd_idx_in_buf
548
+ log do
549
+ "Found EOCD signature at offset %<offset>d" % {offset: eocd_offset}
550
+ end
551
+
552
+ eocd_offset
553
+ end
554
+
555
+ def all_indices_of_substr_in_str(of_substring, in_string)
556
+ last_i = 0
557
+ found_at_indices = []
558
+ while (last_i = in_string.index(of_substring, last_i))
559
+ found_at_indices << last_i
560
+ last_i += of_substring.bytesize
561
+ end
562
+ found_at_indices
563
+ end
564
+
565
+ # We have to scan the maximum possible number
566
+ # of bytes that the EOCD can theoretically occupy including the comment after it,
567
+ # and we have to find a combination of:
568
+ # [EOCD signature, <some ZIP medatata>, comment byte size, comment of size]
569
+ # at the end. To do so, we first find all indices of the signature in the trailer
570
+ # string, and then check whether the bytestring starting at the signature and
571
+ # ending at the end of string satisfies that given pattern.
572
+ def locate_eocd_signature(in_str)
573
+ eocd_signature = 0x06054b50
574
+ eocd_signature_str = [eocd_signature].pack("V")
575
+ unpack_pattern = "VvvvvVVv"
576
+ minimum_record_size = 22
577
+ str_size = in_str.bytesize
578
+ indices = all_indices_of_substr_in_str(eocd_signature_str, in_str)
579
+ indices.each do |check_at|
580
+ maybe_record = in_str[check_at..str_size]
581
+ # If the record is smaller than the minimum - we will never recover anything
582
+ break if maybe_record.bytesize < minimum_record_size
583
+ # Now we check if the record ends with the combination
584
+ # of the comment size and an arbitrary byte string of that size.
585
+ # If it does - we found our match
586
+ *_unused, comment_size = maybe_record.unpack(unpack_pattern)
587
+ if (maybe_record.bytesize - minimum_record_size) == comment_size
588
+ return check_at # Found the EOCD marker location
589
+ end
590
+ end
591
+ # If we haven't caught anything, return nil deliberately instead of returning the last statement
592
+ nil
593
+ end
594
+
595
+ # Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
596
+ # EOCD record in the archive by fixed offsets
597
+ # get_zip64_eocd_location is too high. [15.17/15]
598
+ def get_zip64_eocd_location(file_io, eocd_offset)
599
+ zip64_eocd_loc_offset = eocd_offset
600
+ zip64_eocd_loc_offset -= 4 # The signature
601
+ zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record
602
+ zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record
603
+ zip64_eocd_loc_offset -= 4 # Total number of disks
604
+
605
+ log do
606
+ "Will look for the Zip64 EOCD locator signature at offset %<offset>d" %
607
+ {offset: zip64_eocd_loc_offset}
608
+ end
609
+
610
+ # If the offset is negative there is certainly no Zip64 EOCD locator here
611
+ return unless zip64_eocd_loc_offset >= 0
612
+
613
+ file_io.seek(zip64_eocd_loc_offset, IO::SEEK_SET)
614
+ assert_signature(file_io, 0x07064b50)
615
+
616
+ log do
617
+ "Found Zip64 EOCD locator at offset %<offset>d" % {offset: zip64_eocd_loc_offset}
618
+ end
619
+
620
+ disk_num = read_4b(file_io) # number of the disk
621
+ raise UnsupportedFeature, "The archive spans multiple disks" if disk_num != 0
622
+ read_8b(file_io)
623
+ rescue ReadError
624
+ nil
625
+ end
626
+
627
+ # num_files_and_central_directory_offset_zip64 is too high. [21.12/15]
628
+ def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
629
+ seek(io, zip64_end_of_cdir_location)
630
+
631
+ assert_signature(io, 0x06064b50)
632
+
633
+ zip64_eocdr_size = read_8b(io)
634
+ zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper
635
+ zip64_eocdr = StringIO.new(zip64_eocdr)
636
+ skip_ahead_2(zip64_eocdr) # version made by
637
+ skip_ahead_2(zip64_eocdr) # version needed to extract
638
+
639
+ disk_n = read_4b(zip64_eocdr) # number of this disk
640
+ disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
641
+ raise UnsupportedFeature, "The archive spans multiple disks" if disk_n != disk_n_with_eocdr
642
+
643
+ num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
644
+ num_files_total = read_8b(zip64_eocdr) # files total in the central directory
645
+
646
+ raise UnsupportedFeature, "The archive spans multiple disks" if num_files_this_disk != num_files_total
647
+
648
+ log do
649
+ "Zip64 EOCD record states there are %<amount>d files in the archive" %
650
+ {amount: num_files_total}
651
+ end
652
+
653
+ central_dir_size = read_8b(zip64_eocdr) # Size of the central directory
654
+ central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts
655
+
656
+ [num_files_total, central_dir_offset, central_dir_size]
657
+ end
658
+
659
+ C_UINT4 = "V"
660
+ C_UINT2 = "v"
661
+ C_UINT8 = "Q<"
662
+
663
+ # To prevent too many tiny reads, read the maximum possible size of end of
664
+ # central directory record upfront (all the fixed fields + at most 0xFFFF
665
+ # bytes of the archive comment)
666
+ MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE = 4 + # Offset of the start of central directory
667
+ 4 + # Size of the central directory
668
+ 2 + # Number of files in the cdir
669
+ 4 + # End-of-central-directory signature
670
+ 2 + # Number of this disk
671
+ 2 + # Number of disk with the start of cdir
672
+ 2 + # Number of files in the cdir of this disk
673
+ 2 + # The comment size
674
+ 0xFFFF # Maximum comment size
675
+
676
+ # To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
677
+ # The maximum size is all the usual items, plus the maximum size
678
+ # of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
679
+ MAX_LOCAL_HEADER_SIZE = 4 + # signature
680
+ 2 + # Version needed to extract
681
+ 2 + # gp flags
682
+ 2 + # storage mode
683
+ 2 + # dos time
684
+ 2 + # dos date
685
+ 4 + # CRC32
686
+ 4 + # Comp size
687
+ 4 + # Uncomp size
688
+ 2 + # Filename size
689
+ 2 + # Extra fields size
690
+ 0xFFFF + # Maximum filename size
691
+ 0xFFFF # Maximum extra fields size
692
+
693
+ SIZE_OF_USABLE_EOCD_RECORD = 4 + # Signature
694
+ 2 + # Number of this disk
695
+ 2 + # Number of the disk with the EOCD record
696
+ 2 + # Number of entries in the central directory of this disk
697
+ 2 + # Number of entries in the central directory total
698
+ 4 + # Size of the central directory
699
+ 4 # Start of the central directory offset
700
+
701
+ def num_files_and_central_directory_offset(file_io, eocd_offset)
702
+ seek(file_io, eocd_offset)
703
+
704
+ # The size of the EOCD record is known upfront, so use a strict read
705
+ eocd_record_str = read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD)
706
+ io = StringIO.new(eocd_record_str)
707
+
708
+ assert_signature(io, 0x06054b50)
709
+ skip_ahead_2(io) # number_of_this_disk
710
+ skip_ahead_2(io) # number of the disk with the EOCD record
711
+ skip_ahead_2(io) # number of entries in the central directory of this disk
712
+ num_files = read_2b(io) # number of entries in the central directory total
713
+ cdir_size = read_4b(io) # size of the central directory
714
+ cdir_offset = read_4b(io) # start of central directorty offset
715
+ [num_files, cdir_offset, cdir_size]
716
+ end
717
+
718
+ private_constant :C_UINT4, :C_UINT2, :C_UINT8, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
719
+ :MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
720
+
721
+ # Is provided as a stub to be overridden in a subclass if you need it. Will report
722
+ # during various stages of reading. The log message is contained in the return value
723
+ # of `yield` in the method (the log messages are lazy-evaluated).
724
+ def log
725
+ # The most minimal implementation for the method is just this:
726
+ # $stderr.puts(yield)
727
+ end
728
+
729
+ def parse_out_extra_fields(extra_fields_str)
730
+ extra_table = {}
731
+ extras_buf = StringIO.new(extra_fields_str)
732
+ until extras_buf.eof?
733
+ extra_id = read_2b(extras_buf)
734
+ extra_size = read_2b(extras_buf)
735
+ extra_contents = read_n(extras_buf, extra_size)
736
+ extra_table[extra_id] = extra_contents
737
+ end
738
+ extra_table
739
+ end
740
+ end