zip_kit 6.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +7 -0
  2. data/.codeclimate.yml +7 -0
  3. data/.document +5 -0
  4. data/.github/workflows/ci.yml +29 -0
  5. data/.gitignore +61 -0
  6. data/.rspec +1 -0
  7. data/.standard.yml +8 -0
  8. data/.yardopts +1 -0
  9. data/CHANGELOG.md +255 -0
  10. data/CODE_OF_CONDUCT.md +46 -0
  11. data/CONTRIBUTING.md +153 -0
  12. data/Gemfile +4 -0
  13. data/IMPLEMENTATION_DETAILS.md +97 -0
  14. data/LICENSE.txt +20 -0
  15. data/README.md +234 -0
  16. data/Rakefile +21 -0
  17. data/bench/buffered_crc32_bench.rb +109 -0
  18. data/examples/archive_size_estimate.rb +15 -0
  19. data/examples/config.ru +7 -0
  20. data/examples/deferred_write.rb +58 -0
  21. data/examples/parallel_compression_with_block_deflate.rb +86 -0
  22. data/examples/rack_application.rb +63 -0
  23. data/examples/s3_upload.rb +23 -0
  24. data/lib/zip_kit/block_deflate.rb +130 -0
  25. data/lib/zip_kit/block_write.rb +47 -0
  26. data/lib/zip_kit/file_reader/inflating_reader.rb +36 -0
  27. data/lib/zip_kit/file_reader/stored_reader.rb +35 -0
  28. data/lib/zip_kit/file_reader.rb +740 -0
  29. data/lib/zip_kit/null_writer.rb +12 -0
  30. data/lib/zip_kit/output_enumerator.rb +150 -0
  31. data/lib/zip_kit/path_set.rb +163 -0
  32. data/lib/zip_kit/rack_chunked_body.rb +32 -0
  33. data/lib/zip_kit/rack_tempfile_body.rb +61 -0
  34. data/lib/zip_kit/rails_streaming.rb +37 -0
  35. data/lib/zip_kit/remote_io.rb +114 -0
  36. data/lib/zip_kit/remote_uncap.rb +22 -0
  37. data/lib/zip_kit/size_estimator.rb +84 -0
  38. data/lib/zip_kit/stream_crc32.rb +60 -0
  39. data/lib/zip_kit/streamer/deflated_writer.rb +45 -0
  40. data/lib/zip_kit/streamer/entry.rb +37 -0
  41. data/lib/zip_kit/streamer/filler.rb +9 -0
  42. data/lib/zip_kit/streamer/heuristic.rb +68 -0
  43. data/lib/zip_kit/streamer/stored_writer.rb +39 -0
  44. data/lib/zip_kit/streamer/writable.rb +36 -0
  45. data/lib/zip_kit/streamer.rb +614 -0
  46. data/lib/zip_kit/uniquify_filename.rb +39 -0
  47. data/lib/zip_kit/version.rb +5 -0
  48. data/lib/zip_kit/write_and_tell.rb +40 -0
  49. data/lib/zip_kit/write_buffer.rb +71 -0
  50. data/lib/zip_kit/write_shovel.rb +22 -0
  51. data/lib/zip_kit/zip_writer.rb +436 -0
  52. data/lib/zip_kit.rb +24 -0
  53. data/zip_kit.gemspec +41 -0
  54. metadata +335 -0
@@ -0,0 +1,740 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "stringio"
4
+
5
+ # A very barebones ZIP file reader. Is made for maximum interoperability, but at the same
6
+ # time we attempt to keep it somewhat concise.
7
+ #
8
+ # ## REALLY CRAZY IMPORTANT STUFF: SECURITY IMPLICATIONS
9
+ #
10
+ # Please **BEWARE** - using this is a security risk if you are reading files that have been
11
+ # supplied by users. This implementation has _not_ been formally verified for correctness. As
12
+ # ZIP files contain relative offsets in lots of places it might be possible for a maliciously
13
+ # crafted ZIP file to put the decode procedure in an endless loop, make it attempt huge reads
14
+ # from the input file and so on. Additionally, the reader module for deflated data has
15
+ # no support for ZIP bomb protection. So either limit the `FileReader` usage to the files you
16
+ # trust, or triple-check all the inputs upfront. Patches to make this reader more secure
17
+ # are welcome of course.
18
+ #
19
+ # ## Usage
20
+ #
21
+ # File.open('zipfile.zip', 'rb') do |f|
22
+ # entries = ZipKit::FileReader.read_zip_structure(io: f)
23
+ # entries.each do |e|
24
+ # File.open(e.filename, 'wb') do |extracted_file|
25
+ # ex = e.extractor_from(f)
26
+ # extracted_file << ex.extract(1024 * 1024) until ex.eof?
27
+ # end
28
+ # end
29
+ # end
30
+ #
31
+ # ## Supported features
32
+ #
33
+ # * Deflate and stored storage modes
34
+ # * Zip64 (extra fields and offsets)
35
+ # * Data descriptors
36
+ #
37
+ # ## Unsupported features
38
+ #
39
+ # * Archives split over multiple disks/files
40
+ # * Any ZIP encryption
41
+ # * EFS language flag and InfoZIP filename extra field
42
+ # * CRC32 checksums are _not_ verified
43
+ #
44
+ # ## Mode of operation
45
+ #
46
+ # By default, `FileReader` _ignores_ the data in local file headers (as it is
47
+ # often unreliable). It reads the ZIP file "from the tail", finds the
48
+ # end-of-central-directory signatures, then reads the central directory entries,
49
+ # reconstitutes the entries with their filenames, attributes and so on, and
50
+ # sets these entries up with the absolute _offsets_ into the source file/IO object.
51
+ # These offsets can then be used to extract the actual compressed data of
52
+ # the files and to expand it.
53
+ #
54
+ # ## Recovering damaged or incomplete ZIP files
55
+ #
56
+ # If the ZIP file you are trying to read does not contain the central directory
57
+ # records `read_zip_structure` will not work, since it starts the read process
58
+ # from the EOCD marker at the end of the central directory and then crawls
59
+ # "back" in the IO to figure out the rest. You can explicitly apply a fallback
60
+ # for reading the archive "straight ahead" instead using `read_zip_straight_ahead`
61
+ # - the method will instead scan your IO from the very start, skipping over
62
+ # the actual entry data. This is less efficient than central directory parsing since
63
+ # it involves a much larger number of reads (1 read from the IO per entry in the ZIP).
64
+
65
+ class ZipKit::FileReader
66
+ require_relative "file_reader/stored_reader"
67
+ require_relative "file_reader/inflating_reader"
68
+
69
+ ReadError = Class.new(StandardError)
70
+ UnsupportedFeature = Class.new(StandardError)
71
+ InvalidStructure = Class.new(ReadError)
72
+ LocalHeaderPending = Class.new(StandardError) do
73
+ def message
74
+ "The compressed data offset is not available (local header has not been read)"
75
+ end
76
+ end
77
+ MissingEOCD = Class.new(StandardError) do
78
+ def message
79
+ "Could not find the EOCD signature in the buffer - maybe a malformed ZIP file"
80
+ end
81
+ end
82
+
83
+ private_constant :StoredReader, :InflatingReader
84
+
85
+ # Represents a file within the ZIP archive being read. This is different from
86
+ # the Entry object used in Streamer for ZIP writing, since during writing more
87
+ # data can be kept in memory for immediate use.
88
+ class ZipEntry
89
+ # @return [Fixnum] bit-packed version signature of the program that made the archive
90
+ attr_accessor :made_by
91
+
92
+ # @return [Fixnum] ZIP version support needed to extract this file
93
+ attr_accessor :version_needed_to_extract
94
+
95
+ # @return [Fixnum] bit-packed general purpose flags
96
+ attr_accessor :gp_flags
97
+
98
+ # @return [Fixnum] Storage mode (0 for stored, 8 for deflate)
99
+ attr_accessor :storage_mode
100
+
101
+ # @return [Fixnum] the bit-packed DOS time
102
+ attr_accessor :dos_time
103
+
104
+ # @return [Fixnum] the bit-packed DOS date
105
+ attr_accessor :dos_date
106
+
107
+ # @return [Fixnum] the CRC32 checksum of this file
108
+ attr_accessor :crc32
109
+
110
+ # @return [Fixnum] size of compressed file data in the ZIP
111
+ attr_accessor :compressed_size
112
+
113
+ # @return [Fixnum] size of the file once uncompressed
114
+ attr_accessor :uncompressed_size
115
+
116
+ # @return [String] the filename
117
+ attr_accessor :filename
118
+
119
+ # @return [Fixnum] disk number where this file starts
120
+ attr_accessor :disk_number_start
121
+
122
+ # @return [Fixnum] internal attributes of the file
123
+ attr_accessor :internal_attrs
124
+
125
+ # @return [Fixnum] external attributes of the file
126
+ attr_accessor :external_attrs
127
+
128
+ # @return [Fixnum] at what offset the local file header starts
129
+ # in your original IO object
130
+ attr_accessor :local_file_header_offset
131
+
132
+ # @return [String] the file comment
133
+ attr_accessor :comment
134
+
135
+ # Returns a reader for the actual compressed data of the entry.
136
+ #
137
+ # reader = entry.extractor_from(source_file)
138
+ # outfile << reader.extract(512 * 1024) until reader.eof?
139
+ #
140
+ # @return [#extract(n_bytes), #eof?] the reader for the data
141
+ def extractor_from(from_io)
142
+ from_io.seek(compressed_data_offset, IO::SEEK_SET)
143
+ case storage_mode
144
+ when 8
145
+ InflatingReader.new(from_io, compressed_size)
146
+ when 0
147
+ StoredReader.new(from_io, compressed_size)
148
+ else
149
+ raise UnsupportedFeature, "Unsupported storage mode for reading - %<storage_mode>d" %
150
+ {storage_mode: storage_mode}
151
+ end
152
+ end
153
+
154
+ # @return [Fixnum] at what offset you should start reading
155
+ # for the compressed data in your original IO object
156
+ def compressed_data_offset
157
+ @compressed_data_offset || raise(LocalHeaderPending)
158
+ end
159
+
160
+ # Tells whether the compressed data offset is already known for this entry
161
+ # @return [Boolean]
162
+ def known_offset?
163
+ !@compressed_data_offset.nil?
164
+ end
165
+
166
+ # Tells whether the entry uses a data descriptor (this is defined
167
+ # by bit 3 in the GP flags).
168
+ def uses_data_descriptor?
169
+ (gp_flags & 0x0008) == 0x0008
170
+ end
171
+
172
+ # Sets the offset at which the compressed data for this file starts in the ZIP.
173
+ # By default, the value will be set by the Reader for you. If you use delayed
174
+ # reading, you need to set it by using the `get_compressed_data_offset` on the Reader:
175
+ #
176
+ # entry.compressed_data_offset = reader.get_compressed_data_offset(io: file,
177
+ # local_file_header_offset: entry.local_header_offset)
178
+ def compressed_data_offset=(offset)
179
+ @compressed_data_offset = offset.to_i
180
+ end
181
+ end
182
+
183
+ # Parse an IO handle to a ZIP archive into an array of Entry objects.
184
+ #
185
+ # @param io[#tell, #seek, #read, #size] an IO-ish object
186
+ # @param read_local_headers[Boolean] whether the local headers must be read upfront. When reading
187
+ # a locally available ZIP file this option will not have much use since the small reads from
188
+ # the file handle are not going to be that important. However, if you are using remote reads
189
+ # to decipher a ZIP file located on an HTTP server, the operation _must_ perform an HTTP
190
+ # request for _each entry in the ZIP file_ to determine where the actual file data starts.
191
+ # This, for a ZIP archive of 1000 files, will incur 1000 extra HTTP requests - which you might
192
+ # not want to perform upfront, or - at least - not want to perform _at once_. When the option is
193
+ # set to `false`, you will be getting instances of `LazyEntry` instead of `Entry`. Those objects
194
+ # will raise an exception when you attempt to access their compressed data offset in the ZIP
195
+ # (since the reads have not been performed yet). As a rule, this option can be left in it's
196
+ # default setting (`true`) unless you want to _only_ read the central directory, or you need
197
+ # to limit the number of HTTP requests.
198
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
199
+ def read_zip_structure(io:, read_local_headers: true)
200
+ zip_file_size = io.size
201
+ eocd_offset = get_eocd_offset(io, zip_file_size)
202
+
203
+ zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
204
+ num_files, cdir_location, _cdir_size =
205
+ if zip64_end_of_cdir_location
206
+ num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
207
+ else
208
+ num_files_and_central_directory_offset(io, eocd_offset)
209
+ end
210
+
211
+ log do
212
+ "Located the central directory start at %<location>d" %
213
+ {location: cdir_location}
214
+ end
215
+ seek(io, cdir_location)
216
+
217
+ # Read the entire central directory AND anything behind it, in one fell swoop.
218
+ # Strictly speaking, we should be able to read `cdir_size` bytes and not a byte more.
219
+ # However, we know for a fact that in some of our files the central directory size
220
+ # is in fact misreported. `zipinfo` then says:
221
+ #
222
+ # warning [ktsglobal-2b03bc.zip]: 1 extra byte at beginning or within zipfile
223
+ # (attempting to process anyway)
224
+ # error [ktsglobal-2b03bc.zip]: reported length of central directory is
225
+ # -1 bytes too long (Atari STZip zipfile? J.H.Holm ZIPSPLIT 1.1
226
+ # zipfile?). Compensating...
227
+ #
228
+ # Since the EOCD is not that big anyway, we just read the entire "tail" of the ZIP ignoring
229
+ # the central directory size alltogether.
230
+ central_directory_str = io.read # and not read_n(io, cdir_size), see above
231
+ central_directory_io = StringIO.new(central_directory_str)
232
+ log do
233
+ "Read %<byte_size>d bytes with central directory + EOCD record and locator" %
234
+ {byte_size: central_directory_str.bytesize}
235
+ end
236
+
237
+ entries = (0...num_files).map { |entry_n|
238
+ offset_location = cdir_location + central_directory_io.tell
239
+ log do
240
+ "Reading the central directory entry %<entry_n>d starting at offset %<offset>d" %
241
+ {entry_n: entry_n, offset: offset_location}
242
+ end
243
+ read_cdir_entry(central_directory_io)
244
+ }
245
+
246
+ read_local_headers(entries, io) if read_local_headers
247
+
248
+ entries
249
+ end
250
+
251
+ # Sometimes you might encounter truncated ZIP files, which do not contain
252
+ # any central directory whatsoever - or where the central directory is
253
+ # truncated. In that case, employing the technique of reading the ZIP
254
+ # "from the end" is impossible, and the only recourse is reading each
255
+ # local file header in sucession. If the entries in such a ZIP use data
256
+ # descriptors, you would need to scan after the entry until you encounter
257
+ # the data descriptor signature - and that might be unreliable at best.
258
+ # Therefore, this reading technique does not support data descriptors.
259
+ # It can however recover the entries you still can read if these entries
260
+ # contain all the necessary information about the contained file.
261
+ #
262
+ # @param io[#tell, #read, #seek] the IO-ish object to read the local file
263
+ # headers from @return [Array<ZipEntry>] an array of entries that could be
264
+ # recovered before hitting EOF
265
+ def read_zip_straight_ahead(io:)
266
+ entries = []
267
+ loop do
268
+ cur_offset = io.tell
269
+ entry = read_local_file_header(io: io)
270
+ if entry.uses_data_descriptor?
271
+ raise UnsupportedFeature, "The local file header at #{cur_offset} uses \
272
+ a data descriptor and the start of next entry \
273
+ cannot be found"
274
+ end
275
+ entries << entry
276
+ next_local_header_offset = entry.compressed_data_offset + entry.compressed_size
277
+ log do
278
+ "Recovered a local file file header at offset %<cur_offset>d, seeking to the next at %<header_offset>d" %
279
+ {cur_offset: cur_offset, header_offset: next_local_header_offset}
280
+ end
281
+ seek(io, next_local_header_offset) # Seek to the next entry, and raise if seek is impossible
282
+ end
283
+ entries
284
+ rescue ReadError, RangeError # RangeError is raised if offset exceeds int32/int64 range
285
+ log do
286
+ "Got a read/seek error after reaching %<cur_offset>d, no more entries can be recovered" %
287
+ {cur_offset: cur_offset}
288
+ end
289
+ entries
290
+ end
291
+
292
+ # Parse the local header entry and get the offset in the IO at which the
293
+ # actual compressed data of the file starts within the ZIP.
294
+ # The method will eager-read the entire local header for the file
295
+ # (the maximum size the local header may use), starting at the given offset,
296
+ # and will then compute its size. That size plus the local header offset
297
+ # given will be the compressed data offset of the entry (read starting at
298
+ # this offset to get the data).
299
+ #
300
+ # @param io[#read] an IO-ish object the ZIP file can be read from
301
+ # @return [Array<ZipEntry, Fixnum>] the parsed local header entry and
302
+ # the compressed data offset
303
+ def read_local_file_header(io:)
304
+ local_file_header_offset = io.tell
305
+
306
+ # Reading in bulk is cheaper - grab the maximum length of the local header,
307
+ # including any headroom for extra fields etc.
308
+ local_file_header_str_plus_headroom = io.read(MAX_LOCAL_HEADER_SIZE)
309
+ raise ReadError if local_file_header_str_plus_headroom.nil? # reached EOF
310
+
311
+ io_starting_at_local_header = StringIO.new(local_file_header_str_plus_headroom)
312
+
313
+ assert_signature(io_starting_at_local_header, 0x04034b50)
314
+ e = ZipEntry.new
315
+ e.version_needed_to_extract = read_2b(io_starting_at_local_header) # Version needed to extract
316
+ e.gp_flags = read_2b(io_starting_at_local_header) # gp flags
317
+ e.storage_mode = read_2b(io_starting_at_local_header) # storage mode
318
+ e.dos_time = read_2b(io_starting_at_local_header) # dos time
319
+ e.dos_date = read_2b(io_starting_at_local_header) # dos date
320
+ e.crc32 = read_4b(io_starting_at_local_header) # CRC32
321
+ e.compressed_size = read_4b(io_starting_at_local_header) # Comp size
322
+ e.uncompressed_size = read_4b(io_starting_at_local_header) # Uncomp size
323
+
324
+ filename_size = read_2b(io_starting_at_local_header)
325
+ extra_size = read_2b(io_starting_at_local_header)
326
+ e.filename = read_n(io_starting_at_local_header, filename_size)
327
+ extra_fields_str = read_n(io_starting_at_local_header, extra_size)
328
+
329
+ # Parse out the extra fields
330
+ extra_table = parse_out_extra_fields(extra_fields_str)
331
+
332
+ # ...of which we really only need the Zip64 extra
333
+ if (zip64_extra_contents = extra_table[1])
334
+ # If the Zip64 extra is present, we let it override all
335
+ # the values fetched from the conventional header
336
+ zip64_extra = StringIO.new(zip64_extra_contents)
337
+ log do
338
+ "Will read Zip64 extra data from local header field for %<filename>s, %<size>d bytes" %
339
+ {filename: e.filename, size: zip64_extra.size}
340
+ end
341
+ # Now here be dragons. The APPNOTE specifies that
342
+ #
343
+ # > The order of the fields in the ZIP64 extended
344
+ # > information record is fixed, but the fields will
345
+ # > only appear if the corresponding Local or Central
346
+ # > directory record field is set to 0xFFFF or 0xFFFFFFFF.
347
+ #
348
+ # It means that before we read this stuff we need to check if the previously-read
349
+ # values are at overflow, and only _then_ proceed to read them. Bah.
350
+ e.uncompressed_size = read_8b(zip64_extra) if e.uncompressed_size == 0xFFFFFFFF
351
+ e.compressed_size = read_8b(zip64_extra) if e.compressed_size == 0xFFFFFFFF
352
+ end
353
+
354
+ offset = local_file_header_offset + io_starting_at_local_header.tell
355
+ e.compressed_data_offset = offset
356
+
357
+ e
358
+ end
359
+
360
+ # Get the offset in the IO at which the actual compressed data of the file
361
+ # starts within the ZIP. The method will eager-read the entire local header
362
+ # for the file (the maximum size the local header may use), starting at the
363
+ # given offset, and will then compute its size. That size plus the local
364
+ # header offset given will be the compressed data offset of the entry
365
+ # (read starting at this offset to get the data).
366
+ #
367
+ # @param io[#seek, #read] an IO-ish object the ZIP file can be read from
368
+ # @param local_file_header_offset[Fixnum] absolute offset (0-based) where the
369
+ # local file header is supposed to begin @return [Fixnum] absolute offset
370
+ # (0-based) of where the compressed data begins for this file within the ZIP
371
+ def get_compressed_data_offset(io:, local_file_header_offset:)
372
+ seek(io, local_file_header_offset)
373
+ entry_recovered_from_local_file_header = read_local_file_header(io: io)
374
+ entry_recovered_from_local_file_header.compressed_data_offset
375
+ end
376
+
377
+ # Parse an IO handle to a ZIP archive into an array of Entry objects, reading from the end
378
+ # of the IO object.
379
+ #
380
+ # @see #read_zip_structure
381
+ # @param options[Hash] any options the instance method of the same name accepts
382
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
383
+ def self.read_zip_structure(**options)
384
+ new.read_zip_structure(**options)
385
+ end
386
+
387
+ # Parse an IO handle to a ZIP archive into an array of Entry objects, reading from the start of
388
+ # the file and parsing local file headers one-by-one
389
+ #
390
+ # @see #read_zip_straight_ahead
391
+ # @param options[Hash] any options the instance method of the same name accepts
392
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
393
+ def self.read_zip_straight_ahead(**options)
394
+ new.read_zip_straight_ahead(**options)
395
+ end
396
+
397
+ private
398
+
399
+ def read_local_headers(entries, io)
400
+ entries.each_with_index do |entry, i|
401
+ log do
402
+ "Reading the local header for entry %<index>d at offset %<offset>d" %
403
+ {index: i, offset: entry.local_file_header_offset}
404
+ end
405
+ off = get_compressed_data_offset(io: io,
406
+ local_file_header_offset: entry.local_file_header_offset)
407
+ entry.compressed_data_offset = off
408
+ end
409
+ end
410
+
411
+ def skip_ahead_2(io)
412
+ skip_ahead_n(io, 2)
413
+ end
414
+
415
+ def skip_ahead_4(io)
416
+ skip_ahead_n(io, 4)
417
+ end
418
+
419
+ def skip_ahead_8(io)
420
+ skip_ahead_n(io, 8)
421
+ end
422
+
423
+ def seek(io, absolute_pos)
424
+ io.seek(absolute_pos, IO::SEEK_SET)
425
+ unless absolute_pos == io.tell
426
+ raise ReadError,
427
+ "Expected to seek to #{absolute_pos} but only \
428
+ got to #{io.tell}"
429
+ end
430
+ nil
431
+ end
432
+
433
+ def assert_signature(io, signature_magic_number)
434
+ readback = read_4b(io)
435
+ if readback != signature_magic_number
436
+ expected = "0x0" + signature_magic_number.to_s(16)
437
+ actual = "0x0" + readback.to_s(16)
438
+ raise InvalidStructure, "Expected signature #{expected}, but read #{actual}"
439
+ end
440
+ end
441
+
442
+ def skip_ahead_n(io, n)
443
+ pos_before = io.tell
444
+ io.seek(io.tell + n, IO::SEEK_SET)
445
+ pos_after = io.tell
446
+ delta = pos_after - pos_before
447
+ unless delta == n
448
+ raise ReadError, "Expected to seek #{n} bytes ahead, but could \
449
+ only seek #{delta} bytes ahead"
450
+ end
451
+ nil
452
+ end
453
+
454
+ def read_n(io, n_bytes)
455
+ io.read(n_bytes).tap do |d|
456
+ raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
457
+ unless d.bytesize == n_bytes
458
+ raise ReadError, "Expected to read #{n_bytes} bytes, \
459
+ read #{d.bytesize}"
460
+ end
461
+ end
462
+ end
463
+
464
+ def read_2b(io)
465
+ read_n(io, 2).unpack(C_UINT2).shift
466
+ end
467
+
468
+ def read_4b(io)
469
+ read_n(io, 4).unpack(C_UINT4).shift
470
+ end
471
+
472
+ def read_8b(io)
473
+ read_n(io, 8).unpack(C_UINT8).shift
474
+ end
475
+
476
+ def read_cdir_entry(io)
477
+ # read_cdir_entry is too high. [45.66/15]
478
+ assert_signature(io, 0x02014b50)
479
+ ZipEntry.new.tap do |e|
480
+ e.made_by = read_2b(io)
481
+ e.version_needed_to_extract = read_2b(io)
482
+ e.gp_flags = read_2b(io)
483
+ e.storage_mode = read_2b(io)
484
+ e.dos_time = read_2b(io)
485
+ e.dos_date = read_2b(io)
486
+ e.crc32 = read_4b(io)
487
+ e.compressed_size = read_4b(io)
488
+ e.uncompressed_size = read_4b(io)
489
+ filename_size = read_2b(io)
490
+ extra_size = read_2b(io)
491
+ comment_len = read_2b(io)
492
+ e.disk_number_start = read_2b(io)
493
+ e.internal_attrs = read_2b(io)
494
+ e.external_attrs = read_4b(io)
495
+ e.local_file_header_offset = read_4b(io)
496
+ e.filename = read_n(io, filename_size)
497
+
498
+ # Extra fields
499
+ extras = read_n(io, extra_size)
500
+ # Comment
501
+ e.comment = read_n(io, comment_len)
502
+
503
+ # Parse out the extra fields
504
+ extra_table = parse_out_extra_fields(extras)
505
+
506
+ # ...of which we really only need the Zip64 extra
507
+ if zip64_extra_contents ||= extra_table[1]
508
+ # If the Zip64 extra is present, we let it override all
509
+ # the values fetched from the conventional header
510
+ zip64_extra = StringIO.new(zip64_extra_contents)
511
+ log do
512
+ "Will read Zip64 extra data for %<filename>s, %<size>d bytes" %
513
+ {filename: e.filename, size: zip64_extra.size}
514
+ end
515
+ # Now here be dragons. The APPNOTE specifies that
516
+ #
517
+ # > The order of the fields in the ZIP64 extended
518
+ # > information record is fixed, but the fields will
519
+ # > only appear if the corresponding Local or Central
520
+ # > directory record field is set to 0xFFFF or 0xFFFFFFFF.
521
+ #
522
+ # It means that before we read this stuff we need to check if the previously-read
523
+ # values are at overflow, and only _then_ proceed to read them. Bah.
524
+ e.uncompressed_size = read_8b(zip64_extra) if e.uncompressed_size == 0xFFFFFFFF
525
+ e.compressed_size = read_8b(zip64_extra) if e.compressed_size == 0xFFFFFFFF
526
+ e.local_file_header_offset = read_8b(zip64_extra) if e.local_file_header_offset == 0xFFFFFFFF
527
+ # Disk number comes last and we can skip it anyway, since we do
528
+ # not support multi-disk archives
529
+ end
530
+ end
531
+ end
532
+
533
+ def get_eocd_offset(file_io, zip_file_size)
534
+ # Start reading from the _comment_ of the zip file (from the very end).
535
+ # The maximum size of the comment is 0xFFFF (what fits in 2 bytes)
536
+ implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
537
+ implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0
538
+
539
+ # Use a soft seek (we might not be able to get as far behind in the IO as we want)
540
+ # and a soft read (we might not be able to read as many bytes as we want)
541
+ file_io.seek(implied_position_of_eocd_record, IO::SEEK_SET)
542
+ str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
543
+ eocd_idx_in_buf = locate_eocd_signature(str_containing_eocd_record)
544
+
545
+ raise MissingEOCD unless eocd_idx_in_buf
546
+
547
+ eocd_offset = implied_position_of_eocd_record + eocd_idx_in_buf
548
+ log do
549
+ "Found EOCD signature at offset %<offset>d" % {offset: eocd_offset}
550
+ end
551
+
552
+ eocd_offset
553
+ end
554
+
555
+ def all_indices_of_substr_in_str(of_substring, in_string)
556
+ last_i = 0
557
+ found_at_indices = []
558
+ while (last_i = in_string.index(of_substring, last_i))
559
+ found_at_indices << last_i
560
+ last_i += of_substring.bytesize
561
+ end
562
+ found_at_indices
563
+ end
564
+
565
+ # We have to scan the maximum possible number
566
+ # of bytes that the EOCD can theoretically occupy including the comment after it,
567
+ # and we have to find a combination of:
568
+ # [EOCD signature, <some ZIP medatata>, comment byte size, comment of size]
569
+ # at the end. To do so, we first find all indices of the signature in the trailer
570
+ # string, and then check whether the bytestring starting at the signature and
571
+ # ending at the end of string satisfies that given pattern.
572
+ def locate_eocd_signature(in_str)
573
+ eocd_signature = 0x06054b50
574
+ eocd_signature_str = [eocd_signature].pack("V")
575
+ unpack_pattern = "VvvvvVVv"
576
+ minimum_record_size = 22
577
+ str_size = in_str.bytesize
578
+ indices = all_indices_of_substr_in_str(eocd_signature_str, in_str)
579
+ indices.each do |check_at|
580
+ maybe_record = in_str[check_at..str_size]
581
+ # If the record is smaller than the minimum - we will never recover anything
582
+ break if maybe_record.bytesize < minimum_record_size
583
+ # Now we check if the record ends with the combination
584
+ # of the comment size and an arbitrary byte string of that size.
585
+ # If it does - we found our match
586
+ *_unused, comment_size = maybe_record.unpack(unpack_pattern)
587
+ if (maybe_record.bytesize - minimum_record_size) == comment_size
588
+ return check_at # Found the EOCD marker location
589
+ end
590
+ end
591
+ # If we haven't caught anything, return nil deliberately instead of returning the last statement
592
+ nil
593
+ end
594
+
595
+ # Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
596
+ # EOCD record in the archive by fixed offsets
597
+ # get_zip64_eocd_location is too high. [15.17/15]
598
+ def get_zip64_eocd_location(file_io, eocd_offset)
599
+ zip64_eocd_loc_offset = eocd_offset
600
+ zip64_eocd_loc_offset -= 4 # The signature
601
+ zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record
602
+ zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record
603
+ zip64_eocd_loc_offset -= 4 # Total number of disks
604
+
605
+ log do
606
+ "Will look for the Zip64 EOCD locator signature at offset %<offset>d" %
607
+ {offset: zip64_eocd_loc_offset}
608
+ end
609
+
610
+ # If the offset is negative there is certainly no Zip64 EOCD locator here
611
+ return unless zip64_eocd_loc_offset >= 0
612
+
613
+ file_io.seek(zip64_eocd_loc_offset, IO::SEEK_SET)
614
+ assert_signature(file_io, 0x07064b50)
615
+
616
+ log do
617
+ "Found Zip64 EOCD locator at offset %<offset>d" % {offset: zip64_eocd_loc_offset}
618
+ end
619
+
620
+ disk_num = read_4b(file_io) # number of the disk
621
+ raise UnsupportedFeature, "The archive spans multiple disks" if disk_num != 0
622
+ read_8b(file_io)
623
+ rescue ReadError
624
+ nil
625
+ end
626
+
627
+ # num_files_and_central_directory_offset_zip64 is too high. [21.12/15]
628
+ def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
629
+ seek(io, zip64_end_of_cdir_location)
630
+
631
+ assert_signature(io, 0x06064b50)
632
+
633
+ zip64_eocdr_size = read_8b(io)
634
+ zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper
635
+ zip64_eocdr = StringIO.new(zip64_eocdr)
636
+ skip_ahead_2(zip64_eocdr) # version made by
637
+ skip_ahead_2(zip64_eocdr) # version needed to extract
638
+
639
+ disk_n = read_4b(zip64_eocdr) # number of this disk
640
+ disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
641
+ raise UnsupportedFeature, "The archive spans multiple disks" if disk_n != disk_n_with_eocdr
642
+
643
+ num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
644
+ num_files_total = read_8b(zip64_eocdr) # files total in the central directory
645
+
646
+ raise UnsupportedFeature, "The archive spans multiple disks" if num_files_this_disk != num_files_total
647
+
648
+ log do
649
+ "Zip64 EOCD record states there are %<amount>d files in the archive" %
650
+ {amount: num_files_total}
651
+ end
652
+
653
+ central_dir_size = read_8b(zip64_eocdr) # Size of the central directory
654
+ central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts
655
+
656
+ [num_files_total, central_dir_offset, central_dir_size]
657
+ end
658
+
659
+ C_UINT4 = "V"
660
+ C_UINT2 = "v"
661
+ C_UINT8 = "Q<"
662
+
663
+ # To prevent too many tiny reads, read the maximum possible size of end of
664
+ # central directory record upfront (all the fixed fields + at most 0xFFFF
665
+ # bytes of the archive comment)
666
+ MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE = 4 + # Offset of the start of central directory
667
+ 4 + # Size of the central directory
668
+ 2 + # Number of files in the cdir
669
+ 4 + # End-of-central-directory signature
670
+ 2 + # Number of this disk
671
+ 2 + # Number of disk with the start of cdir
672
+ 2 + # Number of files in the cdir of this disk
673
+ 2 + # The comment size
674
+ 0xFFFF # Maximum comment size
675
+
676
+ # To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
677
+ # The maximum size is all the usual items, plus the maximum size
678
+ # of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
679
+ MAX_LOCAL_HEADER_SIZE = 4 + # signature
680
+ 2 + # Version needed to extract
681
+ 2 + # gp flags
682
+ 2 + # storage mode
683
+ 2 + # dos time
684
+ 2 + # dos date
685
+ 4 + # CRC32
686
+ 4 + # Comp size
687
+ 4 + # Uncomp size
688
+ 2 + # Filename size
689
+ 2 + # Extra fields size
690
+ 0xFFFF + # Maximum filename size
691
+ 0xFFFF # Maximum extra fields size
692
+
693
+ SIZE_OF_USABLE_EOCD_RECORD = 4 + # Signature
694
+ 2 + # Number of this disk
695
+ 2 + # Number of the disk with the EOCD record
696
+ 2 + # Number of entries in the central directory of this disk
697
+ 2 + # Number of entries in the central directory total
698
+ 4 + # Size of the central directory
699
+ 4 # Start of the central directory offset
700
+
701
+ def num_files_and_central_directory_offset(file_io, eocd_offset)
702
+ seek(file_io, eocd_offset)
703
+
704
+ # The size of the EOCD record is known upfront, so use a strict read
705
+ eocd_record_str = read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD)
706
+ io = StringIO.new(eocd_record_str)
707
+
708
+ assert_signature(io, 0x06054b50)
709
+ skip_ahead_2(io) # number_of_this_disk
710
+ skip_ahead_2(io) # number of the disk with the EOCD record
711
+ skip_ahead_2(io) # number of entries in the central directory of this disk
712
+ num_files = read_2b(io) # number of entries in the central directory total
713
+ cdir_size = read_4b(io) # size of the central directory
714
+ cdir_offset = read_4b(io) # start of central directorty offset
715
+ [num_files, cdir_offset, cdir_size]
716
+ end
717
+
718
+ private_constant :C_UINT4, :C_UINT2, :C_UINT8, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
719
+ :MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
720
+
721
+ # Is provided as a stub to be overridden in a subclass if you need it. Will report
722
+ # during various stages of reading. The log message is contained in the return value
723
+ # of `yield` in the method (the log messages are lazy-evaluated).
724
+ def log
725
+ # The most minimal implementation for the method is just this:
726
+ # $stderr.puts(yield)
727
+ end
728
+
729
+ def parse_out_extra_fields(extra_fields_str)
730
+ extra_table = {}
731
+ extras_buf = StringIO.new(extra_fields_str)
732
+ until extras_buf.eof?
733
+ extra_id = read_2b(extras_buf)
734
+ extra_size = read_2b(extras_buf)
735
+ extra_contents = read_n(extras_buf, extra_size)
736
+ extra_table[extra_id] = extra_contents
737
+ end
738
+ extra_table
739
+ end
740
+ end