zip_kit 6.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.codeclimate.yml +7 -0
- data/.document +5 -0
- data/.github/workflows/ci.yml +29 -0
- data/.gitignore +61 -0
- data/.rspec +1 -0
- data/.standard.yml +8 -0
- data/.yardopts +1 -0
- data/CHANGELOG.md +255 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/CONTRIBUTING.md +153 -0
- data/Gemfile +4 -0
- data/IMPLEMENTATION_DETAILS.md +97 -0
- data/LICENSE.txt +20 -0
- data/README.md +234 -0
- data/Rakefile +21 -0
- data/bench/buffered_crc32_bench.rb +109 -0
- data/examples/archive_size_estimate.rb +15 -0
- data/examples/config.ru +7 -0
- data/examples/deferred_write.rb +58 -0
- data/examples/parallel_compression_with_block_deflate.rb +86 -0
- data/examples/rack_application.rb +63 -0
- data/examples/s3_upload.rb +23 -0
- data/lib/zip_kit/block_deflate.rb +130 -0
- data/lib/zip_kit/block_write.rb +47 -0
- data/lib/zip_kit/file_reader/inflating_reader.rb +36 -0
- data/lib/zip_kit/file_reader/stored_reader.rb +35 -0
- data/lib/zip_kit/file_reader.rb +740 -0
- data/lib/zip_kit/null_writer.rb +12 -0
- data/lib/zip_kit/output_enumerator.rb +150 -0
- data/lib/zip_kit/path_set.rb +163 -0
- data/lib/zip_kit/rack_chunked_body.rb +32 -0
- data/lib/zip_kit/rack_tempfile_body.rb +61 -0
- data/lib/zip_kit/rails_streaming.rb +37 -0
- data/lib/zip_kit/remote_io.rb +114 -0
- data/lib/zip_kit/remote_uncap.rb +22 -0
- data/lib/zip_kit/size_estimator.rb +84 -0
- data/lib/zip_kit/stream_crc32.rb +60 -0
- data/lib/zip_kit/streamer/deflated_writer.rb +45 -0
- data/lib/zip_kit/streamer/entry.rb +37 -0
- data/lib/zip_kit/streamer/filler.rb +9 -0
- data/lib/zip_kit/streamer/heuristic.rb +68 -0
- data/lib/zip_kit/streamer/stored_writer.rb +39 -0
- data/lib/zip_kit/streamer/writable.rb +36 -0
- data/lib/zip_kit/streamer.rb +614 -0
- data/lib/zip_kit/uniquify_filename.rb +39 -0
- data/lib/zip_kit/version.rb +5 -0
- data/lib/zip_kit/write_and_tell.rb +40 -0
- data/lib/zip_kit/write_buffer.rb +71 -0
- data/lib/zip_kit/write_shovel.rb +22 -0
- data/lib/zip_kit/zip_writer.rb +436 -0
- data/lib/zip_kit.rb +24 -0
- data/zip_kit.gemspec +41 -0
- metadata +335 -0
@@ -0,0 +1,740 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "stringio"
|
4
|
+
|
5
|
+
# A very barebones ZIP file reader. Is made for maximum interoperability, but at the same
|
6
|
+
# time we attempt to keep it somewhat concise.
|
7
|
+
#
|
8
|
+
# ## REALLY CRAZY IMPORTANT STUFF: SECURITY IMPLICATIONS
|
9
|
+
#
|
10
|
+
# Please **BEWARE** - using this is a security risk if you are reading files that have been
|
11
|
+
# supplied by users. This implementation has _not_ been formally verified for correctness. As
|
12
|
+
# ZIP files contain relative offsets in lots of places it might be possible for a maliciously
|
13
|
+
# crafted ZIP file to put the decode procedure in an endless loop, make it attempt huge reads
|
14
|
+
# from the input file and so on. Additionally, the reader module for deflated data has
|
15
|
+
# no support for ZIP bomb protection. So either limit the `FileReader` usage to the files you
|
16
|
+
# trust, or triple-check all the inputs upfront. Patches to make this reader more secure
|
17
|
+
# are welcome of course.
|
18
|
+
#
|
19
|
+
# ## Usage
|
20
|
+
#
|
21
|
+
# File.open('zipfile.zip', 'rb') do |f|
|
22
|
+
# entries = ZipKit::FileReader.read_zip_structure(io: f)
|
23
|
+
# entries.each do |e|
|
24
|
+
# File.open(e.filename, 'wb') do |extracted_file|
|
25
|
+
# ex = e.extractor_from(f)
|
26
|
+
# extracted_file << ex.extract(1024 * 1024) until ex.eof?
|
27
|
+
# end
|
28
|
+
# end
|
29
|
+
# end
|
30
|
+
#
|
31
|
+
# ## Supported features
|
32
|
+
#
|
33
|
+
# * Deflate and stored storage modes
|
34
|
+
# * Zip64 (extra fields and offsets)
|
35
|
+
# * Data descriptors
|
36
|
+
#
|
37
|
+
# ## Unsupported features
|
38
|
+
#
|
39
|
+
# * Archives split over multiple disks/files
|
40
|
+
# * Any ZIP encryption
|
41
|
+
# * EFS language flag and InfoZIP filename extra field
|
42
|
+
# * CRC32 checksums are _not_ verified
|
43
|
+
#
|
44
|
+
# ## Mode of operation
|
45
|
+
#
|
46
|
+
# By default, `FileReader` _ignores_ the data in local file headers (as it is
|
47
|
+
# often unreliable). It reads the ZIP file "from the tail", finds the
|
48
|
+
# end-of-central-directory signatures, then reads the central directory entries,
|
49
|
+
# reconstitutes the entries with their filenames, attributes and so on, and
|
50
|
+
# sets these entries up with the absolute _offsets_ into the source file/IO object.
|
51
|
+
# These offsets can then be used to extract the actual compressed data of
|
52
|
+
# the files and to expand it.
|
53
|
+
#
|
54
|
+
# ## Recovering damaged or incomplete ZIP files
|
55
|
+
#
|
56
|
+
# If the ZIP file you are trying to read does not contain the central directory
|
57
|
+
# records `read_zip_structure` will not work, since it starts the read process
|
58
|
+
# from the EOCD marker at the end of the central directory and then crawls
|
59
|
+
# "back" in the IO to figure out the rest. You can explicitly apply a fallback
|
60
|
+
# for reading the archive "straight ahead" instead using `read_zip_straight_ahead`
|
61
|
+
# - the method will instead scan your IO from the very start, skipping over
|
62
|
+
# the actual entry data. This is less efficient than central directory parsing since
|
63
|
+
# it involves a much larger number of reads (1 read from the IO per entry in the ZIP).
|
64
|
+
|
65
|
+
class ZipKit::FileReader
|
66
|
+
require_relative "file_reader/stored_reader"
|
67
|
+
require_relative "file_reader/inflating_reader"
|
68
|
+
|
69
|
+
ReadError = Class.new(StandardError)
|
70
|
+
UnsupportedFeature = Class.new(StandardError)
|
71
|
+
InvalidStructure = Class.new(ReadError)
|
72
|
+
LocalHeaderPending = Class.new(StandardError) do
|
73
|
+
def message
|
74
|
+
"The compressed data offset is not available (local header has not been read)"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
MissingEOCD = Class.new(StandardError) do
|
78
|
+
def message
|
79
|
+
"Could not find the EOCD signature in the buffer - maybe a malformed ZIP file"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
private_constant :StoredReader, :InflatingReader
|
84
|
+
|
85
|
+
# Represents a file within the ZIP archive being read. This is different from
|
86
|
+
# the Entry object used in Streamer for ZIP writing, since during writing more
|
87
|
+
# data can be kept in memory for immediate use.
|
88
|
+
class ZipEntry
|
89
|
+
# @return [Fixnum] bit-packed version signature of the program that made the archive
|
90
|
+
attr_accessor :made_by
|
91
|
+
|
92
|
+
# @return [Fixnum] ZIP version support needed to extract this file
|
93
|
+
attr_accessor :version_needed_to_extract
|
94
|
+
|
95
|
+
# @return [Fixnum] bit-packed general purpose flags
|
96
|
+
attr_accessor :gp_flags
|
97
|
+
|
98
|
+
# @return [Fixnum] Storage mode (0 for stored, 8 for deflate)
|
99
|
+
attr_accessor :storage_mode
|
100
|
+
|
101
|
+
# @return [Fixnum] the bit-packed DOS time
|
102
|
+
attr_accessor :dos_time
|
103
|
+
|
104
|
+
# @return [Fixnum] the bit-packed DOS date
|
105
|
+
attr_accessor :dos_date
|
106
|
+
|
107
|
+
# @return [Fixnum] the CRC32 checksum of this file
|
108
|
+
attr_accessor :crc32
|
109
|
+
|
110
|
+
# @return [Fixnum] size of compressed file data in the ZIP
|
111
|
+
attr_accessor :compressed_size
|
112
|
+
|
113
|
+
# @return [Fixnum] size of the file once uncompressed
|
114
|
+
attr_accessor :uncompressed_size
|
115
|
+
|
116
|
+
# @return [String] the filename
|
117
|
+
attr_accessor :filename
|
118
|
+
|
119
|
+
# @return [Fixnum] disk number where this file starts
|
120
|
+
attr_accessor :disk_number_start
|
121
|
+
|
122
|
+
# @return [Fixnum] internal attributes of the file
|
123
|
+
attr_accessor :internal_attrs
|
124
|
+
|
125
|
+
# @return [Fixnum] external attributes of the file
|
126
|
+
attr_accessor :external_attrs
|
127
|
+
|
128
|
+
# @return [Fixnum] at what offset the local file header starts
|
129
|
+
# in your original IO object
|
130
|
+
attr_accessor :local_file_header_offset
|
131
|
+
|
132
|
+
# @return [String] the file comment
|
133
|
+
attr_accessor :comment
|
134
|
+
|
135
|
+
# Returns a reader for the actual compressed data of the entry.
|
136
|
+
#
|
137
|
+
# reader = entry.extractor_from(source_file)
|
138
|
+
# outfile << reader.extract(512 * 1024) until reader.eof?
|
139
|
+
#
|
140
|
+
# @return [#extract(n_bytes), #eof?] the reader for the data
|
141
|
+
def extractor_from(from_io)
|
142
|
+
from_io.seek(compressed_data_offset, IO::SEEK_SET)
|
143
|
+
case storage_mode
|
144
|
+
when 8
|
145
|
+
InflatingReader.new(from_io, compressed_size)
|
146
|
+
when 0
|
147
|
+
StoredReader.new(from_io, compressed_size)
|
148
|
+
else
|
149
|
+
raise UnsupportedFeature, "Unsupported storage mode for reading - %<storage_mode>d" %
|
150
|
+
{storage_mode: storage_mode}
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
# @return [Fixnum] at what offset you should start reading
|
155
|
+
# for the compressed data in your original IO object
|
156
|
+
def compressed_data_offset
|
157
|
+
@compressed_data_offset || raise(LocalHeaderPending)
|
158
|
+
end
|
159
|
+
|
160
|
+
# Tells whether the compressed data offset is already known for this entry
|
161
|
+
# @return [Boolean]
|
162
|
+
def known_offset?
|
163
|
+
!@compressed_data_offset.nil?
|
164
|
+
end
|
165
|
+
|
166
|
+
# Tells whether the entry uses a data descriptor (this is defined
|
167
|
+
# by bit 3 in the GP flags).
|
168
|
+
def uses_data_descriptor?
|
169
|
+
(gp_flags & 0x0008) == 0x0008
|
170
|
+
end
|
171
|
+
|
172
|
+
# Sets the offset at which the compressed data for this file starts in the ZIP.
|
173
|
+
# By default, the value will be set by the Reader for you. If you use delayed
|
174
|
+
# reading, you need to set it by using the `get_compressed_data_offset` on the Reader:
|
175
|
+
#
|
176
|
+
# entry.compressed_data_offset = reader.get_compressed_data_offset(io: file,
|
177
|
+
# local_file_header_offset: entry.local_header_offset)
|
178
|
+
def compressed_data_offset=(offset)
|
179
|
+
@compressed_data_offset = offset.to_i
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
# Parse an IO handle to a ZIP archive into an array of Entry objects.
|
184
|
+
#
|
185
|
+
# @param io[#tell, #seek, #read, #size] an IO-ish object
|
186
|
+
# @param read_local_headers[Boolean] whether the local headers must be read upfront. When reading
|
187
|
+
# a locally available ZIP file this option will not have much use since the small reads from
|
188
|
+
# the file handle are not going to be that important. However, if you are using remote reads
|
189
|
+
# to decipher a ZIP file located on an HTTP server, the operation _must_ perform an HTTP
|
190
|
+
# request for _each entry in the ZIP file_ to determine where the actual file data starts.
|
191
|
+
# This, for a ZIP archive of 1000 files, will incur 1000 extra HTTP requests - which you might
|
192
|
+
# not want to perform upfront, or - at least - not want to perform _at once_. When the option is
|
193
|
+
# set to `false`, you will be getting instances of `LazyEntry` instead of `Entry`. Those objects
|
194
|
+
# will raise an exception when you attempt to access their compressed data offset in the ZIP
|
195
|
+
# (since the reads have not been performed yet). As a rule, this option can be left in it's
|
196
|
+
# default setting (`true`) unless you want to _only_ read the central directory, or you need
|
197
|
+
# to limit the number of HTTP requests.
|
198
|
+
# @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
|
199
|
+
def read_zip_structure(io:, read_local_headers: true)
|
200
|
+
zip_file_size = io.size
|
201
|
+
eocd_offset = get_eocd_offset(io, zip_file_size)
|
202
|
+
|
203
|
+
zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
|
204
|
+
num_files, cdir_location, _cdir_size =
|
205
|
+
if zip64_end_of_cdir_location
|
206
|
+
num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
|
207
|
+
else
|
208
|
+
num_files_and_central_directory_offset(io, eocd_offset)
|
209
|
+
end
|
210
|
+
|
211
|
+
log do
|
212
|
+
"Located the central directory start at %<location>d" %
|
213
|
+
{location: cdir_location}
|
214
|
+
end
|
215
|
+
seek(io, cdir_location)
|
216
|
+
|
217
|
+
# Read the entire central directory AND anything behind it, in one fell swoop.
|
218
|
+
# Strictly speaking, we should be able to read `cdir_size` bytes and not a byte more.
|
219
|
+
# However, we know for a fact that in some of our files the central directory size
|
220
|
+
# is in fact misreported. `zipinfo` then says:
|
221
|
+
#
|
222
|
+
# warning [ktsglobal-2b03bc.zip]: 1 extra byte at beginning or within zipfile
|
223
|
+
# (attempting to process anyway)
|
224
|
+
# error [ktsglobal-2b03bc.zip]: reported length of central directory is
|
225
|
+
# -1 bytes too long (Atari STZip zipfile? J.H.Holm ZIPSPLIT 1.1
|
226
|
+
# zipfile?). Compensating...
|
227
|
+
#
|
228
|
+
# Since the EOCD is not that big anyway, we just read the entire "tail" of the ZIP ignoring
|
229
|
+
# the central directory size alltogether.
|
230
|
+
central_directory_str = io.read # and not read_n(io, cdir_size), see above
|
231
|
+
central_directory_io = StringIO.new(central_directory_str)
|
232
|
+
log do
|
233
|
+
"Read %<byte_size>d bytes with central directory + EOCD record and locator" %
|
234
|
+
{byte_size: central_directory_str.bytesize}
|
235
|
+
end
|
236
|
+
|
237
|
+
entries = (0...num_files).map { |entry_n|
|
238
|
+
offset_location = cdir_location + central_directory_io.tell
|
239
|
+
log do
|
240
|
+
"Reading the central directory entry %<entry_n>d starting at offset %<offset>d" %
|
241
|
+
{entry_n: entry_n, offset: offset_location}
|
242
|
+
end
|
243
|
+
read_cdir_entry(central_directory_io)
|
244
|
+
}
|
245
|
+
|
246
|
+
read_local_headers(entries, io) if read_local_headers
|
247
|
+
|
248
|
+
entries
|
249
|
+
end
|
250
|
+
|
251
|
+
# Sometimes you might encounter truncated ZIP files, which do not contain
|
252
|
+
# any central directory whatsoever - or where the central directory is
|
253
|
+
# truncated. In that case, employing the technique of reading the ZIP
|
254
|
+
# "from the end" is impossible, and the only recourse is reading each
|
255
|
+
# local file header in sucession. If the entries in such a ZIP use data
|
256
|
+
# descriptors, you would need to scan after the entry until you encounter
|
257
|
+
# the data descriptor signature - and that might be unreliable at best.
|
258
|
+
# Therefore, this reading technique does not support data descriptors.
|
259
|
+
# It can however recover the entries you still can read if these entries
|
260
|
+
# contain all the necessary information about the contained file.
|
261
|
+
#
|
262
|
+
# @param io[#tell, #read, #seek] the IO-ish object to read the local file
|
263
|
+
# headers from @return [Array<ZipEntry>] an array of entries that could be
|
264
|
+
# recovered before hitting EOF
|
265
|
+
def read_zip_straight_ahead(io:)
|
266
|
+
entries = []
|
267
|
+
loop do
|
268
|
+
cur_offset = io.tell
|
269
|
+
entry = read_local_file_header(io: io)
|
270
|
+
if entry.uses_data_descriptor?
|
271
|
+
raise UnsupportedFeature, "The local file header at #{cur_offset} uses \
|
272
|
+
a data descriptor and the start of next entry \
|
273
|
+
cannot be found"
|
274
|
+
end
|
275
|
+
entries << entry
|
276
|
+
next_local_header_offset = entry.compressed_data_offset + entry.compressed_size
|
277
|
+
log do
|
278
|
+
"Recovered a local file file header at offset %<cur_offset>d, seeking to the next at %<header_offset>d" %
|
279
|
+
{cur_offset: cur_offset, header_offset: next_local_header_offset}
|
280
|
+
end
|
281
|
+
seek(io, next_local_header_offset) # Seek to the next entry, and raise if seek is impossible
|
282
|
+
end
|
283
|
+
entries
|
284
|
+
rescue ReadError, RangeError # RangeError is raised if offset exceeds int32/int64 range
|
285
|
+
log do
|
286
|
+
"Got a read/seek error after reaching %<cur_offset>d, no more entries can be recovered" %
|
287
|
+
{cur_offset: cur_offset}
|
288
|
+
end
|
289
|
+
entries
|
290
|
+
end
|
291
|
+
|
292
|
+
# Parse the local header entry and get the offset in the IO at which the
|
293
|
+
# actual compressed data of the file starts within the ZIP.
|
294
|
+
# The method will eager-read the entire local header for the file
|
295
|
+
# (the maximum size the local header may use), starting at the given offset,
|
296
|
+
# and will then compute its size. That size plus the local header offset
|
297
|
+
# given will be the compressed data offset of the entry (read starting at
|
298
|
+
# this offset to get the data).
|
299
|
+
#
|
300
|
+
# @param io[#read] an IO-ish object the ZIP file can be read from
|
301
|
+
# @return [Array<ZipEntry, Fixnum>] the parsed local header entry and
|
302
|
+
# the compressed data offset
|
303
|
+
def read_local_file_header(io:)
|
304
|
+
local_file_header_offset = io.tell
|
305
|
+
|
306
|
+
# Reading in bulk is cheaper - grab the maximum length of the local header,
|
307
|
+
# including any headroom for extra fields etc.
|
308
|
+
local_file_header_str_plus_headroom = io.read(MAX_LOCAL_HEADER_SIZE)
|
309
|
+
raise ReadError if local_file_header_str_plus_headroom.nil? # reached EOF
|
310
|
+
|
311
|
+
io_starting_at_local_header = StringIO.new(local_file_header_str_plus_headroom)
|
312
|
+
|
313
|
+
assert_signature(io_starting_at_local_header, 0x04034b50)
|
314
|
+
e = ZipEntry.new
|
315
|
+
e.version_needed_to_extract = read_2b(io_starting_at_local_header) # Version needed to extract
|
316
|
+
e.gp_flags = read_2b(io_starting_at_local_header) # gp flags
|
317
|
+
e.storage_mode = read_2b(io_starting_at_local_header) # storage mode
|
318
|
+
e.dos_time = read_2b(io_starting_at_local_header) # dos time
|
319
|
+
e.dos_date = read_2b(io_starting_at_local_header) # dos date
|
320
|
+
e.crc32 = read_4b(io_starting_at_local_header) # CRC32
|
321
|
+
e.compressed_size = read_4b(io_starting_at_local_header) # Comp size
|
322
|
+
e.uncompressed_size = read_4b(io_starting_at_local_header) # Uncomp size
|
323
|
+
|
324
|
+
filename_size = read_2b(io_starting_at_local_header)
|
325
|
+
extra_size = read_2b(io_starting_at_local_header)
|
326
|
+
e.filename = read_n(io_starting_at_local_header, filename_size)
|
327
|
+
extra_fields_str = read_n(io_starting_at_local_header, extra_size)
|
328
|
+
|
329
|
+
# Parse out the extra fields
|
330
|
+
extra_table = parse_out_extra_fields(extra_fields_str)
|
331
|
+
|
332
|
+
# ...of which we really only need the Zip64 extra
|
333
|
+
if (zip64_extra_contents = extra_table[1])
|
334
|
+
# If the Zip64 extra is present, we let it override all
|
335
|
+
# the values fetched from the conventional header
|
336
|
+
zip64_extra = StringIO.new(zip64_extra_contents)
|
337
|
+
log do
|
338
|
+
"Will read Zip64 extra data from local header field for %<filename>s, %<size>d bytes" %
|
339
|
+
{filename: e.filename, size: zip64_extra.size}
|
340
|
+
end
|
341
|
+
# Now here be dragons. The APPNOTE specifies that
|
342
|
+
#
|
343
|
+
# > The order of the fields in the ZIP64 extended
|
344
|
+
# > information record is fixed, but the fields will
|
345
|
+
# > only appear if the corresponding Local or Central
|
346
|
+
# > directory record field is set to 0xFFFF or 0xFFFFFFFF.
|
347
|
+
#
|
348
|
+
# It means that before we read this stuff we need to check if the previously-read
|
349
|
+
# values are at overflow, and only _then_ proceed to read them. Bah.
|
350
|
+
e.uncompressed_size = read_8b(zip64_extra) if e.uncompressed_size == 0xFFFFFFFF
|
351
|
+
e.compressed_size = read_8b(zip64_extra) if e.compressed_size == 0xFFFFFFFF
|
352
|
+
end
|
353
|
+
|
354
|
+
offset = local_file_header_offset + io_starting_at_local_header.tell
|
355
|
+
e.compressed_data_offset = offset
|
356
|
+
|
357
|
+
e
|
358
|
+
end
|
359
|
+
|
360
|
+
# Get the offset in the IO at which the actual compressed data of the file
|
361
|
+
# starts within the ZIP. The method will eager-read the entire local header
|
362
|
+
# for the file (the maximum size the local header may use), starting at the
|
363
|
+
# given offset, and will then compute its size. That size plus the local
|
364
|
+
# header offset given will be the compressed data offset of the entry
|
365
|
+
# (read starting at this offset to get the data).
|
366
|
+
#
|
367
|
+
# @param io[#seek, #read] an IO-ish object the ZIP file can be read from
|
368
|
+
# @param local_file_header_offset[Fixnum] absolute offset (0-based) where the
|
369
|
+
# local file header is supposed to begin @return [Fixnum] absolute offset
|
370
|
+
# (0-based) of where the compressed data begins for this file within the ZIP
|
371
|
+
def get_compressed_data_offset(io:, local_file_header_offset:)
|
372
|
+
seek(io, local_file_header_offset)
|
373
|
+
entry_recovered_from_local_file_header = read_local_file_header(io: io)
|
374
|
+
entry_recovered_from_local_file_header.compressed_data_offset
|
375
|
+
end
|
376
|
+
|
377
|
+
# Parse an IO handle to a ZIP archive into an array of Entry objects, reading from the end
|
378
|
+
# of the IO object.
|
379
|
+
#
|
380
|
+
# @see #read_zip_structure
|
381
|
+
# @param options[Hash] any options the instance method of the same name accepts
|
382
|
+
# @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
|
383
|
+
def self.read_zip_structure(**options)
|
384
|
+
new.read_zip_structure(**options)
|
385
|
+
end
|
386
|
+
|
387
|
+
# Parse an IO handle to a ZIP archive into an array of Entry objects, reading from the start of
|
388
|
+
# the file and parsing local file headers one-by-one
|
389
|
+
#
|
390
|
+
# @see #read_zip_straight_ahead
|
391
|
+
# @param options[Hash] any options the instance method of the same name accepts
|
392
|
+
# @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
|
393
|
+
def self.read_zip_straight_ahead(**options)
|
394
|
+
new.read_zip_straight_ahead(**options)
|
395
|
+
end
|
396
|
+
|
397
|
+
private
|
398
|
+
|
399
|
+
def read_local_headers(entries, io)
|
400
|
+
entries.each_with_index do |entry, i|
|
401
|
+
log do
|
402
|
+
"Reading the local header for entry %<index>d at offset %<offset>d" %
|
403
|
+
{index: i, offset: entry.local_file_header_offset}
|
404
|
+
end
|
405
|
+
off = get_compressed_data_offset(io: io,
|
406
|
+
local_file_header_offset: entry.local_file_header_offset)
|
407
|
+
entry.compressed_data_offset = off
|
408
|
+
end
|
409
|
+
end
|
410
|
+
|
411
|
+
def skip_ahead_2(io)
|
412
|
+
skip_ahead_n(io, 2)
|
413
|
+
end
|
414
|
+
|
415
|
+
def skip_ahead_4(io)
|
416
|
+
skip_ahead_n(io, 4)
|
417
|
+
end
|
418
|
+
|
419
|
+
def skip_ahead_8(io)
|
420
|
+
skip_ahead_n(io, 8)
|
421
|
+
end
|
422
|
+
|
423
|
+
def seek(io, absolute_pos)
|
424
|
+
io.seek(absolute_pos, IO::SEEK_SET)
|
425
|
+
unless absolute_pos == io.tell
|
426
|
+
raise ReadError,
|
427
|
+
"Expected to seek to #{absolute_pos} but only \
|
428
|
+
got to #{io.tell}"
|
429
|
+
end
|
430
|
+
nil
|
431
|
+
end
|
432
|
+
|
433
|
+
def assert_signature(io, signature_magic_number)
|
434
|
+
readback = read_4b(io)
|
435
|
+
if readback != signature_magic_number
|
436
|
+
expected = "0x0" + signature_magic_number.to_s(16)
|
437
|
+
actual = "0x0" + readback.to_s(16)
|
438
|
+
raise InvalidStructure, "Expected signature #{expected}, but read #{actual}"
|
439
|
+
end
|
440
|
+
end
|
441
|
+
|
442
|
+
def skip_ahead_n(io, n)
|
443
|
+
pos_before = io.tell
|
444
|
+
io.seek(io.tell + n, IO::SEEK_SET)
|
445
|
+
pos_after = io.tell
|
446
|
+
delta = pos_after - pos_before
|
447
|
+
unless delta == n
|
448
|
+
raise ReadError, "Expected to seek #{n} bytes ahead, but could \
|
449
|
+
only seek #{delta} bytes ahead"
|
450
|
+
end
|
451
|
+
nil
|
452
|
+
end
|
453
|
+
|
454
|
+
def read_n(io, n_bytes)
|
455
|
+
io.read(n_bytes).tap do |d|
|
456
|
+
raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
|
457
|
+
unless d.bytesize == n_bytes
|
458
|
+
raise ReadError, "Expected to read #{n_bytes} bytes, \
|
459
|
+
read #{d.bytesize}"
|
460
|
+
end
|
461
|
+
end
|
462
|
+
end
|
463
|
+
|
464
|
+
def read_2b(io)
|
465
|
+
read_n(io, 2).unpack(C_UINT2).shift
|
466
|
+
end
|
467
|
+
|
468
|
+
def read_4b(io)
|
469
|
+
read_n(io, 4).unpack(C_UINT4).shift
|
470
|
+
end
|
471
|
+
|
472
|
+
def read_8b(io)
|
473
|
+
read_n(io, 8).unpack(C_UINT8).shift
|
474
|
+
end
|
475
|
+
|
476
|
+
def read_cdir_entry(io)
|
477
|
+
# read_cdir_entry is too high. [45.66/15]
|
478
|
+
assert_signature(io, 0x02014b50)
|
479
|
+
ZipEntry.new.tap do |e|
|
480
|
+
e.made_by = read_2b(io)
|
481
|
+
e.version_needed_to_extract = read_2b(io)
|
482
|
+
e.gp_flags = read_2b(io)
|
483
|
+
e.storage_mode = read_2b(io)
|
484
|
+
e.dos_time = read_2b(io)
|
485
|
+
e.dos_date = read_2b(io)
|
486
|
+
e.crc32 = read_4b(io)
|
487
|
+
e.compressed_size = read_4b(io)
|
488
|
+
e.uncompressed_size = read_4b(io)
|
489
|
+
filename_size = read_2b(io)
|
490
|
+
extra_size = read_2b(io)
|
491
|
+
comment_len = read_2b(io)
|
492
|
+
e.disk_number_start = read_2b(io)
|
493
|
+
e.internal_attrs = read_2b(io)
|
494
|
+
e.external_attrs = read_4b(io)
|
495
|
+
e.local_file_header_offset = read_4b(io)
|
496
|
+
e.filename = read_n(io, filename_size)
|
497
|
+
|
498
|
+
# Extra fields
|
499
|
+
extras = read_n(io, extra_size)
|
500
|
+
# Comment
|
501
|
+
e.comment = read_n(io, comment_len)
|
502
|
+
|
503
|
+
# Parse out the extra fields
|
504
|
+
extra_table = parse_out_extra_fields(extras)
|
505
|
+
|
506
|
+
# ...of which we really only need the Zip64 extra
|
507
|
+
if zip64_extra_contents ||= extra_table[1]
|
508
|
+
# If the Zip64 extra is present, we let it override all
|
509
|
+
# the values fetched from the conventional header
|
510
|
+
zip64_extra = StringIO.new(zip64_extra_contents)
|
511
|
+
log do
|
512
|
+
"Will read Zip64 extra data for %<filename>s, %<size>d bytes" %
|
513
|
+
{filename: e.filename, size: zip64_extra.size}
|
514
|
+
end
|
515
|
+
# Now here be dragons. The APPNOTE specifies that
|
516
|
+
#
|
517
|
+
# > The order of the fields in the ZIP64 extended
|
518
|
+
# > information record is fixed, but the fields will
|
519
|
+
# > only appear if the corresponding Local or Central
|
520
|
+
# > directory record field is set to 0xFFFF or 0xFFFFFFFF.
|
521
|
+
#
|
522
|
+
# It means that before we read this stuff we need to check if the previously-read
|
523
|
+
# values are at overflow, and only _then_ proceed to read them. Bah.
|
524
|
+
e.uncompressed_size = read_8b(zip64_extra) if e.uncompressed_size == 0xFFFFFFFF
|
525
|
+
e.compressed_size = read_8b(zip64_extra) if e.compressed_size == 0xFFFFFFFF
|
526
|
+
e.local_file_header_offset = read_8b(zip64_extra) if e.local_file_header_offset == 0xFFFFFFFF
|
527
|
+
# Disk number comes last and we can skip it anyway, since we do
|
528
|
+
# not support multi-disk archives
|
529
|
+
end
|
530
|
+
end
|
531
|
+
end
|
532
|
+
|
533
|
+
def get_eocd_offset(file_io, zip_file_size)
|
534
|
+
# Start reading from the _comment_ of the zip file (from the very end).
|
535
|
+
# The maximum size of the comment is 0xFFFF (what fits in 2 bytes)
|
536
|
+
implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
|
537
|
+
implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0
|
538
|
+
|
539
|
+
# Use a soft seek (we might not be able to get as far behind in the IO as we want)
|
540
|
+
# and a soft read (we might not be able to read as many bytes as we want)
|
541
|
+
file_io.seek(implied_position_of_eocd_record, IO::SEEK_SET)
|
542
|
+
str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
|
543
|
+
eocd_idx_in_buf = locate_eocd_signature(str_containing_eocd_record)
|
544
|
+
|
545
|
+
raise MissingEOCD unless eocd_idx_in_buf
|
546
|
+
|
547
|
+
eocd_offset = implied_position_of_eocd_record + eocd_idx_in_buf
|
548
|
+
log do
|
549
|
+
"Found EOCD signature at offset %<offset>d" % {offset: eocd_offset}
|
550
|
+
end
|
551
|
+
|
552
|
+
eocd_offset
|
553
|
+
end
|
554
|
+
|
555
|
+
def all_indices_of_substr_in_str(of_substring, in_string)
|
556
|
+
last_i = 0
|
557
|
+
found_at_indices = []
|
558
|
+
while (last_i = in_string.index(of_substring, last_i))
|
559
|
+
found_at_indices << last_i
|
560
|
+
last_i += of_substring.bytesize
|
561
|
+
end
|
562
|
+
found_at_indices
|
563
|
+
end
|
564
|
+
|
565
|
+
# We have to scan the maximum possible number
|
566
|
+
# of bytes that the EOCD can theoretically occupy including the comment after it,
|
567
|
+
# and we have to find a combination of:
|
568
|
+
# [EOCD signature, <some ZIP medatata>, comment byte size, comment of size]
|
569
|
+
# at the end. To do so, we first find all indices of the signature in the trailer
|
570
|
+
# string, and then check whether the bytestring starting at the signature and
|
571
|
+
# ending at the end of string satisfies that given pattern.
|
572
|
+
def locate_eocd_signature(in_str)
|
573
|
+
eocd_signature = 0x06054b50
|
574
|
+
eocd_signature_str = [eocd_signature].pack("V")
|
575
|
+
unpack_pattern = "VvvvvVVv"
|
576
|
+
minimum_record_size = 22
|
577
|
+
str_size = in_str.bytesize
|
578
|
+
indices = all_indices_of_substr_in_str(eocd_signature_str, in_str)
|
579
|
+
indices.each do |check_at|
|
580
|
+
maybe_record = in_str[check_at..str_size]
|
581
|
+
# If the record is smaller than the minimum - we will never recover anything
|
582
|
+
break if maybe_record.bytesize < minimum_record_size
|
583
|
+
# Now we check if the record ends with the combination
|
584
|
+
# of the comment size and an arbitrary byte string of that size.
|
585
|
+
# If it does - we found our match
|
586
|
+
*_unused, comment_size = maybe_record.unpack(unpack_pattern)
|
587
|
+
if (maybe_record.bytesize - minimum_record_size) == comment_size
|
588
|
+
return check_at # Found the EOCD marker location
|
589
|
+
end
|
590
|
+
end
|
591
|
+
# If we haven't caught anything, return nil deliberately instead of returning the last statement
|
592
|
+
nil
|
593
|
+
end
|
594
|
+
|
595
|
+
# Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
|
596
|
+
# EOCD record in the archive by fixed offsets
|
597
|
+
# get_zip64_eocd_location is too high. [15.17/15]
|
598
|
+
def get_zip64_eocd_location(file_io, eocd_offset)
|
599
|
+
zip64_eocd_loc_offset = eocd_offset
|
600
|
+
zip64_eocd_loc_offset -= 4 # The signature
|
601
|
+
zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record
|
602
|
+
zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record
|
603
|
+
zip64_eocd_loc_offset -= 4 # Total number of disks
|
604
|
+
|
605
|
+
log do
|
606
|
+
"Will look for the Zip64 EOCD locator signature at offset %<offset>d" %
|
607
|
+
{offset: zip64_eocd_loc_offset}
|
608
|
+
end
|
609
|
+
|
610
|
+
# If the offset is negative there is certainly no Zip64 EOCD locator here
|
611
|
+
return unless zip64_eocd_loc_offset >= 0
|
612
|
+
|
613
|
+
file_io.seek(zip64_eocd_loc_offset, IO::SEEK_SET)
|
614
|
+
assert_signature(file_io, 0x07064b50)
|
615
|
+
|
616
|
+
log do
|
617
|
+
"Found Zip64 EOCD locator at offset %<offset>d" % {offset: zip64_eocd_loc_offset}
|
618
|
+
end
|
619
|
+
|
620
|
+
disk_num = read_4b(file_io) # number of the disk
|
621
|
+
raise UnsupportedFeature, "The archive spans multiple disks" if disk_num != 0
|
622
|
+
read_8b(file_io)
|
623
|
+
rescue ReadError
|
624
|
+
nil
|
625
|
+
end
|
626
|
+
|
627
|
+
# num_files_and_central_directory_offset_zip64 is too high. [21.12/15]
|
628
|
+
def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
|
629
|
+
seek(io, zip64_end_of_cdir_location)
|
630
|
+
|
631
|
+
assert_signature(io, 0x06064b50)
|
632
|
+
|
633
|
+
zip64_eocdr_size = read_8b(io)
|
634
|
+
zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper
|
635
|
+
zip64_eocdr = StringIO.new(zip64_eocdr)
|
636
|
+
skip_ahead_2(zip64_eocdr) # version made by
|
637
|
+
skip_ahead_2(zip64_eocdr) # version needed to extract
|
638
|
+
|
639
|
+
disk_n = read_4b(zip64_eocdr) # number of this disk
|
640
|
+
disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
|
641
|
+
raise UnsupportedFeature, "The archive spans multiple disks" if disk_n != disk_n_with_eocdr
|
642
|
+
|
643
|
+
num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
|
644
|
+
num_files_total = read_8b(zip64_eocdr) # files total in the central directory
|
645
|
+
|
646
|
+
raise UnsupportedFeature, "The archive spans multiple disks" if num_files_this_disk != num_files_total
|
647
|
+
|
648
|
+
log do
|
649
|
+
"Zip64 EOCD record states there are %<amount>d files in the archive" %
|
650
|
+
{amount: num_files_total}
|
651
|
+
end
|
652
|
+
|
653
|
+
central_dir_size = read_8b(zip64_eocdr) # Size of the central directory
|
654
|
+
central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts
|
655
|
+
|
656
|
+
[num_files_total, central_dir_offset, central_dir_size]
|
657
|
+
end
|
658
|
+
|
659
|
+
C_UINT4 = "V"
|
660
|
+
C_UINT2 = "v"
|
661
|
+
C_UINT8 = "Q<"
|
662
|
+
|
663
|
+
# To prevent too many tiny reads, read the maximum possible size of end of
|
664
|
+
# central directory record upfront (all the fixed fields + at most 0xFFFF
|
665
|
+
# bytes of the archive comment)
|
666
|
+
MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE = 4 + # Offset of the start of central directory
|
667
|
+
4 + # Size of the central directory
|
668
|
+
2 + # Number of files in the cdir
|
669
|
+
4 + # End-of-central-directory signature
|
670
|
+
2 + # Number of this disk
|
671
|
+
2 + # Number of disk with the start of cdir
|
672
|
+
2 + # Number of files in the cdir of this disk
|
673
|
+
2 + # The comment size
|
674
|
+
0xFFFF # Maximum comment size
|
675
|
+
|
676
|
+
# To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
|
677
|
+
# The maximum size is all the usual items, plus the maximum size
|
678
|
+
# of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
|
679
|
+
MAX_LOCAL_HEADER_SIZE = 4 + # signature
|
680
|
+
2 + # Version needed to extract
|
681
|
+
2 + # gp flags
|
682
|
+
2 + # storage mode
|
683
|
+
2 + # dos time
|
684
|
+
2 + # dos date
|
685
|
+
4 + # CRC32
|
686
|
+
4 + # Comp size
|
687
|
+
4 + # Uncomp size
|
688
|
+
2 + # Filename size
|
689
|
+
2 + # Extra fields size
|
690
|
+
0xFFFF + # Maximum filename size
|
691
|
+
0xFFFF # Maximum extra fields size
|
692
|
+
|
693
|
+
SIZE_OF_USABLE_EOCD_RECORD = 4 + # Signature
|
694
|
+
2 + # Number of this disk
|
695
|
+
2 + # Number of the disk with the EOCD record
|
696
|
+
2 + # Number of entries in the central directory of this disk
|
697
|
+
2 + # Number of entries in the central directory total
|
698
|
+
4 + # Size of the central directory
|
699
|
+
4 # Start of the central directory offset
|
700
|
+
|
701
|
+
def num_files_and_central_directory_offset(file_io, eocd_offset)
|
702
|
+
seek(file_io, eocd_offset)
|
703
|
+
|
704
|
+
# The size of the EOCD record is known upfront, so use a strict read
|
705
|
+
eocd_record_str = read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD)
|
706
|
+
io = StringIO.new(eocd_record_str)
|
707
|
+
|
708
|
+
assert_signature(io, 0x06054b50)
|
709
|
+
skip_ahead_2(io) # number_of_this_disk
|
710
|
+
skip_ahead_2(io) # number of the disk with the EOCD record
|
711
|
+
skip_ahead_2(io) # number of entries in the central directory of this disk
|
712
|
+
num_files = read_2b(io) # number of entries in the central directory total
|
713
|
+
cdir_size = read_4b(io) # size of the central directory
|
714
|
+
cdir_offset = read_4b(io) # start of central directorty offset
|
715
|
+
[num_files, cdir_offset, cdir_size]
|
716
|
+
end
|
717
|
+
|
718
|
+
private_constant :C_UINT4, :C_UINT2, :C_UINT8, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
|
719
|
+
:MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
|
720
|
+
|
721
|
+
# Is provided as a stub to be overridden in a subclass if you need it. Will report
|
722
|
+
# during various stages of reading. The log message is contained in the return value
|
723
|
+
# of `yield` in the method (the log messages are lazy-evaluated).
|
724
|
+
def log
|
725
|
+
# The most minimal implementation for the method is just this:
|
726
|
+
# $stderr.puts(yield)
|
727
|
+
end
|
728
|
+
|
729
|
+
def parse_out_extra_fields(extra_fields_str)
|
730
|
+
extra_table = {}
|
731
|
+
extras_buf = StringIO.new(extra_fields_str)
|
732
|
+
until extras_buf.eof?
|
733
|
+
extra_id = read_2b(extras_buf)
|
734
|
+
extra_size = read_2b(extras_buf)
|
735
|
+
extra_contents = read_n(extras_buf, extra_size)
|
736
|
+
extra_table[extra_id] = extra_contents
|
737
|
+
end
|
738
|
+
extra_table
|
739
|
+
end
|
740
|
+
end
|