zip_kit 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.codeclimate.yml +7 -0
- data/.document +5 -0
- data/.github/workflows/ci.yml +29 -0
- data/.gitignore +61 -0
- data/.rspec +1 -0
- data/.standard.yml +8 -0
- data/.yardopts +1 -0
- data/CHANGELOG.md +255 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/CONTRIBUTING.md +153 -0
- data/Gemfile +4 -0
- data/IMPLEMENTATION_DETAILS.md +97 -0
- data/LICENSE.txt +20 -0
- data/README.md +234 -0
- data/Rakefile +21 -0
- data/bench/buffered_crc32_bench.rb +109 -0
- data/examples/archive_size_estimate.rb +15 -0
- data/examples/config.ru +7 -0
- data/examples/deferred_write.rb +58 -0
- data/examples/parallel_compression_with_block_deflate.rb +86 -0
- data/examples/rack_application.rb +63 -0
- data/examples/s3_upload.rb +23 -0
- data/lib/zip_kit/block_deflate.rb +130 -0
- data/lib/zip_kit/block_write.rb +47 -0
- data/lib/zip_kit/file_reader/inflating_reader.rb +36 -0
- data/lib/zip_kit/file_reader/stored_reader.rb +35 -0
- data/lib/zip_kit/file_reader.rb +740 -0
- data/lib/zip_kit/null_writer.rb +12 -0
- data/lib/zip_kit/output_enumerator.rb +150 -0
- data/lib/zip_kit/path_set.rb +163 -0
- data/lib/zip_kit/rack_chunked_body.rb +32 -0
- data/lib/zip_kit/rack_tempfile_body.rb +61 -0
- data/lib/zip_kit/rails_streaming.rb +37 -0
- data/lib/zip_kit/remote_io.rb +114 -0
- data/lib/zip_kit/remote_uncap.rb +22 -0
- data/lib/zip_kit/size_estimator.rb +84 -0
- data/lib/zip_kit/stream_crc32.rb +60 -0
- data/lib/zip_kit/streamer/deflated_writer.rb +45 -0
- data/lib/zip_kit/streamer/entry.rb +37 -0
- data/lib/zip_kit/streamer/filler.rb +9 -0
- data/lib/zip_kit/streamer/heuristic.rb +68 -0
- data/lib/zip_kit/streamer/stored_writer.rb +39 -0
- data/lib/zip_kit/streamer/writable.rb +36 -0
- data/lib/zip_kit/streamer.rb +614 -0
- data/lib/zip_kit/uniquify_filename.rb +39 -0
- data/lib/zip_kit/version.rb +5 -0
- data/lib/zip_kit/write_and_tell.rb +40 -0
- data/lib/zip_kit/write_buffer.rb +71 -0
- data/lib/zip_kit/write_shovel.rb +22 -0
- data/lib/zip_kit/zip_writer.rb +436 -0
- data/lib/zip_kit.rb +24 -0
- data/zip_kit.gemspec +41 -0
- metadata +335 -0
@@ -0,0 +1,740 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "stringio"
|
4
|
+
|
5
|
+
# A very barebones ZIP file reader. Is made for maximum interoperability, but at the same
|
6
|
+
# time we attempt to keep it somewhat concise.
|
7
|
+
#
|
8
|
+
# ## REALLY CRAZY IMPORTANT STUFF: SECURITY IMPLICATIONS
|
9
|
+
#
|
10
|
+
# Please **BEWARE** - using this is a security risk if you are reading files that have been
|
11
|
+
# supplied by users. This implementation has _not_ been formally verified for correctness. As
|
12
|
+
# ZIP files contain relative offsets in lots of places it might be possible for a maliciously
|
13
|
+
# crafted ZIP file to put the decode procedure in an endless loop, make it attempt huge reads
|
14
|
+
# from the input file and so on. Additionally, the reader module for deflated data has
|
15
|
+
# no support for ZIP bomb protection. So either limit the `FileReader` usage to the files you
|
16
|
+
# trust, or triple-check all the inputs upfront. Patches to make this reader more secure
|
17
|
+
# are welcome of course.
|
18
|
+
#
|
19
|
+
# ## Usage
|
20
|
+
#
|
21
|
+
# File.open('zipfile.zip', 'rb') do |f|
|
22
|
+
# entries = ZipKit::FileReader.read_zip_structure(io: f)
|
23
|
+
# entries.each do |e|
|
24
|
+
# File.open(e.filename, 'wb') do |extracted_file|
|
25
|
+
# ex = e.extractor_from(f)
|
26
|
+
# extracted_file << ex.extract(1024 * 1024) until ex.eof?
|
27
|
+
# end
|
28
|
+
# end
|
29
|
+
# end
|
30
|
+
#
|
31
|
+
# ## Supported features
|
32
|
+
#
|
33
|
+
# * Deflate and stored storage modes
|
34
|
+
# * Zip64 (extra fields and offsets)
|
35
|
+
# * Data descriptors
|
36
|
+
#
|
37
|
+
# ## Unsupported features
|
38
|
+
#
|
39
|
+
# * Archives split over multiple disks/files
|
40
|
+
# * Any ZIP encryption
|
41
|
+
# * EFS language flag and InfoZIP filename extra field
|
42
|
+
# * CRC32 checksums are _not_ verified
|
43
|
+
#
|
44
|
+
# ## Mode of operation
|
45
|
+
#
|
46
|
+
# By default, `FileReader` _ignores_ the data in local file headers (as it is
|
47
|
+
# often unreliable). It reads the ZIP file "from the tail", finds the
|
48
|
+
# end-of-central-directory signatures, then reads the central directory entries,
|
49
|
+
# reconstitutes the entries with their filenames, attributes and so on, and
|
50
|
+
# sets these entries up with the absolute _offsets_ into the source file/IO object.
|
51
|
+
# These offsets can then be used to extract the actual compressed data of
|
52
|
+
# the files and to expand it.
|
53
|
+
#
|
54
|
+
# ## Recovering damaged or incomplete ZIP files
|
55
|
+
#
|
56
|
+
# If the ZIP file you are trying to read does not contain the central directory
|
57
|
+
# records `read_zip_structure` will not work, since it starts the read process
|
58
|
+
# from the EOCD marker at the end of the central directory and then crawls
|
59
|
+
# "back" in the IO to figure out the rest. You can explicitly apply a fallback
|
60
|
+
# for reading the archive "straight ahead" instead using `read_zip_straight_ahead`
|
61
|
+
# - the method will instead scan your IO from the very start, skipping over
|
62
|
+
# the actual entry data. This is less efficient than central directory parsing since
|
63
|
+
# it involves a much larger number of reads (1 read from the IO per entry in the ZIP).
|
64
|
+
|
65
|
+
class ZipKit::FileReader
|
66
|
+
require_relative "file_reader/stored_reader"
|
67
|
+
require_relative "file_reader/inflating_reader"
|
68
|
+
|
69
|
+
ReadError = Class.new(StandardError)
|
70
|
+
UnsupportedFeature = Class.new(StandardError)
|
71
|
+
InvalidStructure = Class.new(ReadError)
|
72
|
+
LocalHeaderPending = Class.new(StandardError) do
|
73
|
+
def message
|
74
|
+
"The compressed data offset is not available (local header has not been read)"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
MissingEOCD = Class.new(StandardError) do
|
78
|
+
def message
|
79
|
+
"Could not find the EOCD signature in the buffer - maybe a malformed ZIP file"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
private_constant :StoredReader, :InflatingReader
|
84
|
+
|
85
|
+
# Represents a file within the ZIP archive being read. This is different from
|
86
|
+
# the Entry object used in Streamer for ZIP writing, since during writing more
|
87
|
+
# data can be kept in memory for immediate use.
|
88
|
+
class ZipEntry
|
89
|
+
# @return [Fixnum] bit-packed version signature of the program that made the archive
|
90
|
+
attr_accessor :made_by
|
91
|
+
|
92
|
+
# @return [Fixnum] ZIP version support needed to extract this file
|
93
|
+
attr_accessor :version_needed_to_extract
|
94
|
+
|
95
|
+
# @return [Fixnum] bit-packed general purpose flags
|
96
|
+
attr_accessor :gp_flags
|
97
|
+
|
98
|
+
# @return [Fixnum] Storage mode (0 for stored, 8 for deflate)
|
99
|
+
attr_accessor :storage_mode
|
100
|
+
|
101
|
+
# @return [Fixnum] the bit-packed DOS time
|
102
|
+
attr_accessor :dos_time
|
103
|
+
|
104
|
+
# @return [Fixnum] the bit-packed DOS date
|
105
|
+
attr_accessor :dos_date
|
106
|
+
|
107
|
+
# @return [Fixnum] the CRC32 checksum of this file
|
108
|
+
attr_accessor :crc32
|
109
|
+
|
110
|
+
# @return [Fixnum] size of compressed file data in the ZIP
|
111
|
+
attr_accessor :compressed_size
|
112
|
+
|
113
|
+
# @return [Fixnum] size of the file once uncompressed
|
114
|
+
attr_accessor :uncompressed_size
|
115
|
+
|
116
|
+
# @return [String] the filename
|
117
|
+
attr_accessor :filename
|
118
|
+
|
119
|
+
# @return [Fixnum] disk number where this file starts
|
120
|
+
attr_accessor :disk_number_start
|
121
|
+
|
122
|
+
# @return [Fixnum] internal attributes of the file
|
123
|
+
attr_accessor :internal_attrs
|
124
|
+
|
125
|
+
# @return [Fixnum] external attributes of the file
|
126
|
+
attr_accessor :external_attrs
|
127
|
+
|
128
|
+
# @return [Fixnum] at what offset the local file header starts
|
129
|
+
# in your original IO object
|
130
|
+
attr_accessor :local_file_header_offset
|
131
|
+
|
132
|
+
# @return [String] the file comment
|
133
|
+
attr_accessor :comment
|
134
|
+
|
135
|
+
# Returns a reader for the actual compressed data of the entry.
|
136
|
+
#
|
137
|
+
# reader = entry.extractor_from(source_file)
|
138
|
+
# outfile << reader.extract(512 * 1024) until reader.eof?
|
139
|
+
#
|
140
|
+
# @return [#extract(n_bytes), #eof?] the reader for the data
|
141
|
+
def extractor_from(from_io)
|
142
|
+
from_io.seek(compressed_data_offset, IO::SEEK_SET)
|
143
|
+
case storage_mode
|
144
|
+
when 8
|
145
|
+
InflatingReader.new(from_io, compressed_size)
|
146
|
+
when 0
|
147
|
+
StoredReader.new(from_io, compressed_size)
|
148
|
+
else
|
149
|
+
raise UnsupportedFeature, "Unsupported storage mode for reading - %<storage_mode>d" %
|
150
|
+
{storage_mode: storage_mode}
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
# @return [Fixnum] at what offset you should start reading
|
155
|
+
# for the compressed data in your original IO object
|
156
|
+
def compressed_data_offset
|
157
|
+
@compressed_data_offset || raise(LocalHeaderPending)
|
158
|
+
end
|
159
|
+
|
160
|
+
# Tells whether the compressed data offset is already known for this entry
|
161
|
+
# @return [Boolean]
|
162
|
+
def known_offset?
|
163
|
+
!@compressed_data_offset.nil?
|
164
|
+
end
|
165
|
+
|
166
|
+
# Tells whether the entry uses a data descriptor (this is defined
|
167
|
+
# by bit 3 in the GP flags).
|
168
|
+
def uses_data_descriptor?
|
169
|
+
(gp_flags & 0x0008) == 0x0008
|
170
|
+
end
|
171
|
+
|
172
|
+
# Sets the offset at which the compressed data for this file starts in the ZIP.
|
173
|
+
# By default, the value will be set by the Reader for you. If you use delayed
|
174
|
+
# reading, you need to set it by using the `get_compressed_data_offset` on the Reader:
|
175
|
+
#
|
176
|
+
# entry.compressed_data_offset = reader.get_compressed_data_offset(io: file,
|
177
|
+
# local_file_header_offset: entry.local_header_offset)
|
178
|
+
def compressed_data_offset=(offset)
|
179
|
+
@compressed_data_offset = offset.to_i
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
# Parse an IO handle to a ZIP archive into an array of Entry objects.
|
184
|
+
#
|
185
|
+
# @param io[#tell, #seek, #read, #size] an IO-ish object
|
186
|
+
# @param read_local_headers[Boolean] whether the local headers must be read upfront. When reading
|
187
|
+
# a locally available ZIP file this option will not have much use since the small reads from
|
188
|
+
# the file handle are not going to be that important. However, if you are using remote reads
|
189
|
+
# to decipher a ZIP file located on an HTTP server, the operation _must_ perform an HTTP
|
190
|
+
# request for _each entry in the ZIP file_ to determine where the actual file data starts.
|
191
|
+
# This, for a ZIP archive of 1000 files, will incur 1000 extra HTTP requests - which you might
|
192
|
+
# not want to perform upfront, or - at least - not want to perform _at once_. When the option is
|
193
|
+
# set to `false`, you will be getting instances of `LazyEntry` instead of `Entry`. Those objects
|
194
|
+
# will raise an exception when you attempt to access their compressed data offset in the ZIP
|
195
|
+
# (since the reads have not been performed yet). As a rule, this option can be left in it's
|
196
|
+
# default setting (`true`) unless you want to _only_ read the central directory, or you need
|
197
|
+
# to limit the number of HTTP requests.
|
198
|
+
# @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
|
199
|
+
def read_zip_structure(io:, read_local_headers: true)
|
200
|
+
zip_file_size = io.size
|
201
|
+
eocd_offset = get_eocd_offset(io, zip_file_size)
|
202
|
+
|
203
|
+
zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
|
204
|
+
num_files, cdir_location, _cdir_size =
|
205
|
+
if zip64_end_of_cdir_location
|
206
|
+
num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
|
207
|
+
else
|
208
|
+
num_files_and_central_directory_offset(io, eocd_offset)
|
209
|
+
end
|
210
|
+
|
211
|
+
log do
|
212
|
+
"Located the central directory start at %<location>d" %
|
213
|
+
{location: cdir_location}
|
214
|
+
end
|
215
|
+
seek(io, cdir_location)
|
216
|
+
|
217
|
+
# Read the entire central directory AND anything behind it, in one fell swoop.
|
218
|
+
# Strictly speaking, we should be able to read `cdir_size` bytes and not a byte more.
|
219
|
+
# However, we know for a fact that in some of our files the central directory size
|
220
|
+
# is in fact misreported. `zipinfo` then says:
|
221
|
+
#
|
222
|
+
# warning [ktsglobal-2b03bc.zip]: 1 extra byte at beginning or within zipfile
|
223
|
+
# (attempting to process anyway)
|
224
|
+
# error [ktsglobal-2b03bc.zip]: reported length of central directory is
|
225
|
+
# -1 bytes too long (Atari STZip zipfile? J.H.Holm ZIPSPLIT 1.1
|
226
|
+
# zipfile?). Compensating...
|
227
|
+
#
|
228
|
+
# Since the EOCD is not that big anyway, we just read the entire "tail" of the ZIP ignoring
|
229
|
+
# the central directory size alltogether.
|
230
|
+
central_directory_str = io.read # and not read_n(io, cdir_size), see above
|
231
|
+
central_directory_io = StringIO.new(central_directory_str)
|
232
|
+
log do
|
233
|
+
"Read %<byte_size>d bytes with central directory + EOCD record and locator" %
|
234
|
+
{byte_size: central_directory_str.bytesize}
|
235
|
+
end
|
236
|
+
|
237
|
+
entries = (0...num_files).map { |entry_n|
|
238
|
+
offset_location = cdir_location + central_directory_io.tell
|
239
|
+
log do
|
240
|
+
"Reading the central directory entry %<entry_n>d starting at offset %<offset>d" %
|
241
|
+
{entry_n: entry_n, offset: offset_location}
|
242
|
+
end
|
243
|
+
read_cdir_entry(central_directory_io)
|
244
|
+
}
|
245
|
+
|
246
|
+
read_local_headers(entries, io) if read_local_headers
|
247
|
+
|
248
|
+
entries
|
249
|
+
end
|
250
|
+
|
251
|
+
# Sometimes you might encounter truncated ZIP files, which do not contain
|
252
|
+
# any central directory whatsoever - or where the central directory is
|
253
|
+
# truncated. In that case, employing the technique of reading the ZIP
|
254
|
+
# "from the end" is impossible, and the only recourse is reading each
|
255
|
+
# local file header in sucession. If the entries in such a ZIP use data
|
256
|
+
# descriptors, you would need to scan after the entry until you encounter
|
257
|
+
# the data descriptor signature - and that might be unreliable at best.
|
258
|
+
# Therefore, this reading technique does not support data descriptors.
|
259
|
+
# It can however recover the entries you still can read if these entries
|
260
|
+
# contain all the necessary information about the contained file.
|
261
|
+
#
|
262
|
+
# @param io[#tell, #read, #seek] the IO-ish object to read the local file
|
263
|
+
# headers from @return [Array<ZipEntry>] an array of entries that could be
|
264
|
+
# recovered before hitting EOF
|
265
|
+
def read_zip_straight_ahead(io:)
|
266
|
+
entries = []
|
267
|
+
loop do
|
268
|
+
cur_offset = io.tell
|
269
|
+
entry = read_local_file_header(io: io)
|
270
|
+
if entry.uses_data_descriptor?
|
271
|
+
raise UnsupportedFeature, "The local file header at #{cur_offset} uses \
|
272
|
+
a data descriptor and the start of next entry \
|
273
|
+
cannot be found"
|
274
|
+
end
|
275
|
+
entries << entry
|
276
|
+
next_local_header_offset = entry.compressed_data_offset + entry.compressed_size
|
277
|
+
log do
|
278
|
+
"Recovered a local file file header at offset %<cur_offset>d, seeking to the next at %<header_offset>d" %
|
279
|
+
{cur_offset: cur_offset, header_offset: next_local_header_offset}
|
280
|
+
end
|
281
|
+
seek(io, next_local_header_offset) # Seek to the next entry, and raise if seek is impossible
|
282
|
+
end
|
283
|
+
entries
|
284
|
+
rescue ReadError, RangeError # RangeError is raised if offset exceeds int32/int64 range
|
285
|
+
log do
|
286
|
+
"Got a read/seek error after reaching %<cur_offset>d, no more entries can be recovered" %
|
287
|
+
{cur_offset: cur_offset}
|
288
|
+
end
|
289
|
+
entries
|
290
|
+
end
|
291
|
+
|
292
|
+
# Parse the local header entry and get the offset in the IO at which the
|
293
|
+
# actual compressed data of the file starts within the ZIP.
|
294
|
+
# The method will eager-read the entire local header for the file
|
295
|
+
# (the maximum size the local header may use), starting at the given offset,
|
296
|
+
# and will then compute its size. That size plus the local header offset
|
297
|
+
# given will be the compressed data offset of the entry (read starting at
|
298
|
+
# this offset to get the data).
|
299
|
+
#
|
300
|
+
# @param io[#read] an IO-ish object the ZIP file can be read from
|
301
|
+
# @return [Array<ZipEntry, Fixnum>] the parsed local header entry and
|
302
|
+
# the compressed data offset
|
303
|
+
def read_local_file_header(io:)
|
304
|
+
local_file_header_offset = io.tell
|
305
|
+
|
306
|
+
# Reading in bulk is cheaper - grab the maximum length of the local header,
|
307
|
+
# including any headroom for extra fields etc.
|
308
|
+
local_file_header_str_plus_headroom = io.read(MAX_LOCAL_HEADER_SIZE)
|
309
|
+
raise ReadError if local_file_header_str_plus_headroom.nil? # reached EOF
|
310
|
+
|
311
|
+
io_starting_at_local_header = StringIO.new(local_file_header_str_plus_headroom)
|
312
|
+
|
313
|
+
assert_signature(io_starting_at_local_header, 0x04034b50)
|
314
|
+
e = ZipEntry.new
|
315
|
+
e.version_needed_to_extract = read_2b(io_starting_at_local_header) # Version needed to extract
|
316
|
+
e.gp_flags = read_2b(io_starting_at_local_header) # gp flags
|
317
|
+
e.storage_mode = read_2b(io_starting_at_local_header) # storage mode
|
318
|
+
e.dos_time = read_2b(io_starting_at_local_header) # dos time
|
319
|
+
e.dos_date = read_2b(io_starting_at_local_header) # dos date
|
320
|
+
e.crc32 = read_4b(io_starting_at_local_header) # CRC32
|
321
|
+
e.compressed_size = read_4b(io_starting_at_local_header) # Comp size
|
322
|
+
e.uncompressed_size = read_4b(io_starting_at_local_header) # Uncomp size
|
323
|
+
|
324
|
+
filename_size = read_2b(io_starting_at_local_header)
|
325
|
+
extra_size = read_2b(io_starting_at_local_header)
|
326
|
+
e.filename = read_n(io_starting_at_local_header, filename_size)
|
327
|
+
extra_fields_str = read_n(io_starting_at_local_header, extra_size)
|
328
|
+
|
329
|
+
# Parse out the extra fields
|
330
|
+
extra_table = parse_out_extra_fields(extra_fields_str)
|
331
|
+
|
332
|
+
# ...of which we really only need the Zip64 extra
|
333
|
+
if (zip64_extra_contents = extra_table[1])
|
334
|
+
# If the Zip64 extra is present, we let it override all
|
335
|
+
# the values fetched from the conventional header
|
336
|
+
zip64_extra = StringIO.new(zip64_extra_contents)
|
337
|
+
log do
|
338
|
+
"Will read Zip64 extra data from local header field for %<filename>s, %<size>d bytes" %
|
339
|
+
{filename: e.filename, size: zip64_extra.size}
|
340
|
+
end
|
341
|
+
# Now here be dragons. The APPNOTE specifies that
|
342
|
+
#
|
343
|
+
# > The order of the fields in the ZIP64 extended
|
344
|
+
# > information record is fixed, but the fields will
|
345
|
+
# > only appear if the corresponding Local or Central
|
346
|
+
# > directory record field is set to 0xFFFF or 0xFFFFFFFF.
|
347
|
+
#
|
348
|
+
# It means that before we read this stuff we need to check if the previously-read
|
349
|
+
# values are at overflow, and only _then_ proceed to read them. Bah.
|
350
|
+
e.uncompressed_size = read_8b(zip64_extra) if e.uncompressed_size == 0xFFFFFFFF
|
351
|
+
e.compressed_size = read_8b(zip64_extra) if e.compressed_size == 0xFFFFFFFF
|
352
|
+
end
|
353
|
+
|
354
|
+
offset = local_file_header_offset + io_starting_at_local_header.tell
|
355
|
+
e.compressed_data_offset = offset
|
356
|
+
|
357
|
+
e
|
358
|
+
end
|
359
|
+
|
360
|
+
# Get the offset in the IO at which the actual compressed data of the file
|
361
|
+
# starts within the ZIP. The method will eager-read the entire local header
|
362
|
+
# for the file (the maximum size the local header may use), starting at the
|
363
|
+
# given offset, and will then compute its size. That size plus the local
|
364
|
+
# header offset given will be the compressed data offset of the entry
|
365
|
+
# (read starting at this offset to get the data).
|
366
|
+
#
|
367
|
+
# @param io[#seek, #read] an IO-ish object the ZIP file can be read from
|
368
|
+
# @param local_file_header_offset[Fixnum] absolute offset (0-based) where the
|
369
|
+
# local file header is supposed to begin @return [Fixnum] absolute offset
|
370
|
+
# (0-based) of where the compressed data begins for this file within the ZIP
|
371
|
+
def get_compressed_data_offset(io:, local_file_header_offset:)
|
372
|
+
seek(io, local_file_header_offset)
|
373
|
+
entry_recovered_from_local_file_header = read_local_file_header(io: io)
|
374
|
+
entry_recovered_from_local_file_header.compressed_data_offset
|
375
|
+
end
|
376
|
+
|
377
|
+
# Parse an IO handle to a ZIP archive into an array of Entry objects, reading from the end
|
378
|
+
# of the IO object.
|
379
|
+
#
|
380
|
+
# @see #read_zip_structure
|
381
|
+
# @param options[Hash] any options the instance method of the same name accepts
|
382
|
+
# @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
|
383
|
+
def self.read_zip_structure(**options)
|
384
|
+
new.read_zip_structure(**options)
|
385
|
+
end
|
386
|
+
|
387
|
+
# Parse an IO handle to a ZIP archive into an array of Entry objects, reading from the start of
|
388
|
+
# the file and parsing local file headers one-by-one
|
389
|
+
#
|
390
|
+
# @see #read_zip_straight_ahead
|
391
|
+
# @param options[Hash] any options the instance method of the same name accepts
|
392
|
+
# @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
|
393
|
+
def self.read_zip_straight_ahead(**options)
|
394
|
+
new.read_zip_straight_ahead(**options)
|
395
|
+
end
|
396
|
+
|
397
|
+
private
|
398
|
+
|
399
|
+
def read_local_headers(entries, io)
|
400
|
+
entries.each_with_index do |entry, i|
|
401
|
+
log do
|
402
|
+
"Reading the local header for entry %<index>d at offset %<offset>d" %
|
403
|
+
{index: i, offset: entry.local_file_header_offset}
|
404
|
+
end
|
405
|
+
off = get_compressed_data_offset(io: io,
|
406
|
+
local_file_header_offset: entry.local_file_header_offset)
|
407
|
+
entry.compressed_data_offset = off
|
408
|
+
end
|
409
|
+
end
|
410
|
+
|
411
|
+
def skip_ahead_2(io)
|
412
|
+
skip_ahead_n(io, 2)
|
413
|
+
end
|
414
|
+
|
415
|
+
def skip_ahead_4(io)
|
416
|
+
skip_ahead_n(io, 4)
|
417
|
+
end
|
418
|
+
|
419
|
+
def skip_ahead_8(io)
|
420
|
+
skip_ahead_n(io, 8)
|
421
|
+
end
|
422
|
+
|
423
|
+
def seek(io, absolute_pos)
|
424
|
+
io.seek(absolute_pos, IO::SEEK_SET)
|
425
|
+
unless absolute_pos == io.tell
|
426
|
+
raise ReadError,
|
427
|
+
"Expected to seek to #{absolute_pos} but only \
|
428
|
+
got to #{io.tell}"
|
429
|
+
end
|
430
|
+
nil
|
431
|
+
end
|
432
|
+
|
433
|
+
def assert_signature(io, signature_magic_number)
|
434
|
+
readback = read_4b(io)
|
435
|
+
if readback != signature_magic_number
|
436
|
+
expected = "0x0" + signature_magic_number.to_s(16)
|
437
|
+
actual = "0x0" + readback.to_s(16)
|
438
|
+
raise InvalidStructure, "Expected signature #{expected}, but read #{actual}"
|
439
|
+
end
|
440
|
+
end
|
441
|
+
|
442
|
+
def skip_ahead_n(io, n)
|
443
|
+
pos_before = io.tell
|
444
|
+
io.seek(io.tell + n, IO::SEEK_SET)
|
445
|
+
pos_after = io.tell
|
446
|
+
delta = pos_after - pos_before
|
447
|
+
unless delta == n
|
448
|
+
raise ReadError, "Expected to seek #{n} bytes ahead, but could \
|
449
|
+
only seek #{delta} bytes ahead"
|
450
|
+
end
|
451
|
+
nil
|
452
|
+
end
|
453
|
+
|
454
|
+
def read_n(io, n_bytes)
|
455
|
+
io.read(n_bytes).tap do |d|
|
456
|
+
raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
|
457
|
+
unless d.bytesize == n_bytes
|
458
|
+
raise ReadError, "Expected to read #{n_bytes} bytes, \
|
459
|
+
read #{d.bytesize}"
|
460
|
+
end
|
461
|
+
end
|
462
|
+
end
|
463
|
+
|
464
|
+
def read_2b(io)
|
465
|
+
read_n(io, 2).unpack(C_UINT2).shift
|
466
|
+
end
|
467
|
+
|
468
|
+
def read_4b(io)
|
469
|
+
read_n(io, 4).unpack(C_UINT4).shift
|
470
|
+
end
|
471
|
+
|
472
|
+
def read_8b(io)
|
473
|
+
read_n(io, 8).unpack(C_UINT8).shift
|
474
|
+
end
|
475
|
+
|
476
|
+
def read_cdir_entry(io)
|
477
|
+
# read_cdir_entry is too high. [45.66/15]
|
478
|
+
assert_signature(io, 0x02014b50)
|
479
|
+
ZipEntry.new.tap do |e|
|
480
|
+
e.made_by = read_2b(io)
|
481
|
+
e.version_needed_to_extract = read_2b(io)
|
482
|
+
e.gp_flags = read_2b(io)
|
483
|
+
e.storage_mode = read_2b(io)
|
484
|
+
e.dos_time = read_2b(io)
|
485
|
+
e.dos_date = read_2b(io)
|
486
|
+
e.crc32 = read_4b(io)
|
487
|
+
e.compressed_size = read_4b(io)
|
488
|
+
e.uncompressed_size = read_4b(io)
|
489
|
+
filename_size = read_2b(io)
|
490
|
+
extra_size = read_2b(io)
|
491
|
+
comment_len = read_2b(io)
|
492
|
+
e.disk_number_start = read_2b(io)
|
493
|
+
e.internal_attrs = read_2b(io)
|
494
|
+
e.external_attrs = read_4b(io)
|
495
|
+
e.local_file_header_offset = read_4b(io)
|
496
|
+
e.filename = read_n(io, filename_size)
|
497
|
+
|
498
|
+
# Extra fields
|
499
|
+
extras = read_n(io, extra_size)
|
500
|
+
# Comment
|
501
|
+
e.comment = read_n(io, comment_len)
|
502
|
+
|
503
|
+
# Parse out the extra fields
|
504
|
+
extra_table = parse_out_extra_fields(extras)
|
505
|
+
|
506
|
+
# ...of which we really only need the Zip64 extra
|
507
|
+
if zip64_extra_contents ||= extra_table[1]
|
508
|
+
# If the Zip64 extra is present, we let it override all
|
509
|
+
# the values fetched from the conventional header
|
510
|
+
zip64_extra = StringIO.new(zip64_extra_contents)
|
511
|
+
log do
|
512
|
+
"Will read Zip64 extra data for %<filename>s, %<size>d bytes" %
|
513
|
+
{filename: e.filename, size: zip64_extra.size}
|
514
|
+
end
|
515
|
+
# Now here be dragons. The APPNOTE specifies that
|
516
|
+
#
|
517
|
+
# > The order of the fields in the ZIP64 extended
|
518
|
+
# > information record is fixed, but the fields will
|
519
|
+
# > only appear if the corresponding Local or Central
|
520
|
+
# > directory record field is set to 0xFFFF or 0xFFFFFFFF.
|
521
|
+
#
|
522
|
+
# It means that before we read this stuff we need to check if the previously-read
|
523
|
+
# values are at overflow, and only _then_ proceed to read them. Bah.
|
524
|
+
e.uncompressed_size = read_8b(zip64_extra) if e.uncompressed_size == 0xFFFFFFFF
|
525
|
+
e.compressed_size = read_8b(zip64_extra) if e.compressed_size == 0xFFFFFFFF
|
526
|
+
e.local_file_header_offset = read_8b(zip64_extra) if e.local_file_header_offset == 0xFFFFFFFF
|
527
|
+
# Disk number comes last and we can skip it anyway, since we do
|
528
|
+
# not support multi-disk archives
|
529
|
+
end
|
530
|
+
end
|
531
|
+
end
|
532
|
+
|
533
|
+
def get_eocd_offset(file_io, zip_file_size)
|
534
|
+
# Start reading from the _comment_ of the zip file (from the very end).
|
535
|
+
# The maximum size of the comment is 0xFFFF (what fits in 2 bytes)
|
536
|
+
implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
|
537
|
+
implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0
|
538
|
+
|
539
|
+
# Use a soft seek (we might not be able to get as far behind in the IO as we want)
|
540
|
+
# and a soft read (we might not be able to read as many bytes as we want)
|
541
|
+
file_io.seek(implied_position_of_eocd_record, IO::SEEK_SET)
|
542
|
+
str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
|
543
|
+
eocd_idx_in_buf = locate_eocd_signature(str_containing_eocd_record)
|
544
|
+
|
545
|
+
raise MissingEOCD unless eocd_idx_in_buf
|
546
|
+
|
547
|
+
eocd_offset = implied_position_of_eocd_record + eocd_idx_in_buf
|
548
|
+
log do
|
549
|
+
"Found EOCD signature at offset %<offset>d" % {offset: eocd_offset}
|
550
|
+
end
|
551
|
+
|
552
|
+
eocd_offset
|
553
|
+
end
|
554
|
+
|
555
|
+
def all_indices_of_substr_in_str(of_substring, in_string)
|
556
|
+
last_i = 0
|
557
|
+
found_at_indices = []
|
558
|
+
while (last_i = in_string.index(of_substring, last_i))
|
559
|
+
found_at_indices << last_i
|
560
|
+
last_i += of_substring.bytesize
|
561
|
+
end
|
562
|
+
found_at_indices
|
563
|
+
end
|
564
|
+
|
565
|
+
# We have to scan the maximum possible number
|
566
|
+
# of bytes that the EOCD can theoretically occupy including the comment after it,
|
567
|
+
# and we have to find a combination of:
|
568
|
+
# [EOCD signature, <some ZIP medatata>, comment byte size, comment of size]
|
569
|
+
# at the end. To do so, we first find all indices of the signature in the trailer
|
570
|
+
# string, and then check whether the bytestring starting at the signature and
|
571
|
+
# ending at the end of string satisfies that given pattern.
|
572
|
+
def locate_eocd_signature(in_str)
|
573
|
+
eocd_signature = 0x06054b50
|
574
|
+
eocd_signature_str = [eocd_signature].pack("V")
|
575
|
+
unpack_pattern = "VvvvvVVv"
|
576
|
+
minimum_record_size = 22
|
577
|
+
str_size = in_str.bytesize
|
578
|
+
indices = all_indices_of_substr_in_str(eocd_signature_str, in_str)
|
579
|
+
indices.each do |check_at|
|
580
|
+
maybe_record = in_str[check_at..str_size]
|
581
|
+
# If the record is smaller than the minimum - we will never recover anything
|
582
|
+
break if maybe_record.bytesize < minimum_record_size
|
583
|
+
# Now we check if the record ends with the combination
|
584
|
+
# of the comment size and an arbitrary byte string of that size.
|
585
|
+
# If it does - we found our match
|
586
|
+
*_unused, comment_size = maybe_record.unpack(unpack_pattern)
|
587
|
+
if (maybe_record.bytesize - minimum_record_size) == comment_size
|
588
|
+
return check_at # Found the EOCD marker location
|
589
|
+
end
|
590
|
+
end
|
591
|
+
# If we haven't caught anything, return nil deliberately instead of returning the last statement
|
592
|
+
nil
|
593
|
+
end
|
594
|
+
|
595
|
+
# Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
|
596
|
+
# EOCD record in the archive by fixed offsets
|
597
|
+
# get_zip64_eocd_location is too high. [15.17/15]
|
598
|
+
def get_zip64_eocd_location(file_io, eocd_offset)
|
599
|
+
zip64_eocd_loc_offset = eocd_offset
|
600
|
+
zip64_eocd_loc_offset -= 4 # The signature
|
601
|
+
zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record
|
602
|
+
zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record
|
603
|
+
zip64_eocd_loc_offset -= 4 # Total number of disks
|
604
|
+
|
605
|
+
log do
|
606
|
+
"Will look for the Zip64 EOCD locator signature at offset %<offset>d" %
|
607
|
+
{offset: zip64_eocd_loc_offset}
|
608
|
+
end
|
609
|
+
|
610
|
+
# If the offset is negative there is certainly no Zip64 EOCD locator here
|
611
|
+
return unless zip64_eocd_loc_offset >= 0
|
612
|
+
|
613
|
+
file_io.seek(zip64_eocd_loc_offset, IO::SEEK_SET)
|
614
|
+
assert_signature(file_io, 0x07064b50)
|
615
|
+
|
616
|
+
log do
|
617
|
+
"Found Zip64 EOCD locator at offset %<offset>d" % {offset: zip64_eocd_loc_offset}
|
618
|
+
end
|
619
|
+
|
620
|
+
disk_num = read_4b(file_io) # number of the disk
|
621
|
+
raise UnsupportedFeature, "The archive spans multiple disks" if disk_num != 0
|
622
|
+
read_8b(file_io)
|
623
|
+
rescue ReadError
|
624
|
+
nil
|
625
|
+
end
|
626
|
+
|
627
|
+
# num_files_and_central_directory_offset_zip64 is too high. [21.12/15]
|
628
|
+
def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
|
629
|
+
seek(io, zip64_end_of_cdir_location)
|
630
|
+
|
631
|
+
assert_signature(io, 0x06064b50)
|
632
|
+
|
633
|
+
zip64_eocdr_size = read_8b(io)
|
634
|
+
zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper
|
635
|
+
zip64_eocdr = StringIO.new(zip64_eocdr)
|
636
|
+
skip_ahead_2(zip64_eocdr) # version made by
|
637
|
+
skip_ahead_2(zip64_eocdr) # version needed to extract
|
638
|
+
|
639
|
+
disk_n = read_4b(zip64_eocdr) # number of this disk
|
640
|
+
disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
|
641
|
+
raise UnsupportedFeature, "The archive spans multiple disks" if disk_n != disk_n_with_eocdr
|
642
|
+
|
643
|
+
num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
|
644
|
+
num_files_total = read_8b(zip64_eocdr) # files total in the central directory
|
645
|
+
|
646
|
+
raise UnsupportedFeature, "The archive spans multiple disks" if num_files_this_disk != num_files_total
|
647
|
+
|
648
|
+
log do
|
649
|
+
"Zip64 EOCD record states there are %<amount>d files in the archive" %
|
650
|
+
{amount: num_files_total}
|
651
|
+
end
|
652
|
+
|
653
|
+
central_dir_size = read_8b(zip64_eocdr) # Size of the central directory
|
654
|
+
central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts
|
655
|
+
|
656
|
+
[num_files_total, central_dir_offset, central_dir_size]
|
657
|
+
end
|
658
|
+
|
659
|
+
C_UINT4 = "V"
|
660
|
+
C_UINT2 = "v"
|
661
|
+
C_UINT8 = "Q<"
|
662
|
+
|
663
|
+
# To prevent too many tiny reads, read the maximum possible size of end of
|
664
|
+
# central directory record upfront (all the fixed fields + at most 0xFFFF
|
665
|
+
# bytes of the archive comment)
|
666
|
+
MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE = 4 + # Offset of the start of central directory
|
667
|
+
4 + # Size of the central directory
|
668
|
+
2 + # Number of files in the cdir
|
669
|
+
4 + # End-of-central-directory signature
|
670
|
+
2 + # Number of this disk
|
671
|
+
2 + # Number of disk with the start of cdir
|
672
|
+
2 + # Number of files in the cdir of this disk
|
673
|
+
2 + # The comment size
|
674
|
+
0xFFFF # Maximum comment size
|
675
|
+
|
676
|
+
# To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
|
677
|
+
# The maximum size is all the usual items, plus the maximum size
|
678
|
+
# of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
|
679
|
+
MAX_LOCAL_HEADER_SIZE = 4 + # signature
|
680
|
+
2 + # Version needed to extract
|
681
|
+
2 + # gp flags
|
682
|
+
2 + # storage mode
|
683
|
+
2 + # dos time
|
684
|
+
2 + # dos date
|
685
|
+
4 + # CRC32
|
686
|
+
4 + # Comp size
|
687
|
+
4 + # Uncomp size
|
688
|
+
2 + # Filename size
|
689
|
+
2 + # Extra fields size
|
690
|
+
0xFFFF + # Maximum filename size
|
691
|
+
0xFFFF # Maximum extra fields size
|
692
|
+
|
693
|
+
SIZE_OF_USABLE_EOCD_RECORD = 4 + # Signature
|
694
|
+
2 + # Number of this disk
|
695
|
+
2 + # Number of the disk with the EOCD record
|
696
|
+
2 + # Number of entries in the central directory of this disk
|
697
|
+
2 + # Number of entries in the central directory total
|
698
|
+
4 + # Size of the central directory
|
699
|
+
4 # Start of the central directory offset
|
700
|
+
|
701
|
+
def num_files_and_central_directory_offset(file_io, eocd_offset)
|
702
|
+
seek(file_io, eocd_offset)
|
703
|
+
|
704
|
+
# The size of the EOCD record is known upfront, so use a strict read
|
705
|
+
eocd_record_str = read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD)
|
706
|
+
io = StringIO.new(eocd_record_str)
|
707
|
+
|
708
|
+
assert_signature(io, 0x06054b50)
|
709
|
+
skip_ahead_2(io) # number_of_this_disk
|
710
|
+
skip_ahead_2(io) # number of the disk with the EOCD record
|
711
|
+
skip_ahead_2(io) # number of entries in the central directory of this disk
|
712
|
+
num_files = read_2b(io) # number of entries in the central directory total
|
713
|
+
cdir_size = read_4b(io) # size of the central directory
|
714
|
+
cdir_offset = read_4b(io) # start of central directorty offset
|
715
|
+
[num_files, cdir_offset, cdir_size]
|
716
|
+
end
|
717
|
+
|
718
|
+
private_constant :C_UINT4, :C_UINT2, :C_UINT8, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
|
719
|
+
:MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
|
720
|
+
|
721
|
+
# Is provided as a stub to be overridden in a subclass if you need it. Will report
|
722
|
+
# during various stages of reading. The log message is contained in the return value
|
723
|
+
# of `yield` in the method (the log messages are lazy-evaluated).
|
724
|
+
def log
|
725
|
+
# The most minimal implementation for the method is just this:
|
726
|
+
# $stderr.puts(yield)
|
727
|
+
end
|
728
|
+
|
729
|
+
def parse_out_extra_fields(extra_fields_str)
|
730
|
+
extra_table = {}
|
731
|
+
extras_buf = StringIO.new(extra_fields_str)
|
732
|
+
until extras_buf.eof?
|
733
|
+
extra_id = read_2b(extras_buf)
|
734
|
+
extra_size = read_2b(extras_buf)
|
735
|
+
extra_contents = read_n(extras_buf, extra_size)
|
736
|
+
extra_table[extra_id] = extra_contents
|
737
|
+
end
|
738
|
+
extra_table
|
739
|
+
end
|
740
|
+
end
|