zip_tricks 2.8.1 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -3
- data/IMPLEMENTATION_DETAILS.md +2 -10
- data/README.md +62 -59
- data/examples/archive_size_estimate.rb +4 -4
- data/examples/rack_application.rb +3 -5
- data/lib/zip_tricks/block_deflate.rb +21 -0
- data/lib/zip_tricks/file_reader.rb +491 -0
- data/lib/zip_tricks/null_writer.rb +7 -2
- data/lib/zip_tricks/rack_body.rb +3 -3
- data/lib/zip_tricks/remote_io.rb +30 -20
- data/lib/zip_tricks/remote_uncap.rb +10 -10
- data/lib/zip_tricks/size_estimator.rb +64 -0
- data/lib/zip_tricks/stream_crc32.rb +2 -2
- data/lib/zip_tricks/streamer/deflated_writer.rb +26 -0
- data/lib/zip_tricks/streamer/entry.rb +21 -0
- data/lib/zip_tricks/streamer/stored_writer.rb +25 -0
- data/lib/zip_tricks/streamer/writable.rb +20 -0
- data/lib/zip_tricks/streamer.rb +172 -66
- data/lib/zip_tricks/zip_writer.rb +346 -0
- data/lib/zip_tricks.rb +1 -4
- data/spec/spec_helper.rb +1 -38
- data/spec/zip_tricks/file_reader_spec.rb +47 -0
- data/spec/zip_tricks/rack_body_spec.rb +2 -2
- data/spec/zip_tricks/remote_io_spec.rb +8 -20
- data/spec/zip_tricks/remote_uncap_spec.rb +4 -4
- data/spec/zip_tricks/size_estimator_spec.rb +31 -0
- data/spec/zip_tricks/streamer_spec.rb +59 -36
- data/spec/zip_tricks/zip_writer_spec.rb +408 -0
- data/zip_tricks.gemspec +20 -14
- metadata +33 -16
- data/lib/zip_tricks/manifest.rb +0 -85
- data/lib/zip_tricks/microzip.rb +0 -339
- data/lib/zip_tricks/stored_size_estimator.rb +0 -44
- data/spec/zip_tricks/manifest_spec.rb +0 -60
- data/spec/zip_tricks/microzip_interop_spec.rb +0 -48
- data/spec/zip_tricks/microzip_spec.rb +0 -546
- data/spec/zip_tricks/stored_size_estimator_spec.rb +0 -22
@@ -0,0 +1,491 @@
|
|
1
|
+
require 'stringio'
|
2
|
+
|
3
|
+
# A very barebones ZIP file reader. Is made for maximum interoperability, but at the same
|
4
|
+
# time we attempt to keep it somewhat concise.
|
5
|
+
#
|
6
|
+
# ## REALLY CRAZY IMPORTANT STUFF: SECURITY IMPLICATIONS
|
7
|
+
#
|
8
|
+
# Please **BEWARE** - using this is a security risk if you are reading files that have been
|
9
|
+
# supplied by users. This implementation has _not_ been formally verified for correctness. As
|
10
|
+
# ZIP files contain relative offsets in lots of places it might be possible for a maliciously
|
11
|
+
# crafted ZIP file to put the decode procedure in an endless loop, make it attempt huge reads
|
12
|
+
# from the input file and so on. Additionally, the reader module for deflated data has
|
13
|
+
# no support for ZIP bomb protection. So either limit the `FileReader` usage to the files you
|
14
|
+
# trust, or triple-check all the inputs upfront. Patches to make this reader more secure
|
15
|
+
# are welcome of course.
|
16
|
+
#
|
17
|
+
# ## Usage
|
18
|
+
#
|
19
|
+
# File.open('zipfile.zip', 'rb') do |f|
|
20
|
+
# entries = FileReader.read_zip_structure(f)
|
21
|
+
# entries.each do |e|
|
22
|
+
# File.open(e.filename, 'wb') do |extracted_file|
|
23
|
+
# ex = e.extractor_from(f)
|
24
|
+
# extracted_file << ex.extract(1024 * 1024) until ex.eof?
|
25
|
+
# end
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# ## Supported features
|
30
|
+
#
|
31
|
+
# * Deflate and stored storage modes
|
32
|
+
# * Zip64 (extra fields and offsets)
|
33
|
+
# * Data descriptors
|
34
|
+
#
|
35
|
+
# ## Unsupported features
|
36
|
+
#
|
37
|
+
# * Archives split over multiple disks/files
|
38
|
+
# * Any ZIP encryption
|
39
|
+
# * EFS language flag and InfoZIP filename extra field
|
40
|
+
# * CRC32 checksums are _not_ verified
|
41
|
+
#
|
42
|
+
# ## Mode of operation
|
43
|
+
#
|
44
|
+
# Basically, `FileReader` _ignores_ the data in local file headers (as it is often unreliable).
|
45
|
+
# It reads the ZIP file "from the tail", finds the end-of-central-directory signatures, then
|
46
|
+
# reads the central directory entries, reconstitutes the entries with their filenames, attributes
|
47
|
+
# and so on, and sets these entries up with the absolute _offsets_ into the source file/IO object.
|
48
|
+
# These offsets can then be used to extract the actual compressed data of the files and to expand it.
|
49
|
+
class ZipTricks::FileReader
|
50
|
+
ReadError = Class.new(StandardError)
|
51
|
+
UnsupportedFeature = Class.new(StandardError)
|
52
|
+
InvalidStructure = Class.new(ReadError)
|
53
|
+
|
54
|
+
class InflatingReader
|
55
|
+
def initialize(from_io, compressed_data_size)
|
56
|
+
@io = from_io
|
57
|
+
@compressed_data_size = compressed_data_size
|
58
|
+
@already_read = 0
|
59
|
+
@zlib_inflater = ::Zlib::Inflate.new(-Zlib::MAX_WBITS)
|
60
|
+
end
|
61
|
+
|
62
|
+
def extract(n_bytes=nil)
|
63
|
+
n_bytes ||= (@compressed_data_size - @already_read)
|
64
|
+
|
65
|
+
return if eof?
|
66
|
+
|
67
|
+
available = @compressed_data_size - @already_read
|
68
|
+
|
69
|
+
return if available.zero?
|
70
|
+
|
71
|
+
n_bytes = available if n_bytes > available
|
72
|
+
|
73
|
+
return '' if n_bytes.zero?
|
74
|
+
|
75
|
+
compressed_chunk = @io.read(n_bytes)
|
76
|
+
@already_read += compressed_chunk.bytesize
|
77
|
+
@zlib_inflater.inflate(compressed_chunk)
|
78
|
+
end
|
79
|
+
|
80
|
+
def eof?
|
81
|
+
@zlib_inflater.finished?
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
class StoredReader
|
86
|
+
def initialize(from_io, compressed_data_size)
|
87
|
+
@io = from_io
|
88
|
+
@compressed_data_size = compressed_data_size
|
89
|
+
@already_read = 0
|
90
|
+
end
|
91
|
+
|
92
|
+
def extract(n_bytes=nil)
|
93
|
+
n_bytes ||= (@compressed_data_size - @already_read)
|
94
|
+
|
95
|
+
return if eof?
|
96
|
+
|
97
|
+
available = @compressed_data_size - @already_read
|
98
|
+
|
99
|
+
return if available.zero?
|
100
|
+
|
101
|
+
n_bytes = available if n_bytes > available
|
102
|
+
|
103
|
+
return '' if n_bytes.zero?
|
104
|
+
|
105
|
+
compressed_chunk = @io.read(n_bytes)
|
106
|
+
@already_read += compressed_chunk.bytesize
|
107
|
+
compressed_chunk
|
108
|
+
end
|
109
|
+
|
110
|
+
def eof?
|
111
|
+
@already_read >= @compressed_data_size
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
private_constant :StoredReader, :InflatingReader
|
116
|
+
|
117
|
+
# Represents a file within the ZIP archive being read
|
118
|
+
class ZipEntry
|
119
|
+
# @return [Fixnum] bit-packed version signature of the program that made the archive
|
120
|
+
attr_accessor :made_by
|
121
|
+
|
122
|
+
# @return [Fixnum] ZIP version support needed to extract this file
|
123
|
+
attr_accessor :version_needed_to_extract
|
124
|
+
|
125
|
+
# @return [Fixnum] bit-packed general purpose flags
|
126
|
+
attr_accessor :gp_flags
|
127
|
+
|
128
|
+
# @return [Fixnum] Storage mode (0 for stored, 8 for deflate)
|
129
|
+
attr_accessor :storage_mode
|
130
|
+
|
131
|
+
# @return [Fixnum] the bit-packed DOS time
|
132
|
+
attr_accessor :dos_time
|
133
|
+
|
134
|
+
# @return [Fixnum] the bit-packed DOS date
|
135
|
+
attr_accessor :dos_date
|
136
|
+
|
137
|
+
# @return [Fixnum] the CRC32 checksum of this file
|
138
|
+
attr_accessor :crc32
|
139
|
+
|
140
|
+
# @return [Fixnum] size of compressed file data in the ZIP
|
141
|
+
attr_accessor :compressed_size
|
142
|
+
|
143
|
+
# @return [Fixnum] size of the file once uncompressed
|
144
|
+
attr_accessor :uncompressed_size
|
145
|
+
|
146
|
+
# @return [String] the filename
|
147
|
+
attr_accessor :filename
|
148
|
+
|
149
|
+
# @return [Fixnum] disk number where this file starts
|
150
|
+
attr_accessor :disk_number_start
|
151
|
+
|
152
|
+
# @return [Fixnum] internal attributes of the file
|
153
|
+
attr_accessor :internal_attrs
|
154
|
+
|
155
|
+
# @return [Fixnum] external attributes of the file
|
156
|
+
attr_accessor :external_attrs
|
157
|
+
|
158
|
+
# @return [Fixnum] at what offset the local file header starts
|
159
|
+
# in your original IO object
|
160
|
+
attr_accessor :local_file_header_offset
|
161
|
+
|
162
|
+
# @return [String] the file comment
|
163
|
+
attr_accessor :comment
|
164
|
+
|
165
|
+
# @return [Fixnum] at what offset you should start reading
|
166
|
+
# for the compressed data in your original IO object
|
167
|
+
attr_accessor :compressed_data_offset
|
168
|
+
|
169
|
+
# Returns a reader for the actual compressed data of the entry.
|
170
|
+
#
|
171
|
+
# reader = entry.reader(source_file)
|
172
|
+
# outfile << reader.extract(512 * 1024) until reader.eof?
|
173
|
+
#
|
174
|
+
# @return [#extract(n_bytes), #eof?] the reader for the data
|
175
|
+
def extractor_from(from_io)
|
176
|
+
from_io.seek(compressed_data_offset, IO::SEEK_SET)
|
177
|
+
case storage_mode
|
178
|
+
when 8
|
179
|
+
InflatingReader.new(from_io, compressed_size)
|
180
|
+
when 0
|
181
|
+
StoredReader.new(from_io, compressed_size)
|
182
|
+
else
|
183
|
+
raise "Unsupported storage mode for reading (#{storage_mode})"
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# Parse an IO handle to a ZIP archive into an array of Entry objects.
|
189
|
+
#
|
190
|
+
# @param io[#tell, #seek, #read, #size] an IO-ish object
|
191
|
+
# @return [Array<Entry>] an array of entries within the ZIP being parsed
|
192
|
+
def read_zip_structure(io)
|
193
|
+
zip_file_size = io.size
|
194
|
+
eocd_offset = get_eocd_offset(io, zip_file_size)
|
195
|
+
|
196
|
+
zip64_end_of_cdir_location = get_zip64_eocd_locator_offset(io, eocd_offset)
|
197
|
+
num_files, cdir_location, cdir_size = if zip64_end_of_cdir_location
|
198
|
+
num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
|
199
|
+
else
|
200
|
+
num_files_and_central_directory_offset(io, eocd_offset)
|
201
|
+
end
|
202
|
+
seek(io, cdir_location)
|
203
|
+
|
204
|
+
# Read the entire central directory in one fell swoop
|
205
|
+
central_directory_str = read_n(io, cdir_size)
|
206
|
+
central_directory_io = StringIO.new(central_directory_str)
|
207
|
+
|
208
|
+
entries = (1..num_files).map { read_cdir_entry(central_directory_io) }
|
209
|
+
entries.each do |entry|
|
210
|
+
entry.compressed_data_offset = find_compressed_data_start_offset(io, entry.local_file_header_offset)
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
# Parse an IO handle to a ZIP archive into an array of Entry objects.
|
215
|
+
#
|
216
|
+
# @param io[#tell, #seek, #read, #size] an IO-ish object
|
217
|
+
# @return [Array<Entry>] an array of entries within the ZIP being parsed
|
218
|
+
def self.read_zip_structure(io)
|
219
|
+
new.read_zip_structure(io)
|
220
|
+
end
|
221
|
+
|
222
|
+
private
|
223
|
+
|
224
|
+
def skip_ahead_2(io)
|
225
|
+
skip_ahead_n(io, 2)
|
226
|
+
end
|
227
|
+
|
228
|
+
def skip_ahead_4(io)
|
229
|
+
skip_ahead_n(io, 4)
|
230
|
+
end
|
231
|
+
|
232
|
+
def skip_ahead_8(io)
|
233
|
+
skip_ahead_n(io, 8)
|
234
|
+
end
|
235
|
+
|
236
|
+
def seek(io, absolute_pos)
|
237
|
+
io.seek(absolute_pos, IO::SEEK_SET)
|
238
|
+
raise ReadError, "Expected to seek to #{absolute_pos} but only got to #{io.tell}" unless absolute_pos == io.tell
|
239
|
+
nil
|
240
|
+
end
|
241
|
+
|
242
|
+
def assert_signature(io, signature_magic_number)
|
243
|
+
packed = [signature_magic_number].pack(C_V)
|
244
|
+
readback = read_4b(io)
|
245
|
+
if readback != signature_magic_number
|
246
|
+
expected = '0x0' + signature_magic_number.to_s(16)
|
247
|
+
actual = '0x0' + readback.to_s(16)
|
248
|
+
raise InvalidStructure, "Expected signature #{expected}, but read #{actual}"
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
def skip_ahead_n(io, n)
|
253
|
+
pos_before = io.tell
|
254
|
+
io.seek(io.tell + n, IO::SEEK_SET)
|
255
|
+
pos_after = io.tell
|
256
|
+
delta = pos_after - pos_before
|
257
|
+
raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead" unless delta == n
|
258
|
+
nil
|
259
|
+
end
|
260
|
+
|
261
|
+
def read_n(io, n_bytes)
|
262
|
+
io.read(n_bytes).tap {|d|
|
263
|
+
raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
|
264
|
+
raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}" unless d.bytesize == n_bytes
|
265
|
+
}
|
266
|
+
end
|
267
|
+
|
268
|
+
def read_2b(io)
|
269
|
+
read_n(io, 2).unpack(C_v).shift
|
270
|
+
end
|
271
|
+
|
272
|
+
def read_4b(io)
|
273
|
+
read_n(io, 4).unpack(C_V).shift
|
274
|
+
end
|
275
|
+
|
276
|
+
def read_8b(io)
|
277
|
+
read_n(io, 8).unpack(C_Qe).shift
|
278
|
+
end
|
279
|
+
|
280
|
+
def find_compressed_data_start_offset(file_io, local_header_offset)
|
281
|
+
seek(file_io, local_header_offset)
|
282
|
+
|
283
|
+
# Reading in bulk is cheaper - grab the maximum length of the local header, including
|
284
|
+
# any headroom
|
285
|
+
local_file_header_str_plus_headroom = file_io.read(MAX_LOCAL_HEADER_SIZE)
|
286
|
+
io = StringIO.new(local_file_header_str_plus_headroom)
|
287
|
+
|
288
|
+
assert_signature(io, 0x04034b50)
|
289
|
+
|
290
|
+
# The rest is unreliable, and we have that information from the central directory already.
|
291
|
+
# So just skip over it to get at the offset where the compressed data begins
|
292
|
+
skip_ahead_2(io) # Version needed to extract
|
293
|
+
skip_ahead_2(io) # gp flags
|
294
|
+
skip_ahead_2(io) # storage mode
|
295
|
+
skip_ahead_2(io) # dos time
|
296
|
+
skip_ahead_2(io) # dos date
|
297
|
+
skip_ahead_4(io) # CRC32
|
298
|
+
|
299
|
+
skip_ahead_4(io) # Comp size
|
300
|
+
skip_ahead_4(io) # Uncomp size
|
301
|
+
|
302
|
+
filename_size = read_2b(io)
|
303
|
+
extra_size = read_2b(io)
|
304
|
+
|
305
|
+
skip_ahead_n(io, filename_size)
|
306
|
+
skip_ahead_n(io, extra_size)
|
307
|
+
|
308
|
+
local_header_offset + io.tell
|
309
|
+
end
|
310
|
+
|
311
|
+
|
312
|
+
def read_cdir_entry(io)
|
313
|
+
expected_at = io.tell
|
314
|
+
assert_signature(io, 0x02014b50)
|
315
|
+
ZipEntry.new.tap do |e|
|
316
|
+
e.made_by = read_2b(io)
|
317
|
+
e.version_needed_to_extract = read_2b(io)
|
318
|
+
e.gp_flags = read_2b(io)
|
319
|
+
e.storage_mode = read_2b(io)
|
320
|
+
e.dos_time = read_2b(io)
|
321
|
+
e.dos_date = read_2b(io)
|
322
|
+
e.crc32 = read_4b(io)
|
323
|
+
e.compressed_size = read_4b(io)
|
324
|
+
e.uncompressed_size = read_4b(io)
|
325
|
+
filename_size = read_2b(io)
|
326
|
+
extra_size = read_2b(io)
|
327
|
+
comment_len = read_2b(io)
|
328
|
+
e.disk_number_start = read_2b(io)
|
329
|
+
e.internal_attrs = read_2b(io)
|
330
|
+
e.external_attrs = read_4b(io)
|
331
|
+
e.local_file_header_offset = read_4b(io)
|
332
|
+
e.filename = read_n(io, filename_size)
|
333
|
+
|
334
|
+
# Extra fields
|
335
|
+
extras = read_n(io, extra_size)
|
336
|
+
# Comment
|
337
|
+
e.comment = read_n(io, comment_len)
|
338
|
+
|
339
|
+
# Parse out the extra fields
|
340
|
+
extra_table = {}
|
341
|
+
extras_buf = StringIO.new(extras)
|
342
|
+
until extras_buf.eof? do
|
343
|
+
extra_id = read_2b(extras_buf)
|
344
|
+
extra_size = read_2b(extras_buf)
|
345
|
+
extra_contents = read_n(extras_buf, extra_size)
|
346
|
+
extra_table[extra_id] = extra_contents
|
347
|
+
end
|
348
|
+
|
349
|
+
# ...of which we really only need the Zip64 extra
|
350
|
+
if zip64_extra_contents = extra_table[1] # Zip64 extra
|
351
|
+
zip64_extra = StringIO.new(zip64_extra_contents)
|
352
|
+
e.uncompressed_size = read_8b(zip64_extra)
|
353
|
+
e.compressed_size = read_8b(zip64_extra)
|
354
|
+
e.local_file_header_offset = read_8b(zip64_extra)
|
355
|
+
end
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
def get_eocd_offset(file_io, zip_file_size)
|
360
|
+
# Start reading from the _comment_ of the zip file (from the very end).
|
361
|
+
# The maximum size of the comment is 0xFFFF (what fits in 2 bytes)
|
362
|
+
implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
|
363
|
+
implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0
|
364
|
+
|
365
|
+
# Use a soft seek (we might not be able to get as far behind in the IO as we want)
|
366
|
+
# and a soft read (we might not be able to read as many bytes as we want)
|
367
|
+
file_io.seek(implied_position_of_eocd_record, IO::SEEK_SET)
|
368
|
+
str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
|
369
|
+
|
370
|
+
# TODO: what to do if multiple occurrences of the signature are found, somehow?
|
371
|
+
eocd_sig = [0x06054b50].pack(C_V)
|
372
|
+
eocd_idx_in_buf = str_containing_eocd_record.index(eocd_sig)
|
373
|
+
|
374
|
+
raise "Could not find the EOCD signature in the buffer - maybe a malformed ZIP file" unless eocd_idx_in_buf
|
375
|
+
|
376
|
+
implied_position_of_eocd_record + eocd_idx_in_buf
|
377
|
+
end
|
378
|
+
|
379
|
+
# Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
|
380
|
+
# EOCD record in the archive by fixed offsets
|
381
|
+
def get_zip64_eocd_locator_offset(file_io, eocd_offset)
|
382
|
+
zip64_eocd_loc_offset = eocd_offset
|
383
|
+
zip64_eocd_loc_offset -= 4 # The signature
|
384
|
+
zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record
|
385
|
+
zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record
|
386
|
+
zip64_eocd_loc_offset -= 4 # Total number of disks
|
387
|
+
|
388
|
+
# If the offset is negative there is certainly no Zip64 EOCD locator here
|
389
|
+
return unless zip64_eocd_loc_offset >= 0
|
390
|
+
|
391
|
+
file_io.seek(zip64_eocd_loc_offset, IO::SEEK_SET)
|
392
|
+
assert_signature(file_io, 0x07064b50)
|
393
|
+
disk_num = read_4b(file_io) # number of the disk
|
394
|
+
raise UnsupportedFeature, "The archive spans multiple disks" if disk_num != 0
|
395
|
+
read_8b(file_io)
|
396
|
+
rescue ReadError
|
397
|
+
nil
|
398
|
+
end
|
399
|
+
|
400
|
+
def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
|
401
|
+
seek(io, zip64_end_of_cdir_location)
|
402
|
+
|
403
|
+
assert_signature(io, 0x06064b50)
|
404
|
+
|
405
|
+
zip64_eocdr_size = read_8b(io)
|
406
|
+
zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper
|
407
|
+
zip64_eocdr = StringIO.new(zip64_eocdr)
|
408
|
+
skip_ahead_2(zip64_eocdr) # version made by
|
409
|
+
skip_ahead_2(zip64_eocdr) # version needed to extract
|
410
|
+
|
411
|
+
disk_n = read_4b(zip64_eocdr) # number of this disk
|
412
|
+
disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
|
413
|
+
raise UnsupportedFeature, "The archive spans multiple disks" if disk_n != disk_n_with_eocdr
|
414
|
+
|
415
|
+
num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
|
416
|
+
num_files_total = read_8b(zip64_eocdr) # files total in the central directory
|
417
|
+
|
418
|
+
raise UnsupportedFeature, "The archive spans multiple disks" if num_files_this_disk != num_files_total
|
419
|
+
|
420
|
+
central_dir_size = read_8b(zip64_eocdr) # Size of the central directory
|
421
|
+
central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts
|
422
|
+
|
423
|
+
[num_files_total, central_dir_offset, central_dir_size]
|
424
|
+
end
|
425
|
+
|
426
|
+
C_V = 'V'.freeze
|
427
|
+
C_v = 'v'.freeze
|
428
|
+
C_Qe = 'Q<'.freeze
|
429
|
+
|
430
|
+
# To prevent too many tiny reads, read the maximum possible size of end of central directory record
|
431
|
+
# upfront (all the fixed fields + at most 0xFFFF bytes of the archive comment)
|
432
|
+
MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE = begin
|
433
|
+
4 + # Offset of the start of central directory
|
434
|
+
4 + # Size of the central directory
|
435
|
+
2 + # Number of files in the cdir
|
436
|
+
4 + # End-of-central-directory signature
|
437
|
+
2 + # Number of this disk
|
438
|
+
2 + # Number of disk with the start of cdir
|
439
|
+
2 + # Number of files in the cdir of this disk
|
440
|
+
2 + # The comment size
|
441
|
+
0xFFFF # Maximum comment size
|
442
|
+
end
|
443
|
+
|
444
|
+
# To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
|
445
|
+
# The maximum size is all the usual items, plus the maximum size
|
446
|
+
# of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
|
447
|
+
MAX_LOCAL_HEADER_SIZE = begin
|
448
|
+
4 + # signature
|
449
|
+
2 + # Version needed to extract
|
450
|
+
2 + # gp flags
|
451
|
+
2 + # storage mode
|
452
|
+
2 + # dos time
|
453
|
+
2 + # dos date
|
454
|
+
4 + # CRC32
|
455
|
+
4 + # Comp size
|
456
|
+
4 + # Uncomp size
|
457
|
+
2 + # Filename size
|
458
|
+
2 + # Extra fields size
|
459
|
+
0xFFFF + # Maximum filename size
|
460
|
+
0xFFFF # Maximum extra fields size
|
461
|
+
end
|
462
|
+
|
463
|
+
SIZE_OF_USABLE_EOCD_RECORD = begin
|
464
|
+
4 + # Signature
|
465
|
+
2 + # Number of this disk
|
466
|
+
2 + # Number of the disk with the EOCD record
|
467
|
+
2 + # Number of entries in the central directory of this disk
|
468
|
+
2 + # Number of entries in the central directory total
|
469
|
+
4 + # Size of the central directory
|
470
|
+
4 # Start of the central directory offset
|
471
|
+
end
|
472
|
+
|
473
|
+
def num_files_and_central_directory_offset(file_io, eocd_offset)
|
474
|
+
seek(file_io, eocd_offset)
|
475
|
+
|
476
|
+
io = StringIO.new(read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD))
|
477
|
+
|
478
|
+
assert_signature(io, 0x06054b50)
|
479
|
+
|
480
|
+
skip_ahead_2(io) # number_of_this_disk
|
481
|
+
skip_ahead_2(io) # number of the disk with the EOCD record
|
482
|
+
skip_ahead_2(io) # number of entries in the central directory of this disk
|
483
|
+
num_files = read_2b(io) # number of entries in the central directory total
|
484
|
+
cdir_size = read_4b(io) # size of the central directory
|
485
|
+
cdir_offset = read_4b(io) # start of central directorty offset
|
486
|
+
[num_files, cdir_offset, cdir_size]
|
487
|
+
end
|
488
|
+
|
489
|
+
private_constant :C_V, :C_v, :C_Qe, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
|
490
|
+
:MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
|
491
|
+
end
|
@@ -2,6 +2,11 @@
|
|
2
2
|
# write operations, but want to discard the data (like when
|
3
3
|
# estimating the size of a ZIP)
|
4
4
|
module ZipTricks::NullWriter
|
5
|
-
|
6
|
-
|
5
|
+
# @param data[String] the data to write
|
6
|
+
# @return [self]
|
7
|
+
def self.<<(data); self; end
|
8
|
+
|
9
|
+
# @param data[String] the data to write
|
10
|
+
# @return [Fixnum] the amount of data that was supposed to be written
|
11
|
+
def self.write(data); data.bytesize; end
|
7
12
|
end
|
data/lib/zip_tricks/rack_body.rb
CHANGED
@@ -9,13 +9,13 @@ class ZipTricks::RackBody
|
|
9
9
|
# The archive will be automatically closed at the end of the block.
|
10
10
|
#
|
11
11
|
# # Precompute the Content-Length ahead of time
|
12
|
-
# content_length = ZipTricks::
|
13
|
-
# estimator.add_stored_entry('large.tif', size
|
12
|
+
# content_length = ZipTricks::SizeEstimator.estimate do | estimator |
|
13
|
+
# estimator.add_stored_entry(filename: 'large.tif', size: 1289894)
|
14
14
|
# end
|
15
15
|
#
|
16
16
|
# # Prepare the response body. The block will only be called when the response starts to be written.
|
17
17
|
# body = ZipTricks::RackBody.new do | streamer |
|
18
|
-
# streamer.add_stored_entry('large.tif', size
|
18
|
+
# streamer.add_stored_entry(filename: 'large.tif', size: 1289894, crc32: 198210)
|
19
19
|
# streamer << large_file.read(1024*1024) until large_file.eof?
|
20
20
|
# ...
|
21
21
|
# end
|
data/lib/zip_tricks/remote_io.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
# An object that fakes just-enough of an IO to be dangerous
|
2
|
-
# - or, more precisely, to be useful as a source for the
|
3
|
-
# central directory parser
|
2
|
+
# - or, more precisely, to be useful as a source for the FileReader
|
3
|
+
# central directory parser. Effectively we substitute an IO object
|
4
|
+
# for an object that fetches parts of the remote file over HTTP using `Range:`
|
5
|
+
# headers. The `RemoteIO` acts as an adapter between an object that performs the
|
6
|
+
# actual fetches over HTTP and an object that expects a handful of IO methods to be
|
7
|
+
# available.
|
4
8
|
class ZipTricks::RemoteIO
|
5
|
-
|
6
|
-
# @param fetcher[#request_object_size, #request_range] an object that can fetch
|
9
|
+
# @param fetcher[#request_object_size, #request_range] an object that perform fetches
|
7
10
|
def initialize(fetcher = :NOT_SET)
|
8
11
|
@pos = 0
|
9
12
|
@fetcher = fetcher
|
@@ -12,21 +15,29 @@ class ZipTricks::RemoteIO
|
|
12
15
|
|
13
16
|
# Emulates IO#seek
|
14
17
|
def seek(offset, mode = IO::SEEK_SET)
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
@pos = clamp(0, offset, @remote_size)
|
19
|
-
when IO::SEEK_END
|
20
|
-
@remote_size ||= request_object_size
|
21
|
-
@pos = clamp(0, @remote_size + offset, @remote_size)
|
22
|
-
else
|
23
|
-
raise Errno::ENOTSUP, "Seek mode #{mode.inspect} not supported"
|
24
|
-
end
|
18
|
+
raise "Unsupported read mode #{mode}" unless mode == IO::SEEK_SET
|
19
|
+
@remote_size ||= request_object_size
|
20
|
+
@pos = clamp(0, offset, @remote_size)
|
25
21
|
0 # always return 0!
|
26
22
|
end
|
27
|
-
|
28
|
-
# Emulates IO#
|
29
|
-
|
23
|
+
|
24
|
+
# Emulates IO#size.
|
25
|
+
#
|
26
|
+
# @return [Fixnum] the size of the remote resource
|
27
|
+
def size
|
28
|
+
@remote_size ||= request_object_size
|
29
|
+
end
|
30
|
+
|
31
|
+
# Emulates IO#read, but requires the number of bytes to read
|
32
|
+
# The method will raise if the number of bytes read from remote does
|
33
|
+
# not match the number requested. The read will be limited to the
|
34
|
+
# size of the remote resource relative to the current offset in the IO,
|
35
|
+
# so if you are at offset 0 in the IO of size 10, doing a `read(20)`
|
36
|
+
# will only return you 10 bytes of result, and not raise any exceptions.
|
37
|
+
#
|
38
|
+
# @param n_bytes[Fixnum, nil] how many bytes to read, or `nil` to read all the way to the end
|
39
|
+
# @return [String] the read bytes
|
40
|
+
def read(n_bytes=nil)
|
30
41
|
@remote_size ||= request_object_size
|
31
42
|
|
32
43
|
# If the resource is empty there is nothing to read
|
@@ -47,11 +58,10 @@ class ZipTricks::RemoteIO
|
|
47
58
|
end
|
48
59
|
end
|
49
60
|
|
50
|
-
# Returns the current pointer position within the IO
|
51
|
-
# Not used by RubyZip but used in tests of our own
|
61
|
+
# Returns the current pointer position within the IO
|
52
62
|
#
|
53
63
|
# @return [Fixnum]
|
54
|
-
def
|
64
|
+
def tell
|
55
65
|
@pos
|
56
66
|
end
|
57
67
|
|
@@ -2,6 +2,9 @@
|
|
2
2
|
# downloading the entire file. The central directory provides the
|
3
3
|
# offsets at which the actual file contents is located. You can then
|
4
4
|
# use the `Range:` HTTP headers to download those entries separately.
|
5
|
+
#
|
6
|
+
# Please read the security warning in `FileReader` _VERY CAREFULLY_
|
7
|
+
# before you use this module.
|
5
8
|
class ZipTricks::RemoteUncap
|
6
9
|
|
7
10
|
# Represents a file embedded within a remote ZIP archive
|
@@ -37,17 +40,14 @@ class ZipTricks::RemoteUncap
|
|
37
40
|
def self.files_within_zip_at(uri)
|
38
41
|
fetcher = new(uri)
|
39
42
|
fake_io = ZipTricks::RemoteIO.new(fetcher)
|
40
|
-
|
41
|
-
|
42
|
-
dir.entries.map do | rubyzip_entry |
|
43
|
+
entries = ZipTricks.const_get(:FileReader).read_zip_structure(fake_io)
|
44
|
+
entries.map do | remote_entry |
|
43
45
|
RemoteZipEntry.new do | entry |
|
44
|
-
entry.name
|
45
|
-
entry.
|
46
|
-
entry.
|
47
|
-
entry.
|
48
|
-
|
49
|
-
entry.starts_at_offset = rubyzip_entry.local_header_offset + rubyzip_entry.calculate_local_header_size
|
50
|
-
entry.ends_at_offset = entry.starts_at_offset + rubyzip_entry.compressed_size
|
46
|
+
entry.name = remote_entry.filename
|
47
|
+
entry.starts_at_offset = remote_entry.compressed_data_offset
|
48
|
+
entry.size_uncompressed = remote_entry.uncompressed_size
|
49
|
+
entry.size_compressed = remote_entry.compressed_size
|
50
|
+
entry.compression_method = remote_entry.storage_mode
|
51
51
|
end
|
52
52
|
end
|
53
53
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# Helps to estimate archive sizes
|
2
|
+
class ZipTricks::SizeEstimator
|
3
|
+
require_relative 'streamer'
|
4
|
+
|
5
|
+
# Used to mark a couple of methods public
|
6
|
+
class DetailStreamer < ::ZipTricks::Streamer
|
7
|
+
public :add_file_and_write_local_header, :write_data_descriptor_for_last_entry
|
8
|
+
end
|
9
|
+
private_constant :DetailStreamer
|
10
|
+
|
11
|
+
# Creates a new estimator with a Streamer object. Normally you should use
|
12
|
+
# `estimate` instead an not use this method directly.
|
13
|
+
def initialize(streamer)
|
14
|
+
@streamer = streamer
|
15
|
+
end
|
16
|
+
private :initialize
|
17
|
+
|
18
|
+
# Performs the estimate using fake archiving. It needs to know the sizes of the
|
19
|
+
# entries upfront. Usage:
|
20
|
+
#
|
21
|
+
# expected_zip_size = SizeEstimator.estimate do | estimator |
|
22
|
+
# estimator.add_stored_entry(filename: "file.doc", size: 898291)
|
23
|
+
# estimator.add_compressed_entry(filename: "family.tif", uncompressed_size: 89281911, compressed_size: 121908)
|
24
|
+
# end
|
25
|
+
#
|
26
|
+
# @return [Fixnum] the size of the resulting archive, in bytes
|
27
|
+
# @yield [SizeEstimator] the estimator
|
28
|
+
def self.estimate
|
29
|
+
output_io = ZipTricks::WriteAndTell.new(ZipTricks::NullWriter)
|
30
|
+
DetailStreamer.open(output_io) { |zip| yield(new(zip)) }
|
31
|
+
output_io.tell
|
32
|
+
end
|
33
|
+
|
34
|
+
# Add a fake entry to the archive, to see how big it is going to be in the end.
|
35
|
+
#
|
36
|
+
# @param filename [String] the name of the file (filenames are variable-width in the ZIP)
|
37
|
+
# @param size [Fixnum] size of the uncompressed entry
|
38
|
+
# @param use_data_descriptor[Boolean] whether the entry uses a postfix data descriptor to specify size
|
39
|
+
# @return self
|
40
|
+
def add_stored_entry(filename:, size:, use_data_descriptor: false)
|
41
|
+
udd = !!use_data_descriptor
|
42
|
+
@streamer.add_file_and_write_local_header(filename: filename, crc32: 0, storage_mode: 0,
|
43
|
+
compressed_size: size, uncompressed_size: size, use_data_descriptor: udd)
|
44
|
+
@streamer.simulate_write(size)
|
45
|
+
@streamer.write_data_descriptor_for_last_entry if udd
|
46
|
+
self
|
47
|
+
end
|
48
|
+
|
49
|
+
# Add a fake entry to the archive, to see how big it is going to be in the end.
|
50
|
+
#
|
51
|
+
# @param filename [String] the name of the file (filenames are variable-width in the ZIP)
|
52
|
+
# @param uncompressed_size [Fixnum] size of the uncompressed entry
|
53
|
+
# @param compressed_size [Fixnum] size of the compressed entry
|
54
|
+
# @param use_data_descriptor[Boolean] whether the entry uses a postfix data descriptor to specify size
|
55
|
+
# @return self
|
56
|
+
def add_compressed_entry(filename:, uncompressed_size:, compressed_size:, use_data_descriptor: false)
|
57
|
+
udd = !!use_data_descriptor
|
58
|
+
@streamer.add_file_and_write_local_header(filename: filename, crc32: 0, storage_mode: 8,
|
59
|
+
compressed_size: compressed_size, uncompressed_size: uncompressed_size, use_data_descriptor: udd)
|
60
|
+
@streamer.simulate_write(compressed_size)
|
61
|
+
@streamer.write_data_descriptor_for_last_entry if udd
|
62
|
+
self
|
63
|
+
end
|
64
|
+
end
|