zip_tricks 2.8.1 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -3
  3. data/IMPLEMENTATION_DETAILS.md +2 -10
  4. data/README.md +62 -59
  5. data/examples/archive_size_estimate.rb +4 -4
  6. data/examples/rack_application.rb +3 -5
  7. data/lib/zip_tricks/block_deflate.rb +21 -0
  8. data/lib/zip_tricks/file_reader.rb +491 -0
  9. data/lib/zip_tricks/null_writer.rb +7 -2
  10. data/lib/zip_tricks/rack_body.rb +3 -3
  11. data/lib/zip_tricks/remote_io.rb +30 -20
  12. data/lib/zip_tricks/remote_uncap.rb +10 -10
  13. data/lib/zip_tricks/size_estimator.rb +64 -0
  14. data/lib/zip_tricks/stream_crc32.rb +2 -2
  15. data/lib/zip_tricks/streamer/deflated_writer.rb +26 -0
  16. data/lib/zip_tricks/streamer/entry.rb +21 -0
  17. data/lib/zip_tricks/streamer/stored_writer.rb +25 -0
  18. data/lib/zip_tricks/streamer/writable.rb +20 -0
  19. data/lib/zip_tricks/streamer.rb +172 -66
  20. data/lib/zip_tricks/zip_writer.rb +346 -0
  21. data/lib/zip_tricks.rb +1 -4
  22. data/spec/spec_helper.rb +1 -38
  23. data/spec/zip_tricks/file_reader_spec.rb +47 -0
  24. data/spec/zip_tricks/rack_body_spec.rb +2 -2
  25. data/spec/zip_tricks/remote_io_spec.rb +8 -20
  26. data/spec/zip_tricks/remote_uncap_spec.rb +4 -4
  27. data/spec/zip_tricks/size_estimator_spec.rb +31 -0
  28. data/spec/zip_tricks/streamer_spec.rb +59 -36
  29. data/spec/zip_tricks/zip_writer_spec.rb +408 -0
  30. data/zip_tricks.gemspec +20 -14
  31. metadata +33 -16
  32. data/lib/zip_tricks/manifest.rb +0 -85
  33. data/lib/zip_tricks/microzip.rb +0 -339
  34. data/lib/zip_tricks/stored_size_estimator.rb +0 -44
  35. data/spec/zip_tricks/manifest_spec.rb +0 -60
  36. data/spec/zip_tricks/microzip_interop_spec.rb +0 -48
  37. data/spec/zip_tricks/microzip_spec.rb +0 -546
  38. data/spec/zip_tricks/stored_size_estimator_spec.rb +0 -22
@@ -0,0 +1,491 @@
1
+ require 'stringio'
2
+
3
+ # A very barebones ZIP file reader. Is made for maximum interoperability, but at the same
4
+ # time we attempt to keep it somewhat concise.
5
+ #
6
+ # ## REALLY CRAZY IMPORTANT STUFF: SECURITY IMPLICATIONS
7
+ #
8
+ # Please **BEWARE** - using this is a security risk if you are reading files that have been
9
+ # supplied by users. This implementation has _not_ been formally verified for correctness. As
10
+ # ZIP files contain relative offsets in lots of places it might be possible for a maliciously
11
+ # crafted ZIP file to put the decode procedure in an endless loop, make it attempt huge reads
12
+ # from the input file and so on. Additionally, the reader module for deflated data has
13
+ # no support for ZIP bomb protection. So either limit the `FileReader` usage to the files you
14
+ # trust, or triple-check all the inputs upfront. Patches to make this reader more secure
15
+ # are welcome of course.
16
+ #
17
+ # ## Usage
18
+ #
19
+ # File.open('zipfile.zip', 'rb') do |f|
20
+ # entries = FileReader.read_zip_structure(f)
21
+ # entries.each do |e|
22
+ # File.open(e.filename, 'wb') do |extracted_file|
23
+ # ex = e.extractor_from(f)
24
+ # extracted_file << ex.extract(1024 * 1024) until ex.eof?
25
+ # end
26
+ # end
27
+ # end
28
+ #
29
+ # ## Supported features
30
+ #
31
+ # * Deflate and stored storage modes
32
+ # * Zip64 (extra fields and offsets)
33
+ # * Data descriptors
34
+ #
35
+ # ## Unsupported features
36
+ #
37
+ # * Archives split over multiple disks/files
38
+ # * Any ZIP encryption
39
+ # * EFS language flag and InfoZIP filename extra field
40
+ # * CRC32 checksums are _not_ verified
41
+ #
42
+ # ## Mode of operation
43
+ #
44
+ # Basically, `FileReader` _ignores_ the data in local file headers (as it is often unreliable).
45
+ # It reads the ZIP file "from the tail", finds the end-of-central-directory signatures, then
46
+ # reads the central directory entries, reconstitutes the entries with their filenames, attributes
47
+ # and so on, and sets these entries up with the absolute _offsets_ into the source file/IO object.
48
+ # These offsets can then be used to extract the actual compressed data of the files and to expand it.
49
+ class ZipTricks::FileReader
50
+ ReadError = Class.new(StandardError)
51
+ UnsupportedFeature = Class.new(StandardError)
52
+ InvalidStructure = Class.new(ReadError)
53
+
54
+ class InflatingReader
55
+ def initialize(from_io, compressed_data_size)
56
+ @io = from_io
57
+ @compressed_data_size = compressed_data_size
58
+ @already_read = 0
59
+ @zlib_inflater = ::Zlib::Inflate.new(-Zlib::MAX_WBITS)
60
+ end
61
+
62
+ def extract(n_bytes=nil)
63
+ n_bytes ||= (@compressed_data_size - @already_read)
64
+
65
+ return if eof?
66
+
67
+ available = @compressed_data_size - @already_read
68
+
69
+ return if available.zero?
70
+
71
+ n_bytes = available if n_bytes > available
72
+
73
+ return '' if n_bytes.zero?
74
+
75
+ compressed_chunk = @io.read(n_bytes)
76
+ @already_read += compressed_chunk.bytesize
77
+ @zlib_inflater.inflate(compressed_chunk)
78
+ end
79
+
80
+ def eof?
81
+ @zlib_inflater.finished?
82
+ end
83
+ end
84
+
85
+ class StoredReader
86
+ def initialize(from_io, compressed_data_size)
87
+ @io = from_io
88
+ @compressed_data_size = compressed_data_size
89
+ @already_read = 0
90
+ end
91
+
92
+ def extract(n_bytes=nil)
93
+ n_bytes ||= (@compressed_data_size - @already_read)
94
+
95
+ return if eof?
96
+
97
+ available = @compressed_data_size - @already_read
98
+
99
+ return if available.zero?
100
+
101
+ n_bytes = available if n_bytes > available
102
+
103
+ return '' if n_bytes.zero?
104
+
105
+ compressed_chunk = @io.read(n_bytes)
106
+ @already_read += compressed_chunk.bytesize
107
+ compressed_chunk
108
+ end
109
+
110
+ def eof?
111
+ @already_read >= @compressed_data_size
112
+ end
113
+ end
114
+
115
+ private_constant :StoredReader, :InflatingReader
116
+
117
+ # Represents a file within the ZIP archive being read
118
+ class ZipEntry
119
+ # @return [Fixnum] bit-packed version signature of the program that made the archive
120
+ attr_accessor :made_by
121
+
122
+ # @return [Fixnum] ZIP version support needed to extract this file
123
+ attr_accessor :version_needed_to_extract
124
+
125
+ # @return [Fixnum] bit-packed general purpose flags
126
+ attr_accessor :gp_flags
127
+
128
+ # @return [Fixnum] Storage mode (0 for stored, 8 for deflate)
129
+ attr_accessor :storage_mode
130
+
131
+ # @return [Fixnum] the bit-packed DOS time
132
+ attr_accessor :dos_time
133
+
134
+ # @return [Fixnum] the bit-packed DOS date
135
+ attr_accessor :dos_date
136
+
137
+ # @return [Fixnum] the CRC32 checksum of this file
138
+ attr_accessor :crc32
139
+
140
+ # @return [Fixnum] size of compressed file data in the ZIP
141
+ attr_accessor :compressed_size
142
+
143
+ # @return [Fixnum] size of the file once uncompressed
144
+ attr_accessor :uncompressed_size
145
+
146
+ # @return [String] the filename
147
+ attr_accessor :filename
148
+
149
+ # @return [Fixnum] disk number where this file starts
150
+ attr_accessor :disk_number_start
151
+
152
+ # @return [Fixnum] internal attributes of the file
153
+ attr_accessor :internal_attrs
154
+
155
+ # @return [Fixnum] external attributes of the file
156
+ attr_accessor :external_attrs
157
+
158
+ # @return [Fixnum] at what offset the local file header starts
159
+ # in your original IO object
160
+ attr_accessor :local_file_header_offset
161
+
162
+ # @return [String] the file comment
163
+ attr_accessor :comment
164
+
165
+ # @return [Fixnum] at what offset you should start reading
166
+ # for the compressed data in your original IO object
167
+ attr_accessor :compressed_data_offset
168
+
169
+ # Returns a reader for the actual compressed data of the entry.
170
+ #
171
+ # reader = entry.reader(source_file)
172
+ # outfile << reader.extract(512 * 1024) until reader.eof?
173
+ #
174
+ # @return [#extract(n_bytes), #eof?] the reader for the data
175
+ def extractor_from(from_io)
176
+ from_io.seek(compressed_data_offset, IO::SEEK_SET)
177
+ case storage_mode
178
+ when 8
179
+ InflatingReader.new(from_io, compressed_size)
180
+ when 0
181
+ StoredReader.new(from_io, compressed_size)
182
+ else
183
+ raise "Unsupported storage mode for reading (#{storage_mode})"
184
+ end
185
+ end
186
+ end
187
+
188
+ # Parse an IO handle to a ZIP archive into an array of Entry objects.
189
+ #
190
+ # @param io[#tell, #seek, #read, #size] an IO-ish object
191
+ # @return [Array<Entry>] an array of entries within the ZIP being parsed
192
+ def read_zip_structure(io)
193
+ zip_file_size = io.size
194
+ eocd_offset = get_eocd_offset(io, zip_file_size)
195
+
196
+ zip64_end_of_cdir_location = get_zip64_eocd_locator_offset(io, eocd_offset)
197
+ num_files, cdir_location, cdir_size = if zip64_end_of_cdir_location
198
+ num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
199
+ else
200
+ num_files_and_central_directory_offset(io, eocd_offset)
201
+ end
202
+ seek(io, cdir_location)
203
+
204
+ # Read the entire central directory in one fell swoop
205
+ central_directory_str = read_n(io, cdir_size)
206
+ central_directory_io = StringIO.new(central_directory_str)
207
+
208
+ entries = (1..num_files).map { read_cdir_entry(central_directory_io) }
209
+ entries.each do |entry|
210
+ entry.compressed_data_offset = find_compressed_data_start_offset(io, entry.local_file_header_offset)
211
+ end
212
+ end
213
+
214
+ # Parse an IO handle to a ZIP archive into an array of Entry objects.
215
+ #
216
+ # @param io[#tell, #seek, #read, #size] an IO-ish object
217
+ # @return [Array<Entry>] an array of entries within the ZIP being parsed
218
+ def self.read_zip_structure(io)
219
+ new.read_zip_structure(io)
220
+ end
221
+
222
+ private
223
+
224
+ def skip_ahead_2(io)
225
+ skip_ahead_n(io, 2)
226
+ end
227
+
228
+ def skip_ahead_4(io)
229
+ skip_ahead_n(io, 4)
230
+ end
231
+
232
+ def skip_ahead_8(io)
233
+ skip_ahead_n(io, 8)
234
+ end
235
+
236
+ def seek(io, absolute_pos)
237
+ io.seek(absolute_pos, IO::SEEK_SET)
238
+ raise ReadError, "Expected to seek to #{absolute_pos} but only got to #{io.tell}" unless absolute_pos == io.tell
239
+ nil
240
+ end
241
+
242
+ def assert_signature(io, signature_magic_number)
243
+ packed = [signature_magic_number].pack(C_V)
244
+ readback = read_4b(io)
245
+ if readback != signature_magic_number
246
+ expected = '0x0' + signature_magic_number.to_s(16)
247
+ actual = '0x0' + readback.to_s(16)
248
+ raise InvalidStructure, "Expected signature #{expected}, but read #{actual}"
249
+ end
250
+ end
251
+
252
+ def skip_ahead_n(io, n)
253
+ pos_before = io.tell
254
+ io.seek(io.tell + n, IO::SEEK_SET)
255
+ pos_after = io.tell
256
+ delta = pos_after - pos_before
257
+ raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead" unless delta == n
258
+ nil
259
+ end
260
+
261
+ def read_n(io, n_bytes)
262
+ io.read(n_bytes).tap {|d|
263
+ raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
264
+ raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}" unless d.bytesize == n_bytes
265
+ }
266
+ end
267
+
268
+ def read_2b(io)
269
+ read_n(io, 2).unpack(C_v).shift
270
+ end
271
+
272
+ def read_4b(io)
273
+ read_n(io, 4).unpack(C_V).shift
274
+ end
275
+
276
+ def read_8b(io)
277
+ read_n(io, 8).unpack(C_Qe).shift
278
+ end
279
+
280
+ def find_compressed_data_start_offset(file_io, local_header_offset)
281
+ seek(file_io, local_header_offset)
282
+
283
+ # Reading in bulk is cheaper - grab the maximum length of the local header, including
284
+ # any headroom
285
+ local_file_header_str_plus_headroom = file_io.read(MAX_LOCAL_HEADER_SIZE)
286
+ io = StringIO.new(local_file_header_str_plus_headroom)
287
+
288
+ assert_signature(io, 0x04034b50)
289
+
290
+ # The rest is unreliable, and we have that information from the central directory already.
291
+ # So just skip over it to get at the offset where the compressed data begins
292
+ skip_ahead_2(io) # Version needed to extract
293
+ skip_ahead_2(io) # gp flags
294
+ skip_ahead_2(io) # storage mode
295
+ skip_ahead_2(io) # dos time
296
+ skip_ahead_2(io) # dos date
297
+ skip_ahead_4(io) # CRC32
298
+
299
+ skip_ahead_4(io) # Comp size
300
+ skip_ahead_4(io) # Uncomp size
301
+
302
+ filename_size = read_2b(io)
303
+ extra_size = read_2b(io)
304
+
305
+ skip_ahead_n(io, filename_size)
306
+ skip_ahead_n(io, extra_size)
307
+
308
+ local_header_offset + io.tell
309
+ end
310
+
311
+
312
+ def read_cdir_entry(io)
313
+ expected_at = io.tell
314
+ assert_signature(io, 0x02014b50)
315
+ ZipEntry.new.tap do |e|
316
+ e.made_by = read_2b(io)
317
+ e.version_needed_to_extract = read_2b(io)
318
+ e.gp_flags = read_2b(io)
319
+ e.storage_mode = read_2b(io)
320
+ e.dos_time = read_2b(io)
321
+ e.dos_date = read_2b(io)
322
+ e.crc32 = read_4b(io)
323
+ e.compressed_size = read_4b(io)
324
+ e.uncompressed_size = read_4b(io)
325
+ filename_size = read_2b(io)
326
+ extra_size = read_2b(io)
327
+ comment_len = read_2b(io)
328
+ e.disk_number_start = read_2b(io)
329
+ e.internal_attrs = read_2b(io)
330
+ e.external_attrs = read_4b(io)
331
+ e.local_file_header_offset = read_4b(io)
332
+ e.filename = read_n(io, filename_size)
333
+
334
+ # Extra fields
335
+ extras = read_n(io, extra_size)
336
+ # Comment
337
+ e.comment = read_n(io, comment_len)
338
+
339
+ # Parse out the extra fields
340
+ extra_table = {}
341
+ extras_buf = StringIO.new(extras)
342
+ until extras_buf.eof? do
343
+ extra_id = read_2b(extras_buf)
344
+ extra_size = read_2b(extras_buf)
345
+ extra_contents = read_n(extras_buf, extra_size)
346
+ extra_table[extra_id] = extra_contents
347
+ end
348
+
349
+ # ...of which we really only need the Zip64 extra
350
+ if zip64_extra_contents = extra_table[1] # Zip64 extra
351
+ zip64_extra = StringIO.new(zip64_extra_contents)
352
+ e.uncompressed_size = read_8b(zip64_extra)
353
+ e.compressed_size = read_8b(zip64_extra)
354
+ e.local_file_header_offset = read_8b(zip64_extra)
355
+ end
356
+ end
357
+ end
358
+
359
+ def get_eocd_offset(file_io, zip_file_size)
360
+ # Start reading from the _comment_ of the zip file (from the very end).
361
+ # The maximum size of the comment is 0xFFFF (what fits in 2 bytes)
362
+ implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
363
+ implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0
364
+
365
+ # Use a soft seek (we might not be able to get as far behind in the IO as we want)
366
+ # and a soft read (we might not be able to read as many bytes as we want)
367
+ file_io.seek(implied_position_of_eocd_record, IO::SEEK_SET)
368
+ str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
369
+
370
+ # TODO: what to do if multiple occurrences of the signature are found, somehow?
371
+ eocd_sig = [0x06054b50].pack(C_V)
372
+ eocd_idx_in_buf = str_containing_eocd_record.index(eocd_sig)
373
+
374
+ raise "Could not find the EOCD signature in the buffer - maybe a malformed ZIP file" unless eocd_idx_in_buf
375
+
376
+ implied_position_of_eocd_record + eocd_idx_in_buf
377
+ end
378
+
379
+ # Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
380
+ # EOCD record in the archive by fixed offsets
381
+ def get_zip64_eocd_locator_offset(file_io, eocd_offset)
382
+ zip64_eocd_loc_offset = eocd_offset
383
+ zip64_eocd_loc_offset -= 4 # The signature
384
+ zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record
385
+ zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record
386
+ zip64_eocd_loc_offset -= 4 # Total number of disks
387
+
388
+ # If the offset is negative there is certainly no Zip64 EOCD locator here
389
+ return unless zip64_eocd_loc_offset >= 0
390
+
391
+ file_io.seek(zip64_eocd_loc_offset, IO::SEEK_SET)
392
+ assert_signature(file_io, 0x07064b50)
393
+ disk_num = read_4b(file_io) # number of the disk
394
+ raise UnsupportedFeature, "The archive spans multiple disks" if disk_num != 0
395
+ read_8b(file_io)
396
+ rescue ReadError
397
+ nil
398
+ end
399
+
400
+ def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
401
+ seek(io, zip64_end_of_cdir_location)
402
+
403
+ assert_signature(io, 0x06064b50)
404
+
405
+ zip64_eocdr_size = read_8b(io)
406
+ zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper
407
+ zip64_eocdr = StringIO.new(zip64_eocdr)
408
+ skip_ahead_2(zip64_eocdr) # version made by
409
+ skip_ahead_2(zip64_eocdr) # version needed to extract
410
+
411
+ disk_n = read_4b(zip64_eocdr) # number of this disk
412
+ disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
413
+ raise UnsupportedFeature, "The archive spans multiple disks" if disk_n != disk_n_with_eocdr
414
+
415
+ num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
416
+ num_files_total = read_8b(zip64_eocdr) # files total in the central directory
417
+
418
+ raise UnsupportedFeature, "The archive spans multiple disks" if num_files_this_disk != num_files_total
419
+
420
+ central_dir_size = read_8b(zip64_eocdr) # Size of the central directory
421
+ central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts
422
+
423
+ [num_files_total, central_dir_offset, central_dir_size]
424
+ end
425
+
426
+ C_V = 'V'.freeze
427
+ C_v = 'v'.freeze
428
+ C_Qe = 'Q<'.freeze
429
+
430
+ # To prevent too many tiny reads, read the maximum possible size of end of central directory record
431
+ # upfront (all the fixed fields + at most 0xFFFF bytes of the archive comment)
432
+ MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE = begin
433
+ 4 + # Offset of the start of central directory
434
+ 4 + # Size of the central directory
435
+ 2 + # Number of files in the cdir
436
+ 4 + # End-of-central-directory signature
437
+ 2 + # Number of this disk
438
+ 2 + # Number of disk with the start of cdir
439
+ 2 + # Number of files in the cdir of this disk
440
+ 2 + # The comment size
441
+ 0xFFFF # Maximum comment size
442
+ end
443
+
444
+ # To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
445
+ # The maximum size is all the usual items, plus the maximum size
446
+ # of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
447
+ MAX_LOCAL_HEADER_SIZE = begin
448
+ 4 + # signature
449
+ 2 + # Version needed to extract
450
+ 2 + # gp flags
451
+ 2 + # storage mode
452
+ 2 + # dos time
453
+ 2 + # dos date
454
+ 4 + # CRC32
455
+ 4 + # Comp size
456
+ 4 + # Uncomp size
457
+ 2 + # Filename size
458
+ 2 + # Extra fields size
459
+ 0xFFFF + # Maximum filename size
460
+ 0xFFFF # Maximum extra fields size
461
+ end
462
+
463
+ SIZE_OF_USABLE_EOCD_RECORD = begin
464
+ 4 + # Signature
465
+ 2 + # Number of this disk
466
+ 2 + # Number of the disk with the EOCD record
467
+ 2 + # Number of entries in the central directory of this disk
468
+ 2 + # Number of entries in the central directory total
469
+ 4 + # Size of the central directory
470
+ 4 # Start of the central directory offset
471
+ end
472
+
473
+ def num_files_and_central_directory_offset(file_io, eocd_offset)
474
+ seek(file_io, eocd_offset)
475
+
476
+ io = StringIO.new(read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD))
477
+
478
+ assert_signature(io, 0x06054b50)
479
+
480
+ skip_ahead_2(io) # number_of_this_disk
481
+ skip_ahead_2(io) # number of the disk with the EOCD record
482
+ skip_ahead_2(io) # number of entries in the central directory of this disk
483
+ num_files = read_2b(io) # number of entries in the central directory total
484
+ cdir_size = read_4b(io) # size of the central directory
485
+ cdir_offset = read_4b(io) # start of central directorty offset
486
+ [num_files, cdir_offset, cdir_size]
487
+ end
488
+
489
+ private_constant :C_V, :C_v, :C_Qe, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
490
+ :MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
491
+ end
@@ -2,6 +2,11 @@
2
2
  # write operations, but want to discard the data (like when
3
3
  # estimating the size of a ZIP)
4
4
  module ZipTricks::NullWriter
5
- def <<(data); self; end
6
- extend self
5
+ # @param data[String] the data to write
6
+ # @return [self]
7
+ def self.<<(data); self; end
8
+
9
+ # @param data[String] the data to write
10
+ # @return [Fixnum] the amount of data that was supposed to be written
11
+ def self.write(data); data.bytesize; end
7
12
  end
@@ -9,13 +9,13 @@ class ZipTricks::RackBody
9
9
  # The archive will be automatically closed at the end of the block.
10
10
  #
11
11
  # # Precompute the Content-Length ahead of time
12
- # content_length = ZipTricks::StoredSizeEstimator.perform_fake_archiving do | estimator |
13
- # estimator.add_stored_entry('large.tif', size=1289894)
12
+ # content_length = ZipTricks::SizeEstimator.estimate do | estimator |
13
+ # estimator.add_stored_entry(filename: 'large.tif', size: 1289894)
14
14
  # end
15
15
  #
16
16
  # # Prepare the response body. The block will only be called when the response starts to be written.
17
17
  # body = ZipTricks::RackBody.new do | streamer |
18
- # streamer.add_stored_entry('large.tif', size=1289894, crc32=198210)
18
+ # streamer.add_stored_entry(filename: 'large.tif', size: 1289894, crc32: 198210)
19
19
  # streamer << large_file.read(1024*1024) until large_file.eof?
20
20
  # ...
21
21
  # end
@@ -1,9 +1,12 @@
1
1
  # An object that fakes just-enough of an IO to be dangerous
2
- # - or, more precisely, to be useful as a source for the RubyZip
3
- # central directory parser
2
+ # - or, more precisely, to be useful as a source for the FileReader
3
+ # central directory parser. Effectively we substitute an IO object
4
+ # for an object that fetches parts of the remote file over HTTP using `Range:`
5
+ # headers. The `RemoteIO` acts as an adapter between an object that performs the
6
+ # actual fetches over HTTP and an object that expects a handful of IO methods to be
7
+ # available.
4
8
  class ZipTricks::RemoteIO
5
-
6
- # @param fetcher[#request_object_size, #request_range] an object that can fetch
9
+ # @param fetcher[#request_object_size, #request_range] an object that perform fetches
7
10
  def initialize(fetcher = :NOT_SET)
8
11
  @pos = 0
9
12
  @fetcher = fetcher
@@ -12,21 +15,29 @@ class ZipTricks::RemoteIO
12
15
 
13
16
  # Emulates IO#seek
14
17
  def seek(offset, mode = IO::SEEK_SET)
15
- case mode
16
- when IO::SEEK_SET
17
- @remote_size ||= request_object_size
18
- @pos = clamp(0, offset, @remote_size)
19
- when IO::SEEK_END
20
- @remote_size ||= request_object_size
21
- @pos = clamp(0, @remote_size + offset, @remote_size)
22
- else
23
- raise Errno::ENOTSUP, "Seek mode #{mode.inspect} not supported"
24
- end
18
+ raise "Unsupported read mode #{mode}" unless mode == IO::SEEK_SET
19
+ @remote_size ||= request_object_size
20
+ @pos = clamp(0, offset, @remote_size)
25
21
  0 # always return 0!
26
22
  end
27
-
28
- # Emulates IO#read
29
- def read(n_bytes = nil)
23
+
24
+ # Emulates IO#size.
25
+ #
26
+ # @return [Fixnum] the size of the remote resource
27
+ def size
28
+ @remote_size ||= request_object_size
29
+ end
30
+
31
+ # Emulates IO#read, but requires the number of bytes to read
32
+ # The method will raise if the number of bytes read from remote does
33
+ # not match the number requested. The read will be limited to the
34
+ # size of the remote resource relative to the current offset in the IO,
35
+ # so if you are at offset 0 in the IO of size 10, doing a `read(20)`
36
+ # will only return you 10 bytes of result, and not raise any exceptions.
37
+ #
38
+ # @param n_bytes[Fixnum, nil] how many bytes to read, or `nil` to read all the way to the end
39
+ # @return [String] the read bytes
40
+ def read(n_bytes=nil)
30
41
  @remote_size ||= request_object_size
31
42
 
32
43
  # If the resource is empty there is nothing to read
@@ -47,11 +58,10 @@ class ZipTricks::RemoteIO
47
58
  end
48
59
  end
49
60
 
50
- # Returns the current pointer position within the IO.
51
- # Not used by RubyZip but used in tests of our own
61
+ # Returns the current pointer position within the IO
52
62
  #
53
63
  # @return [Fixnum]
54
- def pos
64
+ def tell
55
65
  @pos
56
66
  end
57
67
 
@@ -2,6 +2,9 @@
2
2
  # downloading the entire file. The central directory provides the
3
3
  # offsets at which the actual file contents is located. You can then
4
4
  # use the `Range:` HTTP headers to download those entries separately.
5
+ #
6
+ # Please read the security warning in `FileReader` _VERY CAREFULLY_
7
+ # before you use this module.
5
8
  class ZipTricks::RemoteUncap
6
9
 
7
10
  # Represents a file embedded within a remote ZIP archive
@@ -37,17 +40,14 @@ class ZipTricks::RemoteUncap
37
40
  def self.files_within_zip_at(uri)
38
41
  fetcher = new(uri)
39
42
  fake_io = ZipTricks::RemoteIO.new(fetcher)
40
- dir = Zip::CentralDirectory.read_from_stream(fake_io)
41
-
42
- dir.entries.map do | rubyzip_entry |
43
+ entries = ZipTricks.const_get(:FileReader).read_zip_structure(fake_io)
44
+ entries.map do | remote_entry |
43
45
  RemoteZipEntry.new do | entry |
44
- entry.name = rubyzip_entry.name
45
- entry.size_uncompressed = rubyzip_entry.size
46
- entry.size_compressed = rubyzip_entry.compressed_size
47
- entry.compression_method = rubyzip_entry.compression_method
48
-
49
- entry.starts_at_offset = rubyzip_entry.local_header_offset + rubyzip_entry.calculate_local_header_size
50
- entry.ends_at_offset = entry.starts_at_offset + rubyzip_entry.compressed_size
46
+ entry.name = remote_entry.filename
47
+ entry.starts_at_offset = remote_entry.compressed_data_offset
48
+ entry.size_uncompressed = remote_entry.uncompressed_size
49
+ entry.size_compressed = remote_entry.compressed_size
50
+ entry.compression_method = remote_entry.storage_mode
51
51
  end
52
52
  end
53
53
  end
@@ -0,0 +1,64 @@
1
+ # Helps to estimate archive sizes
2
+ class ZipTricks::SizeEstimator
3
+ require_relative 'streamer'
4
+
5
+ # Used to mark a couple of methods public
6
+ class DetailStreamer < ::ZipTricks::Streamer
7
+ public :add_file_and_write_local_header, :write_data_descriptor_for_last_entry
8
+ end
9
+ private_constant :DetailStreamer
10
+
11
+ # Creates a new estimator with a Streamer object. Normally you should use
12
+ # `estimate` instead an not use this method directly.
13
+ def initialize(streamer)
14
+ @streamer = streamer
15
+ end
16
+ private :initialize
17
+
18
+ # Performs the estimate using fake archiving. It needs to know the sizes of the
19
+ # entries upfront. Usage:
20
+ #
21
+ # expected_zip_size = SizeEstimator.estimate do | estimator |
22
+ # estimator.add_stored_entry(filename: "file.doc", size: 898291)
23
+ # estimator.add_compressed_entry(filename: "family.tif", uncompressed_size: 89281911, compressed_size: 121908)
24
+ # end
25
+ #
26
+ # @return [Fixnum] the size of the resulting archive, in bytes
27
+ # @yield [SizeEstimator] the estimator
28
+ def self.estimate
29
+ output_io = ZipTricks::WriteAndTell.new(ZipTricks::NullWriter)
30
+ DetailStreamer.open(output_io) { |zip| yield(new(zip)) }
31
+ output_io.tell
32
+ end
33
+
34
+ # Add a fake entry to the archive, to see how big it is going to be in the end.
35
+ #
36
+ # @param filename [String] the name of the file (filenames are variable-width in the ZIP)
37
+ # @param size [Fixnum] size of the uncompressed entry
38
+ # @param use_data_descriptor[Boolean] whether the entry uses a postfix data descriptor to specify size
39
+ # @return self
40
+ def add_stored_entry(filename:, size:, use_data_descriptor: false)
41
+ udd = !!use_data_descriptor
42
+ @streamer.add_file_and_write_local_header(filename: filename, crc32: 0, storage_mode: 0,
43
+ compressed_size: size, uncompressed_size: size, use_data_descriptor: udd)
44
+ @streamer.simulate_write(size)
45
+ @streamer.write_data_descriptor_for_last_entry if udd
46
+ self
47
+ end
48
+
49
+ # Add a fake entry to the archive, to see how big it is going to be in the end.
50
+ #
51
+ # @param filename [String] the name of the file (filenames are variable-width in the ZIP)
52
+ # @param uncompressed_size [Fixnum] size of the uncompressed entry
53
+ # @param compressed_size [Fixnum] size of the compressed entry
54
+ # @param use_data_descriptor[Boolean] whether the entry uses a postfix data descriptor to specify size
55
+ # @return self
56
+ def add_compressed_entry(filename:, uncompressed_size:, compressed_size:, use_data_descriptor: false)
57
+ udd = !!use_data_descriptor
58
+ @streamer.add_file_and_write_local_header(filename: filename, crc32: 0, storage_mode: 8,
59
+ compressed_size: compressed_size, uncompressed_size: uncompressed_size, use_data_descriptor: udd)
60
+ @streamer.simulate_write(compressed_size)
61
+ @streamer.write_data_descriptor_for_last_entry if udd
62
+ self
63
+ end
64
+ end