zip_tricks 2.8.1 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -3
  3. data/IMPLEMENTATION_DETAILS.md +2 -10
  4. data/README.md +62 -59
  5. data/examples/archive_size_estimate.rb +4 -4
  6. data/examples/rack_application.rb +3 -5
  7. data/lib/zip_tricks/block_deflate.rb +21 -0
  8. data/lib/zip_tricks/file_reader.rb +491 -0
  9. data/lib/zip_tricks/null_writer.rb +7 -2
  10. data/lib/zip_tricks/rack_body.rb +3 -3
  11. data/lib/zip_tricks/remote_io.rb +30 -20
  12. data/lib/zip_tricks/remote_uncap.rb +10 -10
  13. data/lib/zip_tricks/size_estimator.rb +64 -0
  14. data/lib/zip_tricks/stream_crc32.rb +2 -2
  15. data/lib/zip_tricks/streamer/deflated_writer.rb +26 -0
  16. data/lib/zip_tricks/streamer/entry.rb +21 -0
  17. data/lib/zip_tricks/streamer/stored_writer.rb +25 -0
  18. data/lib/zip_tricks/streamer/writable.rb +20 -0
  19. data/lib/zip_tricks/streamer.rb +172 -66
  20. data/lib/zip_tricks/zip_writer.rb +346 -0
  21. data/lib/zip_tricks.rb +1 -4
  22. data/spec/spec_helper.rb +1 -38
  23. data/spec/zip_tricks/file_reader_spec.rb +47 -0
  24. data/spec/zip_tricks/rack_body_spec.rb +2 -2
  25. data/spec/zip_tricks/remote_io_spec.rb +8 -20
  26. data/spec/zip_tricks/remote_uncap_spec.rb +4 -4
  27. data/spec/zip_tricks/size_estimator_spec.rb +31 -0
  28. data/spec/zip_tricks/streamer_spec.rb +59 -36
  29. data/spec/zip_tricks/zip_writer_spec.rb +408 -0
  30. data/zip_tricks.gemspec +20 -14
  31. metadata +33 -16
  32. data/lib/zip_tricks/manifest.rb +0 -85
  33. data/lib/zip_tricks/microzip.rb +0 -339
  34. data/lib/zip_tricks/stored_size_estimator.rb +0 -44
  35. data/spec/zip_tricks/manifest_spec.rb +0 -60
  36. data/spec/zip_tricks/microzip_interop_spec.rb +0 -48
  37. data/spec/zip_tricks/microzip_spec.rb +0 -546
  38. data/spec/zip_tricks/stored_size_estimator_spec.rb +0 -22
@@ -0,0 +1,491 @@
1
+ require 'stringio'
2
+
3
+ # A very barebones ZIP file reader. Is made for maximum interoperability, but at the same
4
+ # time we attempt to keep it somewhat concise.
5
+ #
6
+ # ## REALLY CRAZY IMPORTANT STUFF: SECURITY IMPLICATIONS
7
+ #
8
+ # Please **BEWARE** - using this is a security risk if you are reading files that have been
9
+ # supplied by users. This implementation has _not_ been formally verified for correctness. As
10
+ # ZIP files contain relative offsets in lots of places it might be possible for a maliciously
11
+ # crafted ZIP file to put the decode procedure in an endless loop, make it attempt huge reads
12
+ # from the input file and so on. Additionally, the reader module for deflated data has
13
+ # no support for ZIP bomb protection. So either limit the `FileReader` usage to the files you
14
+ # trust, or triple-check all the inputs upfront. Patches to make this reader more secure
15
+ # are welcome of course.
16
+ #
17
+ # ## Usage
18
+ #
19
+ # File.open('zipfile.zip', 'rb') do |f|
20
+ # entries = FileReader.read_zip_structure(f)
21
+ # entries.each do |e|
22
+ # File.open(e.filename, 'wb') do |extracted_file|
23
+ # ex = e.extractor_from(f)
24
+ # extracted_file << ex.extract(1024 * 1024) until ex.eof?
25
+ # end
26
+ # end
27
+ # end
28
+ #
29
+ # ## Supported features
30
+ #
31
+ # * Deflate and stored storage modes
32
+ # * Zip64 (extra fields and offsets)
33
+ # * Data descriptors
34
+ #
35
+ # ## Unsupported features
36
+ #
37
+ # * Archives split over multiple disks/files
38
+ # * Any ZIP encryption
39
+ # * EFS language flag and InfoZIP filename extra field
40
+ # * CRC32 checksums are _not_ verified
41
+ #
42
+ # ## Mode of operation
43
+ #
44
+ # Basically, `FileReader` _ignores_ the data in local file headers (as it is often unreliable).
45
+ # It reads the ZIP file "from the tail", finds the end-of-central-directory signatures, then
46
+ # reads the central directory entries, reconstitutes the entries with their filenames, attributes
47
+ # and so on, and sets these entries up with the absolute _offsets_ into the source file/IO object.
48
+ # These offsets can then be used to extract the actual compressed data of the files and to expand it.
49
+ class ZipTricks::FileReader
50
+ ReadError = Class.new(StandardError)
51
+ UnsupportedFeature = Class.new(StandardError)
52
+ InvalidStructure = Class.new(ReadError)
53
+
54
+ class InflatingReader
55
+ def initialize(from_io, compressed_data_size)
56
+ @io = from_io
57
+ @compressed_data_size = compressed_data_size
58
+ @already_read = 0
59
+ @zlib_inflater = ::Zlib::Inflate.new(-Zlib::MAX_WBITS)
60
+ end
61
+
62
+ def extract(n_bytes=nil)
63
+ n_bytes ||= (@compressed_data_size - @already_read)
64
+
65
+ return if eof?
66
+
67
+ available = @compressed_data_size - @already_read
68
+
69
+ return if available.zero?
70
+
71
+ n_bytes = available if n_bytes > available
72
+
73
+ return '' if n_bytes.zero?
74
+
75
+ compressed_chunk = @io.read(n_bytes)
76
+ @already_read += compressed_chunk.bytesize
77
+ @zlib_inflater.inflate(compressed_chunk)
78
+ end
79
+
80
+ def eof?
81
+ @zlib_inflater.finished?
82
+ end
83
+ end
84
+
85
+ class StoredReader
86
+ def initialize(from_io, compressed_data_size)
87
+ @io = from_io
88
+ @compressed_data_size = compressed_data_size
89
+ @already_read = 0
90
+ end
91
+
92
+ def extract(n_bytes=nil)
93
+ n_bytes ||= (@compressed_data_size - @already_read)
94
+
95
+ return if eof?
96
+
97
+ available = @compressed_data_size - @already_read
98
+
99
+ return if available.zero?
100
+
101
+ n_bytes = available if n_bytes > available
102
+
103
+ return '' if n_bytes.zero?
104
+
105
+ compressed_chunk = @io.read(n_bytes)
106
+ @already_read += compressed_chunk.bytesize
107
+ compressed_chunk
108
+ end
109
+
110
+ def eof?
111
+ @already_read >= @compressed_data_size
112
+ end
113
+ end
114
+
115
+ private_constant :StoredReader, :InflatingReader
116
+
117
+ # Represents a file within the ZIP archive being read
118
+ class ZipEntry
119
+ # @return [Fixnum] bit-packed version signature of the program that made the archive
120
+ attr_accessor :made_by
121
+
122
+ # @return [Fixnum] ZIP version support needed to extract this file
123
+ attr_accessor :version_needed_to_extract
124
+
125
+ # @return [Fixnum] bit-packed general purpose flags
126
+ attr_accessor :gp_flags
127
+
128
+ # @return [Fixnum] Storage mode (0 for stored, 8 for deflate)
129
+ attr_accessor :storage_mode
130
+
131
+ # @return [Fixnum] the bit-packed DOS time
132
+ attr_accessor :dos_time
133
+
134
+ # @return [Fixnum] the bit-packed DOS date
135
+ attr_accessor :dos_date
136
+
137
+ # @return [Fixnum] the CRC32 checksum of this file
138
+ attr_accessor :crc32
139
+
140
+ # @return [Fixnum] size of compressed file data in the ZIP
141
+ attr_accessor :compressed_size
142
+
143
+ # @return [Fixnum] size of the file once uncompressed
144
+ attr_accessor :uncompressed_size
145
+
146
+ # @return [String] the filename
147
+ attr_accessor :filename
148
+
149
+ # @return [Fixnum] disk number where this file starts
150
+ attr_accessor :disk_number_start
151
+
152
+ # @return [Fixnum] internal attributes of the file
153
+ attr_accessor :internal_attrs
154
+
155
+ # @return [Fixnum] external attributes of the file
156
+ attr_accessor :external_attrs
157
+
158
+ # @return [Fixnum] at what offset the local file header starts
159
+ # in your original IO object
160
+ attr_accessor :local_file_header_offset
161
+
162
+ # @return [String] the file comment
163
+ attr_accessor :comment
164
+
165
+ # @return [Fixnum] at what offset you should start reading
166
+ # for the compressed data in your original IO object
167
+ attr_accessor :compressed_data_offset
168
+
169
+ # Returns a reader for the actual compressed data of the entry.
170
+ #
171
+ # reader = entry.reader(source_file)
172
+ # outfile << reader.extract(512 * 1024) until reader.eof?
173
+ #
174
+ # @return [#extract(n_bytes), #eof?] the reader for the data
175
+ def extractor_from(from_io)
176
+ from_io.seek(compressed_data_offset, IO::SEEK_SET)
177
+ case storage_mode
178
+ when 8
179
+ InflatingReader.new(from_io, compressed_size)
180
+ when 0
181
+ StoredReader.new(from_io, compressed_size)
182
+ else
183
+ raise "Unsupported storage mode for reading (#{storage_mode})"
184
+ end
185
+ end
186
+ end
187
+
188
+ # Parse an IO handle to a ZIP archive into an array of Entry objects.
189
+ #
190
+ # @param io[#tell, #seek, #read, #size] an IO-ish object
191
+ # @return [Array<Entry>] an array of entries within the ZIP being parsed
192
+ def read_zip_structure(io)
193
+ zip_file_size = io.size
194
+ eocd_offset = get_eocd_offset(io, zip_file_size)
195
+
196
+ zip64_end_of_cdir_location = get_zip64_eocd_locator_offset(io, eocd_offset)
197
+ num_files, cdir_location, cdir_size = if zip64_end_of_cdir_location
198
+ num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
199
+ else
200
+ num_files_and_central_directory_offset(io, eocd_offset)
201
+ end
202
+ seek(io, cdir_location)
203
+
204
+ # Read the entire central directory in one fell swoop
205
+ central_directory_str = read_n(io, cdir_size)
206
+ central_directory_io = StringIO.new(central_directory_str)
207
+
208
+ entries = (1..num_files).map { read_cdir_entry(central_directory_io) }
209
+ entries.each do |entry|
210
+ entry.compressed_data_offset = find_compressed_data_start_offset(io, entry.local_file_header_offset)
211
+ end
212
+ end
213
+
214
+ # Parse an IO handle to a ZIP archive into an array of Entry objects.
215
+ #
216
+ # @param io[#tell, #seek, #read, #size] an IO-ish object
217
+ # @return [Array<Entry>] an array of entries within the ZIP being parsed
218
+ def self.read_zip_structure(io)
219
+ new.read_zip_structure(io)
220
+ end
221
+
222
+ private
223
+
224
+ def skip_ahead_2(io)
225
+ skip_ahead_n(io, 2)
226
+ end
227
+
228
+ def skip_ahead_4(io)
229
+ skip_ahead_n(io, 4)
230
+ end
231
+
232
+ def skip_ahead_8(io)
233
+ skip_ahead_n(io, 8)
234
+ end
235
+
236
+ def seek(io, absolute_pos)
237
+ io.seek(absolute_pos, IO::SEEK_SET)
238
+ raise ReadError, "Expected to seek to #{absolute_pos} but only got to #{io.tell}" unless absolute_pos == io.tell
239
+ nil
240
+ end
241
+
242
+ def assert_signature(io, signature_magic_number)
243
+ packed = [signature_magic_number].pack(C_V)
244
+ readback = read_4b(io)
245
+ if readback != signature_magic_number
246
+ expected = '0x0' + signature_magic_number.to_s(16)
247
+ actual = '0x0' + readback.to_s(16)
248
+ raise InvalidStructure, "Expected signature #{expected}, but read #{actual}"
249
+ end
250
+ end
251
+
252
+ def skip_ahead_n(io, n)
253
+ pos_before = io.tell
254
+ io.seek(io.tell + n, IO::SEEK_SET)
255
+ pos_after = io.tell
256
+ delta = pos_after - pos_before
257
+ raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead" unless delta == n
258
+ nil
259
+ end
260
+
261
+ def read_n(io, n_bytes)
262
+ io.read(n_bytes).tap {|d|
263
+ raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
264
+ raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}" unless d.bytesize == n_bytes
265
+ }
266
+ end
267
+
268
+ def read_2b(io)
269
+ read_n(io, 2).unpack(C_v).shift
270
+ end
271
+
272
+ def read_4b(io)
273
+ read_n(io, 4).unpack(C_V).shift
274
+ end
275
+
276
+ def read_8b(io)
277
+ read_n(io, 8).unpack(C_Qe).shift
278
+ end
279
+
280
+ def find_compressed_data_start_offset(file_io, local_header_offset)
281
+ seek(file_io, local_header_offset)
282
+
283
+ # Reading in bulk is cheaper - grab the maximum length of the local header, including
284
+ # any headroom
285
+ local_file_header_str_plus_headroom = file_io.read(MAX_LOCAL_HEADER_SIZE)
286
+ io = StringIO.new(local_file_header_str_plus_headroom)
287
+
288
+ assert_signature(io, 0x04034b50)
289
+
290
+ # The rest is unreliable, and we have that information from the central directory already.
291
+ # So just skip over it to get at the offset where the compressed data begins
292
+ skip_ahead_2(io) # Version needed to extract
293
+ skip_ahead_2(io) # gp flags
294
+ skip_ahead_2(io) # storage mode
295
+ skip_ahead_2(io) # dos time
296
+ skip_ahead_2(io) # dos date
297
+ skip_ahead_4(io) # CRC32
298
+
299
+ skip_ahead_4(io) # Comp size
300
+ skip_ahead_4(io) # Uncomp size
301
+
302
+ filename_size = read_2b(io)
303
+ extra_size = read_2b(io)
304
+
305
+ skip_ahead_n(io, filename_size)
306
+ skip_ahead_n(io, extra_size)
307
+
308
+ local_header_offset + io.tell
309
+ end
310
+
311
+
312
+ def read_cdir_entry(io)
313
+ expected_at = io.tell
314
+ assert_signature(io, 0x02014b50)
315
+ ZipEntry.new.tap do |e|
316
+ e.made_by = read_2b(io)
317
+ e.version_needed_to_extract = read_2b(io)
318
+ e.gp_flags = read_2b(io)
319
+ e.storage_mode = read_2b(io)
320
+ e.dos_time = read_2b(io)
321
+ e.dos_date = read_2b(io)
322
+ e.crc32 = read_4b(io)
323
+ e.compressed_size = read_4b(io)
324
+ e.uncompressed_size = read_4b(io)
325
+ filename_size = read_2b(io)
326
+ extra_size = read_2b(io)
327
+ comment_len = read_2b(io)
328
+ e.disk_number_start = read_2b(io)
329
+ e.internal_attrs = read_2b(io)
330
+ e.external_attrs = read_4b(io)
331
+ e.local_file_header_offset = read_4b(io)
332
+ e.filename = read_n(io, filename_size)
333
+
334
+ # Extra fields
335
+ extras = read_n(io, extra_size)
336
+ # Comment
337
+ e.comment = read_n(io, comment_len)
338
+
339
+ # Parse out the extra fields
340
+ extra_table = {}
341
+ extras_buf = StringIO.new(extras)
342
+ until extras_buf.eof? do
343
+ extra_id = read_2b(extras_buf)
344
+ extra_size = read_2b(extras_buf)
345
+ extra_contents = read_n(extras_buf, extra_size)
346
+ extra_table[extra_id] = extra_contents
347
+ end
348
+
349
+ # ...of which we really only need the Zip64 extra
350
+ if zip64_extra_contents = extra_table[1] # Zip64 extra
351
+ zip64_extra = StringIO.new(zip64_extra_contents)
352
+ e.uncompressed_size = read_8b(zip64_extra)
353
+ e.compressed_size = read_8b(zip64_extra)
354
+ e.local_file_header_offset = read_8b(zip64_extra)
355
+ end
356
+ end
357
+ end
358
+
359
+ def get_eocd_offset(file_io, zip_file_size)
360
+ # Start reading from the _comment_ of the zip file (from the very end).
361
+ # The maximum size of the comment is 0xFFFF (what fits in 2 bytes)
362
+ implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
363
+ implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0
364
+
365
+ # Use a soft seek (we might not be able to get as far behind in the IO as we want)
366
+ # and a soft read (we might not be able to read as many bytes as we want)
367
+ file_io.seek(implied_position_of_eocd_record, IO::SEEK_SET)
368
+ str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
369
+
370
+ # TODO: what to do if multiple occurrences of the signature are found, somehow?
371
+ eocd_sig = [0x06054b50].pack(C_V)
372
+ eocd_idx_in_buf = str_containing_eocd_record.index(eocd_sig)
373
+
374
+ raise "Could not find the EOCD signature in the buffer - maybe a malformed ZIP file" unless eocd_idx_in_buf
375
+
376
+ implied_position_of_eocd_record + eocd_idx_in_buf
377
+ end
378
+
379
+ # Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
380
+ # EOCD record in the archive by fixed offsets
381
+ def get_zip64_eocd_locator_offset(file_io, eocd_offset)
382
+ zip64_eocd_loc_offset = eocd_offset
383
+ zip64_eocd_loc_offset -= 4 # The signature
384
+ zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record
385
+ zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record
386
+ zip64_eocd_loc_offset -= 4 # Total number of disks
387
+
388
+ # If the offset is negative there is certainly no Zip64 EOCD locator here
389
+ return unless zip64_eocd_loc_offset >= 0
390
+
391
+ file_io.seek(zip64_eocd_loc_offset, IO::SEEK_SET)
392
+ assert_signature(file_io, 0x07064b50)
393
+ disk_num = read_4b(file_io) # number of the disk
394
+ raise UnsupportedFeature, "The archive spans multiple disks" if disk_num != 0
395
+ read_8b(file_io)
396
+ rescue ReadError
397
+ nil
398
+ end
399
+
400
+ def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
401
+ seek(io, zip64_end_of_cdir_location)
402
+
403
+ assert_signature(io, 0x06064b50)
404
+
405
+ zip64_eocdr_size = read_8b(io)
406
+ zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper
407
+ zip64_eocdr = StringIO.new(zip64_eocdr)
408
+ skip_ahead_2(zip64_eocdr) # version made by
409
+ skip_ahead_2(zip64_eocdr) # version needed to extract
410
+
411
+ disk_n = read_4b(zip64_eocdr) # number of this disk
412
+ disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
413
+ raise UnsupportedFeature, "The archive spans multiple disks" if disk_n != disk_n_with_eocdr
414
+
415
+ num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
416
+ num_files_total = read_8b(zip64_eocdr) # files total in the central directory
417
+
418
+ raise UnsupportedFeature, "The archive spans multiple disks" if num_files_this_disk != num_files_total
419
+
420
+ central_dir_size = read_8b(zip64_eocdr) # Size of the central directory
421
+ central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts
422
+
423
+ [num_files_total, central_dir_offset, central_dir_size]
424
+ end
425
+
426
+ C_V = 'V'.freeze
427
+ C_v = 'v'.freeze
428
+ C_Qe = 'Q<'.freeze
429
+
430
+ # To prevent too many tiny reads, read the maximum possible size of end of central directory record
431
+ # upfront (all the fixed fields + at most 0xFFFF bytes of the archive comment)
432
+ MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE = begin
433
+ 4 + # Offset of the start of central directory
434
+ 4 + # Size of the central directory
435
+ 2 + # Number of files in the cdir
436
+ 4 + # End-of-central-directory signature
437
+ 2 + # Number of this disk
438
+ 2 + # Number of disk with the start of cdir
439
+ 2 + # Number of files in the cdir of this disk
440
+ 2 + # The comment size
441
+ 0xFFFF # Maximum comment size
442
+ end
443
+
444
+ # To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
445
+ # The maximum size is all the usual items, plus the maximum size
446
+ # of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
447
+ MAX_LOCAL_HEADER_SIZE = begin
448
+ 4 + # signature
449
+ 2 + # Version needed to extract
450
+ 2 + # gp flags
451
+ 2 + # storage mode
452
+ 2 + # dos time
453
+ 2 + # dos date
454
+ 4 + # CRC32
455
+ 4 + # Comp size
456
+ 4 + # Uncomp size
457
+ 2 + # Filename size
458
+ 2 + # Extra fields size
459
+ 0xFFFF + # Maximum filename size
460
+ 0xFFFF # Maximum extra fields size
461
+ end
462
+
463
+ SIZE_OF_USABLE_EOCD_RECORD = begin
464
+ 4 + # Signature
465
+ 2 + # Number of this disk
466
+ 2 + # Number of the disk with the EOCD record
467
+ 2 + # Number of entries in the central directory of this disk
468
+ 2 + # Number of entries in the central directory total
469
+ 4 + # Size of the central directory
470
+ 4 # Start of the central directory offset
471
+ end
472
+
473
+ def num_files_and_central_directory_offset(file_io, eocd_offset)
474
+ seek(file_io, eocd_offset)
475
+
476
+ io = StringIO.new(read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD))
477
+
478
+ assert_signature(io, 0x06054b50)
479
+
480
+ skip_ahead_2(io) # number_of_this_disk
481
+ skip_ahead_2(io) # number of the disk with the EOCD record
482
+ skip_ahead_2(io) # number of entries in the central directory of this disk
483
+ num_files = read_2b(io) # number of entries in the central directory total
484
+ cdir_size = read_4b(io) # size of the central directory
485
+ cdir_offset = read_4b(io) # start of central directorty offset
486
+ [num_files, cdir_offset, cdir_size]
487
+ end
488
+
489
+ private_constant :C_V, :C_v, :C_Qe, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
490
+ :MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
491
+ end
@@ -2,6 +2,11 @@
2
2
  # write operations, but want to discard the data (like when
3
3
  # estimating the size of a ZIP)
4
4
  module ZipTricks::NullWriter
5
- def <<(data); self; end
6
- extend self
5
+ # @param data[String] the data to write
6
+ # @return [self]
7
+ def self.<<(data); self; end
8
+
9
+ # @param data[String] the data to write
10
+ # @return [Fixnum] the amount of data that was supposed to be written
11
+ def self.write(data); data.bytesize; end
7
12
  end
@@ -9,13 +9,13 @@ class ZipTricks::RackBody
9
9
  # The archive will be automatically closed at the end of the block.
10
10
  #
11
11
  # # Precompute the Content-Length ahead of time
12
- # content_length = ZipTricks::StoredSizeEstimator.perform_fake_archiving do | estimator |
13
- # estimator.add_stored_entry('large.tif', size=1289894)
12
+ # content_length = ZipTricks::SizeEstimator.estimate do | estimator |
13
+ # estimator.add_stored_entry(filename: 'large.tif', size: 1289894)
14
14
  # end
15
15
  #
16
16
  # # Prepare the response body. The block will only be called when the response starts to be written.
17
17
  # body = ZipTricks::RackBody.new do | streamer |
18
- # streamer.add_stored_entry('large.tif', size=1289894, crc32=198210)
18
+ # streamer.add_stored_entry(filename: 'large.tif', size: 1289894, crc32: 198210)
19
19
  # streamer << large_file.read(1024*1024) until large_file.eof?
20
20
  # ...
21
21
  # end
@@ -1,9 +1,12 @@
1
1
  # An object that fakes just-enough of an IO to be dangerous
2
- # - or, more precisely, to be useful as a source for the RubyZip
3
- # central directory parser
2
+ # - or, more precisely, to be useful as a source for the FileReader
3
+ # central directory parser. Effectively we substitute an IO object
4
+ # for an object that fetches parts of the remote file over HTTP using `Range:`
5
+ # headers. The `RemoteIO` acts as an adapter between an object that performs the
6
+ # actual fetches over HTTP and an object that expects a handful of IO methods to be
7
+ # available.
4
8
  class ZipTricks::RemoteIO
5
-
6
- # @param fetcher[#request_object_size, #request_range] an object that can fetch
9
+ # @param fetcher[#request_object_size, #request_range] an object that perform fetches
7
10
  def initialize(fetcher = :NOT_SET)
8
11
  @pos = 0
9
12
  @fetcher = fetcher
@@ -12,21 +15,29 @@ class ZipTricks::RemoteIO
12
15
 
13
16
  # Emulates IO#seek
14
17
  def seek(offset, mode = IO::SEEK_SET)
15
- case mode
16
- when IO::SEEK_SET
17
- @remote_size ||= request_object_size
18
- @pos = clamp(0, offset, @remote_size)
19
- when IO::SEEK_END
20
- @remote_size ||= request_object_size
21
- @pos = clamp(0, @remote_size + offset, @remote_size)
22
- else
23
- raise Errno::ENOTSUP, "Seek mode #{mode.inspect} not supported"
24
- end
18
+ raise "Unsupported read mode #{mode}" unless mode == IO::SEEK_SET
19
+ @remote_size ||= request_object_size
20
+ @pos = clamp(0, offset, @remote_size)
25
21
  0 # always return 0!
26
22
  end
27
-
28
- # Emulates IO#read
29
- def read(n_bytes = nil)
23
+
24
+ # Emulates IO#size.
25
+ #
26
+ # @return [Fixnum] the size of the remote resource
27
+ def size
28
+ @remote_size ||= request_object_size
29
+ end
30
+
31
+ # Emulates IO#read, but requires the number of bytes to read
32
+ # The method will raise if the number of bytes read from remote does
33
+ # not match the number requested. The read will be limited to the
34
+ # size of the remote resource relative to the current offset in the IO,
35
+ # so if you are at offset 0 in the IO of size 10, doing a `read(20)`
36
+ # will only return you 10 bytes of result, and not raise any exceptions.
37
+ #
38
+ # @param n_bytes[Fixnum, nil] how many bytes to read, or `nil` to read all the way to the end
39
+ # @return [String] the read bytes
40
+ def read(n_bytes=nil)
30
41
  @remote_size ||= request_object_size
31
42
 
32
43
  # If the resource is empty there is nothing to read
@@ -47,11 +58,10 @@ class ZipTricks::RemoteIO
47
58
  end
48
59
  end
49
60
 
50
- # Returns the current pointer position within the IO.
51
- # Not used by RubyZip but used in tests of our own
61
+ # Returns the current pointer position within the IO
52
62
  #
53
63
  # @return [Fixnum]
54
- def pos
64
+ def tell
55
65
  @pos
56
66
  end
57
67
 
@@ -2,6 +2,9 @@
2
2
  # downloading the entire file. The central directory provides the
3
3
  # offsets at which the actual file contents is located. You can then
4
4
  # use the `Range:` HTTP headers to download those entries separately.
5
+ #
6
+ # Please read the security warning in `FileReader` _VERY CAREFULLY_
7
+ # before you use this module.
5
8
  class ZipTricks::RemoteUncap
6
9
 
7
10
  # Represents a file embedded within a remote ZIP archive
@@ -37,17 +40,14 @@ class ZipTricks::RemoteUncap
37
40
  def self.files_within_zip_at(uri)
38
41
  fetcher = new(uri)
39
42
  fake_io = ZipTricks::RemoteIO.new(fetcher)
40
- dir = Zip::CentralDirectory.read_from_stream(fake_io)
41
-
42
- dir.entries.map do | rubyzip_entry |
43
+ entries = ZipTricks.const_get(:FileReader).read_zip_structure(fake_io)
44
+ entries.map do | remote_entry |
43
45
  RemoteZipEntry.new do | entry |
44
- entry.name = rubyzip_entry.name
45
- entry.size_uncompressed = rubyzip_entry.size
46
- entry.size_compressed = rubyzip_entry.compressed_size
47
- entry.compression_method = rubyzip_entry.compression_method
48
-
49
- entry.starts_at_offset = rubyzip_entry.local_header_offset + rubyzip_entry.calculate_local_header_size
50
- entry.ends_at_offset = entry.starts_at_offset + rubyzip_entry.compressed_size
46
+ entry.name = remote_entry.filename
47
+ entry.starts_at_offset = remote_entry.compressed_data_offset
48
+ entry.size_uncompressed = remote_entry.uncompressed_size
49
+ entry.size_compressed = remote_entry.compressed_size
50
+ entry.compression_method = remote_entry.storage_mode
51
51
  end
52
52
  end
53
53
  end
@@ -0,0 +1,64 @@
1
+ # Helps to estimate archive sizes
2
+ class ZipTricks::SizeEstimator
3
+ require_relative 'streamer'
4
+
5
+ # Used to mark a couple of methods public
6
+ class DetailStreamer < ::ZipTricks::Streamer
7
+ public :add_file_and_write_local_header, :write_data_descriptor_for_last_entry
8
+ end
9
+ private_constant :DetailStreamer
10
+
11
+ # Creates a new estimator with a Streamer object. Normally you should use
12
+ # `estimate` instead an not use this method directly.
13
+ def initialize(streamer)
14
+ @streamer = streamer
15
+ end
16
+ private :initialize
17
+
18
+ # Performs the estimate using fake archiving. It needs to know the sizes of the
19
+ # entries upfront. Usage:
20
+ #
21
+ # expected_zip_size = SizeEstimator.estimate do | estimator |
22
+ # estimator.add_stored_entry(filename: "file.doc", size: 898291)
23
+ # estimator.add_compressed_entry(filename: "family.tif", uncompressed_size: 89281911, compressed_size: 121908)
24
+ # end
25
+ #
26
+ # @return [Fixnum] the size of the resulting archive, in bytes
27
+ # @yield [SizeEstimator] the estimator
28
+ def self.estimate
29
+ output_io = ZipTricks::WriteAndTell.new(ZipTricks::NullWriter)
30
+ DetailStreamer.open(output_io) { |zip| yield(new(zip)) }
31
+ output_io.tell
32
+ end
33
+
34
+ # Add a fake entry to the archive, to see how big it is going to be in the end.
35
+ #
36
+ # @param filename [String] the name of the file (filenames are variable-width in the ZIP)
37
+ # @param size [Fixnum] size of the uncompressed entry
38
+ # @param use_data_descriptor[Boolean] whether the entry uses a postfix data descriptor to specify size
39
+ # @return self
40
+ def add_stored_entry(filename:, size:, use_data_descriptor: false)
41
+ udd = !!use_data_descriptor
42
+ @streamer.add_file_and_write_local_header(filename: filename, crc32: 0, storage_mode: 0,
43
+ compressed_size: size, uncompressed_size: size, use_data_descriptor: udd)
44
+ @streamer.simulate_write(size)
45
+ @streamer.write_data_descriptor_for_last_entry if udd
46
+ self
47
+ end
48
+
49
+ # Add a fake entry to the archive, to see how big it is going to be in the end.
50
+ #
51
+ # @param filename [String] the name of the file (filenames are variable-width in the ZIP)
52
+ # @param uncompressed_size [Fixnum] size of the uncompressed entry
53
+ # @param compressed_size [Fixnum] size of the compressed entry
54
+ # @param use_data_descriptor[Boolean] whether the entry uses a postfix data descriptor to specify size
55
+ # @return self
56
+ def add_compressed_entry(filename:, uncompressed_size:, compressed_size:, use_data_descriptor: false)
57
+ udd = !!use_data_descriptor
58
+ @streamer.add_file_and_write_local_header(filename: filename, crc32: 0, storage_mode: 8,
59
+ compressed_size: compressed_size, uncompressed_size: uncompressed_size, use_data_descriptor: udd)
60
+ @streamer.simulate_write(compressed_size)
61
+ @streamer.write_data_descriptor_for_last_entry if udd
62
+ self
63
+ end
64
+ end