zip_tricks 4.0.0 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6b478a7ffbae2dbb20270a1e4c4b63f495feef87
4
- data.tar.gz: d9a8ca3a1596ee653fcf2e2b9153a9e59bd912fc
3
+ metadata.gz: f6d3a4df27461f1235e526d7d40eb089f4df5c64
4
+ data.tar.gz: 179df69ea7d164f9164aabcc0c19d951e8d5748f
5
5
  SHA512:
6
- metadata.gz: 689d25f85a79987750fe3bbdf845cfe76c987713709a3d7fc8d2bc944a570958168c2b6c38a6f657b254d499052c079aad84c85b6421b9c228740dd3c9f79903
7
- data.tar.gz: 434ffe473e5b0339b40ead5437e070cc1bd6dcce910db58e516a207806cd0eab16f86cc04d342201e2f31f121438308351239d923f4b4e5311850755c77ce339
6
+ metadata.gz: 68131f0f180074731223f145f06d42f65d7ee327bfcc73c7445a9ae2be198ba8ed2372cffb9deba8729c7b3af46eb4cacdeb3ae408fd37a202c848933ce93ffa
7
+ data.tar.gz: 9e38d0ce079b7e367b75db3b6ebd032fef11802f12926cf586f061bdb7f059c1087505b2556cfd76062e288601923d51e4c7a1ca5ca5bb928fcae385ee92807b
data/README.md CHANGED
@@ -74,7 +74,7 @@ to that socket using some accelerated writing technique, and only use the Stream
74
74
  ZipTricks::Streamer.open(io) do | zip |
75
75
  # raw_file is written "as is" (STORED mode).
76
76
  # Write the local file header first..
77
- zip.add_stored_entry("first-file.bin", raw_file.size, raw_file_crc32)
77
+ zip.add_stored_entry(filename: "first-file.bin", size: raw_file.size, crc32: raw_file_crc32)
78
78
 
79
79
  # then send the actual file contents bypassing the Streamer interface
80
80
  io.sendfile(my_temp_file)
data/Rakefile CHANGED
@@ -18,8 +18,8 @@ Jeweler::Tasks.new do |gem|
18
18
  gem.homepage = "http://github.com/wetransfer/zip_tricks"
19
19
  gem.license = "MIT"
20
20
  gem.version = ZipTricks::VERSION
21
- gem.summary = %Q{Makes rubyzip stream, for real}
22
- gem.description = %Q{Makes rubyzip stream, for real}
21
+ gem.summary = 'Stream out ZIP files from Ruby'
22
+ gem.description = 'Stream out ZIP files from Ruby'
23
23
  gem.email = "me@julik.nl"
24
24
  gem.authors = ["Julik Tarkhanov"]
25
25
  gem.files.exclude "testing/**/*"
data/lib/zip_tricks.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  module ZipTricks
2
- VERSION = '4.0.0'
2
+ VERSION = '4.1.0'
3
3
 
4
4
  # Require all the sub-components except myself
5
5
  Dir.glob(__dir__ + '/**/*.rb').sort.each {|p| require p unless p == __FILE__ }
@@ -41,11 +41,20 @@ require 'stringio'
41
41
  #
42
42
  # ## Mode of operation
43
43
  #
44
- # Basically, `FileReader` _ignores_ the data in local file headers (as it is often unreliable).
44
+ # By default, `FileReader` _ignores_ the data in local file headers (as it is often unreliable).
45
45
  # It reads the ZIP file "from the tail", finds the end-of-central-directory signatures, then
46
46
  # reads the central directory entries, reconstitutes the entries with their filenames, attributes
47
47
  # and so on, and sets these entries up with the absolute _offsets_ into the source file/IO object.
48
48
  # These offsets can then be used to extract the actual compressed data of the files and to expand it.
49
+ #
50
+ # ## Recovering damaged or incomplete ZIP files
51
+ #
52
+ # If the ZIP file you are trying to read does not contain the central directory records `read_zip_structure`
53
+ # will not work, since it starts the read process from the EOCD marker at the end of the central directory
54
+ # and then crawls "back" in the IO to figure out the rest. You can explicitly apply a fallback for reading the
55
+ # archive "straight ahead" instead using `read_zip_straight_ahead` - the method will instead scan your IO from
56
+ # the very start, skipping over the actual entry data. This is less efficient than central directory parsing since
57
+ # it involves a much larger number of reads (1 read from the IO per entry in the ZIP).
49
58
  class ZipTricks::FileReader
50
59
  require_relative 'file_reader/stored_reader'
51
60
  require_relative 'file_reader/inflating_reader'
@@ -58,6 +67,11 @@ class ZipTricks::FileReader
58
67
  "The compressed data offset is not available (local header has not been read)"
59
68
  end
60
69
  end
70
+ MissingEOCD = Class.new(StandardError) do
71
+ def message
72
+ "Could not find the EOCD signature in the buffer - maybe a malformed ZIP file"
73
+ end
74
+ end
61
75
 
62
76
  private_constant :StoredReader, :InflatingReader
63
77
 
@@ -139,6 +153,12 @@ class ZipTricks::FileReader
139
153
  !@compressed_data_offset.nil?
140
154
  end
141
155
 
156
+ # Tells whether the entry uses a data descriptor (this is defined
157
+ # by bit 3 in the GP flags).
158
+ def uses_data_descriptor?
159
+ (gp_flags & 0x0008) == 0x0008
160
+ end
161
+
142
162
  # Sets the offset at which the compressed data for this file starts in the ZIP.
143
163
  # By default, the value will be set by the Reader for you. If you use delayed
144
164
  # reading, you need to set it by using the `get_compressed_data_offset` on the Reader:
@@ -165,7 +185,7 @@ class ZipTricks::FileReader
165
185
  # (since the reads have not been performed yet). As a rule, this option can be left in it's
166
186
  # default setting (`true`) unless you want to _only_ read the central directory, or you need
167
187
  # to limit the number of HTTP requests.
168
- # @return [Array<Entry>] an array of entries within the ZIP being parsed
188
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
169
189
  def read_zip_structure(io:, read_local_headers: true)
170
190
  zip_file_size = io.size
171
191
  eocd_offset = get_eocd_offset(io, zip_file_size)
@@ -194,52 +214,130 @@ class ZipTricks::FileReader
194
214
  entries
195
215
  end
196
216
 
197
- # Get the offset in the IO at which the actual compressed data of the file starts within the ZIP.
217
+ # Sometimes you might encounter truncated ZIP files, which do not contain any central directory
218
+ # whatsoever - or where the central directory is truncated. In that case, employing the technique
219
+ # of reading the ZIP "from the end" is impossible, and the only recourse is reading each local file header
220
+ # in sucession. If the entries in such a ZIP use data descriptors, you would need to scan after the entry until
221
+ # you encounter the data descriptor signature - and that might be unreliable at best. Therefore, this reading
222
+ # technique does not support data descriptors. It can however recover the entries you still can read if these
223
+ # entries contain all the necessary information about the contained file.
224
+ #
225
+ # @param io[#tell, #read, #seek] the IO-ish object to read the local file headers from
226
+ # @return [Array<ZipEntry>] an array of entries that could be recovered before hitting EOF
227
+ def read_zip_straight_ahead(io:)
228
+ entries = []
229
+ loop do
230
+ cur_offset = io.tell
231
+ entry = read_local_file_header(io: io)
232
+ if entry.uses_data_descriptor?
233
+ raise UnsupportedFeature, "The local file header at #{cur_offset} uses a data descriptor and the start of next entry cannot be found"
234
+ end
235
+ entries << entry
236
+ next_local_header_offset = entry.compressed_data_offset + entry.compressed_size
237
+ log { 'Recovered a local file file header at offset %d, seeking to the next at %d' % [cur_offset, next_local_header_offset] }
238
+ seek(io, next_local_header_offset) # Seek to the next entry, and raise if seek is impossible
239
+ end
240
+ entries
241
+ rescue ReadError
242
+ log { 'Got a read/seek error after reaching %d, no more entries can be recovered' % cur_offset }
243
+ entries
244
+ end
245
+
246
+ # Parse the local header entry and get the offset in the IO at which the actual compressed data of the
247
+ # file starts within the ZIP.
198
248
  # The method will eager-read the entire local header for the file (the maximum size the local header may use),
199
249
  # starting at the given offset, and will then compute its size. That size plus the local header offset
200
250
  # given will be the compressed data offset of the entry (read starting at this offset to get the data).
201
251
  #
202
- # @param io[#seek, #read] an IO-ish object the ZIP file can be read from
203
- # @param local_header_offset[Fixnum] absolute offset (0-based) where the local file header is supposed to begin
204
- # @return [Fixnum] absolute offset (0-based) of where the compressed data begins for this file within the ZIP
205
- def get_compressed_data_offset(io:, local_file_header_offset:)
206
- seek(io, local_file_header_offset)
207
-
252
+ # @param io[#read] an IO-ish object the ZIP file can be read from
253
+ # @return [Array<ZipEntry, Fixnum>] the parsed local header entry and the compressed data offset
254
+ def read_local_file_header(io:)
255
+ local_file_header_offset = io.tell
256
+
208
257
  # Reading in bulk is cheaper - grab the maximum length of the local header,
209
- # including any headroom
258
+ # including any headroom for extra fields etc.
210
259
  local_file_header_str_plus_headroom = io.read(MAX_LOCAL_HEADER_SIZE)
260
+ raise ReadError if local_file_header_str_plus_headroom.nil? # reached EOF
261
+
211
262
  io_starting_at_local_header = StringIO.new(local_file_header_str_plus_headroom)
212
263
 
213
264
  assert_signature(io_starting_at_local_header, 0x04034b50)
214
-
215
- # The rest is unreliable, and we have that information from the central directory already.
216
- # So just skip over it to get at the offset where the compressed data begins
217
- skip_ahead_2(io_starting_at_local_header) # Version needed to extract
218
- skip_ahead_2(io_starting_at_local_header) # gp flags
219
- skip_ahead_2(io_starting_at_local_header) # storage mode
220
- skip_ahead_2(io_starting_at_local_header) # dos time
221
- skip_ahead_2(io_starting_at_local_header) # dos date
222
- skip_ahead_4(io_starting_at_local_header) # CRC32
223
-
224
- skip_ahead_4(io_starting_at_local_header) # Comp size
225
- skip_ahead_4(io_starting_at_local_header) # Uncomp size
265
+ e = ZipEntry.new
266
+ e.version_needed_to_extract = read_2b(io_starting_at_local_header) # Version needed to extract
267
+ e.gp_flags = read_2b(io_starting_at_local_header) # gp flags
268
+ e.storage_mode = read_2b(io_starting_at_local_header) # storage mode
269
+ e.dos_time = read_2b(io_starting_at_local_header) # dos time
270
+ e.dos_date = read_2b(io_starting_at_local_header) # dos date
271
+ e.crc32 = read_4b(io_starting_at_local_header) # CRC32
272
+ e.compressed_size = read_4b(io_starting_at_local_header) # Comp size
273
+ e.uncompressed_size = read_4b(io_starting_at_local_header) # Uncomp size
226
274
 
227
275
  filename_size = read_2b(io_starting_at_local_header)
228
276
  extra_size = read_2b(io_starting_at_local_header)
277
+ e.filename = read_n(io_starting_at_local_header, filename_size)
278
+ extra_fields_str = read_n(io_starting_at_local_header, extra_size)
279
+
280
+ # Parse out the extra fields
281
+ extra_table = parse_out_extra_fields(extra_fields_str)
282
+
283
+ # ...of which we really only need the Zip64 extra
284
+ if zip64_extra_contents = extra_table[1]
285
+ # If the Zip64 extra is present, we let it override all
286
+ # the values fetched from the conventional header
287
+ zip64_extra = StringIO.new(zip64_extra_contents)
288
+ log { 'Will read Zip64 extra data from local header field for %s, %d bytes' % [e.filename, zip64_extra.size] }
289
+ # Now here be dragons. The APPNOTE specifies that
290
+ #
291
+ # > The order of the fields in the ZIP64 extended
292
+ # > information record is fixed, but the fields will
293
+ # > only appear if the corresponding Local or Central
294
+ # > directory record field is set to 0xFFFF or 0xFFFFFFFF.
295
+ #
296
+ # It means that before we read this stuff we need to check if the previously-read
297
+ # values are at overflow, and only _then_ proceed to read them. Bah.
298
+ e.uncompressed_size = read_8b(zip64_extra) if e.uncompressed_size == 0xFFFFFFFF
299
+ e.compressed_size = read_8b(zip64_extra) if e.compressed_size == 0xFFFFFFFF
300
+ end
229
301
 
230
- skip_ahead_n(io_starting_at_local_header, filename_size)
231
- skip_ahead_n(io_starting_at_local_header, extra_size)
302
+ offset = local_file_header_offset + io_starting_at_local_header.tell
303
+ e.compressed_data_offset = offset
232
304
 
233
- local_file_header_offset + io_starting_at_local_header.tell
305
+ e
234
306
  end
235
-
236
- # Parse an IO handle to a ZIP archive into an array of Entry objects.
307
+
308
+ # Get the offset in the IO at which the actual compressed data of the file starts within the ZIP.
309
+ # The method will eager-read the entire local header for the file (the maximum size the local header may use),
310
+ # starting at the given offset, and will then compute its size. That size plus the local header offset
311
+ # given will be the compressed data offset of the entry (read starting at this offset to get the data).
312
+ #
313
+ # @param io[#seek, #read] an IO-ish object the ZIP file can be read from
314
+ # @param local_header_offset[Fixnum] absolute offset (0-based) where the local file header is supposed to begin
315
+ # @return [Fixnum] absolute offset (0-based) of where the compressed data begins for this file within the ZIP
316
+ def get_compressed_data_offset(io:, local_file_header_offset:)
317
+ seek(io, local_file_header_offset)
318
+ entry_recovered_from_local_file_header = read_local_file_header(io: io)
319
+ entry_recovered_from_local_file_header.compressed_data_offset
320
+ end
321
+
322
+ # Parse an IO handle to a ZIP archive into an array of Entry objects, reading from the end
323
+ # of the IO object.
237
324
  #
325
+ # @see {#read_zip_structure}
238
326
  # @param options[Hash] any options the instance method of the same name accepts
239
- # @return [Array<Entry>] an array of entries within the ZIP being parsed
327
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
240
328
  def self.read_zip_structure(**options)
241
329
  new.read_zip_structure(**options)
242
330
  end
331
+
332
+ # Parse an IO handle to a ZIP archive into an array of Entry objects, reading from the start of
333
+ # the file and parsing local file headers one-by-one
334
+ #
335
+ # @see {#read_zip_straight_ahead}
336
+ # @param options[Hash] any options the instance method of the same name accepts
337
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
338
+ def self.read_zip_straight_ahead(**options)
339
+ new.read_zip_straight_ahead(**options)
340
+ end
243
341
 
244
342
  private
245
343
 
@@ -334,14 +432,7 @@ class ZipTricks::FileReader
334
432
  e.comment = read_n(io, comment_len)
335
433
 
336
434
  # Parse out the extra fields
337
- extra_table = {}
338
- extras_buf = StringIO.new(extras)
339
- until extras_buf.eof? do
340
- extra_id = read_2b(extras_buf)
341
- extra_size = read_2b(extras_buf)
342
- extra_contents = read_n(extras_buf, extra_size)
343
- extra_table[extra_id] = extra_contents
344
- end
435
+ extra_table = parse_out_extra_fields(extras)
345
436
 
346
437
  # ...of which we really only need the Zip64 extra
347
438
  if zip64_extra_contents = extra_table[1]
@@ -378,7 +469,7 @@ class ZipTricks::FileReader
378
469
  str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
379
470
  eocd_idx_in_buf = locate_eocd_signature(str_containing_eocd_record)
380
471
 
381
- raise "Could not find the EOCD signature in the buffer - maybe a malformed ZIP file" unless eocd_idx_in_buf
472
+ raise MissingEOCD unless eocd_idx_in_buf
382
473
 
383
474
  eocd_offset = implied_position_of_eocd_record + eocd_idx_in_buf
384
475
  log { 'Found EOCD signature at offset %d' % eocd_offset }
@@ -546,4 +637,16 @@ class ZipTricks::FileReader
546
637
  # The most minimal implementation for the method is just this:
547
638
  # $stderr.puts(yield)
548
639
  end
640
+
641
+ def parse_out_extra_fields(extra_fields_str)
642
+ extra_table = {}
643
+ extras_buf = StringIO.new(extra_fields_str)
644
+ until extras_buf.eof? do
645
+ extra_id = read_2b(extras_buf)
646
+ extra_size = read_2b(extras_buf)
647
+ extra_contents = read_n(extras_buf, extra_size)
648
+ extra_table[extra_id] = extra_contents
649
+ end
650
+ extra_table
651
+ end
549
652
  end
@@ -1,6 +1,78 @@
1
1
  require 'spec_helper'
2
2
  describe ZipTricks::FileReader do
3
3
 
4
+ describe 'with a file without EOCD' do
5
+ it 'raises the MissingEOCD exception and refuses to read' do
6
+ f = StringIO.new
7
+ 10.times { f << ('A' * 1024 ) }
8
+ f.rewind
9
+
10
+ expect {
11
+ described_class.read_zip_structure(io: f)
12
+ }.to raise_error(described_class::MissingEOCD)
13
+ end
14
+ end
15
+
16
+ describe 'read_zip_straight_ahead' do
17
+ it 'returns all the entries it can recover' do
18
+ zipfile = StringIO.new
19
+ war_and_peace = File.read(__dir__ + '/war-and-peace.txt')
20
+ ZipTricks::Streamer.open(zipfile) do |zip|
21
+ zip.add_stored_entry filename: 'text1.txt', crc32: Zlib.crc32(war_and_peace), size: war_and_peace.bytesize
22
+ zip << war_and_peace
23
+ zip.add_stored_entry filename: 'text2.txt', crc32: Zlib.crc32(war_and_peace), size: war_and_peace.bytesize
24
+ zip << war_and_peace
25
+ zip.add_stored_entry filename: 'text3.txt', crc32: Zlib.crc32(war_and_peace), size: war_and_peace.bytesize
26
+ zip << war_and_peace
27
+ end
28
+ zipfile.rewind
29
+
30
+ recovered_entries = described_class.read_zip_straight_ahead(io: zipfile)
31
+ expect(recovered_entries.length).to eq(3)
32
+ recovered_entries.each do |entry|
33
+ expect(entry.storage_mode).to eq(0)
34
+ expect(entry.compressed_size).to eq(496006)
35
+ expect(entry.uncompressed_size).to eq(496006)
36
+ end
37
+
38
+ first, second, third = recovered_entries
39
+ expect(first.compressed_data_offset).to eq(39)
40
+ expect(second.compressed_data_offset).to eq(496084)
41
+ expect(third.compressed_data_offset).to eq(992129)
42
+
43
+ recovered_entries.each do |entry|
44
+ zipfile.seek(entry.compressed_data_offset)
45
+ expect(zipfile.read(5)).to eq(war_and_peace[0...5])
46
+ end
47
+ end
48
+
49
+ it 'recovers an entry that uses Zip64 extra fields' do
50
+ zipfile = StringIO.new
51
+ w = ZipTricks::ZipWriter.new
52
+ w.write_local_file_header(io: zipfile, filename: 'big.bin', compressed_size: 0xFFFFFFFFFF, uncompressed_size: 0xFFFFFFFFF,
53
+ crc32: 0, gp_flags: 0, mtime: Time.now, storage_mode: 0)
54
+ zipfile.rewind
55
+ recovered_entries = described_class.read_zip_straight_ahead(io: zipfile)
56
+ expect(recovered_entries.length).to eq(1)
57
+ entry = recovered_entries.shift
58
+ expect(entry.compressed_size).to eq(0xFFFFFFFFFF)
59
+ end
60
+
61
+ it 'raises when an entry uses a data descriptor' do
62
+ zipfile = StringIO.new
63
+ ZipTricks::Streamer.open(zipfile) do |zip|
64
+ zip.write_deflated_file('war-and-peace.txt') do |sink|
65
+ sink << File.read(__dir__ + '/war-and-peace.txt')
66
+ end
67
+ end
68
+ zipfile.rewind
69
+
70
+ expect {
71
+ described_class.read_zip_straight_ahead(io: zipfile)
72
+ }.to raise_error(described_class::UnsupportedFeature)
73
+ end
74
+ end
75
+
4
76
  describe 'with an end-to-end ZIP file to read' do
5
77
  it 'reads and uncompresses the file written deflated with data descriptors' do
6
78
  zipfile = StringIO.new
data/zip_tricks.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: zip_tricks 4.0.0 ruby lib
5
+ # stub: zip_tricks 4.1.0 ruby lib
6
6
 
7
7
  Gem::Specification.new do |s|
8
8
  s.name = "zip_tricks"
9
- s.version = "4.0.0"
9
+ s.version = "4.1.0"
10
10
 
11
11
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
12
  s.require_paths = ["lib"]
13
13
  s.authors = ["Julik Tarkhanov"]
14
- s.date = "2016-08-19"
15
- s.description = "Makes rubyzip stream, for real"
14
+ s.date = "2016-09-14"
15
+ s.description = "Stream out ZIP files from Ruby"
16
16
  s.email = "me@julik.nl"
17
17
  s.extra_rdoc_files = [
18
18
  "LICENSE.txt",
@@ -70,8 +70,8 @@ Gem::Specification.new do |s|
70
70
  ]
71
71
  s.homepage = "http://github.com/wetransfer/zip_tricks"
72
72
  s.licenses = ["MIT"]
73
- s.rubygems_version = "2.2.2"
74
- s.summary = "Makes rubyzip stream, for real"
73
+ s.rubygems_version = "2.5.1"
74
+ s.summary = "Stream out ZIP files from Ruby"
75
75
 
76
76
  if s.respond_to? :specification_version then
77
77
  s.specification_version = 4
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zip_tricks
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.0.0
4
+ version: 4.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Julik Tarkhanov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-08-19 00:00:00.000000000 Z
11
+ date: 2016-09-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rubyzip
@@ -162,7 +162,7 @@ dependencies:
162
162
  - - "~>"
163
163
  - !ruby/object:Gem::Version
164
164
  version: 2.0.1
165
- description: Makes rubyzip stream, for real
165
+ description: Stream out ZIP files from Ruby
166
166
  email: me@julik.nl
167
167
  executables: []
168
168
  extensions: []
@@ -238,8 +238,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
238
238
  version: '0'
239
239
  requirements: []
240
240
  rubyforge_project:
241
- rubygems_version: 2.2.2
241
+ rubygems_version: 2.5.1
242
242
  signing_key:
243
243
  specification_version: 4
244
- summary: Makes rubyzip stream, for real
244
+ summary: Stream out ZIP files from Ruby
245
245
  test_files: []