zip_tricks 4.0.0 → 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6b478a7ffbae2dbb20270a1e4c4b63f495feef87
4
- data.tar.gz: d9a8ca3a1596ee653fcf2e2b9153a9e59bd912fc
3
+ metadata.gz: f6d3a4df27461f1235e526d7d40eb089f4df5c64
4
+ data.tar.gz: 179df69ea7d164f9164aabcc0c19d951e8d5748f
5
5
  SHA512:
6
- metadata.gz: 689d25f85a79987750fe3bbdf845cfe76c987713709a3d7fc8d2bc944a570958168c2b6c38a6f657b254d499052c079aad84c85b6421b9c228740dd3c9f79903
7
- data.tar.gz: 434ffe473e5b0339b40ead5437e070cc1bd6dcce910db58e516a207806cd0eab16f86cc04d342201e2f31f121438308351239d923f4b4e5311850755c77ce339
6
+ metadata.gz: 68131f0f180074731223f145f06d42f65d7ee327bfcc73c7445a9ae2be198ba8ed2372cffb9deba8729c7b3af46eb4cacdeb3ae408fd37a202c848933ce93ffa
7
+ data.tar.gz: 9e38d0ce079b7e367b75db3b6ebd032fef11802f12926cf586f061bdb7f059c1087505b2556cfd76062e288601923d51e4c7a1ca5ca5bb928fcae385ee92807b
data/README.md CHANGED
@@ -74,7 +74,7 @@ to that socket using some accelerated writing technique, and only use the Stream
74
74
  ZipTricks::Streamer.open(io) do | zip |
75
75
  # raw_file is written "as is" (STORED mode).
76
76
  # Write the local file header first..
77
- zip.add_stored_entry("first-file.bin", raw_file.size, raw_file_crc32)
77
+ zip.add_stored_entry(filename: "first-file.bin", size: raw_file.size, crc32: raw_file_crc32)
78
78
 
79
79
  # then send the actual file contents bypassing the Streamer interface
80
80
  io.sendfile(my_temp_file)
data/Rakefile CHANGED
@@ -18,8 +18,8 @@ Jeweler::Tasks.new do |gem|
18
18
  gem.homepage = "http://github.com/wetransfer/zip_tricks"
19
19
  gem.license = "MIT"
20
20
  gem.version = ZipTricks::VERSION
21
- gem.summary = %Q{Makes rubyzip stream, for real}
22
- gem.description = %Q{Makes rubyzip stream, for real}
21
+ gem.summary = 'Stream out ZIP files from Ruby'
22
+ gem.description = 'Stream out ZIP files from Ruby'
23
23
  gem.email = "me@julik.nl"
24
24
  gem.authors = ["Julik Tarkhanov"]
25
25
  gem.files.exclude "testing/**/*"
data/lib/zip_tricks.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  module ZipTricks
2
- VERSION = '4.0.0'
2
+ VERSION = '4.1.0'
3
3
 
4
4
  # Require all the sub-components except myself
5
5
  Dir.glob(__dir__ + '/**/*.rb').sort.each {|p| require p unless p == __FILE__ }
@@ -41,11 +41,20 @@ require 'stringio'
41
41
  #
42
42
  # ## Mode of operation
43
43
  #
44
- # Basically, `FileReader` _ignores_ the data in local file headers (as it is often unreliable).
44
+ # By default, `FileReader` _ignores_ the data in local file headers (as it is often unreliable).
45
45
  # It reads the ZIP file "from the tail", finds the end-of-central-directory signatures, then
46
46
  # reads the central directory entries, reconstitutes the entries with their filenames, attributes
47
47
  # and so on, and sets these entries up with the absolute _offsets_ into the source file/IO object.
48
48
  # These offsets can then be used to extract the actual compressed data of the files and to expand it.
49
+ #
50
+ # ## Recovering damaged or incomplete ZIP files
51
+ #
52
+ # If the ZIP file you are trying to read does not contain the central directory records `read_zip_structure`
53
+ # will not work, since it starts the read process from the EOCD marker at the end of the central directory
54
+ # and then crawls "back" in the IO to figure out the rest. You can explicitly apply a fallback for reading the
55
+ # archive "straight ahead" instead using `read_zip_straight_ahead` - the method will instead scan your IO from
56
+ # the very start, skipping over the actual entry data. This is less efficient than central directory parsing since
57
+ # it involves a much larger number of reads (1 read from the IO per entry in the ZIP).
49
58
  class ZipTricks::FileReader
50
59
  require_relative 'file_reader/stored_reader'
51
60
  require_relative 'file_reader/inflating_reader'
@@ -58,6 +67,11 @@ class ZipTricks::FileReader
58
67
  "The compressed data offset is not available (local header has not been read)"
59
68
  end
60
69
  end
70
+ MissingEOCD = Class.new(StandardError) do
71
+ def message
72
+ "Could not find the EOCD signature in the buffer - maybe a malformed ZIP file"
73
+ end
74
+ end
61
75
 
62
76
  private_constant :StoredReader, :InflatingReader
63
77
 
@@ -139,6 +153,12 @@ class ZipTricks::FileReader
139
153
  !@compressed_data_offset.nil?
140
154
  end
141
155
 
156
+ # Tells whether the entry uses a data descriptor (this is defined
157
+ # by bit 3 in the GP flags).
158
+ def uses_data_descriptor?
159
+ (gp_flags & 0x0008) == 0x0008
160
+ end
161
+
142
162
  # Sets the offset at which the compressed data for this file starts in the ZIP.
143
163
  # By default, the value will be set by the Reader for you. If you use delayed
144
164
  # reading, you need to set it by using the `get_compressed_data_offset` on the Reader:
@@ -165,7 +185,7 @@ class ZipTricks::FileReader
165
185
  # (since the reads have not been performed yet). As a rule, this option can be left in it's
166
186
  # default setting (`true`) unless you want to _only_ read the central directory, or you need
167
187
  # to limit the number of HTTP requests.
168
- # @return [Array<Entry>] an array of entries within the ZIP being parsed
188
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
169
189
  def read_zip_structure(io:, read_local_headers: true)
170
190
  zip_file_size = io.size
171
191
  eocd_offset = get_eocd_offset(io, zip_file_size)
@@ -194,52 +214,130 @@ class ZipTricks::FileReader
194
214
  entries
195
215
  end
196
216
 
197
- # Get the offset in the IO at which the actual compressed data of the file starts within the ZIP.
217
+ # Sometimes you might encounter truncated ZIP files, which do not contain any central directory
218
+ # whatsoever - or where the central directory is truncated. In that case, employing the technique
219
+ # of reading the ZIP "from the end" is impossible, and the only recourse is reading each local file header
220
+ # in sucession. If the entries in such a ZIP use data descriptors, you would need to scan after the entry until
221
+ # you encounter the data descriptor signature - and that might be unreliable at best. Therefore, this reading
222
+ # technique does not support data descriptors. It can however recover the entries you still can read if these
223
+ # entries contain all the necessary information about the contained file.
224
+ #
225
+ # @param io[#tell, #read, #seek] the IO-ish object to read the local file headers from
226
+ # @return [Array<ZipEntry>] an array of entries that could be recovered before hitting EOF
227
+ def read_zip_straight_ahead(io:)
228
+ entries = []
229
+ loop do
230
+ cur_offset = io.tell
231
+ entry = read_local_file_header(io: io)
232
+ if entry.uses_data_descriptor?
233
+ raise UnsupportedFeature, "The local file header at #{cur_offset} uses a data descriptor and the start of next entry cannot be found"
234
+ end
235
+ entries << entry
236
+ next_local_header_offset = entry.compressed_data_offset + entry.compressed_size
237
+ log { 'Recovered a local file file header at offset %d, seeking to the next at %d' % [cur_offset, next_local_header_offset] }
238
+ seek(io, next_local_header_offset) # Seek to the next entry, and raise if seek is impossible
239
+ end
240
+ entries
241
+ rescue ReadError
242
+ log { 'Got a read/seek error after reaching %d, no more entries can be recovered' % cur_offset }
243
+ entries
244
+ end
245
+
246
+ # Parse the local header entry and get the offset in the IO at which the actual compressed data of the
247
+ # file starts within the ZIP.
198
248
  # The method will eager-read the entire local header for the file (the maximum size the local header may use),
199
249
  # starting at the given offset, and will then compute its size. That size plus the local header offset
200
250
  # given will be the compressed data offset of the entry (read starting at this offset to get the data).
201
251
  #
202
- # @param io[#seek, #read] an IO-ish object the ZIP file can be read from
203
- # @param local_header_offset[Fixnum] absolute offset (0-based) where the local file header is supposed to begin
204
- # @return [Fixnum] absolute offset (0-based) of where the compressed data begins for this file within the ZIP
205
- def get_compressed_data_offset(io:, local_file_header_offset:)
206
- seek(io, local_file_header_offset)
207
-
252
+ # @param io[#read] an IO-ish object the ZIP file can be read from
253
+ # @return [Array<ZipEntry, Fixnum>] the parsed local header entry and the compressed data offset
254
+ def read_local_file_header(io:)
255
+ local_file_header_offset = io.tell
256
+
208
257
  # Reading in bulk is cheaper - grab the maximum length of the local header,
209
- # including any headroom
258
+ # including any headroom for extra fields etc.
210
259
  local_file_header_str_plus_headroom = io.read(MAX_LOCAL_HEADER_SIZE)
260
+ raise ReadError if local_file_header_str_plus_headroom.nil? # reached EOF
261
+
211
262
  io_starting_at_local_header = StringIO.new(local_file_header_str_plus_headroom)
212
263
 
213
264
  assert_signature(io_starting_at_local_header, 0x04034b50)
214
-
215
- # The rest is unreliable, and we have that information from the central directory already.
216
- # So just skip over it to get at the offset where the compressed data begins
217
- skip_ahead_2(io_starting_at_local_header) # Version needed to extract
218
- skip_ahead_2(io_starting_at_local_header) # gp flags
219
- skip_ahead_2(io_starting_at_local_header) # storage mode
220
- skip_ahead_2(io_starting_at_local_header) # dos time
221
- skip_ahead_2(io_starting_at_local_header) # dos date
222
- skip_ahead_4(io_starting_at_local_header) # CRC32
223
-
224
- skip_ahead_4(io_starting_at_local_header) # Comp size
225
- skip_ahead_4(io_starting_at_local_header) # Uncomp size
265
+ e = ZipEntry.new
266
+ e.version_needed_to_extract = read_2b(io_starting_at_local_header) # Version needed to extract
267
+ e.gp_flags = read_2b(io_starting_at_local_header) # gp flags
268
+ e.storage_mode = read_2b(io_starting_at_local_header) # storage mode
269
+ e.dos_time = read_2b(io_starting_at_local_header) # dos time
270
+ e.dos_date = read_2b(io_starting_at_local_header) # dos date
271
+ e.crc32 = read_4b(io_starting_at_local_header) # CRC32
272
+ e.compressed_size = read_4b(io_starting_at_local_header) # Comp size
273
+ e.uncompressed_size = read_4b(io_starting_at_local_header) # Uncomp size
226
274
 
227
275
  filename_size = read_2b(io_starting_at_local_header)
228
276
  extra_size = read_2b(io_starting_at_local_header)
277
+ e.filename = read_n(io_starting_at_local_header, filename_size)
278
+ extra_fields_str = read_n(io_starting_at_local_header, extra_size)
279
+
280
+ # Parse out the extra fields
281
+ extra_table = parse_out_extra_fields(extra_fields_str)
282
+
283
+ # ...of which we really only need the Zip64 extra
284
+ if zip64_extra_contents = extra_table[1]
285
+ # If the Zip64 extra is present, we let it override all
286
+ # the values fetched from the conventional header
287
+ zip64_extra = StringIO.new(zip64_extra_contents)
288
+ log { 'Will read Zip64 extra data from local header field for %s, %d bytes' % [e.filename, zip64_extra.size] }
289
+ # Now here be dragons. The APPNOTE specifies that
290
+ #
291
+ # > The order of the fields in the ZIP64 extended
292
+ # > information record is fixed, but the fields will
293
+ # > only appear if the corresponding Local or Central
294
+ # > directory record field is set to 0xFFFF or 0xFFFFFFFF.
295
+ #
296
+ # It means that before we read this stuff we need to check if the previously-read
297
+ # values are at overflow, and only _then_ proceed to read them. Bah.
298
+ e.uncompressed_size = read_8b(zip64_extra) if e.uncompressed_size == 0xFFFFFFFF
299
+ e.compressed_size = read_8b(zip64_extra) if e.compressed_size == 0xFFFFFFFF
300
+ end
229
301
 
230
- skip_ahead_n(io_starting_at_local_header, filename_size)
231
- skip_ahead_n(io_starting_at_local_header, extra_size)
302
+ offset = local_file_header_offset + io_starting_at_local_header.tell
303
+ e.compressed_data_offset = offset
232
304
 
233
- local_file_header_offset + io_starting_at_local_header.tell
305
+ e
234
306
  end
235
-
236
- # Parse an IO handle to a ZIP archive into an array of Entry objects.
307
+
308
+ # Get the offset in the IO at which the actual compressed data of the file starts within the ZIP.
309
+ # The method will eager-read the entire local header for the file (the maximum size the local header may use),
310
+ # starting at the given offset, and will then compute its size. That size plus the local header offset
311
+ # given will be the compressed data offset of the entry (read starting at this offset to get the data).
312
+ #
313
+ # @param io[#seek, #read] an IO-ish object the ZIP file can be read from
314
+ # @param local_header_offset[Fixnum] absolute offset (0-based) where the local file header is supposed to begin
315
+ # @return [Fixnum] absolute offset (0-based) of where the compressed data begins for this file within the ZIP
316
+ def get_compressed_data_offset(io:, local_file_header_offset:)
317
+ seek(io, local_file_header_offset)
318
+ entry_recovered_from_local_file_header = read_local_file_header(io: io)
319
+ entry_recovered_from_local_file_header.compressed_data_offset
320
+ end
321
+
322
+ # Parse an IO handle to a ZIP archive into an array of Entry objects, reading from the end
323
+ # of the IO object.
237
324
  #
325
+ # @see {#read_zip_structure}
238
326
  # @param options[Hash] any options the instance method of the same name accepts
239
- # @return [Array<Entry>] an array of entries within the ZIP being parsed
327
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
240
328
  def self.read_zip_structure(**options)
241
329
  new.read_zip_structure(**options)
242
330
  end
331
+
332
+ # Parse an IO handle to a ZIP archive into an array of Entry objects, reading from the start of
333
+ # the file and parsing local file headers one-by-one
334
+ #
335
+ # @see {#read_zip_straight_ahead}
336
+ # @param options[Hash] any options the instance method of the same name accepts
337
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
338
+ def self.read_zip_straight_ahead(**options)
339
+ new.read_zip_straight_ahead(**options)
340
+ end
243
341
 
244
342
  private
245
343
 
@@ -334,14 +432,7 @@ class ZipTricks::FileReader
334
432
  e.comment = read_n(io, comment_len)
335
433
 
336
434
  # Parse out the extra fields
337
- extra_table = {}
338
- extras_buf = StringIO.new(extras)
339
- until extras_buf.eof? do
340
- extra_id = read_2b(extras_buf)
341
- extra_size = read_2b(extras_buf)
342
- extra_contents = read_n(extras_buf, extra_size)
343
- extra_table[extra_id] = extra_contents
344
- end
435
+ extra_table = parse_out_extra_fields(extras)
345
436
 
346
437
  # ...of which we really only need the Zip64 extra
347
438
  if zip64_extra_contents = extra_table[1]
@@ -378,7 +469,7 @@ class ZipTricks::FileReader
378
469
  str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
379
470
  eocd_idx_in_buf = locate_eocd_signature(str_containing_eocd_record)
380
471
 
381
- raise "Could not find the EOCD signature in the buffer - maybe a malformed ZIP file" unless eocd_idx_in_buf
472
+ raise MissingEOCD unless eocd_idx_in_buf
382
473
 
383
474
  eocd_offset = implied_position_of_eocd_record + eocd_idx_in_buf
384
475
  log { 'Found EOCD signature at offset %d' % eocd_offset }
@@ -546,4 +637,16 @@ class ZipTricks::FileReader
546
637
  # The most minimal implementation for the method is just this:
547
638
  # $stderr.puts(yield)
548
639
  end
640
+
641
+ def parse_out_extra_fields(extra_fields_str)
642
+ extra_table = {}
643
+ extras_buf = StringIO.new(extra_fields_str)
644
+ until extras_buf.eof? do
645
+ extra_id = read_2b(extras_buf)
646
+ extra_size = read_2b(extras_buf)
647
+ extra_contents = read_n(extras_buf, extra_size)
648
+ extra_table[extra_id] = extra_contents
649
+ end
650
+ extra_table
651
+ end
549
652
  end
@@ -1,6 +1,78 @@
1
1
  require 'spec_helper'
2
2
  describe ZipTricks::FileReader do
3
3
 
4
+ describe 'with a file without EOCD' do
5
+ it 'raises the MissingEOCD exception and refuses to read' do
6
+ f = StringIO.new
7
+ 10.times { f << ('A' * 1024 ) }
8
+ f.rewind
9
+
10
+ expect {
11
+ described_class.read_zip_structure(io: f)
12
+ }.to raise_error(described_class::MissingEOCD)
13
+ end
14
+ end
15
+
16
+ describe 'read_zip_straight_ahead' do
17
+ it 'returns all the entries it can recover' do
18
+ zipfile = StringIO.new
19
+ war_and_peace = File.read(__dir__ + '/war-and-peace.txt')
20
+ ZipTricks::Streamer.open(zipfile) do |zip|
21
+ zip.add_stored_entry filename: 'text1.txt', crc32: Zlib.crc32(war_and_peace), size: war_and_peace.bytesize
22
+ zip << war_and_peace
23
+ zip.add_stored_entry filename: 'text2.txt', crc32: Zlib.crc32(war_and_peace), size: war_and_peace.bytesize
24
+ zip << war_and_peace
25
+ zip.add_stored_entry filename: 'text3.txt', crc32: Zlib.crc32(war_and_peace), size: war_and_peace.bytesize
26
+ zip << war_and_peace
27
+ end
28
+ zipfile.rewind
29
+
30
+ recovered_entries = described_class.read_zip_straight_ahead(io: zipfile)
31
+ expect(recovered_entries.length).to eq(3)
32
+ recovered_entries.each do |entry|
33
+ expect(entry.storage_mode).to eq(0)
34
+ expect(entry.compressed_size).to eq(496006)
35
+ expect(entry.uncompressed_size).to eq(496006)
36
+ end
37
+
38
+ first, second, third = recovered_entries
39
+ expect(first.compressed_data_offset).to eq(39)
40
+ expect(second.compressed_data_offset).to eq(496084)
41
+ expect(third.compressed_data_offset).to eq(992129)
42
+
43
+ recovered_entries.each do |entry|
44
+ zipfile.seek(entry.compressed_data_offset)
45
+ expect(zipfile.read(5)).to eq(war_and_peace[0...5])
46
+ end
47
+ end
48
+
49
+ it 'recovers an entry that uses Zip64 extra fields' do
50
+ zipfile = StringIO.new
51
+ w = ZipTricks::ZipWriter.new
52
+ w.write_local_file_header(io: zipfile, filename: 'big.bin', compressed_size: 0xFFFFFFFFFF, uncompressed_size: 0xFFFFFFFFF,
53
+ crc32: 0, gp_flags: 0, mtime: Time.now, storage_mode: 0)
54
+ zipfile.rewind
55
+ recovered_entries = described_class.read_zip_straight_ahead(io: zipfile)
56
+ expect(recovered_entries.length).to eq(1)
57
+ entry = recovered_entries.shift
58
+ expect(entry.compressed_size).to eq(0xFFFFFFFFFF)
59
+ end
60
+
61
+ it 'raises when an entry uses a data descriptor' do
62
+ zipfile = StringIO.new
63
+ ZipTricks::Streamer.open(zipfile) do |zip|
64
+ zip.write_deflated_file('war-and-peace.txt') do |sink|
65
+ sink << File.read(__dir__ + '/war-and-peace.txt')
66
+ end
67
+ end
68
+ zipfile.rewind
69
+
70
+ expect {
71
+ described_class.read_zip_straight_ahead(io: zipfile)
72
+ }.to raise_error(described_class::UnsupportedFeature)
73
+ end
74
+ end
75
+
4
76
  describe 'with an end-to-end ZIP file to read' do
5
77
  it 'reads and uncompresses the file written deflated with data descriptors' do
6
78
  zipfile = StringIO.new
data/zip_tricks.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: zip_tricks 4.0.0 ruby lib
5
+ # stub: zip_tricks 4.1.0 ruby lib
6
6
 
7
7
  Gem::Specification.new do |s|
8
8
  s.name = "zip_tricks"
9
- s.version = "4.0.0"
9
+ s.version = "4.1.0"
10
10
 
11
11
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
12
  s.require_paths = ["lib"]
13
13
  s.authors = ["Julik Tarkhanov"]
14
- s.date = "2016-08-19"
15
- s.description = "Makes rubyzip stream, for real"
14
+ s.date = "2016-09-14"
15
+ s.description = "Stream out ZIP files from Ruby"
16
16
  s.email = "me@julik.nl"
17
17
  s.extra_rdoc_files = [
18
18
  "LICENSE.txt",
@@ -70,8 +70,8 @@ Gem::Specification.new do |s|
70
70
  ]
71
71
  s.homepage = "http://github.com/wetransfer/zip_tricks"
72
72
  s.licenses = ["MIT"]
73
- s.rubygems_version = "2.2.2"
74
- s.summary = "Makes rubyzip stream, for real"
73
+ s.rubygems_version = "2.5.1"
74
+ s.summary = "Stream out ZIP files from Ruby"
75
75
 
76
76
  if s.respond_to? :specification_version then
77
77
  s.specification_version = 4
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zip_tricks
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.0.0
4
+ version: 4.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Julik Tarkhanov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-08-19 00:00:00.000000000 Z
11
+ date: 2016-09-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rubyzip
@@ -162,7 +162,7 @@ dependencies:
162
162
  - - "~>"
163
163
  - !ruby/object:Gem::Version
164
164
  version: 2.0.1
165
- description: Makes rubyzip stream, for real
165
+ description: Stream out ZIP files from Ruby
166
166
  email: me@julik.nl
167
167
  executables: []
168
168
  extensions: []
@@ -238,8 +238,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
238
238
  version: '0'
239
239
  requirements: []
240
240
  rubyforge_project:
241
- rubygems_version: 2.2.2
241
+ rubygems_version: 2.5.1
242
242
  signing_key:
243
243
  specification_version: 4
244
- summary: Makes rubyzip stream, for real
244
+ summary: Stream out ZIP files from Ruby
245
245
  test_files: []