zip_tricks 5.1.1 → 5.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -91,8 +91,9 @@ class ZipTricks::Streamer
91
91
  InvalidOutput = Class.new(ArgumentError)
92
92
  Overflow = Class.new(StandardError)
93
93
  UnknownMode = Class.new(StandardError)
94
+ OffsetOutOfSync = Class.new(StandardError)
94
95
 
95
- private_constant :DeflatedWriter, :StoredWriter, :STORED, :DEFLATED
96
+ private_constant :STORED, :DEFLATED
96
97
 
97
98
  # Creates a new Streamer on top of the given IO-ish object and yields it. Once the given block
98
99
  # returns, the Streamer will have it's `close` method called, which will write out the central
@@ -130,28 +131,26 @@ class ZipTricks::Streamer
130
131
  # end
131
132
  #
132
133
  # @param kwargs_for_new [Hash] keyword arguments for {Streamer.new}
133
- # @return [Enumerator] the enumerator you can read bytestrings of the ZIP from using `each`
134
+ # @return [ZipTricks::OutputEnumerator] the enumerator you can read bytestrings of the ZIP from by calling `each`
134
135
  def self.output_enum(**kwargs_for_new, &zip_streamer_block)
135
136
  ZipTricks::OutputEnumerator.new(**kwargs_for_new, &zip_streamer_block)
136
137
  end
137
138
 
138
139
  # Creates a new Streamer on top of the given IO-ish object.
139
140
  #
140
- # @param stream[IO] the destination IO for the ZIP. Anything that responds to `<<` can be used.
141
+ # @param writable[#<<] the destination IO for the ZIP. Anything that responds to `<<` can be used.
141
142
  # @param writer[ZipTricks::ZipWriter] the object to be used as the writer.
142
143
  # Defaults to an instance of ZipTricks::ZipWriter, normally you won't need to override it
143
144
  # @param auto_rename_duplicate_filenames[Boolean] whether duplicate filenames, when encountered,
144
145
  # should be suffixed with (1), (2) etc. Default value is `false` - if
145
146
  # dupliate names are used an exception will be raised
146
- def initialize(stream, writer: create_writer, auto_rename_duplicate_filenames: false)
147
- raise InvalidOutput, 'The stream must respond to #<<' unless stream.respond_to?(:<<)
148
-
149
- @dedupe_filenames = auto_rename_duplicate_filenames
150
- @out = ZipTricks::WriteAndTell.new(stream)
147
+ def initialize(writable, writer: create_writer, auto_rename_duplicate_filenames: false)
148
+ raise InvalidOutput, 'The writable must respond to #<<' unless writable.respond_to?(:<<)
149
+ @out = ZipTricks::WriteAndTell.new(writable)
151
150
  @files = []
152
- @local_header_offsets = []
153
151
  @path_set = ZipTricks::PathSet.new
154
152
  @writer = writer
153
+ @dedupe_filenames = auto_rename_duplicate_filenames
155
154
  end
156
155
 
157
156
  # Writes a part of a zip entry body (actual binary data of the entry) into the output stream.
@@ -213,9 +212,6 @@ class ZipTricks::Streamer
213
212
  @out.tell
214
213
  end
215
214
 
216
- # Will be phased out in ZipTricks 5.x
217
- alias_method :add_compressed_entry, :add_deflated_entry
218
-
219
215
  # Writes out the local header for an entry (file in the ZIP) that is using
220
216
  # the stored storage model (is stored as-is).
221
217
  # Once this method is called, the `<<` method has to be called one or more
@@ -363,14 +359,16 @@ class ZipTricks::Streamer
363
359
  #
364
360
  # @return [Integer] the offset the output IO is at after closing the archive
365
361
  def close
362
+ # Make sure offsets are in order
363
+ verify_offsets!
364
+
366
365
  # Record the central directory offset, so that it can be written into the EOCD record
367
366
  cdir_starts_at = @out.tell
368
367
 
369
368
  # Write out the central directory entries, one for each file
370
- @files.each_with_index do |entry, i|
371
- header_loc = @local_header_offsets.fetch(i)
369
+ @files.each do |entry|
372
370
  @writer.write_central_directory_file_header(io: @out,
373
- local_file_header_location: header_loc,
371
+ local_file_header_location: entry.local_header_offset,
374
372
  gp_flags: entry.gp_flags,
375
373
  storage_mode: entry.storage_mode,
376
374
  compressed_size: entry.compressed_size,
@@ -423,15 +421,40 @@ class ZipTricks::Streamer
423
421
  last_entry.compressed_size = compressed_size
424
422
  last_entry.uncompressed_size = uncompressed_size
425
423
 
424
+ offset_before_data_descriptor = @out.tell
426
425
  @writer.write_data_descriptor(io: @out,
427
426
  crc32: last_entry.crc32,
428
427
  compressed_size: last_entry.compressed_size,
429
428
  uncompressed_size: last_entry.uncompressed_size)
429
+ last_entry.bytes_used_for_data_descriptor = @out.tell - offset_before_data_descriptor
430
+
430
431
  @out.tell
431
432
  end
432
433
 
433
434
  private
434
435
 
436
+ def verify_offsets!
437
+ # We need to check whether the offsets noted for the entries actually make sense
438
+ computed_offset = @files.map(&:total_bytes_used).inject(0, &:+)
439
+ actual_offset = @out.tell
440
+ if computed_offset != actual_offset
441
+ message = <<-EMS
442
+ The offset of the Streamer output IO is out of sync with the expected value. All entries written so far,
443
+ including their compressed bodies, local headers and data descriptors, add up to a certain offset,
444
+ but this offset does not match the actual offset of the IO.
445
+
446
+ Entries add up to #{computed_offset} bytes and the IO is at #{actual_offset} bytes.
447
+
448
+ This can happen if you write local headers for an entry, write the "body" of the entry directly to the IO
449
+ object which is your destination, but do not adjust the offset known to the Streamer object. To adjust
450
+ the offfset you need to call `Streamer#simulate_write(body_size)` after outputting the entry. Otherwise
451
+ the local header offsets of the entries you write are going to be incorrect and some ZIP applications
452
+ are going to have problems opening your archive.
453
+ EMS
454
+ raise OffsetOutOfSync, message
455
+ end
456
+ end
457
+
435
458
  def add_file_and_write_local_header(
436
459
  filename:,
437
460
  modification_time:,
@@ -464,16 +487,18 @@ class ZipTricks::Streamer
464
487
  uncompressed_size = 0
465
488
  end
466
489
 
490
+ local_header_starts_at = @out.tell
491
+
467
492
  e = Entry.new(filename,
468
493
  crc32,
469
494
  compressed_size,
470
495
  uncompressed_size,
471
496
  storage_mode,
472
497
  modification_time,
473
- use_data_descriptor)
474
-
475
- @files << e
476
- @local_header_offsets << @out.tell
498
+ use_data_descriptor,
499
+ _local_file_header_offset = local_header_starts_at,
500
+ _bytes_used_for_local_header = 0,
501
+ _bytes_used_for_data_descriptor = 0)
477
502
 
478
503
  @writer.write_local_file_header(io: @out,
479
504
  gp_flags: e.gp_flags,
@@ -483,6 +508,9 @@ class ZipTricks::Streamer
483
508
  mtime: e.mtime,
484
509
  filename: e.filename,
485
510
  storage_mode: e.storage_mode)
511
+ e.bytes_used_for_local_header = @out.tell - e.local_header_offset
512
+
513
+ @files << e
486
514
  end
487
515
 
488
516
  def remove_backslash(filename)
@@ -4,13 +4,6 @@
4
4
  # registers data passing through it in a CRC32 checksum calculator. Is made to be completely
5
5
  # interchangeable with the StoredWriter in terms of interface.
6
6
  class ZipTricks::Streamer::DeflatedWriter
7
- # After how many bytes of incoming data the deflater for the
8
- # contents must be flushed. This is done to prevent unreasonable
9
- # memory use when archiving large files, and to ensure we write to
10
- # the socket often enough while still maintaining acceptable
11
- # compression
12
- FLUSH_EVERY_N_BYTES = 1024 * 1024 * 5
13
-
14
7
  # The amount of bytes we will buffer before computing the intermediate
15
8
  # CRC32 checksums. Benchmarks show that the optimum is 64KB (see
16
9
  # `bench/buffered_crc32_bench.rb), if that is exceeded Zlib is going
@@ -18,11 +11,10 @@ class ZipTricks::Streamer::DeflatedWriter
18
11
  CRC32_BUFFER_SIZE = 64 * 1024
19
12
 
20
13
  def initialize(io)
21
- @compressed_io = ZipTricks::WriteAndTell.new(io)
22
- @uncompressed_size = 0
14
+ @compressed_io = io
23
15
  @deflater = ::Zlib::Deflate.new(Zlib::DEFAULT_COMPRESSION, -::Zlib::MAX_WBITS)
24
- @crc = ZipTricks::WriteBuffer.new(ZipTricks::StreamCRC32.new, CRC32_BUFFER_SIZE)
25
- @bytes_since_last_flush = 0
16
+ @crc = ZipTricks::StreamCRC32.new
17
+ @crc_buf = ZipTricks::WriteBuffer.new(@crc, CRC32_BUFFER_SIZE)
26
18
  end
27
19
 
28
20
  # Writes the given data into the deflater, and flushes the deflater
@@ -31,13 +23,8 @@ class ZipTricks::Streamer::DeflatedWriter
31
23
  # @param data[String] data to be written
32
24
  # @return self
33
25
  def <<(data)
34
- @uncompressed_size += data.bytesize
35
- @bytes_since_last_flush += data.bytesize
36
- @compressed_io << @deflater.deflate(data)
37
- @crc << data
38
-
39
- interim_flush
40
-
26
+ @deflater.deflate(data) { |chunk| @compressed_io << chunk }
27
+ @crc_buf << data
41
28
  self
42
29
  end
43
30
 
@@ -45,18 +32,12 @@ class ZipTricks::Streamer::DeflatedWriter
45
32
  # compressed data written and the CRC32 checksum. The return value
46
33
  # can be directly used as the argument to {Streamer#update_last_entry_and_write_data_descriptor}
47
34
  #
48
- # @param data[String] data to be written
49
35
  # @return [Hash] a hash of `{crc32, compressed_size, uncompressed_size}`
50
36
  def finish
51
37
  @compressed_io << @deflater.finish until @deflater.finished?
52
- {crc32: @crc.to_i, compressed_size: @compressed_io.tell, uncompressed_size: @uncompressed_size}
53
- end
54
-
55
- private
56
-
57
- def interim_flush
58
- return if @bytes_since_last_flush < FLUSH_EVERY_N_BYTES
59
- @compressed_io << @deflater.flush
60
- @bytes_since_last_flush = 0
38
+ @crc_buf.flush
39
+ {crc32: @crc.to_i, compressed_size: @deflater.total_out, uncompressed_size: @deflater.total_in}
40
+ ensure
41
+ @deflater.close
61
42
  end
62
43
  end
@@ -4,7 +4,7 @@
4
4
  # Normally you will not have to use this class directly
5
5
  class ZipTricks::Streamer::Entry < Struct.new(:filename, :crc32, :compressed_size,
6
6
  :uncompressed_size, :storage_mode, :mtime,
7
- :use_data_descriptor)
7
+ :use_data_descriptor, :local_header_offset, :bytes_used_for_local_header, :bytes_used_for_data_descriptor)
8
8
  def initialize(*)
9
9
  super
10
10
  filename.force_encoding(Encoding::UTF_8)
@@ -15,6 +15,10 @@ class ZipTricks::Streamer::Entry < Struct.new(:filename, :crc32, :compressed_siz
15
15
  end)
16
16
  end
17
17
 
18
+ def total_bytes_used
19
+ bytes_used_for_local_header + compressed_size + bytes_used_for_data_descriptor
20
+ end
21
+
18
22
  # Set the general purpose flags for the entry. We care about is the EFS
19
23
  # bit (bit 11) which should be set if the filename is UTF8. If it is, we need to set the
20
24
  # bit so that the unarchiving application knows that the filename in the archive is UTF-8
@@ -12,7 +12,8 @@ class ZipTricks::Streamer::StoredWriter
12
12
 
13
13
  def initialize(io)
14
14
  @io = ZipTricks::WriteAndTell.new(io)
15
- @crc = ZipTricks::WriteBuffer.new(ZipTricks::StreamCRC32.new, CRC32_BUFFER_SIZE)
15
+ @crc_compute = ZipTricks::StreamCRC32.new
16
+ @crc = ZipTricks::WriteBuffer.new(@crc_compute, CRC32_BUFFER_SIZE)
16
17
  end
17
18
 
18
19
  # Writes the given data to the contained IO object.
@@ -28,9 +29,9 @@ class ZipTricks::Streamer::StoredWriter
28
29
  # Returns the amount of data written and the CRC32 checksum. The return value
29
30
  # can be directly used as the argument to {Streamer#update_last_entry_and_write_data_descriptor}
30
31
  #
31
- # @param data[String] data to be written
32
32
  # @return [Hash] a hash of `{crc32, compressed_size, uncompressed_size}`
33
33
  def finish
34
- {crc32: @crc.to_i, compressed_size: @io.tell, uncompressed_size: @io.tell}
34
+ @crc.flush
35
+ {crc32: @crc_compute.to_i, compressed_size: @io.tell, uncompressed_size: @io.tell}
35
36
  end
36
37
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ZipTricks
4
- VERSION = '5.1.1'
4
+ VERSION = '5.5.0'
5
5
  end
@@ -10,9 +10,8 @@ class ZipTricks::WriteAndTell
10
10
 
11
11
  def <<(bytes)
12
12
  return self if bytes.nil?
13
- binary_bytes = binary(bytes)
14
- @io << binary_bytes
15
- @pos += binary_bytes.bytesize
13
+ @io << bytes.b
14
+ @pos += bytes.bytesize
16
15
  self
17
16
  end
18
17
 
@@ -23,13 +22,4 @@ class ZipTricks::WriteAndTell
23
22
  def tell
24
23
  @pos
25
24
  end
26
-
27
- private
28
-
29
- def binary(str)
30
- return str if str.encoding == Encoding::BINARY
31
- str.force_encoding(Encoding::BINARY)
32
- rescue RuntimeError # the string is frozen
33
- str.dup.force_encoding(Encoding::BINARY)
34
- end
35
25
  end
@@ -7,13 +7,34 @@
7
7
  # CRC32 combine operations - and this adds up. Since the CRC32 value
8
8
  # is usually not needed until the complete output has completed
9
9
  # we can buffer at least some amount of data before computing CRC32 over it.
10
+ # We also use this buffer for output via Rack, where some amount of buffering
11
+ # helps reduce the number of syscalls made by the webserver. ZipTricks performs
12
+ # lots of very small writes, and some degree of speedup (about 20%) can be achieved
13
+ # with a buffer of a few KB.
14
+ #
15
+ # Note that there is no guarantee that the write buffer is going to flush at or above
16
+ # the given `buffer_size`, because for writes which exceed the buffer size it will
17
+ # first `flush` and then write through the oversized chunk, without buffering it. This
18
+ # helps conserve memory. Also note that the buffer will *not* duplicate strings for you
19
+ # and *will* yield the same buffer String over and over, so if you are storing it in an
20
+ # Array you might need to duplicate it.
21
+ #
22
+ # Note also that the WriteBuffer assumes that the object it `<<`-writes into is going
23
+ # to **consume** in some way the string that it passes in. After the `<<` method returns,
24
+ # the WriteBuffer will be cleared, and it passes the same String reference on every call
25
+ # to `<<`. Therefore, if you need to retain the output of the WriteBuffer in, say, an Array,
26
+ # you might need to `.dup` the `String` it gives you.
10
27
  class ZipTricks::WriteBuffer
11
28
  # Creates a new WriteBuffer bypassing into a given writable object
12
29
  #
13
- # @param writable[#<<] An object that responds to `#<<` with string as argument
30
+ # @param writable[#<<] An object that responds to `#<<` with a String as argument
14
31
  # @param buffer_size[Integer] How many bytes to buffer
15
32
  def initialize(writable, buffer_size)
16
- @buf = StringIO.new
33
+ # Allocating the buffer using a zero-padded String as a variation
34
+ # on using capacity:, which JRuby apparently does not like very much. The
35
+ # desire here is that the buffer doesn't have to be resized during the lifetime
36
+ # of the object.
37
+ @buf = ("\0".b * (buffer_size * 2)).clear
17
38
  @buffer_size = buffer_size
18
39
  @writable = writable
19
40
  end
@@ -24,28 +45,27 @@ class ZipTricks::WriteBuffer
24
45
  # @param data[String] data to be written
25
46
  # @return self
26
47
  def <<(data)
27
- @buf << data
28
- flush! if @buf.size > @buffer_size
48
+ if data.bytesize >= @buffer_size
49
+ flush unless @buf.empty? # <- this is were we can output less than @buffer_size
50
+ @writable << data
51
+ else
52
+ @buf << data
53
+ flush if @buf.bytesize >= @buffer_size
54
+ end
29
55
  self
30
56
  end
31
57
 
32
58
  # Explicitly flushes the buffer if it contains anything
33
59
  #
34
60
  # @return self
35
- def flush!
36
- @writable << @buf.string if @buf.size > 0
37
- @buf.truncate(0)
38
- @buf.rewind
61
+ def flush
62
+ unless @buf.empty?
63
+ @writable << @buf
64
+ @buf.clear
65
+ end
39
66
  self
40
67
  end
41
68
 
42
- # Flushes the buffer and returns the result of `#to_i` of the contained `writable`.
43
- # Primarily facilitates working with StreamCRC32 objects where you finish the
44
- # computation by retrieving the CRC as an integer
45
- #
46
- # @return [Integer] the return value of `writable#to_i`
47
- def to_i
48
- flush!
49
- @writable.to_i
50
- end
69
+ # `flush!` was renamed to `flush` but we preserve this method for backwards compatibility
70
+ alias_method :flush!, :flush
51
71
  end
@@ -57,7 +57,7 @@ class ZipTricks::ZipWriter
57
57
  C_UINT2 = 'v' # Encode a 2-byte unsigned little-endian uint
58
58
  C_UINT8 = 'Q<' # Encode an 8-byte unsigned little-endian uint
59
59
  C_CHAR = 'C' # For bit-encoded strings
60
- C_INT4 = 'N' # Encode a 4-byte signed little-endian int
60
+ C_INT4 = 'l<' # Encode a 4-byte signed little-endian int
61
61
 
62
62
  private_constant :FOUR_BYTE_MAX_UINT,
63
63
  :TWO_BYTE_MAX_UINT,
@@ -195,7 +195,7 @@ class ZipTricks::ZipWriter
195
195
  [TWO_BYTE_MAX_UINT].pack(C_UINT2)
196
196
  else
197
197
  [0].pack(C_UINT2)
198
- end
198
+ end
199
199
  io << [0].pack(C_UINT2) # internal file attributes 2 bytes
200
200
 
201
201
  # Because the add_empty_directory method will create a directory with a trailing "/",
@@ -385,7 +385,7 @@ class ZipTricks::ZipWriter
385
385
  0x5455, C_UINT2, # tag for this extra block type ("UT")
386
386
  (1 + 4), C_UINT2, # the size of this block (1 byte used for the Flag + 3 longs used for the timestamp)
387
387
  flags, C_CHAR, # encode a single byte
388
- mtime.utc.to_i, C_INT4, # Use a signed long, not the unsigned one used by the rest of the ZIP spec.
388
+ mtime.utc.to_i, C_INT4, # Use a signed int, not the unsigned one used by the rest of the ZIP spec.
389
389
  ]
390
390
  # The atime and ctime can be omitted if not present
391
391
  pack_array(data_and_packspecs)
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
11
11
  spec.licenses = ['MIT (Hippocratic)']
12
12
  spec.summary = 'Stream out ZIP files from Ruby'
13
13
  spec.description = 'Stream out ZIP files from Ruby'
14
- spec.homepage = 'http://github.com/wetransfer/zip_tricks'
14
+ spec.homepage = 'https://github.com/wetransfer/zip_tricks'
15
15
 
16
16
  # Prevent pushing this gem to RubyGems.org.
17
17
  # To allow pushes either set the 'allowed_push_host'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zip_tricks
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.1.1
4
+ version: 5.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Julik Tarkhanov
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: exe
13
13
  cert_chain: []
14
- date: 2020-05-07 00:00:00.000000000 Z
14
+ date: 2020-11-23 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: bundler
@@ -262,17 +262,8 @@ files:
262
262
  - lib/zip_tricks/write_and_tell.rb
263
263
  - lib/zip_tricks/write_buffer.rb
264
264
  - lib/zip_tricks/zip_writer.rb
265
- - qa/README_QA.md
266
- - qa/generate_test_files.rb
267
- - qa/in/VTYL8830.jpg
268
- - qa/in/war-and-peace.txt
269
- - qa/support.rb
270
- - qa/test-report-2016-07-28.txt
271
- - qa/test-report-2016-12-12.txt
272
- - qa/test-report-2017-04-2.txt
273
- - qa/test-report.txt
274
265
  - zip_tricks.gemspec
275
- homepage: http://github.com/wetransfer/zip_tricks
266
+ homepage: https://github.com/wetransfer/zip_tricks
276
267
  licenses:
277
268
  - MIT (Hippocratic)
278
269
  metadata:
@@ -1,16 +0,0 @@
1
- ## Manual testing harness for ZipTricks
2
-
3
- These tests will generate **very large** files that test various edge cases of ZIP generation. The idea is to generate
4
- these files and to then try to open them with the unarchiver applications we support. The workflow is as follows:
5
-
6
-
7
- 1. Configure your storage to have `zip_tricks` directory linked into your virtual machines and to be on a fast volume (SSD RAID0 is recommended)
8
- 2. Run `generate_test_files.rb`. This will take some time and produce a number of large ZIP files.
9
- 3. Open them with the following ZIP unarchivers:
10
- * A recent version of `zipinfo` with the `-tlhvz` flags - to see the information about the file
11
- * ArchiveUtility on OSX
12
- * The Unarchiver on OSX
13
- * Built-in Explorer on Windows 7
14
- * 7Zip 9.20 on Windows 7
15
- * Any other unarchivers you consider necessary
16
- * Write down your observations in `test-report.txt` and, when cutting a release, timestamp a copy of that file.