zip_tricks 5.1.1 → 5.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -91,8 +91,9 @@ class ZipTricks::Streamer
91
91
  InvalidOutput = Class.new(ArgumentError)
92
92
  Overflow = Class.new(StandardError)
93
93
  UnknownMode = Class.new(StandardError)
94
+ OffsetOutOfSync = Class.new(StandardError)
94
95
 
95
- private_constant :DeflatedWriter, :StoredWriter, :STORED, :DEFLATED
96
+ private_constant :STORED, :DEFLATED
96
97
 
97
98
  # Creates a new Streamer on top of the given IO-ish object and yields it. Once the given block
98
99
  # returns, the Streamer will have it's `close` method called, which will write out the central
@@ -130,28 +131,26 @@ class ZipTricks::Streamer
130
131
  # end
131
132
  #
132
133
  # @param kwargs_for_new [Hash] keyword arguments for {Streamer.new}
133
- # @return [Enumerator] the enumerator you can read bytestrings of the ZIP from using `each`
134
+ # @return [ZipTricks::OutputEnumerator] the enumerator you can read bytestrings of the ZIP from by calling `each`
134
135
  def self.output_enum(**kwargs_for_new, &zip_streamer_block)
135
136
  ZipTricks::OutputEnumerator.new(**kwargs_for_new, &zip_streamer_block)
136
137
  end
137
138
 
138
139
  # Creates a new Streamer on top of the given IO-ish object.
139
140
  #
140
- # @param stream[IO] the destination IO for the ZIP. Anything that responds to `<<` can be used.
141
+ # @param writable[#<<] the destination IO for the ZIP. Anything that responds to `<<` can be used.
141
142
  # @param writer[ZipTricks::ZipWriter] the object to be used as the writer.
142
143
  # Defaults to an instance of ZipTricks::ZipWriter, normally you won't need to override it
143
144
  # @param auto_rename_duplicate_filenames[Boolean] whether duplicate filenames, when encountered,
144
145
  # should be suffixed with (1), (2) etc. Default value is `false` - if
145
146
  # dupliate names are used an exception will be raised
146
- def initialize(stream, writer: create_writer, auto_rename_duplicate_filenames: false)
147
- raise InvalidOutput, 'The stream must respond to #<<' unless stream.respond_to?(:<<)
148
-
149
- @dedupe_filenames = auto_rename_duplicate_filenames
150
- @out = ZipTricks::WriteAndTell.new(stream)
147
+ def initialize(writable, writer: create_writer, auto_rename_duplicate_filenames: false)
148
+ raise InvalidOutput, 'The writable must respond to #<<' unless writable.respond_to?(:<<)
149
+ @out = ZipTricks::WriteAndTell.new(writable)
151
150
  @files = []
152
- @local_header_offsets = []
153
151
  @path_set = ZipTricks::PathSet.new
154
152
  @writer = writer
153
+ @dedupe_filenames = auto_rename_duplicate_filenames
155
154
  end
156
155
 
157
156
  # Writes a part of a zip entry body (actual binary data of the entry) into the output stream.
@@ -213,9 +212,6 @@ class ZipTricks::Streamer
213
212
  @out.tell
214
213
  end
215
214
 
216
- # Will be phased out in ZipTricks 5.x
217
- alias_method :add_compressed_entry, :add_deflated_entry
218
-
219
215
  # Writes out the local header for an entry (file in the ZIP) that is using
220
216
  # the stored storage model (is stored as-is).
221
217
  # Once this method is called, the `<<` method has to be called one or more
@@ -363,14 +359,16 @@ class ZipTricks::Streamer
363
359
  #
364
360
  # @return [Integer] the offset the output IO is at after closing the archive
365
361
  def close
362
+ # Make sure offsets are in order
363
+ verify_offsets!
364
+
366
365
  # Record the central directory offset, so that it can be written into the EOCD record
367
366
  cdir_starts_at = @out.tell
368
367
 
369
368
  # Write out the central directory entries, one for each file
370
- @files.each_with_index do |entry, i|
371
- header_loc = @local_header_offsets.fetch(i)
369
+ @files.each do |entry|
372
370
  @writer.write_central_directory_file_header(io: @out,
373
- local_file_header_location: header_loc,
371
+ local_file_header_location: entry.local_header_offset,
374
372
  gp_flags: entry.gp_flags,
375
373
  storage_mode: entry.storage_mode,
376
374
  compressed_size: entry.compressed_size,
@@ -423,15 +421,40 @@ class ZipTricks::Streamer
423
421
  last_entry.compressed_size = compressed_size
424
422
  last_entry.uncompressed_size = uncompressed_size
425
423
 
424
+ offset_before_data_descriptor = @out.tell
426
425
  @writer.write_data_descriptor(io: @out,
427
426
  crc32: last_entry.crc32,
428
427
  compressed_size: last_entry.compressed_size,
429
428
  uncompressed_size: last_entry.uncompressed_size)
429
+ last_entry.bytes_used_for_data_descriptor = @out.tell - offset_before_data_descriptor
430
+
430
431
  @out.tell
431
432
  end
432
433
 
433
434
  private
434
435
 
436
+ def verify_offsets!
437
+ # We need to check whether the offsets noted for the entries actually make sense
438
+ computed_offset = @files.map(&:total_bytes_used).inject(0, &:+)
439
+ actual_offset = @out.tell
440
+ if computed_offset != actual_offset
441
+ message = <<-EMS
442
+ The offset of the Streamer output IO is out of sync with the expected value. All entries written so far,
443
+ including their compressed bodies, local headers and data descriptors, add up to a certain offset,
444
+ but this offset does not match the actual offset of the IO.
445
+
446
+ Entries add up to #{computed_offset} bytes and the IO is at #{actual_offset} bytes.
447
+
448
+ This can happen if you write local headers for an entry, write the "body" of the entry directly to the IO
449
+ object which is your destination, but do not adjust the offset known to the Streamer object. To adjust
450
+ the offfset you need to call `Streamer#simulate_write(body_size)` after outputting the entry. Otherwise
451
+ the local header offsets of the entries you write are going to be incorrect and some ZIP applications
452
+ are going to have problems opening your archive.
453
+ EMS
454
+ raise OffsetOutOfSync, message
455
+ end
456
+ end
457
+
435
458
  def add_file_and_write_local_header(
436
459
  filename:,
437
460
  modification_time:,
@@ -464,16 +487,18 @@ class ZipTricks::Streamer
464
487
  uncompressed_size = 0
465
488
  end
466
489
 
490
+ local_header_starts_at = @out.tell
491
+
467
492
  e = Entry.new(filename,
468
493
  crc32,
469
494
  compressed_size,
470
495
  uncompressed_size,
471
496
  storage_mode,
472
497
  modification_time,
473
- use_data_descriptor)
474
-
475
- @files << e
476
- @local_header_offsets << @out.tell
498
+ use_data_descriptor,
499
+ _local_file_header_offset = local_header_starts_at,
500
+ _bytes_used_for_local_header = 0,
501
+ _bytes_used_for_data_descriptor = 0)
477
502
 
478
503
  @writer.write_local_file_header(io: @out,
479
504
  gp_flags: e.gp_flags,
@@ -483,6 +508,9 @@ class ZipTricks::Streamer
483
508
  mtime: e.mtime,
484
509
  filename: e.filename,
485
510
  storage_mode: e.storage_mode)
511
+ e.bytes_used_for_local_header = @out.tell - e.local_header_offset
512
+
513
+ @files << e
486
514
  end
487
515
 
488
516
  def remove_backslash(filename)
@@ -4,13 +4,6 @@
4
4
  # registers data passing through it in a CRC32 checksum calculator. Is made to be completely
5
5
  # interchangeable with the StoredWriter in terms of interface.
6
6
  class ZipTricks::Streamer::DeflatedWriter
7
- # After how many bytes of incoming data the deflater for the
8
- # contents must be flushed. This is done to prevent unreasonable
9
- # memory use when archiving large files, and to ensure we write to
10
- # the socket often enough while still maintaining acceptable
11
- # compression
12
- FLUSH_EVERY_N_BYTES = 1024 * 1024 * 5
13
-
14
7
  # The amount of bytes we will buffer before computing the intermediate
15
8
  # CRC32 checksums. Benchmarks show that the optimum is 64KB (see
16
9
  # `bench/buffered_crc32_bench.rb), if that is exceeded Zlib is going
@@ -18,11 +11,10 @@ class ZipTricks::Streamer::DeflatedWriter
18
11
  CRC32_BUFFER_SIZE = 64 * 1024
19
12
 
20
13
  def initialize(io)
21
- @compressed_io = ZipTricks::WriteAndTell.new(io)
22
- @uncompressed_size = 0
14
+ @compressed_io = io
23
15
  @deflater = ::Zlib::Deflate.new(Zlib::DEFAULT_COMPRESSION, -::Zlib::MAX_WBITS)
24
- @crc = ZipTricks::WriteBuffer.new(ZipTricks::StreamCRC32.new, CRC32_BUFFER_SIZE)
25
- @bytes_since_last_flush = 0
16
+ @crc = ZipTricks::StreamCRC32.new
17
+ @crc_buf = ZipTricks::WriteBuffer.new(@crc, CRC32_BUFFER_SIZE)
26
18
  end
27
19
 
28
20
  # Writes the given data into the deflater, and flushes the deflater
@@ -31,13 +23,8 @@ class ZipTricks::Streamer::DeflatedWriter
31
23
  # @param data[String] data to be written
32
24
  # @return self
33
25
  def <<(data)
34
- @uncompressed_size += data.bytesize
35
- @bytes_since_last_flush += data.bytesize
36
- @compressed_io << @deflater.deflate(data)
37
- @crc << data
38
-
39
- interim_flush
40
-
26
+ @deflater.deflate(data) { |chunk| @compressed_io << chunk }
27
+ @crc_buf << data
41
28
  self
42
29
  end
43
30
 
@@ -45,18 +32,12 @@ class ZipTricks::Streamer::DeflatedWriter
45
32
  # compressed data written and the CRC32 checksum. The return value
46
33
  # can be directly used as the argument to {Streamer#update_last_entry_and_write_data_descriptor}
47
34
  #
48
- # @param data[String] data to be written
49
35
  # @return [Hash] a hash of `{crc32, compressed_size, uncompressed_size}`
50
36
  def finish
51
37
  @compressed_io << @deflater.finish until @deflater.finished?
52
- {crc32: @crc.to_i, compressed_size: @compressed_io.tell, uncompressed_size: @uncompressed_size}
53
- end
54
-
55
- private
56
-
57
- def interim_flush
58
- return if @bytes_since_last_flush < FLUSH_EVERY_N_BYTES
59
- @compressed_io << @deflater.flush
60
- @bytes_since_last_flush = 0
38
+ @crc_buf.flush
39
+ {crc32: @crc.to_i, compressed_size: @deflater.total_out, uncompressed_size: @deflater.total_in}
40
+ ensure
41
+ @deflater.close
61
42
  end
62
43
  end
@@ -4,7 +4,7 @@
4
4
  # Normally you will not have to use this class directly
5
5
  class ZipTricks::Streamer::Entry < Struct.new(:filename, :crc32, :compressed_size,
6
6
  :uncompressed_size, :storage_mode, :mtime,
7
- :use_data_descriptor)
7
+ :use_data_descriptor, :local_header_offset, :bytes_used_for_local_header, :bytes_used_for_data_descriptor)
8
8
  def initialize(*)
9
9
  super
10
10
  filename.force_encoding(Encoding::UTF_8)
@@ -15,6 +15,10 @@ class ZipTricks::Streamer::Entry < Struct.new(:filename, :crc32, :compressed_siz
15
15
  end)
16
16
  end
17
17
 
18
+ def total_bytes_used
19
+ bytes_used_for_local_header + compressed_size + bytes_used_for_data_descriptor
20
+ end
21
+
18
22
  # Set the general purpose flags for the entry. We care about is the EFS
19
23
  # bit (bit 11) which should be set if the filename is UTF8. If it is, we need to set the
20
24
  # bit so that the unarchiving application knows that the filename in the archive is UTF-8
@@ -12,7 +12,8 @@ class ZipTricks::Streamer::StoredWriter
12
12
 
13
13
  def initialize(io)
14
14
  @io = ZipTricks::WriteAndTell.new(io)
15
- @crc = ZipTricks::WriteBuffer.new(ZipTricks::StreamCRC32.new, CRC32_BUFFER_SIZE)
15
+ @crc_compute = ZipTricks::StreamCRC32.new
16
+ @crc = ZipTricks::WriteBuffer.new(@crc_compute, CRC32_BUFFER_SIZE)
16
17
  end
17
18
 
18
19
  # Writes the given data to the contained IO object.
@@ -28,9 +29,9 @@ class ZipTricks::Streamer::StoredWriter
28
29
  # Returns the amount of data written and the CRC32 checksum. The return value
29
30
  # can be directly used as the argument to {Streamer#update_last_entry_and_write_data_descriptor}
30
31
  #
31
- # @param data[String] data to be written
32
32
  # @return [Hash] a hash of `{crc32, compressed_size, uncompressed_size}`
33
33
  def finish
34
- {crc32: @crc.to_i, compressed_size: @io.tell, uncompressed_size: @io.tell}
34
+ @crc.flush
35
+ {crc32: @crc_compute.to_i, compressed_size: @io.tell, uncompressed_size: @io.tell}
35
36
  end
36
37
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ZipTricks
4
- VERSION = '5.1.1'
4
+ VERSION = '5.5.0'
5
5
  end
@@ -10,9 +10,8 @@ class ZipTricks::WriteAndTell
10
10
 
11
11
  def <<(bytes)
12
12
  return self if bytes.nil?
13
- binary_bytes = binary(bytes)
14
- @io << binary_bytes
15
- @pos += binary_bytes.bytesize
13
+ @io << bytes.b
14
+ @pos += bytes.bytesize
16
15
  self
17
16
  end
18
17
 
@@ -23,13 +22,4 @@ class ZipTricks::WriteAndTell
23
22
  def tell
24
23
  @pos
25
24
  end
26
-
27
- private
28
-
29
- def binary(str)
30
- return str if str.encoding == Encoding::BINARY
31
- str.force_encoding(Encoding::BINARY)
32
- rescue RuntimeError # the string is frozen
33
- str.dup.force_encoding(Encoding::BINARY)
34
- end
35
25
  end
@@ -7,13 +7,34 @@
7
7
  # CRC32 combine operations - and this adds up. Since the CRC32 value
8
8
  # is usually not needed until the complete output has completed
9
9
  # we can buffer at least some amount of data before computing CRC32 over it.
10
+ # We also use this buffer for output via Rack, where some amount of buffering
11
+ # helps reduce the number of syscalls made by the webserver. ZipTricks performs
12
+ # lots of very small writes, and some degree of speedup (about 20%) can be achieved
13
+ # with a buffer of a few KB.
14
+ #
15
+ # Note that there is no guarantee that the write buffer is going to flush at or above
16
+ # the given `buffer_size`, because for writes which exceed the buffer size it will
17
+ # first `flush` and then write through the oversized chunk, without buffering it. This
18
+ # helps conserve memory. Also note that the buffer will *not* duplicate strings for you
19
+ # and *will* yield the same buffer String over and over, so if you are storing it in an
20
+ # Array you might need to duplicate it.
21
+ #
22
+ # Note also that the WriteBuffer assumes that the object it `<<`-writes into is going
23
+ # to **consume** in some way the string that it passes in. After the `<<` method returns,
24
+ # the WriteBuffer will be cleared, and it passes the same String reference on every call
25
+ # to `<<`. Therefore, if you need to retain the output of the WriteBuffer in, say, an Array,
26
+ # you might need to `.dup` the `String` it gives you.
10
27
  class ZipTricks::WriteBuffer
11
28
  # Creates a new WriteBuffer bypassing into a given writable object
12
29
  #
13
- # @param writable[#<<] An object that responds to `#<<` with string as argument
30
+ # @param writable[#<<] An object that responds to `#<<` with a String as argument
14
31
  # @param buffer_size[Integer] How many bytes to buffer
15
32
  def initialize(writable, buffer_size)
16
- @buf = StringIO.new
33
+ # Allocating the buffer using a zero-padded String as a variation
34
+ # on using capacity:, which JRuby apparently does not like very much. The
35
+ # desire here is that the buffer doesn't have to be resized during the lifetime
36
+ # of the object.
37
+ @buf = ("\0".b * (buffer_size * 2)).clear
17
38
  @buffer_size = buffer_size
18
39
  @writable = writable
19
40
  end
@@ -24,28 +45,27 @@ class ZipTricks::WriteBuffer
24
45
  # @param data[String] data to be written
25
46
  # @return self
26
47
  def <<(data)
27
- @buf << data
28
- flush! if @buf.size > @buffer_size
48
+ if data.bytesize >= @buffer_size
49
+ flush unless @buf.empty? # <- this is were we can output less than @buffer_size
50
+ @writable << data
51
+ else
52
+ @buf << data
53
+ flush if @buf.bytesize >= @buffer_size
54
+ end
29
55
  self
30
56
  end
31
57
 
32
58
  # Explicitly flushes the buffer if it contains anything
33
59
  #
34
60
  # @return self
35
- def flush!
36
- @writable << @buf.string if @buf.size > 0
37
- @buf.truncate(0)
38
- @buf.rewind
61
+ def flush
62
+ unless @buf.empty?
63
+ @writable << @buf
64
+ @buf.clear
65
+ end
39
66
  self
40
67
  end
41
68
 
42
- # Flushes the buffer and returns the result of `#to_i` of the contained `writable`.
43
- # Primarily facilitates working with StreamCRC32 objects where you finish the
44
- # computation by retrieving the CRC as an integer
45
- #
46
- # @return [Integer] the return value of `writable#to_i`
47
- def to_i
48
- flush!
49
- @writable.to_i
50
- end
69
+ # `flush!` was renamed to `flush` but we preserve this method for backwards compatibility
70
+ alias_method :flush!, :flush
51
71
  end
@@ -57,7 +57,7 @@ class ZipTricks::ZipWriter
57
57
  C_UINT2 = 'v' # Encode a 2-byte unsigned little-endian uint
58
58
  C_UINT8 = 'Q<' # Encode an 8-byte unsigned little-endian uint
59
59
  C_CHAR = 'C' # For bit-encoded strings
60
- C_INT4 = 'N' # Encode a 4-byte signed little-endian int
60
+ C_INT4 = 'l<' # Encode a 4-byte signed little-endian int
61
61
 
62
62
  private_constant :FOUR_BYTE_MAX_UINT,
63
63
  :TWO_BYTE_MAX_UINT,
@@ -195,7 +195,7 @@ class ZipTricks::ZipWriter
195
195
  [TWO_BYTE_MAX_UINT].pack(C_UINT2)
196
196
  else
197
197
  [0].pack(C_UINT2)
198
- end
198
+ end
199
199
  io << [0].pack(C_UINT2) # internal file attributes 2 bytes
200
200
 
201
201
  # Because the add_empty_directory method will create a directory with a trailing "/",
@@ -385,7 +385,7 @@ class ZipTricks::ZipWriter
385
385
  0x5455, C_UINT2, # tag for this extra block type ("UT")
386
386
  (1 + 4), C_UINT2, # the size of this block (1 byte used for the Flag + 3 longs used for the timestamp)
387
387
  flags, C_CHAR, # encode a single byte
388
- mtime.utc.to_i, C_INT4, # Use a signed long, not the unsigned one used by the rest of the ZIP spec.
388
+ mtime.utc.to_i, C_INT4, # Use a signed int, not the unsigned one used by the rest of the ZIP spec.
389
389
  ]
390
390
  # The atime and ctime can be omitted if not present
391
391
  pack_array(data_and_packspecs)
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
11
11
  spec.licenses = ['MIT (Hippocratic)']
12
12
  spec.summary = 'Stream out ZIP files from Ruby'
13
13
  spec.description = 'Stream out ZIP files from Ruby'
14
- spec.homepage = 'http://github.com/wetransfer/zip_tricks'
14
+ spec.homepage = 'https://github.com/wetransfer/zip_tricks'
15
15
 
16
16
  # Prevent pushing this gem to RubyGems.org.
17
17
  # To allow pushes either set the 'allowed_push_host'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zip_tricks
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.1.1
4
+ version: 5.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Julik Tarkhanov
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: exe
13
13
  cert_chain: []
14
- date: 2020-05-07 00:00:00.000000000 Z
14
+ date: 2020-11-23 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: bundler
@@ -262,17 +262,8 @@ files:
262
262
  - lib/zip_tricks/write_and_tell.rb
263
263
  - lib/zip_tricks/write_buffer.rb
264
264
  - lib/zip_tricks/zip_writer.rb
265
- - qa/README_QA.md
266
- - qa/generate_test_files.rb
267
- - qa/in/VTYL8830.jpg
268
- - qa/in/war-and-peace.txt
269
- - qa/support.rb
270
- - qa/test-report-2016-07-28.txt
271
- - qa/test-report-2016-12-12.txt
272
- - qa/test-report-2017-04-2.txt
273
- - qa/test-report.txt
274
265
  - zip_tricks.gemspec
275
- homepage: http://github.com/wetransfer/zip_tricks
266
+ homepage: https://github.com/wetransfer/zip_tricks
276
267
  licenses:
277
268
  - MIT (Hippocratic)
278
269
  metadata:
@@ -1,16 +0,0 @@
1
- ## Manual testing harness for ZipTricks
2
-
3
- These tests will generate **very large** files that test various edge cases of ZIP generation. The idea is to generate
4
- these files and to then try to open them with the unarchiver applications we support. The workflow is as follows:
5
-
6
-
7
- 1. Configure your storage to have `zip_tricks` directory linked into your virtual machines and to be on a fast volume (SSD RAID0 is recommended)
8
- 2. Run `generate_test_files.rb`. This will take some time and produce a number of large ZIP files.
9
- 3. Open them with the following ZIP unarchivers:
10
- * A recent version of `zipinfo` with the `-tlhvz` flags - to see the information about the file
11
- * ArchiveUtility on OSX
12
- * The Unarchiver on OSX
13
- * Built-in Explorer on Windows 7
14
- * 7Zip 9.20 on Windows 7
15
- * Any other unarchivers you consider necessary
16
- * Write down your observations in `test-report.txt` and, when cutting a release, timestamp a copy of that file.