RubyGems - zip_tricks - Versions diffs - 5.1.1 → 5.5.0 - Mend

zip_tricks 5.1.1 → 5.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +4 -4
data/.rubocop.yml +2 -0
data/.travis.yml +0 -3
data/CHANGELOG.md +36 -0
data/README.md +26 -15
data/lib/zip_tricks/block_write.rb +26 -21
data/lib/zip_tricks/file_reader.rb +8 -6
data/lib/zip_tricks/null_writer.rb +1 -1
data/lib/zip_tricks/output_enumerator.rb +48 -27
data/lib/zip_tricks/path_set.rb +4 -0
data/lib/zip_tricks/rails_streaming.rb +6 -7
data/lib/zip_tricks/size_estimator.rb +0 -3
data/lib/zip_tricks/stream_crc32.rb +2 -2
data/lib/zip_tricks/streamer.rb +47 -19
data/lib/zip_tricks/streamer/deflated_writer.rb +9 -28
data/lib/zip_tricks/streamer/entry.rb +5 -1
data/lib/zip_tricks/streamer/stored_writer.rb +4 -3
data/lib/zip_tricks/version.rb +1 -1
data/lib/zip_tricks/write_and_tell.rb +2 -12
data/lib/zip_tricks/write_buffer.rb +37 -17
data/lib/zip_tricks/zip_writer.rb +3 -3
data/zip_tricks.gemspec +1 -1
metadata +3 -12
data/qa/README_QA.md +0 -16
data/qa/generate_test_files.rb +0 -126
data/qa/in/VTYL8830.jpg +0 -0
data/qa/in/war-and-peace.txt +0 -10810
data/qa/support.rb +0 -88
data/qa/test-report-2016-07-28.txt +0 -156
data/qa/test-report-2016-12-12.txt +0 -156
data/qa/test-report-2017-04-2.txt +0 -168
data/qa/test-report.txt +0 -28

data/lib/zip_tricks/streamer.rb CHANGED

@@ -91,8 +91,9 @@ class ZipTricks::Streamer
   InvalidOutput = Class.new(ArgumentError)
   Overflow = Class.new(StandardError)
   UnknownMode = Class.new(StandardError)
+  OffsetOutOfSync = Class.new(StandardError)
-  private_constant :DeflatedWriter, :StoredWriter, :STORED, :DEFLATED
+  private_constant :STORED, :DEFLATED
   # Creates a new Streamer on top of the given IO-ish object and yields it. Once the given block
   # returns, the Streamer will have it's `close` method called, which will write out the central
@@ -130,28 +131,26 @@ class ZipTricks::Streamer
   #     end
   #
   # @param kwargs_for_new [Hash] keyword arguments for {Streamer.new}
-  # @return [Enumerator] the enumerator you can read bytestrings of the ZIP from using `each`
+  # @return [ZipTricks::OutputEnumerator] the enumerator you can read bytestrings of the ZIP from by calling `each`
   def self.output_enum(**kwargs_for_new, &zip_streamer_block)
     ZipTricks::OutputEnumerator.new(**kwargs_for_new, &zip_streamer_block)
   end
   # Creates a new Streamer on top of the given IO-ish object.
   #
-  # @param stream[IO] the destination IO for the ZIP. Anything that responds to `<<` can be used.
+  # @param writable[#<<] the destination IO for the ZIP. Anything that responds to `<<` can be used.
   # @param writer[ZipTricks::ZipWriter] the object to be used as the writer.
   #    Defaults to an instance of ZipTricks::ZipWriter, normally you won't need to override it
   # @param auto_rename_duplicate_filenames[Boolean] whether duplicate filenames, when encountered,
   #    should be suffixed with (1), (2) etc. Default value is `false` - if
   #    dupliate names are used an exception will be raised
-  def initialize(stream, writer: create_writer, auto_rename_duplicate_filenames: false)
-    raise InvalidOutput, 'The stream must respond to #<<' unless stream.respond_to?(:<<)
-    @dedupe_filenames = auto_rename_duplicate_filenames
-    @out = ZipTricks::WriteAndTell.new(stream)
+  def initialize(writable, writer: create_writer, auto_rename_duplicate_filenames: false)
+    raise InvalidOutput, 'The writable must respond to #<<' unless writable.respond_to?(:<<)
+    @out = ZipTricks::WriteAndTell.new(writable)
     @files = []
-    @local_header_offsets = []
     @path_set = ZipTricks::PathSet.new
     @writer = writer
+    @dedupe_filenames = auto_rename_duplicate_filenames
   end
   # Writes a part of a zip entry body (actual binary data of the entry) into the output stream.
@@ -213,9 +212,6 @@ class ZipTricks::Streamer
     @out.tell
   end
-  # Will be phased out in ZipTricks 5.x
-  alias_method :add_compressed_entry, :add_deflated_entry
   # Writes out the local header for an entry (file in the ZIP) that is using
   # the stored storage model (is stored as-is).
   # Once this method is called, the `<<` method has to be called one or more
@@ -363,14 +359,16 @@ class ZipTricks::Streamer
   #
   # @return [Integer] the offset the output IO is at after closing the archive
   def close
+    # Make sure offsets are in order
+    verify_offsets!
     # Record the central directory offset, so that it can be written into the EOCD record
     cdir_starts_at = @out.tell
     # Write out the central directory entries, one for each file
-    @files.each_with_index do |entry, i|
-      header_loc = @local_header_offsets.fetch(i)
+    @files.each do |entry|
       @writer.write_central_directory_file_header(io: @out,
-                                                  local_file_header_location: header_loc,
+                                                  local_file_header_location: entry.local_header_offset,
                                                   gp_flags: entry.gp_flags,
                                                   storage_mode: entry.storage_mode,
                                                   compressed_size: entry.compressed_size,
@@ -423,15 +421,40 @@ class ZipTricks::Streamer
     last_entry.compressed_size = compressed_size
     last_entry.uncompressed_size = uncompressed_size
+    offset_before_data_descriptor = @out.tell
     @writer.write_data_descriptor(io: @out,
                                   crc32: last_entry.crc32,
                                   compressed_size: last_entry.compressed_size,
                                   uncompressed_size: last_entry.uncompressed_size)
+    last_entry.bytes_used_for_data_descriptor = @out.tell - offset_before_data_descriptor
     @out.tell
   end
   private
+  def verify_offsets!
+    # We need to check whether the offsets noted for the entries actually make sense
+    computed_offset = @files.map(&:total_bytes_used).inject(0, &:+)
+    actual_offset = @out.tell
+    if computed_offset != actual_offset
+      message = <<-EMS
+The offset of the Streamer output IO is out of sync with the expected value. All entries written so far,
+including their compressed bodies, local headers and data descriptors, add up to a certain offset,
+but this offset does not match the actual offset of the IO.
+Entries add up to #{computed_offset} bytes and the IO is at #{actual_offset} bytes.
+This can happen if you write local headers for an entry, write the "body" of the entry directly to the IO
+object which is your destination, but do not adjust the offset known to the Streamer object. To adjust
+the offfset you need to call `Streamer#simulate_write(body_size)` after outputting the entry. Otherwise
+the local header offsets of the entries you write are going to be incorrect and some ZIP applications
+are going to have problems opening your archive.
+EMS
+      raise OffsetOutOfSync, message
+    end
+  end
   def add_file_and_write_local_header(
       filename:,
       modification_time:,
@@ -464,16 +487,18 @@ class ZipTricks::Streamer
       uncompressed_size = 0
     end
+    local_header_starts_at = @out.tell
     e = Entry.new(filename,
                   crc32,
                   compressed_size,
                   uncompressed_size,
                   storage_mode,
                   modification_time,
-                  use_data_descriptor)
-    @files << e
-    @local_header_offsets << @out.tell
+                  use_data_descriptor,
+                  _local_file_header_offset = local_header_starts_at,
+                  _bytes_used_for_local_header = 0,
+                  _bytes_used_for_data_descriptor = 0)
     @writer.write_local_file_header(io: @out,
                                     gp_flags: e.gp_flags,
@@ -483,6 +508,9 @@ class ZipTricks::Streamer
                                     mtime: e.mtime,
                                     filename: e.filename,
                                     storage_mode: e.storage_mode)
+    e.bytes_used_for_local_header = @out.tell - e.local_header_offset
+    @files << e
   end
   def remove_backslash(filename)

data/lib/zip_tricks/streamer/deflated_writer.rb CHANGED

@@ -4,13 +4,6 @@
 # registers data passing through it in a CRC32 checksum calculator. Is made to be completely
 # interchangeable with the StoredWriter in terms of interface.
 class ZipTricks::Streamer::DeflatedWriter
-  # After how many bytes of incoming data the deflater for the
-  # contents must be flushed. This is done to prevent unreasonable
-  # memory use when archiving large files, and to ensure we write to
-  # the socket often enough while still maintaining acceptable
-  # compression
-  FLUSH_EVERY_N_BYTES = 1024 * 1024 * 5
   # The amount of bytes we will buffer before computing the intermediate
   # CRC32 checksums. Benchmarks show that the optimum is 64KB (see
   # `bench/buffered_crc32_bench.rb), if that is exceeded Zlib is going
@@ -18,11 +11,10 @@ class ZipTricks::Streamer::DeflatedWriter
   CRC32_BUFFER_SIZE = 64 * 1024
   def initialize(io)
-    @compressed_io = ZipTricks::WriteAndTell.new(io)
-    @uncompressed_size = 0
+    @compressed_io = io
     @deflater = ::Zlib::Deflate.new(Zlib::DEFAULT_COMPRESSION, -::Zlib::MAX_WBITS)
-    @crc = ZipTricks::WriteBuffer.new(ZipTricks::StreamCRC32.new, CRC32_BUFFER_SIZE)
-    @bytes_since_last_flush = 0
+    @crc = ZipTricks::StreamCRC32.new
+    @crc_buf = ZipTricks::WriteBuffer.new(@crc, CRC32_BUFFER_SIZE)
   end
   # Writes the given data into the deflater, and flushes the deflater
@@ -31,13 +23,8 @@ class ZipTricks::Streamer::DeflatedWriter
   # @param data[String] data to be written
   # @return self
   def <<(data)
-    @uncompressed_size += data.bytesize
-    @bytes_since_last_flush += data.bytesize
-    @compressed_io << @deflater.deflate(data)
-    @crc << data
-    interim_flush
+    @deflater.deflate(data) { |chunk| @compressed_io << chunk }
+    @crc_buf << data
     self
   end
@@ -45,18 +32,12 @@ class ZipTricks::Streamer::DeflatedWriter
   # compressed data written and the CRC32 checksum. The return value
   # can be directly used as the argument to {Streamer#update_last_entry_and_write_data_descriptor}
   #
-  # @param data[String] data to be written
   # @return [Hash] a hash of `{crc32, compressed_size, uncompressed_size}`
   def finish
     @compressed_io << @deflater.finish until @deflater.finished?
-    {crc32: @crc.to_i, compressed_size: @compressed_io.tell, uncompressed_size: @uncompressed_size}
-  end
-  private
-  def interim_flush
-    return if @bytes_since_last_flush < FLUSH_EVERY_N_BYTES
-    @compressed_io << @deflater.flush
-    @bytes_since_last_flush = 0
+    @crc_buf.flush
+    {crc32: @crc.to_i, compressed_size: @deflater.total_out, uncompressed_size: @deflater.total_in}
+  ensure
+    @deflater.close
   end
 end

data/lib/zip_tricks/streamer/entry.rb CHANGED

@@ -4,7 +4,7 @@
 # Normally you will not have to use this class directly
 class ZipTricks::Streamer::Entry < Struct.new(:filename, :crc32, :compressed_size,
                                               :uncompressed_size, :storage_mode, :mtime,
-                                              :use_data_descriptor)
+                                              :use_data_descriptor, :local_header_offset, :bytes_used_for_local_header, :bytes_used_for_data_descriptor)
   def initialize(*)
     super
     filename.force_encoding(Encoding::UTF_8)
@@ -15,6 +15,10 @@ class ZipTricks::Streamer::Entry < Struct.new(:filename, :crc32, :compressed_siz
                            end)
   end
+  def total_bytes_used
+    bytes_used_for_local_header + compressed_size + bytes_used_for_data_descriptor
+  end
   # Set the general purpose flags for the entry. We care about is the EFS
   # bit (bit 11) which should be set if the filename is UTF8. If it is, we need to set the
   # bit so that the unarchiving application knows that the filename in the archive is UTF-8

data/lib/zip_tricks/streamer/stored_writer.rb CHANGED

@@ -12,7 +12,8 @@ class ZipTricks::Streamer::StoredWriter
   def initialize(io)
     @io = ZipTricks::WriteAndTell.new(io)
-    @crc = ZipTricks::WriteBuffer.new(ZipTricks::StreamCRC32.new, CRC32_BUFFER_SIZE)
+    @crc_compute = ZipTricks::StreamCRC32.new
+    @crc = ZipTricks::WriteBuffer.new(@crc_compute, CRC32_BUFFER_SIZE)
   end
   # Writes the given data to the contained IO object.
@@ -28,9 +29,9 @@ class ZipTricks::Streamer::StoredWriter
   # Returns the amount of data written and the CRC32 checksum. The return value
   # can be directly used as the argument to {Streamer#update_last_entry_and_write_data_descriptor}
   #
-  # @param data[String] data to be written
   # @return [Hash] a hash of `{crc32, compressed_size, uncompressed_size}`
   def finish
-    {crc32: @crc.to_i, compressed_size: @io.tell, uncompressed_size: @io.tell}
+    @crc.flush
+    {crc32: @crc_compute.to_i, compressed_size: @io.tell, uncompressed_size: @io.tell}
   end
 end

data/lib/zip_tricks/version.rb CHANGED

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module ZipTricks
-  VERSION = '5.1.1'
+  VERSION = '5.5.0'
 end

data/lib/zip_tricks/write_and_tell.rb CHANGED

@@ -10,9 +10,8 @@ class ZipTricks::WriteAndTell
   def <<(bytes)
     return self if bytes.nil?
-    binary_bytes = binary(bytes)
-    @io << binary_bytes
-    @pos += binary_bytes.bytesize
+    @io << bytes.b
+    @pos += bytes.bytesize
     self
   end
@@ -23,13 +22,4 @@ class ZipTricks::WriteAndTell
   def tell
     @pos
   end
-  private
-  def binary(str)
-    return str if str.encoding == Encoding::BINARY
-    str.force_encoding(Encoding::BINARY)
-  rescue RuntimeError # the string is frozen
-    str.dup.force_encoding(Encoding::BINARY)
-  end
 end

data/lib/zip_tricks/write_buffer.rb CHANGED

@@ -7,13 +7,34 @@
 # CRC32 combine operations - and this adds up. Since the CRC32 value
 # is usually not needed until the complete output has completed
 # we can buffer at least some amount of data before computing CRC32 over it.
+# We also use this buffer for output via Rack, where some amount of buffering
+# helps reduce the number of syscalls made by the webserver. ZipTricks performs
+# lots of very small writes, and some degree of speedup (about 20%) can be achieved
+# with a buffer of a few KB.
+#
+# Note that there is no guarantee that the write buffer is going to flush at or above
+# the given `buffer_size`, because for writes which exceed the buffer size it will
+# first `flush` and then write through the oversized chunk, without buffering it. This
+# helps conserve memory. Also note that the buffer will *not* duplicate strings for you
+# and *will* yield the same buffer String over and over, so if you are storing it in an
+# Array you might need to duplicate it.
+#
+# Note also that the WriteBuffer assumes that the object it `<<`-writes into is going
+# to **consume** in some way the string that it passes in. After the `<<` method returns,
+# the WriteBuffer will be cleared, and it passes the same String reference on every call
+# to `<<`. Therefore, if you need to retain the output of the WriteBuffer in, say, an Array,
+# you might need to `.dup` the `String` it gives you.
 class ZipTricks::WriteBuffer
   # Creates a new WriteBuffer bypassing into a given writable object
   #
-  # @param writable[#<<] An object that responds to `#<<` with string as argument
+  # @param writable[#<<] An object that responds to `#<<` with a String as argument
   # @param buffer_size[Integer] How many bytes to buffer
   def initialize(writable, buffer_size)
-    @buf = StringIO.new
+    # Allocating the buffer using a zero-padded String as a variation
+    # on using capacity:, which JRuby apparently does not like very much. The
+    # desire here is that the buffer doesn't have to be resized during the lifetime
+    # of the object.
+    @buf = ("\0".b * (buffer_size * 2)).clear
     @buffer_size = buffer_size
     @writable = writable
   end
@@ -24,28 +45,27 @@ class ZipTricks::WriteBuffer
   # @param data[String] data to be written
   # @return self
   def <<(data)
-    @buf << data
-    flush! if @buf.size > @buffer_size
+    if data.bytesize >= @buffer_size
+      flush unless @buf.empty? # <- this is were we can output less than @buffer_size
+      @writable << data
+    else
+      @buf << data
+      flush if @buf.bytesize >= @buffer_size
+    end
     self
   end
   # Explicitly flushes the buffer if it contains anything
   #
   # @return self
-  def flush!
-    @writable << @buf.string if @buf.size > 0
-    @buf.truncate(0)
-    @buf.rewind
+  def flush
+    unless @buf.empty?
+      @writable << @buf
+      @buf.clear
+    end
     self
   end
-  # Flushes the buffer and returns the result of `#to_i` of the contained `writable`.
-  # Primarily facilitates working with StreamCRC32 objects where you finish the
-  # computation by retrieving the CRC as an integer
-  #
-  # @return [Integer] the return value of `writable#to_i`
-  def to_i
-    flush!
-    @writable.to_i
-  end
+  # `flush!` was renamed to `flush` but we preserve this method for backwards compatibility
+  alias_method :flush!, :flush
 end

data/lib/zip_tricks/zip_writer.rb CHANGED

@@ -57,7 +57,7 @@ class ZipTricks::ZipWriter
   C_UINT2 = 'v'    # Encode a 2-byte unsigned little-endian uint
   C_UINT8 = 'Q<'  # Encode an 8-byte unsigned little-endian uint
   C_CHAR = 'C' # For bit-encoded strings
-  C_INT4 = 'N' # Encode a 4-byte signed little-endian int
+  C_INT4 = 'l<' # Encode a 4-byte signed little-endian int
   private_constant :FOUR_BYTE_MAX_UINT,
                    :TWO_BYTE_MAX_UINT,
@@ -195,7 +195,7 @@ class ZipTricks::ZipWriter
       [TWO_BYTE_MAX_UINT].pack(C_UINT2)
     else
       [0].pack(C_UINT2)
-          end
+    end
     io << [0].pack(C_UINT2)                                # internal file attributes        2 bytes
     # Because the add_empty_directory method will create a directory with a trailing "/",
@@ -385,7 +385,7 @@ class ZipTricks::ZipWriter
       0x5455, C_UINT2,  # tag for this extra block type ("UT")
       (1 + 4), C_UINT2, # the size of this block (1 byte used for the Flag + 3 longs used for the timestamp)
       flags, C_CHAR,   # encode a single byte
-      mtime.utc.to_i, C_INT4, # Use a signed long, not the unsigned one used by the rest of the ZIP spec.
+      mtime.utc.to_i, C_INT4, # Use a signed int, not the unsigned one used by the rest of the ZIP spec.
     ]
     # The atime and ctime can be omitted if not present
     pack_array(data_and_packspecs)

data/zip_tricks.gemspec CHANGED

@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
   spec.licenses       = ['MIT (Hippocratic)']
   spec.summary        = 'Stream out ZIP files from Ruby'
   spec.description    = 'Stream out ZIP files from Ruby'
-  spec.homepage       = 'http://github.com/wetransfer/zip_tricks'
+  spec.homepage       = 'https://github.com/wetransfer/zip_tricks'
   # Prevent pushing this gem to RubyGems.org.
   # To allow pushes either set the 'allowed_push_host'

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: zip_tricks
 version: !ruby/object:Gem::Version
-  version: 5.1.1
+  version: 5.5.0
 platform: ruby
 authors:
 - Julik Tarkhanov
@@ -11,7 +11,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2020-05-07 00:00:00.000000000 Z
+date: 2020-11-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -262,17 +262,8 @@ files:
 - lib/zip_tricks/write_and_tell.rb
 - lib/zip_tricks/write_buffer.rb
 - lib/zip_tricks/zip_writer.rb
-- qa/README_QA.md
-- qa/generate_test_files.rb
-- qa/in/VTYL8830.jpg
-- qa/in/war-and-peace.txt
-- qa/support.rb
-- qa/test-report-2016-07-28.txt
-- qa/test-report-2016-12-12.txt
-- qa/test-report-2017-04-2.txt
-- qa/test-report.txt
 - zip_tricks.gemspec
-homepage: http://github.com/wetransfer/zip_tricks
+homepage: https://github.com/wetransfer/zip_tricks
 licenses:
 - MIT (Hippocratic)
 metadata:

data/qa/README_QA.md DELETED

@@ -1,16 +0,0 @@
-## Manual testing harness for ZipTricks
-These tests will generate **very large** files that test various edge cases of ZIP generation. The idea is to generate
-these files and to then try to open them with the unarchiver applications we support. The workflow is as follows:
-1. Configure your storage to have `zip_tricks` directory linked into your virtual machines and to be on a fast volume (SSD RAID0 is recommended)
-2. Run `generate_test_files.rb`. This will take some time and produce a number of large ZIP files.
-3. Open them with the following ZIP unarchivers:
-  * A recent version of `zipinfo` with the `-tlhvz` flags - to see the information about the file
-  * ArchiveUtility on OSX
-  * The Unarchiver on OSX
-  * Built-in Explorer on Windows 7
-  * 7Zip 9.20 on Windows 7
-  * Any other unarchivers you consider necessary
-* Write down your observations in `test-report.txt` and, when cutting a release, timestamp a copy of that file.