RubyGems - zip_tricks - Versions diffs - 5.4.0 → 5.5.0 - Mend

zip_tricks 5.4.0 → 5.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/README.md +12 -8
data/lib/zip_tricks/output_enumerator.rb +48 -27
data/lib/zip_tricks/streamer.rb +6 -7
data/lib/zip_tricks/streamer/deflated_writer.rb +4 -2
data/lib/zip_tricks/streamer/stored_writer.rb +4 -2
data/lib/zip_tricks/version.rb +1 -1
data/lib/zip_tricks/write_buffer.rb +37 -17
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 05e8eea8ecf1ad0b9b9cb132c54dde1abd60dc003afd1e5a1ce989786ce89608
-  data.tar.gz: fbdc2172fc3becefa4dd8720713acb11d838900465ac3cb42de38fec76fd0d90
+  metadata.gz: 50ba2d6a0b5bde1cf51443c7ff4228a7e99a1cc1ac843e3ef23c0d197878a04b
+  data.tar.gz: 5fdb377cc34fd6d9edb4e7932e79ebe28bc2dc91aa71348e386b8b601e4f06f1
 SHA512:
-  metadata.gz: 449c59e898d2b54a089b60d7aebe7633d9f65ad64a5ce014a7a44e1683f6d6cec4997f732fd06746edeec96594d5041b68efcf19571ef92c9798acc05ffda7bd
-  data.tar.gz: 5ed26109e12373acfb9866531ef03c4302141bbb8a312f759ff768295c8d7926f650745f3f2624ad652333e57cd5bc5f98f932ace41504aeca45a668ef7a4f4a
+  metadata.gz: 4c2ae765d7e6c584632b7606d66b970c100b9679cb6b503209be1179bc9582f83727b13154bfce5281dd8dc105f1f5112d2794df7f97ebb987a1e224f67f4840
+  data.tar.gz: e06306028f18fe1eb16abe12c48cd93182d11451298cde6068b9ccb37b78846be469b6bd48848507f38f8796b00281f27391589d20d9163b3845cba4112411c5

data/CHANGELOG.md CHANGED

@@ -1,3 +1,9 @@
+## 5.5.0
+* In `OutputEnumerator` apply some amount of buffering to be within a UNIX socket size for metatada writes. This
+  speeds up usage with Puma by about 20 percent, as there won't be as many `syswrite` calls on the socket.
+* Make `StoredWriter` and `DeflatedWriter` public constants so that standalone tests can be written for them
 ## 5.4.0
 * Use block form for zlib Deflater calls to conserve memory

data/README.md CHANGED

@@ -35,9 +35,9 @@ class ZipsController < ActionController::Base
     zip_tricks_stream do |zip|
       zip.write_deflated_file('report1.csv') do |sink|
         CSV(sink) do |csv_write|
-          csv << Person.column_names
+          csv_write << Person.column_names
           Person.all.find_each do |person|
-            csv << person.attributes.values
+            csv_write << person.attributes.values
           end
         end
       end
@@ -75,12 +75,15 @@ since you do not know how large the compressed data segments are going to be.
 ## Send a ZIP from a Rack response
-Create a `RackBody` object and give it's constructor a block that adds files.
-The block will only be called when actually sending the response to the client
+To "pull" data from ZipTricks you can create an `OutputEnumerator` object which will yield the binary chunks piece
+by piece, and apply some amount of buffering as well. Since this `OutputEnumerator` responds to `#each` and yields
+Strings it also can (and should!) be used as a Rack response body. Return it to your webserver and you will
+have your ZIP streamed. The block that you give to the `OutputEnumerator` will only start executing once your
+response body starts getting iterated over - when actually sending the response to the client
 (unless you are using a buffering Rack webserver, such as Webrick).
 ```ruby
-body = ZipTricks::RackBody.new do | zip |
+body = ZipTricks::Streamer.output_enum do | zip |
   zip.write_stored_file('mov.mp4') do |sink| # Those MPEG4 files do not compress that well
     File.open('mov.mp4', 'rb'){|source| IO.copy_stream(source, sink) }
   end
@@ -127,11 +130,12 @@ ZipTricks::Streamer.open(io) do | zip |
   # Write the local file header first..
   zip.add_stored_entry(filename: "first-file.bin", size: raw_file.size, crc32: raw_file_crc32)
-  # then send the actual file contents bypassing the Streamer interface
+  # Adjust the ZIP offsets within the Streamer
+  zip.simulate_write(my_temp_file.size)
+  # ...and then send the actual file contents bypassing the Streamer interface
   io.sendfile(my_temp_file)
-  # ...and then adjust the ZIP offsets within the Streamer
-  zip.simulate_write(my_temp_file.size)
 end
 ```

data/lib/zip_tricks/output_enumerator.rb CHANGED

@@ -1,43 +1,64 @@
 # frozen_string_literal: true
-# Can be used as a Rack response body directly. Will yield
-# a {ZipTricks::Streamer} for adding entries to the archive and writing
-# zip entry bodies.
+# The output enumerator makes it possible to "pull" from a ZipTricks streamer
+# object instead of having it "push" writes to you. It will "stash" the block which
+# writes the ZIP archive through the streamer, and when you call `each` on the Enumerator
+# it will yield you the bytes the block writes. Since it is an enumerator you can
+# use `next` to take chunks written by the ZipTricks streamer one by one. It can be very
+# convenient when you need to segment your ZIP output into bigger chunks for, say,
+# uploading them to a cloud storage provider such as S3.
+#
+# Another use of the output enumerator is outputting a ZIP archive from Rails or Rack,
+# where an object responding to `each` is required which yields Strings. For instance,
+# you can return a ZIP archive from Rack like so:
+#
+#     iterable_zip_body = ZipTricks::OutputEnumerator.new do | streamer |
+#       streamer.write_deflated_file('big.csv') do |sink|
+#         CSV(sink) do |csv_writer|
+#           csv_writer << Person.column_names
+#           Person.all.find_each do |person|
+#             csv_writer << person.attributes.values
+#           end
+#         end
+#       end
+#     end
+#
+#     [200, {'Content-Type' => 'binary/octet-stream'}, iterable_zip_body]
 class ZipTricks::OutputEnumerator
-  # Prepares a new Rack response body with a Zip output stream.
-  # The block given to the constructor will be called when the response
-  # body will be read by the webserver, and will receive a {ZipTricks::Streamer}
-  # as it's block argument. You can then add entries to the Streamer as usual.
-  # The archive will be automatically closed at the end of the block.
+  DEFAULT_WRITE_BUFFER_SIZE = 64 * 1024
+  # Creates a new OutputEnumerator.
   #
-  #     # Precompute the Content-Length ahead of time
-  #     content_length = ZipTricks::SizeEstimator.estimate do | estimator |
-  #       estimator.add_stored_entry(filename: 'large.tif', size: 1289894)
-  #     end
-  #
-  #     # Prepare the response body.
-  #     # The block will only be called when the
-  #     # response starts to be written.
-  #     body = ZipTricks::OutputEnumerator.new do | streamer |
-  #       streamer.add_stored_entry(filename: 'large.tif', size: 1289894, crc32: 198210)
-  #       streamer << large_file.read(1024*1024) until large_file.eof?
-  #       ...
-  #     end
-  #
-  #     return [200, {'Content-Type' => 'binary/octet-stream',
-  #     'Content-Length' => content_length.to_s}, body]
-  def initialize(**streamer_options, &blk)
+  # @param streamer_options[Hash] options for Streamer, see {ZipTricks::Streamer.new}
+  # @param write_buffer_size[Integer] By default all ZipTricks writes are unbuffered. For output to sockets
+  #     it is beneficial to bulkify those writes so that they are roughly sized to a socket buffer chunk. This
+  #     object will bulkify writes for you in this way (so `each` will yield not on every call to `<<` from the Streamer
+  #     but at block size boundaries or greater). Set it to 0 for unbuffered writes.
+  # @param blk a block that will receive the Streamer object when executing. The block will not be executed
+  #     immediately but only once `each` is called on the OutputEnumerator
+  def initialize(write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE, **streamer_options, &blk)
     @streamer_options = streamer_options.to_h
+    @bufsize = write_buffer_size.to_i
     @archiving_block = blk
   end
   # Executes the block given to the constructor with a {ZipTricks::Streamer}
   # and passes each written chunk to the block given to the method. This allows one
-  # to "take" output of the ZIP piecewise.
+  # to "take" output of the ZIP piecewise. If called without a block will return an Enumerator
+  # that you can pull data from using `next`.
+  #
+  # **NOTE** Because the `WriteBuffer` inside this object can reuse the buffer, it is important
+  #    that the `String` that is yielded **either** gets consumed eagerly (written byte-by-byte somewhere, or `#dup`-ed)
+  #    since the write buffer will clear it after your block returns. If you expand this Enumerator
+  #    eagerly into an Array you might notice that a lot of the segments of your ZIP output are
+  #    empty - this means that you need to duplicate them.
+  #
+  # @yield [String] a chunk of the ZIP output in binary encoding
   def each
     if block_given?
       block_write = ZipTricks::BlockWrite.new { |chunk| yield(chunk) }
-      ZipTricks::Streamer.open(block_write, **@streamer_options, &@archiving_block)
+      buffer = ZipTricks::WriteBuffer.new(block_write, @bufsize)
+      ZipTricks::Streamer.open(buffer, **@streamer_options, &@archiving_block)
+      buffer.flush
     else
       enum_for(:each)
     end

data/lib/zip_tricks/streamer.rb CHANGED

@@ -93,7 +93,7 @@ class ZipTricks::Streamer
   UnknownMode = Class.new(StandardError)
   OffsetOutOfSync = Class.new(StandardError)
-  private_constant :DeflatedWriter, :StoredWriter, :STORED, :DEFLATED
+  private_constant :STORED, :DEFLATED
   # Creates a new Streamer on top of the given IO-ish object and yields it. Once the given block
   # returns, the Streamer will have it's `close` method called, which will write out the central
@@ -138,20 +138,19 @@ class ZipTricks::Streamer
   # Creates a new Streamer on top of the given IO-ish object.
   #
-  # @param stream[IO] the destination IO for the ZIP. Anything that responds to `<<` can be used.
+  # @param writable[#<<] the destination IO for the ZIP. Anything that responds to `<<` can be used.
   # @param writer[ZipTricks::ZipWriter] the object to be used as the writer.
   #    Defaults to an instance of ZipTricks::ZipWriter, normally you won't need to override it
   # @param auto_rename_duplicate_filenames[Boolean] whether duplicate filenames, when encountered,
   #    should be suffixed with (1), (2) etc. Default value is `false` - if
   #    dupliate names are used an exception will be raised
-  def initialize(stream, writer: create_writer, auto_rename_duplicate_filenames: false)
-    raise InvalidOutput, 'The stream must respond to #<<' unless stream.respond_to?(:<<)
-    @dedupe_filenames = auto_rename_duplicate_filenames
-    @out = ZipTricks::WriteAndTell.new(stream)
+  def initialize(writable, writer: create_writer, auto_rename_duplicate_filenames: false)
+    raise InvalidOutput, 'The writable must respond to #<<' unless writable.respond_to?(:<<)
+    @out = ZipTricks::WriteAndTell.new(writable)
     @files = []
     @path_set = ZipTricks::PathSet.new
     @writer = writer
+    @dedupe_filenames = auto_rename_duplicate_filenames
   end
   # Writes a part of a zip entry body (actual binary data of the entry) into the output stream.

data/lib/zip_tricks/streamer/deflated_writer.rb CHANGED

@@ -13,7 +13,8 @@ class ZipTricks::Streamer::DeflatedWriter
   def initialize(io)
     @compressed_io = io
     @deflater = ::Zlib::Deflate.new(Zlib::DEFAULT_COMPRESSION, -::Zlib::MAX_WBITS)
-    @crc = ZipTricks::WriteBuffer.new(ZipTricks::StreamCRC32.new, CRC32_BUFFER_SIZE)
+    @crc = ZipTricks::StreamCRC32.new
+    @crc_buf = ZipTricks::WriteBuffer.new(@crc, CRC32_BUFFER_SIZE)
   end
   # Writes the given data into the deflater, and flushes the deflater
@@ -23,7 +24,7 @@ class ZipTricks::Streamer::DeflatedWriter
   # @return self
   def <<(data)
     @deflater.deflate(data) { |chunk| @compressed_io << chunk }
-    @crc << data
+    @crc_buf << data
     self
   end
@@ -34,6 +35,7 @@ class ZipTricks::Streamer::DeflatedWriter
   # @return [Hash] a hash of `{crc32, compressed_size, uncompressed_size}`
   def finish
     @compressed_io << @deflater.finish until @deflater.finished?
+    @crc_buf.flush
     {crc32: @crc.to_i, compressed_size: @deflater.total_out, uncompressed_size: @deflater.total_in}
   ensure
     @deflater.close

data/lib/zip_tricks/streamer/stored_writer.rb CHANGED

@@ -12,7 +12,8 @@ class ZipTricks::Streamer::StoredWriter
   def initialize(io)
     @io = ZipTricks::WriteAndTell.new(io)
-    @crc = ZipTricks::WriteBuffer.new(ZipTricks::StreamCRC32.new, CRC32_BUFFER_SIZE)
+    @crc_compute = ZipTricks::StreamCRC32.new
+    @crc = ZipTricks::WriteBuffer.new(@crc_compute, CRC32_BUFFER_SIZE)
   end
   # Writes the given data to the contained IO object.
@@ -30,6 +31,7 @@ class ZipTricks::Streamer::StoredWriter
   #
   # @return [Hash] a hash of `{crc32, compressed_size, uncompressed_size}`
   def finish
-    {crc32: @crc.to_i, compressed_size: @io.tell, uncompressed_size: @io.tell}
+    @crc.flush
+    {crc32: @crc_compute.to_i, compressed_size: @io.tell, uncompressed_size: @io.tell}
   end
 end

data/lib/zip_tricks/version.rb CHANGED

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module ZipTricks
-  VERSION = '5.4.0'
+  VERSION = '5.5.0'
 end

data/lib/zip_tricks/write_buffer.rb CHANGED

@@ -7,13 +7,34 @@
 # CRC32 combine operations - and this adds up. Since the CRC32 value
 # is usually not needed until the complete output has completed
 # we can buffer at least some amount of data before computing CRC32 over it.
+# We also use this buffer for output via Rack, where some amount of buffering
+# helps reduce the number of syscalls made by the webserver. ZipTricks performs
+# lots of very small writes, and some degree of speedup (about 20%) can be achieved
+# with a buffer of a few KB.
+#
+# Note that there is no guarantee that the write buffer is going to flush at or above
+# the given `buffer_size`, because for writes which exceed the buffer size it will
+# first `flush` and then write through the oversized chunk, without buffering it. This
+# helps conserve memory. Also note that the buffer will *not* duplicate strings for you
+# and *will* yield the same buffer String over and over, so if you are storing it in an
+# Array you might need to duplicate it.
+#
+# Note also that the WriteBuffer assumes that the object it `<<`-writes into is going
+# to **consume** in some way the string that it passes in. After the `<<` method returns,
+# the WriteBuffer will be cleared, and it passes the same String reference on every call
+# to `<<`. Therefore, if you need to retain the output of the WriteBuffer in, say, an Array,
+# you might need to `.dup` the `String` it gives you.
 class ZipTricks::WriteBuffer
   # Creates a new WriteBuffer bypassing into a given writable object
   #
-  # @param writable[#<<] An object that responds to `#<<` with string as argument
+  # @param writable[#<<] An object that responds to `#<<` with a String as argument
   # @param buffer_size[Integer] How many bytes to buffer
   def initialize(writable, buffer_size)
-    @buf = StringIO.new
+    # Allocating the buffer using a zero-padded String as a variation
+    # on using capacity:, which JRuby apparently does not like very much. The
+    # desire here is that the buffer doesn't have to be resized during the lifetime
+    # of the object.
+    @buf = ("\0".b * (buffer_size * 2)).clear
     @buffer_size = buffer_size
     @writable = writable
   end
@@ -24,28 +45,27 @@ class ZipTricks::WriteBuffer
   # @param data[String] data to be written
   # @return self
   def <<(data)
-    @buf << data
-    flush! if @buf.size > @buffer_size
+    if data.bytesize >= @buffer_size
+      flush unless @buf.empty? # <- this is were we can output less than @buffer_size
+      @writable << data
+    else
+      @buf << data
+      flush if @buf.bytesize >= @buffer_size
+    end
     self
   end
   # Explicitly flushes the buffer if it contains anything
   #
   # @return self
-  def flush!
-    @writable << @buf.string if @buf.size > 0
-    @buf.truncate(0)
-    @buf.rewind
+  def flush
+    unless @buf.empty?
+      @writable << @buf
+      @buf.clear
+    end
     self
   end
-  # Flushes the buffer and returns the result of `#to_i` of the contained `writable`.
-  # Primarily facilitates working with StreamCRC32 objects where you finish the
-  # computation by retrieving the CRC as an integer
-  #
-  # @return [Integer] the return value of `writable#to_i`
-  def to_i
-    flush!
-    @writable.to_i
-  end
+  # `flush!` was renamed to `flush` but we preserve this method for backwards compatibility
+  alias_method :flush!, :flush
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: zip_tricks
 version: !ruby/object:Gem::Version
-  version: 5.4.0
+  version: 5.5.0
 platform: ruby
 authors:
 - Julik Tarkhanov
@@ -11,7 +11,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2020-11-19 00:00:00.000000000 Z
+date: 2020-11-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler