zip_kit 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +7 -0
  2. data/.codeclimate.yml +7 -0
  3. data/.document +5 -0
  4. data/.github/workflows/ci.yml +29 -0
  5. data/.gitignore +61 -0
  6. data/.rspec +1 -0
  7. data/.standard.yml +8 -0
  8. data/.yardopts +1 -0
  9. data/CHANGELOG.md +255 -0
  10. data/CODE_OF_CONDUCT.md +46 -0
  11. data/CONTRIBUTING.md +153 -0
  12. data/Gemfile +4 -0
  13. data/IMPLEMENTATION_DETAILS.md +97 -0
  14. data/LICENSE.txt +20 -0
  15. data/README.md +234 -0
  16. data/Rakefile +21 -0
  17. data/bench/buffered_crc32_bench.rb +109 -0
  18. data/examples/archive_size_estimate.rb +15 -0
  19. data/examples/config.ru +7 -0
  20. data/examples/deferred_write.rb +58 -0
  21. data/examples/parallel_compression_with_block_deflate.rb +86 -0
  22. data/examples/rack_application.rb +63 -0
  23. data/examples/s3_upload.rb +23 -0
  24. data/lib/zip_kit/block_deflate.rb +130 -0
  25. data/lib/zip_kit/block_write.rb +47 -0
  26. data/lib/zip_kit/file_reader/inflating_reader.rb +36 -0
  27. data/lib/zip_kit/file_reader/stored_reader.rb +35 -0
  28. data/lib/zip_kit/file_reader.rb +740 -0
  29. data/lib/zip_kit/null_writer.rb +12 -0
  30. data/lib/zip_kit/output_enumerator.rb +150 -0
  31. data/lib/zip_kit/path_set.rb +163 -0
  32. data/lib/zip_kit/rack_chunked_body.rb +32 -0
  33. data/lib/zip_kit/rack_tempfile_body.rb +61 -0
  34. data/lib/zip_kit/rails_streaming.rb +37 -0
  35. data/lib/zip_kit/remote_io.rb +114 -0
  36. data/lib/zip_kit/remote_uncap.rb +22 -0
  37. data/lib/zip_kit/size_estimator.rb +84 -0
  38. data/lib/zip_kit/stream_crc32.rb +60 -0
  39. data/lib/zip_kit/streamer/deflated_writer.rb +45 -0
  40. data/lib/zip_kit/streamer/entry.rb +37 -0
  41. data/lib/zip_kit/streamer/filler.rb +9 -0
  42. data/lib/zip_kit/streamer/heuristic.rb +68 -0
  43. data/lib/zip_kit/streamer/stored_writer.rb +39 -0
  44. data/lib/zip_kit/streamer/writable.rb +36 -0
  45. data/lib/zip_kit/streamer.rb +614 -0
  46. data/lib/zip_kit/uniquify_filename.rb +39 -0
  47. data/lib/zip_kit/version.rb +5 -0
  48. data/lib/zip_kit/write_and_tell.rb +40 -0
  49. data/lib/zip_kit/write_buffer.rb +71 -0
  50. data/lib/zip_kit/write_shovel.rb +22 -0
  51. data/lib/zip_kit/zip_writer.rb +436 -0
  52. data/lib/zip_kit.rb +24 -0
  53. data/zip_kit.gemspec +41 -0
  54. metadata +335 -0
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../lib/zip_kit"
4
+
5
+ # An example of how you can create a Rack endpoint for your ZIP downloads.
6
+ # NEVER run this in production - it is a huge security risk.
7
+ # What this app will do is pick PATH_INFO (your request URL path)
8
+ # and grab a file located at this path on your filesystem. The file will then
9
+ # be added to a ZIP archive created completely programmatically. No data will
10
+ # be cached on disk and the contents of the ZIP file will _not_ be buffered in
11
+ # it's entirety before sending. Unless you use a buffering Rack server of
12
+ # course (WEBrick or Thin).
13
+ class ZipDownload
14
+ def call(env)
15
+ file_path = env["PATH_INFO"] # Should be the absolute path on the filesystem
16
+
17
+ # Open the file for binary reading
18
+ f = File.open(file_path, "rb")
19
+ filename = File.basename(file_path)
20
+
21
+ # Compute the CRC32 upfront. We do not use local footers for post-computing
22
+ # the CRC32, so you _do_ have to precompute it beforehand. Ideally, you
23
+ # would do that before storing the files you will be sending out later on.
24
+ crc32 = ZipKit::StreamCRC32.from_io(f)
25
+ f.rewind
26
+
27
+ # Compute the size of the download, so that a
28
+ # real Content-Length header can be sent. Also, if your download
29
+ # stops at some point, the downloading browser will be able to tell
30
+ # the user that the download stalled or was aborted in-flight.
31
+ # Note that using the size estimator here does _not_ read or compress
32
+ # your original file, so it is very fast.
33
+ size = ZipKit::SizeEstimator.estimate { |ar|
34
+ ar.add_stored_entry(filename, f.size)
35
+ }
36
+
37
+ # Create a suitable Rack response body, that will support each(),
38
+ # close() and all the other methods. We can then return it up the stack.
39
+ zip_response_body = ZipKit::OutputEnumerator.new do |zip|
40
+ # We are adding only one file to the ZIP here, but you could do that
41
+ # with an arbitrary number of files of course.
42
+ zip.add_stored_entry(filename: filename, size: f.size, crc32: crc32)
43
+ # Write the contents of the file. It is stored, so the writes go
44
+ # directly to the Rack output, bypassing any RubyZip
45
+ # deflaters/compressors. In fact you are yielding the "blob" string
46
+ # here directly to the Rack server handler.
47
+ IO.copy_stream(f, zip)
48
+ ensure
49
+ f.close # Make sure the opened file we read from gets closed
50
+ end
51
+
52
+ # Add a Content-Disposition so that the download has a .zip extension
53
+ # (this will not work well with UTF-8 filenames on Windows, but hey!)
54
+ content_disposition = "attachment; filename=%<filename>s.zip" % {filename: filename}
55
+
56
+ # and return the response, adding the Content-Length we have computed earlier
57
+ [
58
+ 200,
59
+ {"Content-Length" => size.to_s, "Content-Disposition" => content_disposition},
60
+ zip_response_body
61
+ ]
62
+ end
63
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../lib/zip_kit"
4
+
5
+ # Any writable object can be used as a destination for the Streamer.
6
+ # For example, you can write to an S3 bucket. Newer versions of the S3 SDK
7
+ # support a method called `upload_stream` which allows streaming uploads. The
8
+ # SDK will split your streamed bytes into appropriately-sized multipart upload
9
+ # parts and PUT them onto S3.
10
+ bucket = Aws::S3::Bucket.new("mybucket")
11
+ obj = bucket.object("big.zip")
12
+ obj.upload_stream do |write_stream|
13
+ ZipKit::Streamer.open(write_stream) do |zip|
14
+ zip.write_file("large.csv") do |sink|
15
+ CSV(sink) do |csv|
16
+ csv << ["Line", "Item"]
17
+ 20_000.times do |n|
18
+ csv << [n, "Item number #{n}"]
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "zlib"
4
+
5
+ # Permits Deflate compression in independent blocks. The workflow is as follows:
6
+ #
7
+ # * Run every block to compress through deflate_chunk, remove the header,
8
+ # footer and adler32 from the result
9
+ # * Write out the compressed block bodies (the ones deflate_chunk returns)
10
+ # to your output, in sequence
11
+ # * Write out the footer (\03\00)
12
+ #
13
+ # The resulting stream is guaranteed to be handled properly by all zip
14
+ # unarchiving tools, including the BOMArchiveHelper/ArchiveUtility on OSX.
15
+ #
16
+ # You could also build a compressor for Rubyzip using this module quite easily,
17
+ # even though this is outside the scope of the library.
18
+ #
19
+ # When you deflate the chunks separately, you need to write the end marker
20
+ # yourself (using `write_terminator`).
21
+ # If you just want to deflate a large IO's contents, use
22
+ # `deflate_in_blocks_and_terminate` to have the end marker written out for you.
23
+ #
24
+ # Basic usage to compress a file in parts:
25
+ #
26
+ # source_file = File.open('12_gigs.bin', 'rb')
27
+ # compressed = Tempfile.new
28
+ # # Will not compress everything in memory, but do it per chunk to spare
29
+ # memory. `compressed`
30
+ # # will be written to at the end of each chunk.
31
+ # ZipKit::BlockDeflate.deflate_in_blocks_and_terminate(source_file,
32
+ # compressed)
33
+ #
34
+ # You can also do the same to parts that you will later concatenate together
35
+ # elsewhere, in that case you need to skip the end marker:
36
+ #
37
+ # compressed = Tempfile.new
38
+ # ZipKit::BlockDeflate.deflate_in_blocks(File.open('part1.bin', 'rb),
39
+ # compressed)
40
+ # ZipKit::BlockDeflate.deflate_in_blocks(File.open('part2.bin', 'rb),
41
+ # compressed)
42
+ # ZipKit::BlockDeflate.deflate_in_blocks(File.open('partN.bin', 'rb),
43
+ # compressed)
44
+ # ZipKit::BlockDeflate.write_terminator(compressed)
45
+ #
46
+ # You can also elect to just compress strings in memory (to splice them later):
47
+ #
48
+ # compressed_string = ZipKit::BlockDeflate.deflate_chunk(big_string)
49
+
50
+ class ZipKit::BlockDeflate
51
+ DEFAULT_BLOCKSIZE = 1_024 * 1024 * 5
52
+ END_MARKER = [3, 0].pack("C*")
53
+ # Zlib::NO_COMPRESSION..
54
+ VALID_COMPRESSIONS = (Zlib::DEFAULT_COMPRESSION..Zlib::BEST_COMPRESSION).to_a.freeze
55
+ # Write the end marker (\x3\x0) to the given IO.
56
+ #
57
+ # `output_io` can also be a {ZipKit::Streamer} to expedite ops.
58
+ #
59
+ # @param output_io [IO] the stream to write to (should respond to `:<<`)
60
+ # @return [Fixnum] number of bytes written to `output_io`
61
+ def self.write_terminator(output_io)
62
+ output_io << END_MARKER
63
+ END_MARKER.bytesize
64
+ end
65
+
66
+ # Compress a given binary string and flush the deflate stream at byte boundary.
67
+ # The returned string can be spliced into another deflate stream.
68
+ #
69
+ # @param bytes [String] Bytes to compress
70
+ # @param level [Fixnum] Zlib compression level (defaults to `Zlib::DEFAULT_COMPRESSION`)
71
+ # @return [String] compressed bytes
72
+ def self.deflate_chunk(bytes, level: Zlib::DEFAULT_COMPRESSION)
73
+ raise "Invalid Zlib compression level #{level}" unless VALID_COMPRESSIONS.include?(level)
74
+ z = Zlib::Deflate.new(level)
75
+ compressed_blob = z.deflate(bytes, Zlib::SYNC_FLUSH)
76
+ compressed_blob << z.finish
77
+ z.close
78
+
79
+ # Remove the header (2 bytes), the [3,0] end marker and the adler (4 bytes)
80
+ compressed_blob[2...-6]
81
+ end
82
+
83
+ # Compress the contents of input_io into output_io, in blocks
84
+ # of block_size. Aligns the parts so that they can be concatenated later.
85
+ # Writes deflate end marker (\x3\x0) into `output_io` as the final step, so
86
+ # the contents of `output_io` can be spliced verbatim into a ZIP archive.
87
+ #
88
+ # Once the write completes, no more parts for concatenation should be written to
89
+ # the same stream.
90
+ #
91
+ # `output_io` can also be a {ZipKit::Streamer} to expedite ops.
92
+ #
93
+ # @param input_io [IO] the stream to read from (should respond to `:read`)
94
+ # @param output_io [IO] the stream to write to (should respond to `:<<`)
95
+ # @param level [Fixnum] Zlib compression level (defaults to `Zlib::DEFAULT_COMPRESSION`)
96
+ # @param block_size [Fixnum] The block size to use (defaults to `DEFAULT_BLOCKSIZE`)
97
+ # @return [Fixnum] number of bytes written to `output_io`
98
+ def self.deflate_in_blocks_and_terminate(input_io,
99
+ output_io,
100
+ level: Zlib::DEFAULT_COMPRESSION,
101
+ block_size: DEFAULT_BLOCKSIZE)
102
+ bytes_written = deflate_in_blocks(input_io, output_io, level: level, block_size: block_size)
103
+ bytes_written + write_terminator(output_io)
104
+ end
105
+
106
+ # Compress the contents of input_io into output_io, in blocks
107
+ # of block_size. Align the parts so that they can be concatenated later.
108
+ # Will not write the deflate end marker (\x3\x0) so more parts can be written
109
+ # later and succesfully read back in provided the end marker wll be written.
110
+ #
111
+ # `output_io` can also be a {ZipKit::Streamer} to expedite ops.
112
+ #
113
+ # @param input_io [IO] the stream to read from (should respond to `:read`)
114
+ # @param output_io [IO] the stream to write to (should respond to `:<<`)
115
+ # @param level [Fixnum] Zlib compression level (defaults to `Zlib::DEFAULT_COMPRESSION`)
116
+ # @param block_size [Fixnum] The block size to use (defaults to `DEFAULT_BLOCKSIZE`)
117
+ # @return [Fixnum] number of bytes written to `output_io`
118
+ def self.deflate_in_blocks(input_io,
119
+ output_io,
120
+ level: Zlib::DEFAULT_COMPRESSION,
121
+ block_size: DEFAULT_BLOCKSIZE)
122
+ bytes_written = 0
123
+ while (block = input_io.read(block_size))
124
+ deflated = deflate_chunk(block, level: level)
125
+ output_io << deflated
126
+ bytes_written += deflated.bytesize
127
+ end
128
+ bytes_written
129
+ end
130
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Acts as a converter between callers which send data to the `#<<` method (such as all the ZipKit
4
+ # writer methods, which push onto anything), and a given block. Every time `#<<` gets called on the BlockWrite,
5
+ # the block given to the constructor will be called with the same argument. ZipKit uses this object
6
+ # when integrating with Rack and in the OutputEnumerator. Normally you wouldn't need to use it manually but
7
+ # you always can. BlockWrite will also ensure the binary string encoding is forced onto any string
8
+ # that passes through it.
9
+ #
10
+ # For example, you can create a Rack response body like so:
11
+ #
12
+ # class MyRackResponse
13
+ # def each
14
+ # writer = ZipKit::BlockWrite.new {|chunk| yield(chunk) }
15
+ # writer << "Hello" << "world" << "!"
16
+ # end
17
+ # end
18
+ # [200, {}, MyRackResponse.new]
19
+ class ZipKit::BlockWrite
20
+ # Creates a new BlockWrite.
21
+ #
22
+ # @param block The block that will be called when this object receives the `<<` message
23
+ def initialize(&block)
24
+ @block = block
25
+ end
26
+
27
+ # Make sure those methods raise outright
28
+ %i[seek pos= to_s].each do |m|
29
+ define_method(m) do |*_args|
30
+ raise "#{m} not supported - this IO adapter is non-rewindable"
31
+ end
32
+ end
33
+
34
+ # Sends a string through to the block stored in the BlockWrite.
35
+ #
36
+ # @param buf[String] the string to write. Note that a zero-length String
37
+ # will not be forwarded to the block, as it has special meaning when used
38
+ # with chunked encoding (it indicates the end of the stream).
39
+ # @return self
40
+ def <<(buf)
41
+ # Zero-size output has a special meaning when using chunked encoding
42
+ return if buf.nil? || buf.bytesize.zero?
43
+
44
+ @block.call(buf.b)
45
+ self
46
+ end
47
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Rubocop: convention: Missing top-level class documentation comment.
4
+ class ZipKit::FileReader::InflatingReader
5
+ def initialize(from_io, compressed_data_size)
6
+ @io = from_io
7
+ @compressed_data_size = compressed_data_size
8
+ @already_read = 0
9
+ @zlib_inflater = ::Zlib::Inflate.new(-Zlib::MAX_WBITS)
10
+ end
11
+
12
+ def extract(n_bytes = nil)
13
+ n_bytes ||= (@compressed_data_size - @already_read)
14
+
15
+ return if eof?
16
+
17
+ available = @compressed_data_size - @already_read
18
+
19
+ return if available.zero?
20
+
21
+ n_bytes = available if n_bytes > available
22
+
23
+ return "" if n_bytes.zero?
24
+
25
+ compressed_chunk = @io.read(n_bytes)
26
+
27
+ return if compressed_chunk.nil?
28
+
29
+ @already_read += compressed_chunk.bytesize
30
+ @zlib_inflater.inflate(compressed_chunk)
31
+ end
32
+
33
+ def eof?
34
+ @zlib_inflater.finished?
35
+ end
36
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Rubocop: convention: Missing top-level class documentation comment.
4
+ class ZipKit::FileReader::StoredReader
5
+ def initialize(from_io, compressed_data_size)
6
+ @io = from_io
7
+ @compressed_data_size = compressed_data_size
8
+ @already_read = 0
9
+ end
10
+
11
+ def extract(n_bytes = nil)
12
+ n_bytes ||= (@compressed_data_size - @already_read)
13
+
14
+ return if eof?
15
+
16
+ available = @compressed_data_size - @already_read
17
+
18
+ return if available.zero?
19
+
20
+ n_bytes = available if n_bytes > available
21
+
22
+ return "" if n_bytes.zero?
23
+
24
+ compressed_chunk = @io.read(n_bytes)
25
+
26
+ return if compressed_chunk.nil?
27
+
28
+ @already_read += compressed_chunk.bytesize
29
+ compressed_chunk
30
+ end
31
+
32
+ def eof?
33
+ @already_read >= @compressed_data_size
34
+ end
35
+ end