zip_kit 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.codeclimate.yml +7 -0
- data/.document +5 -0
- data/.github/workflows/ci.yml +29 -0
- data/.gitignore +61 -0
- data/.rspec +1 -0
- data/.standard.yml +8 -0
- data/.yardopts +1 -0
- data/CHANGELOG.md +255 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/CONTRIBUTING.md +153 -0
- data/Gemfile +4 -0
- data/IMPLEMENTATION_DETAILS.md +97 -0
- data/LICENSE.txt +20 -0
- data/README.md +234 -0
- data/Rakefile +21 -0
- data/bench/buffered_crc32_bench.rb +109 -0
- data/examples/archive_size_estimate.rb +15 -0
- data/examples/config.ru +7 -0
- data/examples/deferred_write.rb +58 -0
- data/examples/parallel_compression_with_block_deflate.rb +86 -0
- data/examples/rack_application.rb +63 -0
- data/examples/s3_upload.rb +23 -0
- data/lib/zip_kit/block_deflate.rb +130 -0
- data/lib/zip_kit/block_write.rb +47 -0
- data/lib/zip_kit/file_reader/inflating_reader.rb +36 -0
- data/lib/zip_kit/file_reader/stored_reader.rb +35 -0
- data/lib/zip_kit/file_reader.rb +740 -0
- data/lib/zip_kit/null_writer.rb +12 -0
- data/lib/zip_kit/output_enumerator.rb +150 -0
- data/lib/zip_kit/path_set.rb +163 -0
- data/lib/zip_kit/rack_chunked_body.rb +32 -0
- data/lib/zip_kit/rack_tempfile_body.rb +61 -0
- data/lib/zip_kit/rails_streaming.rb +37 -0
- data/lib/zip_kit/remote_io.rb +114 -0
- data/lib/zip_kit/remote_uncap.rb +22 -0
- data/lib/zip_kit/size_estimator.rb +84 -0
- data/lib/zip_kit/stream_crc32.rb +60 -0
- data/lib/zip_kit/streamer/deflated_writer.rb +45 -0
- data/lib/zip_kit/streamer/entry.rb +37 -0
- data/lib/zip_kit/streamer/filler.rb +9 -0
- data/lib/zip_kit/streamer/heuristic.rb +68 -0
- data/lib/zip_kit/streamer/stored_writer.rb +39 -0
- data/lib/zip_kit/streamer/writable.rb +36 -0
- data/lib/zip_kit/streamer.rb +614 -0
- data/lib/zip_kit/uniquify_filename.rb +39 -0
- data/lib/zip_kit/version.rb +5 -0
- data/lib/zip_kit/write_and_tell.rb +40 -0
- data/lib/zip_kit/write_buffer.rb +71 -0
- data/lib/zip_kit/write_shovel.rb +22 -0
- data/lib/zip_kit/zip_writer.rb +436 -0
- data/lib/zip_kit.rb +24 -0
- data/zip_kit.gemspec +41 -0
- metadata +335 -0
@@ -0,0 +1,60 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# A simple stateful class for keeping track of a CRC32 value through multiple writes
|
4
|
+
class ZipKit::StreamCRC32
|
5
|
+
include ZipKit::WriteShovel
|
6
|
+
|
7
|
+
STRINGS_HAVE_CAPACITY_SUPPORT = begin
|
8
|
+
String.new("", capacity: 1)
|
9
|
+
true
|
10
|
+
rescue ArgumentError
|
11
|
+
false
|
12
|
+
end
|
13
|
+
CRC_BUF_SIZE = 1024 * 512
|
14
|
+
private_constant :STRINGS_HAVE_CAPACITY_SUPPORT, :CRC_BUF_SIZE
|
15
|
+
|
16
|
+
# Compute a CRC32 value from an IO object. The object should respond to `read` and `eof?`
|
17
|
+
#
|
18
|
+
# @param io[IO] the IO to read the data from
|
19
|
+
# @return [Fixnum] the computed CRC32 value
|
20
|
+
def self.from_io(io)
|
21
|
+
# If we can specify the string capacity upfront we will not have to resize
|
22
|
+
# the string during operation. This saves time but is only available on
|
23
|
+
# recent Ruby 2.x versions.
|
24
|
+
blob = STRINGS_HAVE_CAPACITY_SUPPORT ? String.new("", capacity: CRC_BUF_SIZE) : +""
|
25
|
+
crc = new
|
26
|
+
crc << io.read(CRC_BUF_SIZE, blob) until io.eof?
|
27
|
+
crc.to_i
|
28
|
+
end
|
29
|
+
|
30
|
+
# Creates a new streaming CRC32 calculator
|
31
|
+
def initialize
|
32
|
+
@crc = Zlib.crc32
|
33
|
+
end
|
34
|
+
|
35
|
+
# Append data to the CRC32. Updates the contained CRC32 value in place.
|
36
|
+
#
|
37
|
+
# @param blob[String] the string to compute the CRC32 from
|
38
|
+
# @return [self]
|
39
|
+
def <<(blob)
|
40
|
+
@crc = Zlib.crc32(blob, @crc)
|
41
|
+
self
|
42
|
+
end
|
43
|
+
|
44
|
+
# Returns the CRC32 value computed so far
|
45
|
+
#
|
46
|
+
# @return [Fixnum] the updated CRC32 value for all the blobs so far
|
47
|
+
def to_i
|
48
|
+
@crc
|
49
|
+
end
|
50
|
+
|
51
|
+
# Appends a known CRC32 value to the current one, and combines the
|
52
|
+
# contained CRC32 value in-place.
|
53
|
+
#
|
54
|
+
# @param crc32[Fixnum] the CRC32 value to append
|
55
|
+
# @param blob_size[Fixnum] the size of the daata the `crc32` is computed from
|
56
|
+
# @return [Fixnum] the updated CRC32 value for all the blobs so far
|
57
|
+
def append(crc32, blob_size)
|
58
|
+
@crc = Zlib.crc32_combine(@crc, crc32, blob_size)
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Sends writes to the given `io` compressed using a `Zlib::Deflate`. Also
|
4
|
+
# registers data passing through it in a CRC32 checksum calculator. Is made to be completely
|
5
|
+
# interchangeable with the StoredWriter in terms of interface.
|
6
|
+
class ZipKit::Streamer::DeflatedWriter
|
7
|
+
include ZipKit::WriteShovel
|
8
|
+
|
9
|
+
# The amount of bytes we will buffer before computing the intermediate
|
10
|
+
# CRC32 checksums. Benchmarks show that the optimum is 64KB (see
|
11
|
+
# `bench/buffered_crc32_bench.rb), if that is exceeded Zlib is going
|
12
|
+
# to perform internal CRC combine calls which will make the speed go down again.
|
13
|
+
CRC32_BUFFER_SIZE = 64 * 1024
|
14
|
+
|
15
|
+
def initialize(io)
|
16
|
+
@compressed_io = io
|
17
|
+
@deflater = ::Zlib::Deflate.new(Zlib::DEFAULT_COMPRESSION, -::Zlib::MAX_WBITS)
|
18
|
+
@crc = ZipKit::StreamCRC32.new
|
19
|
+
@crc_buf = ZipKit::WriteBuffer.new(@crc, CRC32_BUFFER_SIZE)
|
20
|
+
end
|
21
|
+
|
22
|
+
# Writes the given data into the deflater, and flushes the deflater
|
23
|
+
# after having written more than FLUSH_EVERY_N_BYTES bytes of data
|
24
|
+
#
|
25
|
+
# @param data[String] data to be written
|
26
|
+
# @return self
|
27
|
+
def <<(data)
|
28
|
+
@deflater.deflate(data) { |chunk| @compressed_io << chunk }
|
29
|
+
@crc_buf << data
|
30
|
+
self
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns the amount of data received for writing, the amount of
|
34
|
+
# compressed data written and the CRC32 checksum. The return value
|
35
|
+
# can be directly used as the argument to {Streamer#update_last_entry_and_write_data_descriptor}
|
36
|
+
#
|
37
|
+
# @return [Hash] a hash of `{crc32, compressed_size, uncompressed_size}`
|
38
|
+
def finish
|
39
|
+
@compressed_io << @deflater.finish until @deflater.finished?
|
40
|
+
@crc_buf.flush
|
41
|
+
{crc32: @crc.to_i, compressed_size: @deflater.total_out, uncompressed_size: @deflater.total_in}
|
42
|
+
ensure
|
43
|
+
@deflater.close
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Is used internally by Streamer to keep track of entries in the archive during writing.
|
4
|
+
# Normally you will not have to use this class directly
|
5
|
+
class ZipKit::Streamer::Entry < Struct.new(:filename, :crc32, :compressed_size,
|
6
|
+
:uncompressed_size, :storage_mode, :mtime,
|
7
|
+
:use_data_descriptor, :local_header_offset, :bytes_used_for_local_header, :bytes_used_for_data_descriptor, :unix_permissions)
|
8
|
+
def initialize(*)
|
9
|
+
super
|
10
|
+
filename.force_encoding(Encoding::UTF_8)
|
11
|
+
@requires_efs_flag = !(begin
|
12
|
+
filename.encode(Encoding::ASCII)
|
13
|
+
rescue
|
14
|
+
false
|
15
|
+
end)
|
16
|
+
end
|
17
|
+
|
18
|
+
def total_bytes_used
|
19
|
+
bytes_used_for_local_header + compressed_size + bytes_used_for_data_descriptor
|
20
|
+
end
|
21
|
+
|
22
|
+
# Set the general purpose flags for the entry. We care about is the EFS
|
23
|
+
# bit (bit 11) which should be set if the filename is UTF8. If it is, we need to set the
|
24
|
+
# bit so that the unarchiving application knows that the filename in the archive is UTF-8
|
25
|
+
# encoded, and not some DOS default. For ASCII entries it does not matter.
|
26
|
+
# Additionally, we care about bit 3 which toggles the use of the postfix data descriptor.
|
27
|
+
def gp_flags
|
28
|
+
flag = 0b00000000000
|
29
|
+
flag |= 0b100000000000 if @requires_efs_flag # bit 11
|
30
|
+
flag |= 0x0008 if use_data_descriptor # bit 3
|
31
|
+
flag
|
32
|
+
end
|
33
|
+
|
34
|
+
def filler?
|
35
|
+
false
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Is used internally by Streamer to keep track of entries in the archive during writing.
|
4
|
+
# Normally you will not have to use this class directly
|
5
|
+
class ZipKit::Streamer::Filler < Struct.new(:total_bytes_used)
|
6
|
+
def filler?
|
7
|
+
true
|
8
|
+
end
|
9
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Will be used to pick whether to store a file in the `stored` or
|
4
|
+
# `deflated` mode, by compressing the first N bytes of the file and
|
5
|
+
# comparing the stored and deflated data sizes. If deflate produces
|
6
|
+
# a sizable compression gain for this data, it will create a deflated
|
7
|
+
# file inside the ZIP archive. If the file doesn't compress well, it
|
8
|
+
# will use the "stored" mode for the entry. About 128KB of the
|
9
|
+
# file will be buffered to pick the appropriate storage mode. The
|
10
|
+
# Heuristic will call either `write_stored_file` or `write_deflated_file`
|
11
|
+
# on the Streamer passed into it once it knows which compression
|
12
|
+
# method should be applied
|
13
|
+
class ZipKit::Streamer::Heuristic
|
14
|
+
include ZipKit::WriteShovel
|
15
|
+
|
16
|
+
BYTES_WRITTEN_THRESHOLD = 128 * 1024
|
17
|
+
MINIMUM_VIABLE_COMPRESSION = 0.75
|
18
|
+
|
19
|
+
def initialize(streamer, filename, **write_file_options)
|
20
|
+
@streamer = streamer
|
21
|
+
@filename = filename
|
22
|
+
@write_file_options = write_file_options
|
23
|
+
|
24
|
+
@buf = StringIO.new.binmode
|
25
|
+
@deflater = ::Zlib::Deflate.new(Zlib::DEFAULT_COMPRESSION, -::Zlib::MAX_WBITS)
|
26
|
+
@bytes_deflated = 0
|
27
|
+
|
28
|
+
@winner = nil
|
29
|
+
end
|
30
|
+
|
31
|
+
def <<(bytes)
|
32
|
+
if @winner
|
33
|
+
@winner << bytes
|
34
|
+
else
|
35
|
+
@buf << bytes
|
36
|
+
@deflater.deflate(bytes) { |chunk| @bytes_deflated += chunk.bytesize }
|
37
|
+
decide if @buf.size > BYTES_WRITTEN_THRESHOLD
|
38
|
+
end
|
39
|
+
self
|
40
|
+
end
|
41
|
+
|
42
|
+
def write(bytes)
|
43
|
+
self << bytes
|
44
|
+
bytes.bytesize
|
45
|
+
end
|
46
|
+
|
47
|
+
def close
|
48
|
+
decide unless @winner
|
49
|
+
@winner.close
|
50
|
+
end
|
51
|
+
|
52
|
+
private def decide
|
53
|
+
# Finish and then close the deflater - it has likely buffered some data
|
54
|
+
@bytes_deflated += @deflater.finish.bytesize until @deflater.finished?
|
55
|
+
# If the deflated version is smaller than the stored one
|
56
|
+
# - use deflate, otherwise stored
|
57
|
+
ratio = @bytes_deflated / @buf.size.to_f
|
58
|
+
@winner = if ratio <= MINIMUM_VIABLE_COMPRESSION
|
59
|
+
@streamer.write_deflated_file(@filename, **@write_file_options)
|
60
|
+
else
|
61
|
+
@streamer.write_stored_file(@filename, **@write_file_options)
|
62
|
+
end
|
63
|
+
# Copy the buffered uncompressed data into the newly initialized writable
|
64
|
+
@buf.rewind
|
65
|
+
IO.copy_stream(@buf, @winner)
|
66
|
+
@buf.truncate(0)
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Sends writes to the given `io`, and also registers all the data passing
|
4
|
+
# through it in a CRC32 checksum calculator. Is made to be completely
|
5
|
+
# interchangeable with the DeflatedWriter in terms of interface.
|
6
|
+
class ZipKit::Streamer::StoredWriter
|
7
|
+
include ZipKit::WriteShovel
|
8
|
+
|
9
|
+
# The amount of bytes we will buffer before computing the intermediate
|
10
|
+
# CRC32 checksums. Benchmarks show that the optimum is 64KB (see
|
11
|
+
# `bench/buffered_crc32_bench.rb), if that is exceeded Zlib is going
|
12
|
+
# to perform internal CRC combine calls which will make the speed go down again.
|
13
|
+
CRC32_BUFFER_SIZE = 64 * 1024
|
14
|
+
|
15
|
+
def initialize(io)
|
16
|
+
@io = ZipKit::WriteAndTell.new(io)
|
17
|
+
@crc_compute = ZipKit::StreamCRC32.new
|
18
|
+
@crc = ZipKit::WriteBuffer.new(@crc_compute, CRC32_BUFFER_SIZE)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Writes the given data to the contained IO object.
|
22
|
+
#
|
23
|
+
# @param data[String] data to be written
|
24
|
+
# @return self
|
25
|
+
def <<(data)
|
26
|
+
@io << data
|
27
|
+
@crc << data
|
28
|
+
self
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns the amount of data written and the CRC32 checksum. The return value
|
32
|
+
# can be directly used as the argument to {Streamer#update_last_entry_and_write_data_descriptor}
|
33
|
+
#
|
34
|
+
# @return [Hash] a hash of `{crc32, compressed_size, uncompressed_size}`
|
35
|
+
def finish
|
36
|
+
@crc.flush
|
37
|
+
{crc32: @crc_compute.to_i, compressed_size: @io.tell, uncompressed_size: @io.tell}
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Gets yielded from the writing methods of the Streamer
|
4
|
+
# and accepts the data being written into the ZIP for deflate
|
5
|
+
# or stored modes. Can be used as a destination for `IO.copy_stream`
|
6
|
+
#
|
7
|
+
# IO.copy_stream(File.open('source.bin', 'rb), writable)
|
8
|
+
class ZipKit::Streamer::Writable
|
9
|
+
include ZipKit::WriteShovel
|
10
|
+
|
11
|
+
# Initializes a new Writable with the object it delegates the writes to.
|
12
|
+
# Normally you would not need to use this method directly
|
13
|
+
def initialize(streamer, writer)
|
14
|
+
@streamer = streamer
|
15
|
+
@writer = writer
|
16
|
+
@closed = false
|
17
|
+
end
|
18
|
+
|
19
|
+
# Writes the given data to the output stream
|
20
|
+
#
|
21
|
+
# @param d[String] the binary string to write (part of the uncompressed file)
|
22
|
+
# @return [self]
|
23
|
+
def <<(d)
|
24
|
+
raise "Trying to write to a closed Writable" if @closed
|
25
|
+
@writer << d
|
26
|
+
self
|
27
|
+
end
|
28
|
+
|
29
|
+
# Flushes the writer and recovers the CRC32/size values. It then calls
|
30
|
+
# `update_last_entry_and_write_data_descriptor` on the given Streamer.
|
31
|
+
def close
|
32
|
+
return if @closed
|
33
|
+
@streamer.update_last_entry_and_write_data_descriptor(**@writer.finish)
|
34
|
+
@closed = true
|
35
|
+
end
|
36
|
+
end
|