zip_kit 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.codeclimate.yml +7 -0
- data/.document +5 -0
- data/.github/workflows/ci.yml +29 -0
- data/.gitignore +61 -0
- data/.rspec +1 -0
- data/.standard.yml +8 -0
- data/.yardopts +1 -0
- data/CHANGELOG.md +255 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/CONTRIBUTING.md +153 -0
- data/Gemfile +4 -0
- data/IMPLEMENTATION_DETAILS.md +97 -0
- data/LICENSE.txt +20 -0
- data/README.md +234 -0
- data/Rakefile +21 -0
- data/bench/buffered_crc32_bench.rb +109 -0
- data/examples/archive_size_estimate.rb +15 -0
- data/examples/config.ru +7 -0
- data/examples/deferred_write.rb +58 -0
- data/examples/parallel_compression_with_block_deflate.rb +86 -0
- data/examples/rack_application.rb +63 -0
- data/examples/s3_upload.rb +23 -0
- data/lib/zip_kit/block_deflate.rb +130 -0
- data/lib/zip_kit/block_write.rb +47 -0
- data/lib/zip_kit/file_reader/inflating_reader.rb +36 -0
- data/lib/zip_kit/file_reader/stored_reader.rb +35 -0
- data/lib/zip_kit/file_reader.rb +740 -0
- data/lib/zip_kit/null_writer.rb +12 -0
- data/lib/zip_kit/output_enumerator.rb +150 -0
- data/lib/zip_kit/path_set.rb +163 -0
- data/lib/zip_kit/rack_chunked_body.rb +32 -0
- data/lib/zip_kit/rack_tempfile_body.rb +61 -0
- data/lib/zip_kit/rails_streaming.rb +37 -0
- data/lib/zip_kit/remote_io.rb +114 -0
- data/lib/zip_kit/remote_uncap.rb +22 -0
- data/lib/zip_kit/size_estimator.rb +84 -0
- data/lib/zip_kit/stream_crc32.rb +60 -0
- data/lib/zip_kit/streamer/deflated_writer.rb +45 -0
- data/lib/zip_kit/streamer/entry.rb +37 -0
- data/lib/zip_kit/streamer/filler.rb +9 -0
- data/lib/zip_kit/streamer/heuristic.rb +68 -0
- data/lib/zip_kit/streamer/stored_writer.rb +39 -0
- data/lib/zip_kit/streamer/writable.rb +36 -0
- data/lib/zip_kit/streamer.rb +614 -0
- data/lib/zip_kit/uniquify_filename.rb +39 -0
- data/lib/zip_kit/version.rb +5 -0
- data/lib/zip_kit/write_and_tell.rb +40 -0
- data/lib/zip_kit/write_buffer.rb +71 -0
- data/lib/zip_kit/write_shovel.rb +22 -0
- data/lib/zip_kit/zip_writer.rb +436 -0
- data/lib/zip_kit.rb +24 -0
- data/zip_kit.gemspec +41 -0
- metadata +335 -0
@@ -0,0 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Used when you need to supply a destination IO for some
|
4
|
+
# write operations, but want to discard the data (like when
|
5
|
+
# estimating the size of a ZIP)
|
6
|
+
module ZipKit::NullWriter
|
7
|
+
# @param _[String] the data to write
|
8
|
+
# @return [self]
|
9
|
+
def self.<<(_)
|
10
|
+
self
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "time" # for .httpdate
|
4
|
+
|
5
|
+
# The output enumerator makes it possible to "pull" from a ZipKit streamer
|
6
|
+
# object instead of having it "push" writes to you. It will "stash" the block which
|
7
|
+
# writes the ZIP archive through the streamer, and when you call `each` on the Enumerator
|
8
|
+
# it will yield you the bytes the block writes. Since it is an enumerator you can
|
9
|
+
# use `next` to take chunks written by the ZipKit streamer one by one. It can be very
|
10
|
+
# convenient when you need to segment your ZIP output into bigger chunks for, say,
|
11
|
+
# uploading them to a cloud storage provider such as S3.
|
12
|
+
#
|
13
|
+
# Another use of the `OutputEnumerator` is as a Rack response body - since a Rack
|
14
|
+
# response body object must support `#each` yielding successive binary strings.
|
15
|
+
# Which is exactly what `OutputEnumerator` does.
|
16
|
+
#
|
17
|
+
# The enumerator can provide you some more conveinences for HTTP output - correct streaming
|
18
|
+
# headers and a body with chunked transfer encoding.
|
19
|
+
#
|
20
|
+
# iterable_zip_body = ZipKit::OutputEnumerator.new do | streamer |
|
21
|
+
# streamer.write_file('big.csv') do |sink|
|
22
|
+
# CSV(sink) do |csv_writer|
|
23
|
+
# csv_writer << Person.column_names
|
24
|
+
# Person.all.find_each do |person|
|
25
|
+
# csv_writer << person.attributes.values
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
# end
|
29
|
+
# end
|
30
|
+
#
|
31
|
+
# Either as a `Transfer-Encoding: chunked` response (if your webserver supports it),
|
32
|
+
# which will give you true streaming capability:
|
33
|
+
#
|
34
|
+
# headers, chunked_or_presized_rack_body = iterable_zip_body.to_headers_and_rack_response_body(env)
|
35
|
+
# [200, headers, chunked_or_presized_rack_body]
|
36
|
+
#
|
37
|
+
# or it will wrap your output in a `TempfileBody` object which buffers the ZIP before output. Buffering has
|
38
|
+
# benefits if your webserver does not support anything beyound HTTP/1.0, and also engages automatically
|
39
|
+
# in unit tests (since rack-test and Rails tests do not do streaming HTTP/1.1).
|
40
|
+
class ZipKit::OutputEnumerator
|
41
|
+
DEFAULT_WRITE_BUFFER_SIZE = 64 * 1024
|
42
|
+
|
43
|
+
# Creates a new OutputEnumerator enumerator. The enumerator can be read from using `each`,
|
44
|
+
# and the creation of the ZIP is in lockstep with the caller calling `each` on the returned
|
45
|
+
# output enumerator object. This can be used when the calling program wants to stream the
|
46
|
+
# output of the ZIP archive and throttle that output, or split it into chunks, or use it
|
47
|
+
# as a generator.
|
48
|
+
#
|
49
|
+
# For example:
|
50
|
+
#
|
51
|
+
# # The block given to {output_enum} won't be executed immediately - rather it
|
52
|
+
# # will only start to execute when the caller starts to read from the output
|
53
|
+
# # by calling `each`
|
54
|
+
# body = ::ZipKit::OutputEnumerator.new(writer: CustomWriter) do |streamer|
|
55
|
+
# streamer.add_stored_entry(filename: 'large.tif', size: 1289894, crc32: 198210)
|
56
|
+
# streamer << large_file.read(1024*1024) until large_file.eof?
|
57
|
+
# ...
|
58
|
+
# end
|
59
|
+
#
|
60
|
+
# body.each do |bin_string|
|
61
|
+
# # Send the output somewhere, buffer it in a file etc.
|
62
|
+
# # The block passed into `initialize` will only start executing once `#each`
|
63
|
+
# # is called
|
64
|
+
# ...
|
65
|
+
# end
|
66
|
+
#
|
67
|
+
# @param kwargs_for_new [Hash] keyword arguments for {Streamer.new}
|
68
|
+
# @return [ZipKit::OutputEnumerator] the enumerator you can read bytestrings of the ZIP from by calling `each`
|
69
|
+
#
|
70
|
+
# @param streamer_options[Hash] options for Streamer, see {ZipKit::Streamer.new}
|
71
|
+
# @param write_buffer_size[Integer] By default all ZipKit writes are unbuffered. For output to sockets
|
72
|
+
# it is beneficial to bulkify those writes so that they are roughly sized to a socket buffer chunk. This
|
73
|
+
# object will bulkify writes for you in this way (so `each` will yield not on every call to `<<` from the Streamer
|
74
|
+
# but at block size boundaries or greater). Set it to 0 for unbuffered writes.
|
75
|
+
# @param blk a block that will receive the Streamer object when executing. The block will not be executed
|
76
|
+
# immediately but only once `each` is called on the OutputEnumerator
|
77
|
+
def initialize(write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE, **streamer_options, &blk)
|
78
|
+
@streamer_options = streamer_options.to_h
|
79
|
+
@bufsize = write_buffer_size.to_i
|
80
|
+
@archiving_block = blk
|
81
|
+
end
|
82
|
+
|
83
|
+
# Executes the block given to the constructor with a {ZipKit::Streamer}
|
84
|
+
# and passes each written chunk to the block given to the method. This allows one
|
85
|
+
# to "take" output of the ZIP piecewise. If called without a block will return an Enumerator
|
86
|
+
# that you can pull data from using `next`.
|
87
|
+
#
|
88
|
+
# **NOTE** Because the `WriteBuffer` inside this object can reuse the buffer, it is important
|
89
|
+
# that the `String` that is yielded **either** gets consumed eagerly (written byte-by-byte somewhere, or `#dup`-ed)
|
90
|
+
# since the write buffer will clear it after your block returns. If you expand this Enumerator
|
91
|
+
# eagerly into an Array you might notice that a lot of the segments of your ZIP output are
|
92
|
+
# empty - this means that you need to duplicate them.
|
93
|
+
#
|
94
|
+
# @yield [String] a chunk of the ZIP output in binary encoding
|
95
|
+
def each
|
96
|
+
if block_given?
|
97
|
+
block_write = ZipKit::BlockWrite.new { |chunk| yield(chunk) }
|
98
|
+
buffer = ZipKit::WriteBuffer.new(block_write, @bufsize)
|
99
|
+
ZipKit::Streamer.open(buffer, **@streamer_options, &@archiving_block)
|
100
|
+
buffer.flush
|
101
|
+
else
|
102
|
+
enum_for(:each)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Returns a tuple of `headers, body` - headers are a `Hash` and the body is
|
107
|
+
# an object that can be used as a Rack response body. The method will automatically
|
108
|
+
# switch the wrapping of the output depending on whether the response can be pre-sized,
|
109
|
+
# and whether your downstream webserver (like nginx) is configured to support
|
110
|
+
# the HTTP/1.1 protocol version.
|
111
|
+
#
|
112
|
+
# @param rack_env[Hash] the Rack env, which the method may need to mutate (adding a Tempfile for cleanup)
|
113
|
+
# @param content_length[Integer] the amount of bytes that the archive will contain. If given, no Chunked encoding gets applied.
|
114
|
+
# @return [Array]
|
115
|
+
def to_headers_and_rack_response_body(rack_env, content_length: nil)
|
116
|
+
headers = {
|
117
|
+
# We need to ensure Rack::ETag does not suddenly start buffering us, see
|
118
|
+
# https://github.com/rack/rack/issues/1619#issuecomment-606315714
|
119
|
+
# Set this even when not streaming for consistency. The fact that there would be
|
120
|
+
# a weak ETag generated would mean that the middleware buffers, so we have tests for that.
|
121
|
+
"Last-Modified" => Time.now.httpdate,
|
122
|
+
# Make sure Rack::Deflater does not touch our response body either, see
|
123
|
+
# https://github.com/felixbuenemann/xlsxtream/issues/14#issuecomment-529569548
|
124
|
+
"Content-Encoding" => "identity",
|
125
|
+
# Disable buffering for both nginx and Google Load Balancer, see
|
126
|
+
# https://cloud.google.com/appengine/docs/flexible/how-requests-are-handled?tab=python#x-accel-buffering
|
127
|
+
"X-Accel-Buffering" => "no"
|
128
|
+
}
|
129
|
+
|
130
|
+
if content_length
|
131
|
+
# If we know the size of the body, transfer encoding is not required at all - so the enumerator itself
|
132
|
+
# can function as the Rack body. This also would apply in HTTP/2 contexts where chunked encoding would
|
133
|
+
# no longer be required - then the enumerator could get returned "bare".
|
134
|
+
body = self
|
135
|
+
headers["Content-Length"] = content_length.to_i.to_s
|
136
|
+
elsif rack_env["HTTP_VERSION"] == "HTTP/1.0"
|
137
|
+
# Check for the proxy configuration first. This is the first common misconfiguration which destroys streaming -
|
138
|
+
# since HTTP 1.0 does not support chunked responses we need to revert to buffering. The issue though is that
|
139
|
+
# this reversion happens silently and it is usually not clear at all why streaming does not work. So let's at
|
140
|
+
# the very least print it to the Rails log.
|
141
|
+
body = ZipKit::RackTempfileBody.new(rack_env, self)
|
142
|
+
headers["Content-Length"] = body.size.to_s
|
143
|
+
else
|
144
|
+
body = ZipKit::RackChunkedBody.new(self)
|
145
|
+
headers["Transfer-Encoding"] = "chunked"
|
146
|
+
end
|
147
|
+
|
148
|
+
[headers, body]
|
149
|
+
end
|
150
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# A ZIP archive contains a flat list of entries. These entries can implicitly
|
4
|
+
# create directories when the archive is expanded. For example, an entry with
|
5
|
+
# the filename of "some folder/file.docx" will make the unarchiving application
|
6
|
+
# create a directory called "some folder" automatically, and then deposit the
|
7
|
+
# file "file.docx" in that directory. These "implicit" directories can be
|
8
|
+
# arbitrarily nested, and create a tree structure of directories. That structure
|
9
|
+
# however is implicit as the archive contains a flat list.
|
10
|
+
#
|
11
|
+
# This creates opportunities for conflicts. For example, imagine the following
|
12
|
+
# structure:
|
13
|
+
#
|
14
|
+
# * `something/` - specifies an empty directory with the name "something"
|
15
|
+
# * `something` - specifies a file, creates a conflict
|
16
|
+
#
|
17
|
+
# This can be prevented with filename uniqueness checks. It does get funkier however
|
18
|
+
# as the rabbit hole goes down:
|
19
|
+
#
|
20
|
+
# * `dir/subdir/another_subdir/yet_another_subdir/file.bin` - declares a file and directories
|
21
|
+
# * `dir/subdir/another_subdir/yet_another_subdir` - declares a file at one of the levels, creates a conflict
|
22
|
+
#
|
23
|
+
# The results of this ZIP structure aren't very easy to predict as they depend on the
|
24
|
+
# application that opens the archive. For example, BOMArchiveHelper on macOS will expand files
|
25
|
+
# as they are declared in the ZIP, but once a conflict occurs it will fail with "error -21". It
|
26
|
+
# is not very transparent to the user why unarchiving fails, and it has to - and can reliably - only
|
27
|
+
# be prevented when the archive gets created.
|
28
|
+
#
|
29
|
+
# Unfortunately that conflicts with another "magical" feature of ZipKit which automatically
|
30
|
+
# "fixes" duplicate filenames - filenames (paths) which have already been added to the archive.
|
31
|
+
# This fix is performed by appending (1), then (2) and so forth to the filename so that the
|
32
|
+
# conflict is avoided. This is not possible to apply to directories, because when one of the
|
33
|
+
# path components is reused in multiple filenames it means those entities should end up in
|
34
|
+
# the same directory (subdirectory) once the archive is opened.
|
35
|
+
#
|
36
|
+
# The `PathSet` keeps track of entries as they get added using 2 Sets (cheap presence checks),
|
37
|
+
# one for directories and one for files. It will raise a `Conflict` exception if there are
|
38
|
+
# files clobbering one another, or in case files collide with directories.
|
39
|
+
class ZipKit::PathSet
|
40
|
+
class Conflict < StandardError
|
41
|
+
end
|
42
|
+
|
43
|
+
class FileClobbersDirectory < Conflict
|
44
|
+
end
|
45
|
+
|
46
|
+
class DirectoryClobbersFile < Conflict
|
47
|
+
end
|
48
|
+
|
49
|
+
def initialize
|
50
|
+
@known_directories = Set.new
|
51
|
+
@known_files = Set.new
|
52
|
+
end
|
53
|
+
|
54
|
+
# Adds a directory path to the set of known paths, including
|
55
|
+
# all the directories that contain it. So, calling
|
56
|
+
# add_directory_path("dir/dir2/dir3")
|
57
|
+
# will add "dir", "dir/dir2", "dir/dir2/dir3".
|
58
|
+
#
|
59
|
+
# @param path[String] the path to the directory to add
|
60
|
+
# @return [void]
|
61
|
+
def add_directory_path(path)
|
62
|
+
path_and_ancestors(path).each do |parent_directory_path|
|
63
|
+
if @known_files.include?(parent_directory_path)
|
64
|
+
# Have to use the old-fashioned heredocs because ZipKit
|
65
|
+
# aims to be compatible with MRI 2.1+ syntax, and squiggly
|
66
|
+
# heredoc is only available starting 2.3+
|
67
|
+
error_message = <<~ERR
|
68
|
+
The path #{parent_directory_path.inspect} which has to be added
|
69
|
+
as a directory is already used for a file.
|
70
|
+
|
71
|
+
The directory at this path would get created implicitly
|
72
|
+
to produce #{path.inspect} during decompresison.
|
73
|
+
|
74
|
+
This would make some archive utilities refuse to open
|
75
|
+
the ZIP.
|
76
|
+
ERR
|
77
|
+
raise DirectoryClobbersFile, error_message
|
78
|
+
end
|
79
|
+
@known_directories << parent_directory_path
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Adds a file path to the set of known paths, including
|
84
|
+
# all the directories that contain it. Once a file has been added,
|
85
|
+
# it is no longer possible to add a directory having the same path
|
86
|
+
# as this would cause conflict.
|
87
|
+
#
|
88
|
+
# The operation also adds all the containing directories for the file, so
|
89
|
+
# add_file_path("dir/dir2/file.doc")
|
90
|
+
# will add "dir" and "dir/dir2" as directories, "dir/dir2/dir3".
|
91
|
+
#
|
92
|
+
# @param file_path[String] the path to the directory to add
|
93
|
+
# @return [void]
|
94
|
+
def add_file_path(file_path)
|
95
|
+
if @known_files.include?(file_path)
|
96
|
+
error_message = <<~ERR
|
97
|
+
The file at #{file_path.inspect} has already been included
|
98
|
+
in the archive. Adding it the second time would cause
|
99
|
+
the first file to be overwritten during unarchiving, and
|
100
|
+
could also get the archive flagged as invalid.
|
101
|
+
ERR
|
102
|
+
raise Conflict, error_message
|
103
|
+
end
|
104
|
+
|
105
|
+
if @known_directories.include?(file_path)
|
106
|
+
error_message = <<~ERR
|
107
|
+
The path #{file_path.inspect} is already used for
|
108
|
+
a directory, but you are trying to add it as a file.
|
109
|
+
|
110
|
+
This would make some archive utilities refuse
|
111
|
+
to open the ZIP.
|
112
|
+
ERR
|
113
|
+
raise FileClobbersDirectory, error_message
|
114
|
+
end
|
115
|
+
|
116
|
+
# Add all the directories which this file is contained in
|
117
|
+
*dir_components, _file_name = non_empty_path_components(file_path)
|
118
|
+
add_directory_path(dir_components.join("/"))
|
119
|
+
|
120
|
+
# ...and then the file itself
|
121
|
+
@known_files << file_path
|
122
|
+
end
|
123
|
+
|
124
|
+
# Tells whether a specific full path is already known to the PathSet.
|
125
|
+
# Can be a path for a directory or for a file.
|
126
|
+
#
|
127
|
+
# @param path_in_archive[String] the path to check for inclusion
|
128
|
+
# @return [Boolean]
|
129
|
+
def include?(path_in_archive)
|
130
|
+
@known_files.include?(path_in_archive) || @known_directories.include?(path_in_archive)
|
131
|
+
end
|
132
|
+
|
133
|
+
# Clears the contained sets
|
134
|
+
# @return [void]
|
135
|
+
def clear
|
136
|
+
@known_files.clear
|
137
|
+
@known_directories.clear
|
138
|
+
end
|
139
|
+
|
140
|
+
# Adds the directory or file path to the path set
|
141
|
+
#
|
142
|
+
# @return [void]
|
143
|
+
def add_directory_or_file_path(path_in_archive)
|
144
|
+
if path_in_archive.end_with?("/")
|
145
|
+
add_directory_path(path_in_archive)
|
146
|
+
else
|
147
|
+
add_file_path(path_in_archive)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
private
|
152
|
+
|
153
|
+
def non_empty_path_components(path)
|
154
|
+
path.split("/").reject(&:empty?)
|
155
|
+
end
|
156
|
+
|
157
|
+
def path_and_ancestors(path)
|
158
|
+
path_components = non_empty_path_components(path)
|
159
|
+
path_components.each_with_object([]) do |component, seen|
|
160
|
+
seen << [seen.last, component].compact.join("/")
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# A body wrapper that emits chunked responses, creating valid
|
4
|
+
# Transfer-Encoding::Chunked HTTP response body. This is copied from Rack::Chunked::Body,
|
5
|
+
# because Rack is not going to include that class after version 3.x
|
6
|
+
# Rails has a substitute class for this inside ActionController::Streaming,
|
7
|
+
# but that module is a private constant in the Rails codebase, and is thus
|
8
|
+
# considered "private" from the Rails standpoint. It is not that much code to
|
9
|
+
# carry, so we copy it into our code.
|
10
|
+
class ZipKit::RackChunkedBody
|
11
|
+
TERM = "\r\n"
|
12
|
+
TAIL = "0#{TERM}"
|
13
|
+
|
14
|
+
# @param body[#each] the enumerable that yields bytes, usually a `OutputEnumerator`
|
15
|
+
def initialize(body)
|
16
|
+
@body = body
|
17
|
+
end
|
18
|
+
|
19
|
+
# For each string yielded by the response body, yield
|
20
|
+
# the element in chunked encoding - and finish off with a terminator
|
21
|
+
def each
|
22
|
+
term = TERM
|
23
|
+
@body.each do |chunk|
|
24
|
+
size = chunk.bytesize
|
25
|
+
next if size == 0
|
26
|
+
|
27
|
+
yield [size.to_s(16), term, chunk.b, term].join
|
28
|
+
end
|
29
|
+
yield TAIL
|
30
|
+
yield term
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Contains a file handle which can be closed once the response finishes sending.
|
4
|
+
# It supports `to_path` so that `Rack::Sendfile` can intercept it
|
5
|
+
class ZipKit::RackTempfileBody
|
6
|
+
TEMPFILE_NAME_PREFIX = "zip-tricks-tf-body-"
|
7
|
+
attr_reader :tempfile
|
8
|
+
|
9
|
+
# @param body[#each] the enumerable that yields bytes, usually a `OutputEnumerator`.
|
10
|
+
# The `body` will be read in full immediately and closed.
|
11
|
+
def initialize(env, body)
|
12
|
+
@tempfile = Tempfile.new(TEMPFILE_NAME_PREFIX)
|
13
|
+
# Rack::TempfileReaper calls close! on tempfiles which get buffered
|
14
|
+
# We wil assume that it works fine with Rack::Sendfile (i.e. the path
|
15
|
+
# to the file getting served gets used before we unlink the tempfile)
|
16
|
+
env["rack.tempfiles"] ||= []
|
17
|
+
env["rack.tempfiles"] << @tempfile
|
18
|
+
|
19
|
+
@tempfile.binmode
|
20
|
+
@body = body
|
21
|
+
@did_flush = false
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns the size of the contained `Tempfile` so that a correct
|
25
|
+
# Content-Length header can be set
|
26
|
+
#
|
27
|
+
# @return [Integer]
|
28
|
+
def size
|
29
|
+
flush
|
30
|
+
@tempfile.size
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns the path to the `Tempfile`, so that Rack::Sendfile can send this response
|
34
|
+
# using the downstream webserver
|
35
|
+
#
|
36
|
+
# @return [String]
|
37
|
+
def to_path
|
38
|
+
flush
|
39
|
+
@tempfile.to_path
|
40
|
+
end
|
41
|
+
|
42
|
+
# Stream the file's contents if `Rack::Sendfile` isn't present.
|
43
|
+
#
|
44
|
+
# @return [void]
|
45
|
+
def each
|
46
|
+
flush
|
47
|
+
while (chunk = @tempfile.read(16384))
|
48
|
+
yield chunk
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def flush
|
55
|
+
if !@did_flush
|
56
|
+
@body.each { |bytes| @tempfile << bytes }
|
57
|
+
@did_flush = true
|
58
|
+
end
|
59
|
+
@tempfile.rewind
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Should be included into a Rails controller for easy ZIP output from any action.
|
4
|
+
module ZipKit::RailsStreaming
|
5
|
+
# Opens a {ZipKit::Streamer} and yields it to the caller. The output of the streamer
|
6
|
+
# gets automatically forwarded to the Rails response stream. When the output completes,
|
7
|
+
# the Rails response stream is going to be closed automatically.
|
8
|
+
# @param filename[String] name of the file for the Content-Disposition header
|
9
|
+
# @param type[String] the content type (MIME type) of the archive being output
|
10
|
+
# @param zip_streamer_options[Hash] options that will be passed to the Streamer.
|
11
|
+
# See {ZipKit::Streamer#initialize} for the full list of options.
|
12
|
+
# @yield [Streamer] the streamer that can be written to
|
13
|
+
# @return [ZipKit::OutputEnumerator] The output enumerator assigned to the response body
|
14
|
+
def zip_kit_stream(filename: "download.zip", type: "application/zip", **zip_streamer_options, &zip_streaming_blk)
|
15
|
+
# The output enumerator yields chunks of bytes generated from ZipKit. Instantiating it
|
16
|
+
# first will also validate the Streamer options.
|
17
|
+
chunk_yielder = ZipKit::OutputEnumerator.new(**zip_streamer_options, &zip_streaming_blk)
|
18
|
+
|
19
|
+
# We want some common headers for file sending. Rails will also set
|
20
|
+
# self.sending_file = true for us when we call send_file_headers!
|
21
|
+
send_file_headers!(type: type, filename: filename)
|
22
|
+
|
23
|
+
# Check for the proxy configuration first. This is the first common misconfiguration which destroys streaming -
|
24
|
+
# since HTTP 1.0 does not support chunked responses we need to revert to buffering. The issue though is that
|
25
|
+
# this reversion happens silently and it is usually not clear at all why streaming does not work. So let's at
|
26
|
+
# the very least print it to the Rails log.
|
27
|
+
if request.get_header("HTTP_VERSION") == "HTTP/1.0"
|
28
|
+
logger&.warn { "The downstream HTTP proxy/LB insists on HTTP/1.0 protocol, ZIP response will be buffered." }
|
29
|
+
end
|
30
|
+
|
31
|
+
headers, rack_body = chunk_yielder.to_headers_and_rack_response_body(request.env)
|
32
|
+
|
33
|
+
# Set the "particular" streaming headers
|
34
|
+
response.headers.merge!(headers)
|
35
|
+
self.response_body = rack_body
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# An object that fakes just-enough of an IO to be dangerous
|
4
|
+
# - or, more precisely, to be useful as a source for the FileReader
|
5
|
+
# central directory parser. Effectively we substitute an IO object
|
6
|
+
# for an object that fetches parts of the remote file over HTTP using `Range:`
|
7
|
+
# headers. The `RemoteIO` acts as an adapter between an object that performs the
|
8
|
+
# actual fetches over HTTP and an object that expects a handful of IO methods to be
|
9
|
+
# available.
|
10
|
+
class ZipKit::RemoteIO
|
11
|
+
# @param url[String, URI] the HTTP/HTTPS URL of the object to be retrieved
|
12
|
+
def initialize(url)
|
13
|
+
@pos = 0
|
14
|
+
@uri = URI(url)
|
15
|
+
@remote_size = nil
|
16
|
+
end
|
17
|
+
|
18
|
+
# Emulates IO#seek
|
19
|
+
# @param offset[Integer] absolute offset in the remote resource to seek to
|
20
|
+
# @param mode[Integer] The seek mode (only SEEK_SET is supported)
|
21
|
+
def seek(offset, mode = IO::SEEK_SET)
|
22
|
+
raise "Unsupported read mode #{mode}" unless mode == IO::SEEK_SET
|
23
|
+
@remote_size ||= request_object_size
|
24
|
+
@pos = clamp(0, offset, @remote_size)
|
25
|
+
0 # always return 0!
|
26
|
+
end
|
27
|
+
|
28
|
+
# Emulates IO#size.
|
29
|
+
#
|
30
|
+
# @return [Integer] the size of the remote resource
|
31
|
+
def size
|
32
|
+
@remote_size ||= request_object_size
|
33
|
+
end
|
34
|
+
|
35
|
+
# Emulates IO#read, but requires the number of bytes to read
|
36
|
+
# The read will be limited to the
|
37
|
+
# size of the remote resource relative to the current offset in the IO,
|
38
|
+
# so if you are at offset 0 in the IO of size 10, doing a `read(20)`
|
39
|
+
# will only return you 10 bytes of result, and not raise any exceptions.
|
40
|
+
#
|
41
|
+
# @param n_bytes[Fixnum, nil] how many bytes to read, or `nil` to read all the way to the end
|
42
|
+
# @return [String] the read bytes
|
43
|
+
def read(n_bytes = nil)
|
44
|
+
# If the resource is empty there is nothing to read
|
45
|
+
return if size.zero?
|
46
|
+
|
47
|
+
maximum_avaialable = size - @pos
|
48
|
+
n_bytes ||= maximum_avaialable # nil == read to the end of file
|
49
|
+
return "" if n_bytes.zero?
|
50
|
+
raise ArgumentError, "No negative reads(#{n_bytes})" if n_bytes < 0
|
51
|
+
|
52
|
+
n_bytes = clamp(0, n_bytes, maximum_avaialable)
|
53
|
+
|
54
|
+
http_range = (@pos..(@pos + n_bytes - 1))
|
55
|
+
request_range(http_range).tap do |data|
|
56
|
+
raise "Remote read returned #{data.bytesize} bytes instead of #{n_bytes} as requested" if data.bytesize != n_bytes
|
57
|
+
@pos = clamp(0, @pos + data.bytesize, size)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Returns the current pointer position within the IO
|
62
|
+
#
|
63
|
+
# @return [Fixnum]
|
64
|
+
def tell
|
65
|
+
@pos
|
66
|
+
end
|
67
|
+
|
68
|
+
protected
|
69
|
+
|
70
|
+
# Only used internally when reading the remote ZIP.
|
71
|
+
#
|
72
|
+
# @param range[Range] the HTTP range of data to fetch from remote
|
73
|
+
# @return [String] the response body of the ranged request
|
74
|
+
def request_range(range)
|
75
|
+
http = Net::HTTP.start(@uri.hostname, @uri.port)
|
76
|
+
request = Net::HTTP::Get.new(@uri)
|
77
|
+
request.range = range
|
78
|
+
response = http.request(request)
|
79
|
+
case response.code
|
80
|
+
when "206", "200"
|
81
|
+
response.body
|
82
|
+
else
|
83
|
+
raise "Remote at #{@uri} replied with code #{response.code}"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# For working with S3 it is a better idea to perform a GET request for one byte, since doing a HEAD
|
88
|
+
# request needs a different permission - and standard GET presigned URLs are not allowed to perform it
|
89
|
+
#
|
90
|
+
# @return [Integer] the size of the remote resource, parsed either from Content-Length or Content-Range header
|
91
|
+
def request_object_size
|
92
|
+
http = Net::HTTP.start(@uri.hostname, @uri.port)
|
93
|
+
request = Net::HTTP::Get.new(@uri)
|
94
|
+
request.range = 0..0
|
95
|
+
response = http.request(request)
|
96
|
+
case response.code
|
97
|
+
when "206"
|
98
|
+
content_range_header_value = response["Content-Range"]
|
99
|
+
content_range_header_value.split("/").last.to_i
|
100
|
+
when "200"
|
101
|
+
response["Content-Length"].to_i
|
102
|
+
else
|
103
|
+
raise "Remote at #{@uri} replied with code #{response.code}"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
private
|
108
|
+
|
109
|
+
def clamp(a, b, c)
|
110
|
+
return a if b < a
|
111
|
+
return c if b > c
|
112
|
+
b
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Alows reading the central directory of a remote ZIP file without
|
4
|
+
# downloading the entire file. The central directory provides the
|
5
|
+
# offsets at which the actual file contents is located. You can then
|
6
|
+
# use the `Range:` HTTP headers to download those entries separately.
|
7
|
+
#
|
8
|
+
# Please read the security warning in `FileReader` _VERY CAREFULLY_
|
9
|
+
# before you use this module.
|
10
|
+
module ZipKit::RemoteUncap
|
11
|
+
# @param uri[String] the HTTP(S) URL to read the ZIP footer from
|
12
|
+
# @param reader_class[Class] which class to use for reading
|
13
|
+
# @param options_for_zip_reader[Hash] any additional options to give to
|
14
|
+
# {ZipKit::FileReader} when reading
|
15
|
+
# @return [Array<ZipKit::FileReader::ZipEntry>] metadata about the
|
16
|
+
# files within the remote archive
|
17
|
+
def self.files_within_zip_at(uri, reader_class: ZipKit::FileReader, **options_for_zip_reader)
|
18
|
+
fake_io = ZipKit::RemoteIO.new(uri)
|
19
|
+
reader = reader_class.new
|
20
|
+
reader.read_zip_structure(io: fake_io, **options_for_zip_reader)
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Helps to estimate archive sizes
|
4
|
+
class ZipKit::SizeEstimator
|
5
|
+
require_relative "streamer"
|
6
|
+
|
7
|
+
# Creates a new estimator with a Streamer object. Normally you should use
|
8
|
+
# `estimate` instead an not use this method directly.
|
9
|
+
def initialize(streamer)
|
10
|
+
@streamer = streamer
|
11
|
+
end
|
12
|
+
private :initialize
|
13
|
+
|
14
|
+
# Performs the estimate using fake archiving. It needs to know the sizes of the
|
15
|
+
# entries upfront. Usage:
|
16
|
+
#
|
17
|
+
# expected_zip_size = SizeEstimator.estimate do | estimator |
|
18
|
+
# estimator.add_stored_entry(filename: "file.doc", size: 898291)
|
19
|
+
# estimator.add_deflated_entry(filename: "family.tif",
|
20
|
+
# uncompressed_size: 89281911, compressed_size: 121908)
|
21
|
+
# end
|
22
|
+
#
|
23
|
+
# @param kwargs_for_streamer_new Any options to pass to Streamer, see {Streamer#initialize}
|
24
|
+
# @return [Integer] the size of the resulting archive, in bytes
|
25
|
+
# @yield [SizeEstimator] the estimator
|
26
|
+
def self.estimate(**kwargs_for_streamer_new)
|
27
|
+
streamer = ZipKit::Streamer.new(ZipKit::NullWriter, **kwargs_for_streamer_new)
|
28
|
+
estimator = new(streamer)
|
29
|
+
yield(estimator)
|
30
|
+
streamer.close # Returns the .tell of the contained IO
|
31
|
+
end
|
32
|
+
|
33
|
+
# Add a fake entry to the archive, to see how big it is going to be in the end.
|
34
|
+
#
|
35
|
+
# @param filename [String] the name of the file (filenames are variable-width in the ZIP)
|
36
|
+
# @param size [Fixnum] size of the uncompressed entry
|
37
|
+
# @param use_data_descriptor[Boolean] whether the entry uses a postfix
|
38
|
+
# data descriptor to specify size
|
39
|
+
# @return self
|
40
|
+
def add_stored_entry(filename:, size:, use_data_descriptor: false)
|
41
|
+
@streamer.add_stored_entry(filename: filename,
|
42
|
+
crc32: 0,
|
43
|
+
size: size,
|
44
|
+
use_data_descriptor: use_data_descriptor)
|
45
|
+
@streamer.simulate_write(size)
|
46
|
+
if use_data_descriptor
|
47
|
+
@streamer.update_last_entry_and_write_data_descriptor(crc32: 0, compressed_size: size, uncompressed_size: size)
|
48
|
+
end
|
49
|
+
self
|
50
|
+
end
|
51
|
+
|
52
|
+
# Add a fake entry to the archive, to see how big it is going to be in the end.
|
53
|
+
#
|
54
|
+
# @param filename [String] the name of the file (filenames are variable-width in the ZIP)
|
55
|
+
# @param uncompressed_size [Fixnum] size of the uncompressed entry
|
56
|
+
# @param compressed_size [Fixnum] size of the compressed entry
|
57
|
+
# @param use_data_descriptor[Boolean] whether the entry uses a postfix data
|
58
|
+
# descriptor to specify size
|
59
|
+
# @return self
|
60
|
+
def add_deflated_entry(filename:, uncompressed_size:, compressed_size:, use_data_descriptor: false)
|
61
|
+
@streamer.add_deflated_entry(filename: filename,
|
62
|
+
crc32: 0,
|
63
|
+
compressed_size: compressed_size,
|
64
|
+
uncompressed_size: uncompressed_size,
|
65
|
+
use_data_descriptor: use_data_descriptor)
|
66
|
+
|
67
|
+
@streamer.simulate_write(compressed_size)
|
68
|
+
if use_data_descriptor
|
69
|
+
@streamer.update_last_entry_and_write_data_descriptor(crc32: 0,
|
70
|
+
compressed_size: compressed_size,
|
71
|
+
uncompressed_size: uncompressed_size)
|
72
|
+
end
|
73
|
+
self
|
74
|
+
end
|
75
|
+
|
76
|
+
# Add an empty directory to the archive.
|
77
|
+
#
|
78
|
+
# @param dirname [String] the name of the directory
|
79
|
+
# @return self
|
80
|
+
def add_empty_directory_entry(dirname:)
|
81
|
+
@streamer.add_empty_directory(dirname: dirname)
|
82
|
+
self
|
83
|
+
end
|
84
|
+
end
|