zip_kit 6.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.codeclimate.yml +7 -0
- data/.document +5 -0
- data/.github/workflows/ci.yml +29 -0
- data/.gitignore +61 -0
- data/.rspec +1 -0
- data/.standard.yml +8 -0
- data/.yardopts +1 -0
- data/CHANGELOG.md +255 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/CONTRIBUTING.md +153 -0
- data/Gemfile +4 -0
- data/IMPLEMENTATION_DETAILS.md +97 -0
- data/LICENSE.txt +20 -0
- data/README.md +234 -0
- data/Rakefile +21 -0
- data/bench/buffered_crc32_bench.rb +109 -0
- data/examples/archive_size_estimate.rb +15 -0
- data/examples/config.ru +7 -0
- data/examples/deferred_write.rb +58 -0
- data/examples/parallel_compression_with_block_deflate.rb +86 -0
- data/examples/rack_application.rb +63 -0
- data/examples/s3_upload.rb +23 -0
- data/lib/zip_kit/block_deflate.rb +130 -0
- data/lib/zip_kit/block_write.rb +47 -0
- data/lib/zip_kit/file_reader/inflating_reader.rb +36 -0
- data/lib/zip_kit/file_reader/stored_reader.rb +35 -0
- data/lib/zip_kit/file_reader.rb +740 -0
- data/lib/zip_kit/null_writer.rb +12 -0
- data/lib/zip_kit/output_enumerator.rb +150 -0
- data/lib/zip_kit/path_set.rb +163 -0
- data/lib/zip_kit/rack_chunked_body.rb +32 -0
- data/lib/zip_kit/rack_tempfile_body.rb +61 -0
- data/lib/zip_kit/rails_streaming.rb +37 -0
- data/lib/zip_kit/remote_io.rb +114 -0
- data/lib/zip_kit/remote_uncap.rb +22 -0
- data/lib/zip_kit/size_estimator.rb +84 -0
- data/lib/zip_kit/stream_crc32.rb +60 -0
- data/lib/zip_kit/streamer/deflated_writer.rb +45 -0
- data/lib/zip_kit/streamer/entry.rb +37 -0
- data/lib/zip_kit/streamer/filler.rb +9 -0
- data/lib/zip_kit/streamer/heuristic.rb +68 -0
- data/lib/zip_kit/streamer/stored_writer.rb +39 -0
- data/lib/zip_kit/streamer/writable.rb +36 -0
- data/lib/zip_kit/streamer.rb +614 -0
- data/lib/zip_kit/uniquify_filename.rb +39 -0
- data/lib/zip_kit/version.rb +5 -0
- data/lib/zip_kit/write_and_tell.rb +40 -0
- data/lib/zip_kit/write_buffer.rb +71 -0
- data/lib/zip_kit/write_shovel.rb +22 -0
- data/lib/zip_kit/zip_writer.rb +436 -0
- data/lib/zip_kit.rb +24 -0
- data/zip_kit.gemspec +41 -0
- metadata +335 -0
@@ -0,0 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Used when you need to supply a destination IO for some
|
4
|
+
# write operations, but want to discard the data (like when
|
5
|
+
# estimating the size of a ZIP)
|
6
|
+
module ZipKit::NullWriter
|
7
|
+
# @param _[String] the data to write
|
8
|
+
# @return [self]
|
9
|
+
def self.<<(_)
|
10
|
+
self
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "time" # for .httpdate
|
4
|
+
|
5
|
+
# The output enumerator makes it possible to "pull" from a ZipKit streamer
|
6
|
+
# object instead of having it "push" writes to you. It will "stash" the block which
|
7
|
+
# writes the ZIP archive through the streamer, and when you call `each` on the Enumerator
|
8
|
+
# it will yield you the bytes the block writes. Since it is an enumerator you can
|
9
|
+
# use `next` to take chunks written by the ZipKit streamer one by one. It can be very
|
10
|
+
# convenient when you need to segment your ZIP output into bigger chunks for, say,
|
11
|
+
# uploading them to a cloud storage provider such as S3.
|
12
|
+
#
|
13
|
+
# Another use of the `OutputEnumerator` is as a Rack response body - since a Rack
|
14
|
+
# response body object must support `#each` yielding successive binary strings.
|
15
|
+
# Which is exactly what `OutputEnumerator` does.
|
16
|
+
#
|
17
|
+
# The enumerator can provide you some more conveinences for HTTP output - correct streaming
|
18
|
+
# headers and a body with chunked transfer encoding.
|
19
|
+
#
|
20
|
+
# iterable_zip_body = ZipKit::OutputEnumerator.new do | streamer |
|
21
|
+
# streamer.write_file('big.csv') do |sink|
|
22
|
+
# CSV(sink) do |csv_writer|
|
23
|
+
# csv_writer << Person.column_names
|
24
|
+
# Person.all.find_each do |person|
|
25
|
+
# csv_writer << person.attributes.values
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
# end
|
29
|
+
# end
|
30
|
+
#
|
31
|
+
# Either as a `Transfer-Encoding: chunked` response (if your webserver supports it),
|
32
|
+
# which will give you true streaming capability:
|
33
|
+
#
|
34
|
+
# headers, chunked_or_presized_rack_body = iterable_zip_body.to_headers_and_rack_response_body(env)
|
35
|
+
# [200, headers, chunked_or_presized_rack_body]
|
36
|
+
#
|
37
|
+
# or it will wrap your output in a `TempfileBody` object which buffers the ZIP before output. Buffering has
|
38
|
+
# benefits if your webserver does not support anything beyound HTTP/1.0, and also engages automatically
|
39
|
+
# in unit tests (since rack-test and Rails tests do not do streaming HTTP/1.1).
|
40
|
+
class ZipKit::OutputEnumerator
|
41
|
+
DEFAULT_WRITE_BUFFER_SIZE = 64 * 1024
|
42
|
+
|
43
|
+
# Creates a new OutputEnumerator enumerator. The enumerator can be read from using `each`,
|
44
|
+
# and the creation of the ZIP is in lockstep with the caller calling `each` on the returned
|
45
|
+
# output enumerator object. This can be used when the calling program wants to stream the
|
46
|
+
# output of the ZIP archive and throttle that output, or split it into chunks, or use it
|
47
|
+
# as a generator.
|
48
|
+
#
|
49
|
+
# For example:
|
50
|
+
#
|
51
|
+
# # The block given to {output_enum} won't be executed immediately - rather it
|
52
|
+
# # will only start to execute when the caller starts to read from the output
|
53
|
+
# # by calling `each`
|
54
|
+
# body = ::ZipKit::OutputEnumerator.new(writer: CustomWriter) do |streamer|
|
55
|
+
# streamer.add_stored_entry(filename: 'large.tif', size: 1289894, crc32: 198210)
|
56
|
+
# streamer << large_file.read(1024*1024) until large_file.eof?
|
57
|
+
# ...
|
58
|
+
# end
|
59
|
+
#
|
60
|
+
# body.each do |bin_string|
|
61
|
+
# # Send the output somewhere, buffer it in a file etc.
|
62
|
+
# # The block passed into `initialize` will only start executing once `#each`
|
63
|
+
# # is called
|
64
|
+
# ...
|
65
|
+
# end
|
66
|
+
#
|
67
|
+
# @param kwargs_for_new [Hash] keyword arguments for {Streamer.new}
|
68
|
+
# @return [ZipKit::OutputEnumerator] the enumerator you can read bytestrings of the ZIP from by calling `each`
|
69
|
+
#
|
70
|
+
# @param streamer_options[Hash] options for Streamer, see {ZipKit::Streamer.new}
|
71
|
+
# @param write_buffer_size[Integer] By default all ZipKit writes are unbuffered. For output to sockets
|
72
|
+
# it is beneficial to bulkify those writes so that they are roughly sized to a socket buffer chunk. This
|
73
|
+
# object will bulkify writes for you in this way (so `each` will yield not on every call to `<<` from the Streamer
|
74
|
+
# but at block size boundaries or greater). Set it to 0 for unbuffered writes.
|
75
|
+
# @param blk a block that will receive the Streamer object when executing. The block will not be executed
|
76
|
+
# immediately but only once `each` is called on the OutputEnumerator
|
77
|
+
def initialize(write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE, **streamer_options, &blk)
|
78
|
+
@streamer_options = streamer_options.to_h
|
79
|
+
@bufsize = write_buffer_size.to_i
|
80
|
+
@archiving_block = blk
|
81
|
+
end
|
82
|
+
|
83
|
+
# Executes the block given to the constructor with a {ZipKit::Streamer}
|
84
|
+
# and passes each written chunk to the block given to the method. This allows one
|
85
|
+
# to "take" output of the ZIP piecewise. If called without a block will return an Enumerator
|
86
|
+
# that you can pull data from using `next`.
|
87
|
+
#
|
88
|
+
# **NOTE** Because the `WriteBuffer` inside this object can reuse the buffer, it is important
|
89
|
+
# that the `String` that is yielded **either** gets consumed eagerly (written byte-by-byte somewhere, or `#dup`-ed)
|
90
|
+
# since the write buffer will clear it after your block returns. If you expand this Enumerator
|
91
|
+
# eagerly into an Array you might notice that a lot of the segments of your ZIP output are
|
92
|
+
# empty - this means that you need to duplicate them.
|
93
|
+
#
|
94
|
+
# @yield [String] a chunk of the ZIP output in binary encoding
|
95
|
+
def each
|
96
|
+
if block_given?
|
97
|
+
block_write = ZipKit::BlockWrite.new { |chunk| yield(chunk) }
|
98
|
+
buffer = ZipKit::WriteBuffer.new(block_write, @bufsize)
|
99
|
+
ZipKit::Streamer.open(buffer, **@streamer_options, &@archiving_block)
|
100
|
+
buffer.flush
|
101
|
+
else
|
102
|
+
enum_for(:each)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Returns a tuple of `headers, body` - headers are a `Hash` and the body is
|
107
|
+
# an object that can be used as a Rack response body. The method will automatically
|
108
|
+
# switch the wrapping of the output depending on whether the response can be pre-sized,
|
109
|
+
# and whether your downstream webserver (like nginx) is configured to support
|
110
|
+
# the HTTP/1.1 protocol version.
|
111
|
+
#
|
112
|
+
# @param rack_env[Hash] the Rack env, which the method may need to mutate (adding a Tempfile for cleanup)
|
113
|
+
# @param content_length[Integer] the amount of bytes that the archive will contain. If given, no Chunked encoding gets applied.
|
114
|
+
# @return [Array]
|
115
|
+
def to_headers_and_rack_response_body(rack_env, content_length: nil)
|
116
|
+
headers = {
|
117
|
+
# We need to ensure Rack::ETag does not suddenly start buffering us, see
|
118
|
+
# https://github.com/rack/rack/issues/1619#issuecomment-606315714
|
119
|
+
# Set this even when not streaming for consistency. The fact that there would be
|
120
|
+
# a weak ETag generated would mean that the middleware buffers, so we have tests for that.
|
121
|
+
"Last-Modified" => Time.now.httpdate,
|
122
|
+
# Make sure Rack::Deflater does not touch our response body either, see
|
123
|
+
# https://github.com/felixbuenemann/xlsxtream/issues/14#issuecomment-529569548
|
124
|
+
"Content-Encoding" => "identity",
|
125
|
+
# Disable buffering for both nginx and Google Load Balancer, see
|
126
|
+
# https://cloud.google.com/appengine/docs/flexible/how-requests-are-handled?tab=python#x-accel-buffering
|
127
|
+
"X-Accel-Buffering" => "no"
|
128
|
+
}
|
129
|
+
|
130
|
+
if content_length
|
131
|
+
# If we know the size of the body, transfer encoding is not required at all - so the enumerator itself
|
132
|
+
# can function as the Rack body. This also would apply in HTTP/2 contexts where chunked encoding would
|
133
|
+
# no longer be required - then the enumerator could get returned "bare".
|
134
|
+
body = self
|
135
|
+
headers["Content-Length"] = content_length.to_i.to_s
|
136
|
+
elsif rack_env["HTTP_VERSION"] == "HTTP/1.0"
|
137
|
+
# Check for the proxy configuration first. This is the first common misconfiguration which destroys streaming -
|
138
|
+
# since HTTP 1.0 does not support chunked responses we need to revert to buffering. The issue though is that
|
139
|
+
# this reversion happens silently and it is usually not clear at all why streaming does not work. So let's at
|
140
|
+
# the very least print it to the Rails log.
|
141
|
+
body = ZipKit::RackTempfileBody.new(rack_env, self)
|
142
|
+
headers["Content-Length"] = body.size.to_s
|
143
|
+
else
|
144
|
+
body = ZipKit::RackChunkedBody.new(self)
|
145
|
+
headers["Transfer-Encoding"] = "chunked"
|
146
|
+
end
|
147
|
+
|
148
|
+
[headers, body]
|
149
|
+
end
|
150
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# A ZIP archive contains a flat list of entries. These entries can implicitly
|
4
|
+
# create directories when the archive is expanded. For example, an entry with
|
5
|
+
# the filename of "some folder/file.docx" will make the unarchiving application
|
6
|
+
# create a directory called "some folder" automatically, and then deposit the
|
7
|
+
# file "file.docx" in that directory. These "implicit" directories can be
|
8
|
+
# arbitrarily nested, and create a tree structure of directories. That structure
|
9
|
+
# however is implicit as the archive contains a flat list.
|
10
|
+
#
|
11
|
+
# This creates opportunities for conflicts. For example, imagine the following
|
12
|
+
# structure:
|
13
|
+
#
|
14
|
+
# * `something/` - specifies an empty directory with the name "something"
|
15
|
+
# * `something` - specifies a file, creates a conflict
|
16
|
+
#
|
17
|
+
# This can be prevented with filename uniqueness checks. It does get funkier however
|
18
|
+
# as the rabbit hole goes down:
|
19
|
+
#
|
20
|
+
# * `dir/subdir/another_subdir/yet_another_subdir/file.bin` - declares a file and directories
|
21
|
+
# * `dir/subdir/another_subdir/yet_another_subdir` - declares a file at one of the levels, creates a conflict
|
22
|
+
#
|
23
|
+
# The results of this ZIP structure aren't very easy to predict as they depend on the
|
24
|
+
# application that opens the archive. For example, BOMArchiveHelper on macOS will expand files
|
25
|
+
# as they are declared in the ZIP, but once a conflict occurs it will fail with "error -21". It
|
26
|
+
# is not very transparent to the user why unarchiving fails, and it has to - and can reliably - only
|
27
|
+
# be prevented when the archive gets created.
|
28
|
+
#
|
29
|
+
# Unfortunately that conflicts with another "magical" feature of ZipKit which automatically
|
30
|
+
# "fixes" duplicate filenames - filenames (paths) which have already been added to the archive.
|
31
|
+
# This fix is performed by appending (1), then (2) and so forth to the filename so that the
|
32
|
+
# conflict is avoided. This is not possible to apply to directories, because when one of the
|
33
|
+
# path components is reused in multiple filenames it means those entities should end up in
|
34
|
+
# the same directory (subdirectory) once the archive is opened.
|
35
|
+
#
|
36
|
+
# The `PathSet` keeps track of entries as they get added using 2 Sets (cheap presence checks),
|
37
|
+
# one for directories and one for files. It will raise a `Conflict` exception if there are
|
38
|
+
# files clobbering one another, or in case files collide with directories.
|
39
|
+
class ZipKit::PathSet
|
40
|
+
class Conflict < StandardError
|
41
|
+
end
|
42
|
+
|
43
|
+
class FileClobbersDirectory < Conflict
|
44
|
+
end
|
45
|
+
|
46
|
+
class DirectoryClobbersFile < Conflict
|
47
|
+
end
|
48
|
+
|
49
|
+
def initialize
|
50
|
+
@known_directories = Set.new
|
51
|
+
@known_files = Set.new
|
52
|
+
end
|
53
|
+
|
54
|
+
# Adds a directory path to the set of known paths, including
|
55
|
+
# all the directories that contain it. So, calling
|
56
|
+
# add_directory_path("dir/dir2/dir3")
|
57
|
+
# will add "dir", "dir/dir2", "dir/dir2/dir3".
|
58
|
+
#
|
59
|
+
# @param path[String] the path to the directory to add
|
60
|
+
# @return [void]
|
61
|
+
def add_directory_path(path)
|
62
|
+
path_and_ancestors(path).each do |parent_directory_path|
|
63
|
+
if @known_files.include?(parent_directory_path)
|
64
|
+
# Have to use the old-fashioned heredocs because ZipKit
|
65
|
+
# aims to be compatible with MRI 2.1+ syntax, and squiggly
|
66
|
+
# heredoc is only available starting 2.3+
|
67
|
+
error_message = <<~ERR
|
68
|
+
The path #{parent_directory_path.inspect} which has to be added
|
69
|
+
as a directory is already used for a file.
|
70
|
+
|
71
|
+
The directory at this path would get created implicitly
|
72
|
+
to produce #{path.inspect} during decompresison.
|
73
|
+
|
74
|
+
This would make some archive utilities refuse to open
|
75
|
+
the ZIP.
|
76
|
+
ERR
|
77
|
+
raise DirectoryClobbersFile, error_message
|
78
|
+
end
|
79
|
+
@known_directories << parent_directory_path
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Adds a file path to the set of known paths, including
|
84
|
+
# all the directories that contain it. Once a file has been added,
|
85
|
+
# it is no longer possible to add a directory having the same path
|
86
|
+
# as this would cause conflict.
|
87
|
+
#
|
88
|
+
# The operation also adds all the containing directories for the file, so
|
89
|
+
# add_file_path("dir/dir2/file.doc")
|
90
|
+
# will add "dir" and "dir/dir2" as directories, "dir/dir2/dir3".
|
91
|
+
#
|
92
|
+
# @param file_path[String] the path to the directory to add
|
93
|
+
# @return [void]
|
94
|
+
def add_file_path(file_path)
|
95
|
+
if @known_files.include?(file_path)
|
96
|
+
error_message = <<~ERR
|
97
|
+
The file at #{file_path.inspect} has already been included
|
98
|
+
in the archive. Adding it the second time would cause
|
99
|
+
the first file to be overwritten during unarchiving, and
|
100
|
+
could also get the archive flagged as invalid.
|
101
|
+
ERR
|
102
|
+
raise Conflict, error_message
|
103
|
+
end
|
104
|
+
|
105
|
+
if @known_directories.include?(file_path)
|
106
|
+
error_message = <<~ERR
|
107
|
+
The path #{file_path.inspect} is already used for
|
108
|
+
a directory, but you are trying to add it as a file.
|
109
|
+
|
110
|
+
This would make some archive utilities refuse
|
111
|
+
to open the ZIP.
|
112
|
+
ERR
|
113
|
+
raise FileClobbersDirectory, error_message
|
114
|
+
end
|
115
|
+
|
116
|
+
# Add all the directories which this file is contained in
|
117
|
+
*dir_components, _file_name = non_empty_path_components(file_path)
|
118
|
+
add_directory_path(dir_components.join("/"))
|
119
|
+
|
120
|
+
# ...and then the file itself
|
121
|
+
@known_files << file_path
|
122
|
+
end
|
123
|
+
|
124
|
+
# Tells whether a specific full path is already known to the PathSet.
|
125
|
+
# Can be a path for a directory or for a file.
|
126
|
+
#
|
127
|
+
# @param path_in_archive[String] the path to check for inclusion
|
128
|
+
# @return [Boolean]
|
129
|
+
def include?(path_in_archive)
|
130
|
+
@known_files.include?(path_in_archive) || @known_directories.include?(path_in_archive)
|
131
|
+
end
|
132
|
+
|
133
|
+
# Clears the contained sets
|
134
|
+
# @return [void]
|
135
|
+
def clear
|
136
|
+
@known_files.clear
|
137
|
+
@known_directories.clear
|
138
|
+
end
|
139
|
+
|
140
|
+
# Adds the directory or file path to the path set
|
141
|
+
#
|
142
|
+
# @return [void]
|
143
|
+
def add_directory_or_file_path(path_in_archive)
|
144
|
+
if path_in_archive.end_with?("/")
|
145
|
+
add_directory_path(path_in_archive)
|
146
|
+
else
|
147
|
+
add_file_path(path_in_archive)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
private
|
152
|
+
|
153
|
+
def non_empty_path_components(path)
|
154
|
+
path.split("/").reject(&:empty?)
|
155
|
+
end
|
156
|
+
|
157
|
+
def path_and_ancestors(path)
|
158
|
+
path_components = non_empty_path_components(path)
|
159
|
+
path_components.each_with_object([]) do |component, seen|
|
160
|
+
seen << [seen.last, component].compact.join("/")
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# A body wrapper that emits chunked responses, creating valid
|
4
|
+
# Transfer-Encoding::Chunked HTTP response body. This is copied from Rack::Chunked::Body,
|
5
|
+
# because Rack is not going to include that class after version 3.x
|
6
|
+
# Rails has a substitute class for this inside ActionController::Streaming,
|
7
|
+
# but that module is a private constant in the Rails codebase, and is thus
|
8
|
+
# considered "private" from the Rails standpoint. It is not that much code to
|
9
|
+
# carry, so we copy it into our code.
|
10
|
+
class ZipKit::RackChunkedBody
|
11
|
+
TERM = "\r\n"
|
12
|
+
TAIL = "0#{TERM}"
|
13
|
+
|
14
|
+
# @param body[#each] the enumerable that yields bytes, usually a `OutputEnumerator`
|
15
|
+
def initialize(body)
|
16
|
+
@body = body
|
17
|
+
end
|
18
|
+
|
19
|
+
# For each string yielded by the response body, yield
|
20
|
+
# the element in chunked encoding - and finish off with a terminator
|
21
|
+
def each
|
22
|
+
term = TERM
|
23
|
+
@body.each do |chunk|
|
24
|
+
size = chunk.bytesize
|
25
|
+
next if size == 0
|
26
|
+
|
27
|
+
yield [size.to_s(16), term, chunk.b, term].join
|
28
|
+
end
|
29
|
+
yield TAIL
|
30
|
+
yield term
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Contains a file handle which can be closed once the response finishes sending.
|
4
|
+
# It supports `to_path` so that `Rack::Sendfile` can intercept it
|
5
|
+
class ZipKit::RackTempfileBody
|
6
|
+
TEMPFILE_NAME_PREFIX = "zip-tricks-tf-body-"
|
7
|
+
attr_reader :tempfile
|
8
|
+
|
9
|
+
# @param body[#each] the enumerable that yields bytes, usually a `OutputEnumerator`.
|
10
|
+
# The `body` will be read in full immediately and closed.
|
11
|
+
def initialize(env, body)
|
12
|
+
@tempfile = Tempfile.new(TEMPFILE_NAME_PREFIX)
|
13
|
+
# Rack::TempfileReaper calls close! on tempfiles which get buffered
|
14
|
+
# We wil assume that it works fine with Rack::Sendfile (i.e. the path
|
15
|
+
# to the file getting served gets used before we unlink the tempfile)
|
16
|
+
env["rack.tempfiles"] ||= []
|
17
|
+
env["rack.tempfiles"] << @tempfile
|
18
|
+
|
19
|
+
@tempfile.binmode
|
20
|
+
@body = body
|
21
|
+
@did_flush = false
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns the size of the contained `Tempfile` so that a correct
|
25
|
+
# Content-Length header can be set
|
26
|
+
#
|
27
|
+
# @return [Integer]
|
28
|
+
def size
|
29
|
+
flush
|
30
|
+
@tempfile.size
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns the path to the `Tempfile`, so that Rack::Sendfile can send this response
|
34
|
+
# using the downstream webserver
|
35
|
+
#
|
36
|
+
# @return [String]
|
37
|
+
def to_path
|
38
|
+
flush
|
39
|
+
@tempfile.to_path
|
40
|
+
end
|
41
|
+
|
42
|
+
# Stream the file's contents if `Rack::Sendfile` isn't present.
|
43
|
+
#
|
44
|
+
# @return [void]
|
45
|
+
def each
|
46
|
+
flush
|
47
|
+
while (chunk = @tempfile.read(16384))
|
48
|
+
yield chunk
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def flush
|
55
|
+
if !@did_flush
|
56
|
+
@body.each { |bytes| @tempfile << bytes }
|
57
|
+
@did_flush = true
|
58
|
+
end
|
59
|
+
@tempfile.rewind
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Should be included into a Rails controller for easy ZIP output from any action.
|
4
|
+
module ZipKit::RailsStreaming
|
5
|
+
# Opens a {ZipKit::Streamer} and yields it to the caller. The output of the streamer
|
6
|
+
# gets automatically forwarded to the Rails response stream. When the output completes,
|
7
|
+
# the Rails response stream is going to be closed automatically.
|
8
|
+
# @param filename[String] name of the file for the Content-Disposition header
|
9
|
+
# @param type[String] the content type (MIME type) of the archive being output
|
10
|
+
# @param zip_streamer_options[Hash] options that will be passed to the Streamer.
|
11
|
+
# See {ZipKit::Streamer#initialize} for the full list of options.
|
12
|
+
# @yield [Streamer] the streamer that can be written to
|
13
|
+
# @return [ZipKit::OutputEnumerator] The output enumerator assigned to the response body
|
14
|
+
def zip_kit_stream(filename: "download.zip", type: "application/zip", **zip_streamer_options, &zip_streaming_blk)
|
15
|
+
# The output enumerator yields chunks of bytes generated from ZipKit. Instantiating it
|
16
|
+
# first will also validate the Streamer options.
|
17
|
+
chunk_yielder = ZipKit::OutputEnumerator.new(**zip_streamer_options, &zip_streaming_blk)
|
18
|
+
|
19
|
+
# We want some common headers for file sending. Rails will also set
|
20
|
+
# self.sending_file = true for us when we call send_file_headers!
|
21
|
+
send_file_headers!(type: type, filename: filename)
|
22
|
+
|
23
|
+
# Check for the proxy configuration first. This is the first common misconfiguration which destroys streaming -
|
24
|
+
# since HTTP 1.0 does not support chunked responses we need to revert to buffering. The issue though is that
|
25
|
+
# this reversion happens silently and it is usually not clear at all why streaming does not work. So let's at
|
26
|
+
# the very least print it to the Rails log.
|
27
|
+
if request.get_header("HTTP_VERSION") == "HTTP/1.0"
|
28
|
+
logger&.warn { "The downstream HTTP proxy/LB insists on HTTP/1.0 protocol, ZIP response will be buffered." }
|
29
|
+
end
|
30
|
+
|
31
|
+
headers, rack_body = chunk_yielder.to_headers_and_rack_response_body(request.env)
|
32
|
+
|
33
|
+
# Set the "particular" streaming headers
|
34
|
+
response.headers.merge!(headers)
|
35
|
+
self.response_body = rack_body
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# An object that fakes just-enough of an IO to be dangerous
|
4
|
+
# - or, more precisely, to be useful as a source for the FileReader
|
5
|
+
# central directory parser. Effectively we substitute an IO object
|
6
|
+
# for an object that fetches parts of the remote file over HTTP using `Range:`
|
7
|
+
# headers. The `RemoteIO` acts as an adapter between an object that performs the
|
8
|
+
# actual fetches over HTTP and an object that expects a handful of IO methods to be
|
9
|
+
# available.
|
10
|
+
class ZipKit::RemoteIO
|
11
|
+
# @param url[String, URI] the HTTP/HTTPS URL of the object to be retrieved
|
12
|
+
def initialize(url)
|
13
|
+
@pos = 0
|
14
|
+
@uri = URI(url)
|
15
|
+
@remote_size = nil
|
16
|
+
end
|
17
|
+
|
18
|
+
# Emulates IO#seek
|
19
|
+
# @param offset[Integer] absolute offset in the remote resource to seek to
|
20
|
+
# @param mode[Integer] The seek mode (only SEEK_SET is supported)
|
21
|
+
def seek(offset, mode = IO::SEEK_SET)
|
22
|
+
raise "Unsupported read mode #{mode}" unless mode == IO::SEEK_SET
|
23
|
+
@remote_size ||= request_object_size
|
24
|
+
@pos = clamp(0, offset, @remote_size)
|
25
|
+
0 # always return 0!
|
26
|
+
end
|
27
|
+
|
28
|
+
# Emulates IO#size.
|
29
|
+
#
|
30
|
+
# @return [Integer] the size of the remote resource
|
31
|
+
def size
|
32
|
+
@remote_size ||= request_object_size
|
33
|
+
end
|
34
|
+
|
35
|
+
# Emulates IO#read, but requires the number of bytes to read
|
36
|
+
# The read will be limited to the
|
37
|
+
# size of the remote resource relative to the current offset in the IO,
|
38
|
+
# so if you are at offset 0 in the IO of size 10, doing a `read(20)`
|
39
|
+
# will only return you 10 bytes of result, and not raise any exceptions.
|
40
|
+
#
|
41
|
+
# @param n_bytes[Fixnum, nil] how many bytes to read, or `nil` to read all the way to the end
|
42
|
+
# @return [String] the read bytes
|
43
|
+
def read(n_bytes = nil)
|
44
|
+
# If the resource is empty there is nothing to read
|
45
|
+
return if size.zero?
|
46
|
+
|
47
|
+
maximum_avaialable = size - @pos
|
48
|
+
n_bytes ||= maximum_avaialable # nil == read to the end of file
|
49
|
+
return "" if n_bytes.zero?
|
50
|
+
raise ArgumentError, "No negative reads(#{n_bytes})" if n_bytes < 0
|
51
|
+
|
52
|
+
n_bytes = clamp(0, n_bytes, maximum_avaialable)
|
53
|
+
|
54
|
+
http_range = (@pos..(@pos + n_bytes - 1))
|
55
|
+
request_range(http_range).tap do |data|
|
56
|
+
raise "Remote read returned #{data.bytesize} bytes instead of #{n_bytes} as requested" if data.bytesize != n_bytes
|
57
|
+
@pos = clamp(0, @pos + data.bytesize, size)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Returns the current pointer position within the IO
|
62
|
+
#
|
63
|
+
# @return [Fixnum]
|
64
|
+
def tell
|
65
|
+
@pos
|
66
|
+
end
|
67
|
+
|
68
|
+
protected
|
69
|
+
|
70
|
+
# Only used internally when reading the remote ZIP.
|
71
|
+
#
|
72
|
+
# @param range[Range] the HTTP range of data to fetch from remote
|
73
|
+
# @return [String] the response body of the ranged request
|
74
|
+
def request_range(range)
|
75
|
+
http = Net::HTTP.start(@uri.hostname, @uri.port)
|
76
|
+
request = Net::HTTP::Get.new(@uri)
|
77
|
+
request.range = range
|
78
|
+
response = http.request(request)
|
79
|
+
case response.code
|
80
|
+
when "206", "200"
|
81
|
+
response.body
|
82
|
+
else
|
83
|
+
raise "Remote at #{@uri} replied with code #{response.code}"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# For working with S3 it is a better idea to perform a GET request for one byte, since doing a HEAD
|
88
|
+
# request needs a different permission - and standard GET presigned URLs are not allowed to perform it
|
89
|
+
#
|
90
|
+
# @return [Integer] the size of the remote resource, parsed either from Content-Length or Content-Range header
|
91
|
+
def request_object_size
|
92
|
+
http = Net::HTTP.start(@uri.hostname, @uri.port)
|
93
|
+
request = Net::HTTP::Get.new(@uri)
|
94
|
+
request.range = 0..0
|
95
|
+
response = http.request(request)
|
96
|
+
case response.code
|
97
|
+
when "206"
|
98
|
+
content_range_header_value = response["Content-Range"]
|
99
|
+
content_range_header_value.split("/").last.to_i
|
100
|
+
when "200"
|
101
|
+
response["Content-Length"].to_i
|
102
|
+
else
|
103
|
+
raise "Remote at #{@uri} replied with code #{response.code}"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
private
|
108
|
+
|
109
|
+
def clamp(a, b, c)
|
110
|
+
return a if b < a
|
111
|
+
return c if b > c
|
112
|
+
b
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Alows reading the central directory of a remote ZIP file without
|
4
|
+
# downloading the entire file. The central directory provides the
|
5
|
+
# offsets at which the actual file contents is located. You can then
|
6
|
+
# use the `Range:` HTTP headers to download those entries separately.
|
7
|
+
#
|
8
|
+
# Please read the security warning in `FileReader` _VERY CAREFULLY_
|
9
|
+
# before you use this module.
|
10
|
+
module ZipKit::RemoteUncap
|
11
|
+
# @param uri[String] the HTTP(S) URL to read the ZIP footer from
|
12
|
+
# @param reader_class[Class] which class to use for reading
|
13
|
+
# @param options_for_zip_reader[Hash] any additional options to give to
|
14
|
+
# {ZipKit::FileReader} when reading
|
15
|
+
# @return [Array<ZipKit::FileReader::ZipEntry>] metadata about the
|
16
|
+
# files within the remote archive
|
17
|
+
def self.files_within_zip_at(uri, reader_class: ZipKit::FileReader, **options_for_zip_reader)
|
18
|
+
fake_io = ZipKit::RemoteIO.new(uri)
|
19
|
+
reader = reader_class.new
|
20
|
+
reader.read_zip_structure(io: fake_io, **options_for_zip_reader)
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Helps to estimate archive sizes
|
4
|
+
class ZipKit::SizeEstimator
|
5
|
+
require_relative "streamer"
|
6
|
+
|
7
|
+
# Creates a new estimator with a Streamer object. Normally you should use
|
8
|
+
# `estimate` instead an not use this method directly.
|
9
|
+
def initialize(streamer)
|
10
|
+
@streamer = streamer
|
11
|
+
end
|
12
|
+
private :initialize
|
13
|
+
|
14
|
+
# Performs the estimate using fake archiving. It needs to know the sizes of the
|
15
|
+
# entries upfront. Usage:
|
16
|
+
#
|
17
|
+
# expected_zip_size = SizeEstimator.estimate do | estimator |
|
18
|
+
# estimator.add_stored_entry(filename: "file.doc", size: 898291)
|
19
|
+
# estimator.add_deflated_entry(filename: "family.tif",
|
20
|
+
# uncompressed_size: 89281911, compressed_size: 121908)
|
21
|
+
# end
|
22
|
+
#
|
23
|
+
# @param kwargs_for_streamer_new Any options to pass to Streamer, see {Streamer#initialize}
|
24
|
+
# @return [Integer] the size of the resulting archive, in bytes
|
25
|
+
# @yield [SizeEstimator] the estimator
|
26
|
+
def self.estimate(**kwargs_for_streamer_new)
|
27
|
+
streamer = ZipKit::Streamer.new(ZipKit::NullWriter, **kwargs_for_streamer_new)
|
28
|
+
estimator = new(streamer)
|
29
|
+
yield(estimator)
|
30
|
+
streamer.close # Returns the .tell of the contained IO
|
31
|
+
end
|
32
|
+
|
33
|
+
# Add a fake entry to the archive, to see how big it is going to be in the end.
|
34
|
+
#
|
35
|
+
# @param filename [String] the name of the file (filenames are variable-width in the ZIP)
|
36
|
+
# @param size [Fixnum] size of the uncompressed entry
|
37
|
+
# @param use_data_descriptor[Boolean] whether the entry uses a postfix
|
38
|
+
# data descriptor to specify size
|
39
|
+
# @return self
|
40
|
+
def add_stored_entry(filename:, size:, use_data_descriptor: false)
|
41
|
+
@streamer.add_stored_entry(filename: filename,
|
42
|
+
crc32: 0,
|
43
|
+
size: size,
|
44
|
+
use_data_descriptor: use_data_descriptor)
|
45
|
+
@streamer.simulate_write(size)
|
46
|
+
if use_data_descriptor
|
47
|
+
@streamer.update_last_entry_and_write_data_descriptor(crc32: 0, compressed_size: size, uncompressed_size: size)
|
48
|
+
end
|
49
|
+
self
|
50
|
+
end
|
51
|
+
|
52
|
+
# Add a fake entry to the archive, to see how big it is going to be in the end.
|
53
|
+
#
|
54
|
+
# @param filename [String] the name of the file (filenames are variable-width in the ZIP)
|
55
|
+
# @param uncompressed_size [Fixnum] size of the uncompressed entry
|
56
|
+
# @param compressed_size [Fixnum] size of the compressed entry
|
57
|
+
# @param use_data_descriptor[Boolean] whether the entry uses a postfix data
|
58
|
+
# descriptor to specify size
|
59
|
+
# @return self
|
60
|
+
def add_deflated_entry(filename:, uncompressed_size:, compressed_size:, use_data_descriptor: false)
|
61
|
+
@streamer.add_deflated_entry(filename: filename,
|
62
|
+
crc32: 0,
|
63
|
+
compressed_size: compressed_size,
|
64
|
+
uncompressed_size: uncompressed_size,
|
65
|
+
use_data_descriptor: use_data_descriptor)
|
66
|
+
|
67
|
+
@streamer.simulate_write(compressed_size)
|
68
|
+
if use_data_descriptor
|
69
|
+
@streamer.update_last_entry_and_write_data_descriptor(crc32: 0,
|
70
|
+
compressed_size: compressed_size,
|
71
|
+
uncompressed_size: uncompressed_size)
|
72
|
+
end
|
73
|
+
self
|
74
|
+
end
|
75
|
+
|
76
|
+
# Add an empty directory to the archive.
|
77
|
+
#
|
78
|
+
# @param dirname [String] the name of the directory
|
79
|
+
# @return self
|
80
|
+
def add_empty_directory_entry(dirname:)
|
81
|
+
@streamer.add_empty_directory(dirname: dirname)
|
82
|
+
self
|
83
|
+
end
|
84
|
+
end
|