zip_kit 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +7 -0
  2. data/.codeclimate.yml +7 -0
  3. data/.document +5 -0
  4. data/.github/workflows/ci.yml +29 -0
  5. data/.gitignore +61 -0
  6. data/.rspec +1 -0
  7. data/.standard.yml +8 -0
  8. data/.yardopts +1 -0
  9. data/CHANGELOG.md +255 -0
  10. data/CODE_OF_CONDUCT.md +46 -0
  11. data/CONTRIBUTING.md +153 -0
  12. data/Gemfile +4 -0
  13. data/IMPLEMENTATION_DETAILS.md +97 -0
  14. data/LICENSE.txt +20 -0
  15. data/README.md +234 -0
  16. data/Rakefile +21 -0
  17. data/bench/buffered_crc32_bench.rb +109 -0
  18. data/examples/archive_size_estimate.rb +15 -0
  19. data/examples/config.ru +7 -0
  20. data/examples/deferred_write.rb +58 -0
  21. data/examples/parallel_compression_with_block_deflate.rb +86 -0
  22. data/examples/rack_application.rb +63 -0
  23. data/examples/s3_upload.rb +23 -0
  24. data/lib/zip_kit/block_deflate.rb +130 -0
  25. data/lib/zip_kit/block_write.rb +47 -0
  26. data/lib/zip_kit/file_reader/inflating_reader.rb +36 -0
  27. data/lib/zip_kit/file_reader/stored_reader.rb +35 -0
  28. data/lib/zip_kit/file_reader.rb +740 -0
  29. data/lib/zip_kit/null_writer.rb +12 -0
  30. data/lib/zip_kit/output_enumerator.rb +150 -0
  31. data/lib/zip_kit/path_set.rb +163 -0
  32. data/lib/zip_kit/rack_chunked_body.rb +32 -0
  33. data/lib/zip_kit/rack_tempfile_body.rb +61 -0
  34. data/lib/zip_kit/rails_streaming.rb +37 -0
  35. data/lib/zip_kit/remote_io.rb +114 -0
  36. data/lib/zip_kit/remote_uncap.rb +22 -0
  37. data/lib/zip_kit/size_estimator.rb +84 -0
  38. data/lib/zip_kit/stream_crc32.rb +60 -0
  39. data/lib/zip_kit/streamer/deflated_writer.rb +45 -0
  40. data/lib/zip_kit/streamer/entry.rb +37 -0
  41. data/lib/zip_kit/streamer/filler.rb +9 -0
  42. data/lib/zip_kit/streamer/heuristic.rb +68 -0
  43. data/lib/zip_kit/streamer/stored_writer.rb +39 -0
  44. data/lib/zip_kit/streamer/writable.rb +36 -0
  45. data/lib/zip_kit/streamer.rb +614 -0
  46. data/lib/zip_kit/uniquify_filename.rb +39 -0
  47. data/lib/zip_kit/version.rb +5 -0
  48. data/lib/zip_kit/write_and_tell.rb +40 -0
  49. data/lib/zip_kit/write_buffer.rb +71 -0
  50. data/lib/zip_kit/write_shovel.rb +22 -0
  51. data/lib/zip_kit/zip_writer.rb +436 -0
  52. data/lib/zip_kit.rb +24 -0
  53. data/zip_kit.gemspec +41 -0
  54. metadata +335 -0
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Used when you need to supply a destination IO for some
4
+ # write operations, but want to discard the data (like when
5
+ # estimating the size of a ZIP)
6
+ module ZipKit::NullWriter
7
+ # @param _[String] the data to write
8
+ # @return [self]
9
+ def self.<<(_)
10
+ self
11
+ end
12
+ end
@@ -0,0 +1,150 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "time" # for .httpdate
4
+
5
+ # The output enumerator makes it possible to "pull" from a ZipKit streamer
6
+ # object instead of having it "push" writes to you. It will "stash" the block which
7
+ # writes the ZIP archive through the streamer, and when you call `each` on the Enumerator
8
+ # it will yield you the bytes the block writes. Since it is an enumerator you can
9
+ # use `next` to take chunks written by the ZipKit streamer one by one. It can be very
10
+ # convenient when you need to segment your ZIP output into bigger chunks for, say,
11
+ # uploading them to a cloud storage provider such as S3.
12
+ #
13
+ # Another use of the `OutputEnumerator` is as a Rack response body - since a Rack
14
+ # response body object must support `#each` yielding successive binary strings.
15
+ # Which is exactly what `OutputEnumerator` does.
16
+ #
17
+ # The enumerator can provide you some more conveinences for HTTP output - correct streaming
18
+ # headers and a body with chunked transfer encoding.
19
+ #
20
+ # iterable_zip_body = ZipKit::OutputEnumerator.new do | streamer |
21
+ # streamer.write_file('big.csv') do |sink|
22
+ # CSV(sink) do |csv_writer|
23
+ # csv_writer << Person.column_names
24
+ # Person.all.find_each do |person|
25
+ # csv_writer << person.attributes.values
26
+ # end
27
+ # end
28
+ # end
29
+ # end
30
+ #
31
+ # Either as a `Transfer-Encoding: chunked` response (if your webserver supports it),
32
+ # which will give you true streaming capability:
33
+ #
34
+ # headers, chunked_or_presized_rack_body = iterable_zip_body.to_headers_and_rack_response_body(env)
35
+ # [200, headers, chunked_or_presized_rack_body]
36
+ #
37
+ # or it will wrap your output in a `TempfileBody` object which buffers the ZIP before output. Buffering has
38
+ # benefits if your webserver does not support anything beyound HTTP/1.0, and also engages automatically
39
+ # in unit tests (since rack-test and Rails tests do not do streaming HTTP/1.1).
40
+ class ZipKit::OutputEnumerator
41
+ DEFAULT_WRITE_BUFFER_SIZE = 64 * 1024
42
+
43
+ # Creates a new OutputEnumerator enumerator. The enumerator can be read from using `each`,
44
+ # and the creation of the ZIP is in lockstep with the caller calling `each` on the returned
45
+ # output enumerator object. This can be used when the calling program wants to stream the
46
+ # output of the ZIP archive and throttle that output, or split it into chunks, or use it
47
+ # as a generator.
48
+ #
49
+ # For example:
50
+ #
51
+ # # The block given to {output_enum} won't be executed immediately - rather it
52
+ # # will only start to execute when the caller starts to read from the output
53
+ # # by calling `each`
54
+ # body = ::ZipKit::OutputEnumerator.new(writer: CustomWriter) do |streamer|
55
+ # streamer.add_stored_entry(filename: 'large.tif', size: 1289894, crc32: 198210)
56
+ # streamer << large_file.read(1024*1024) until large_file.eof?
57
+ # ...
58
+ # end
59
+ #
60
+ # body.each do |bin_string|
61
+ # # Send the output somewhere, buffer it in a file etc.
62
+ # # The block passed into `initialize` will only start executing once `#each`
63
+ # # is called
64
+ # ...
65
+ # end
66
+ #
67
+ # @param kwargs_for_new [Hash] keyword arguments for {Streamer.new}
68
+ # @return [ZipKit::OutputEnumerator] the enumerator you can read bytestrings of the ZIP from by calling `each`
69
+ #
70
+ # @param streamer_options[Hash] options for Streamer, see {ZipKit::Streamer.new}
71
+ # @param write_buffer_size[Integer] By default all ZipKit writes are unbuffered. For output to sockets
72
+ # it is beneficial to bulkify those writes so that they are roughly sized to a socket buffer chunk. This
73
+ # object will bulkify writes for you in this way (so `each` will yield not on every call to `<<` from the Streamer
74
+ # but at block size boundaries or greater). Set it to 0 for unbuffered writes.
75
+ # @param blk a block that will receive the Streamer object when executing. The block will not be executed
76
+ # immediately but only once `each` is called on the OutputEnumerator
77
+ def initialize(write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE, **streamer_options, &blk)
78
+ @streamer_options = streamer_options.to_h
79
+ @bufsize = write_buffer_size.to_i
80
+ @archiving_block = blk
81
+ end
82
+
83
+ # Executes the block given to the constructor with a {ZipKit::Streamer}
84
+ # and passes each written chunk to the block given to the method. This allows one
85
+ # to "take" output of the ZIP piecewise. If called without a block will return an Enumerator
86
+ # that you can pull data from using `next`.
87
+ #
88
+ # **NOTE** Because the `WriteBuffer` inside this object can reuse the buffer, it is important
89
+ # that the `String` that is yielded **either** gets consumed eagerly (written byte-by-byte somewhere, or `#dup`-ed)
90
+ # since the write buffer will clear it after your block returns. If you expand this Enumerator
91
+ # eagerly into an Array you might notice that a lot of the segments of your ZIP output are
92
+ # empty - this means that you need to duplicate them.
93
+ #
94
+ # @yield [String] a chunk of the ZIP output in binary encoding
95
+ def each
96
+ if block_given?
97
+ block_write = ZipKit::BlockWrite.new { |chunk| yield(chunk) }
98
+ buffer = ZipKit::WriteBuffer.new(block_write, @bufsize)
99
+ ZipKit::Streamer.open(buffer, **@streamer_options, &@archiving_block)
100
+ buffer.flush
101
+ else
102
+ enum_for(:each)
103
+ end
104
+ end
105
+
106
+ # Returns a tuple of `headers, body` - headers are a `Hash` and the body is
107
+ # an object that can be used as a Rack response body. The method will automatically
108
+ # switch the wrapping of the output depending on whether the response can be pre-sized,
109
+ # and whether your downstream webserver (like nginx) is configured to support
110
+ # the HTTP/1.1 protocol version.
111
+ #
112
+ # @param rack_env[Hash] the Rack env, which the method may need to mutate (adding a Tempfile for cleanup)
113
+ # @param content_length[Integer] the amount of bytes that the archive will contain. If given, no Chunked encoding gets applied.
114
+ # @return [Array]
115
+ def to_headers_and_rack_response_body(rack_env, content_length: nil)
116
+ headers = {
117
+ # We need to ensure Rack::ETag does not suddenly start buffering us, see
118
+ # https://github.com/rack/rack/issues/1619#issuecomment-606315714
119
+ # Set this even when not streaming for consistency. The fact that there would be
120
+ # a weak ETag generated would mean that the middleware buffers, so we have tests for that.
121
+ "Last-Modified" => Time.now.httpdate,
122
+ # Make sure Rack::Deflater does not touch our response body either, see
123
+ # https://github.com/felixbuenemann/xlsxtream/issues/14#issuecomment-529569548
124
+ "Content-Encoding" => "identity",
125
+ # Disable buffering for both nginx and Google Load Balancer, see
126
+ # https://cloud.google.com/appengine/docs/flexible/how-requests-are-handled?tab=python#x-accel-buffering
127
+ "X-Accel-Buffering" => "no"
128
+ }
129
+
130
+ if content_length
131
+ # If we know the size of the body, transfer encoding is not required at all - so the enumerator itself
132
+ # can function as the Rack body. This also would apply in HTTP/2 contexts where chunked encoding would
133
+ # no longer be required - then the enumerator could get returned "bare".
134
+ body = self
135
+ headers["Content-Length"] = content_length.to_i.to_s
136
+ elsif rack_env["HTTP_VERSION"] == "HTTP/1.0"
137
+ # Check for the proxy configuration first. This is the first common misconfiguration which destroys streaming -
138
+ # since HTTP 1.0 does not support chunked responses we need to revert to buffering. The issue though is that
139
+ # this reversion happens silently and it is usually not clear at all why streaming does not work. So let's at
140
+ # the very least print it to the Rails log.
141
+ body = ZipKit::RackTempfileBody.new(rack_env, self)
142
+ headers["Content-Length"] = body.size.to_s
143
+ else
144
+ body = ZipKit::RackChunkedBody.new(self)
145
+ headers["Transfer-Encoding"] = "chunked"
146
+ end
147
+
148
+ [headers, body]
149
+ end
150
+ end
@@ -0,0 +1,163 @@
1
+ # frozen_string_literal: true
2
+
3
+ # A ZIP archive contains a flat list of entries. These entries can implicitly
4
+ # create directories when the archive is expanded. For example, an entry with
5
+ # the filename of "some folder/file.docx" will make the unarchiving application
6
+ # create a directory called "some folder" automatically, and then deposit the
7
+ # file "file.docx" in that directory. These "implicit" directories can be
8
+ # arbitrarily nested, and create a tree structure of directories. That structure
9
+ # however is implicit as the archive contains a flat list.
10
+ #
11
+ # This creates opportunities for conflicts. For example, imagine the following
12
+ # structure:
13
+ #
14
+ # * `something/` - specifies an empty directory with the name "something"
15
+ # * `something` - specifies a file, creates a conflict
16
+ #
17
+ # This can be prevented with filename uniqueness checks. It does get funkier however
18
+ # as the rabbit hole goes down:
19
+ #
20
+ # * `dir/subdir/another_subdir/yet_another_subdir/file.bin` - declares a file and directories
21
+ # * `dir/subdir/another_subdir/yet_another_subdir` - declares a file at one of the levels, creates a conflict
22
+ #
23
+ # The results of this ZIP structure aren't very easy to predict as they depend on the
24
+ # application that opens the archive. For example, BOMArchiveHelper on macOS will expand files
25
+ # as they are declared in the ZIP, but once a conflict occurs it will fail with "error -21". It
26
+ # is not very transparent to the user why unarchiving fails, and it has to - and can reliably - only
27
+ # be prevented when the archive gets created.
28
+ #
29
+ # Unfortunately that conflicts with another "magical" feature of ZipKit which automatically
30
+ # "fixes" duplicate filenames - filenames (paths) which have already been added to the archive.
31
+ # This fix is performed by appending (1), then (2) and so forth to the filename so that the
32
+ # conflict is avoided. This is not possible to apply to directories, because when one of the
33
+ # path components is reused in multiple filenames it means those entities should end up in
34
+ # the same directory (subdirectory) once the archive is opened.
35
+ #
36
+ # The `PathSet` keeps track of entries as they get added using 2 Sets (cheap presence checks),
37
+ # one for directories and one for files. It will raise a `Conflict` exception if there are
38
+ # files clobbering one another, or in case files collide with directories.
39
+ class ZipKit::PathSet
40
+ class Conflict < StandardError
41
+ end
42
+
43
+ class FileClobbersDirectory < Conflict
44
+ end
45
+
46
+ class DirectoryClobbersFile < Conflict
47
+ end
48
+
49
+ def initialize
50
+ @known_directories = Set.new
51
+ @known_files = Set.new
52
+ end
53
+
54
+ # Adds a directory path to the set of known paths, including
55
+ # all the directories that contain it. So, calling
56
+ # add_directory_path("dir/dir2/dir3")
57
+ # will add "dir", "dir/dir2", "dir/dir2/dir3".
58
+ #
59
+ # @param path[String] the path to the directory to add
60
+ # @return [void]
61
+ def add_directory_path(path)
62
+ path_and_ancestors(path).each do |parent_directory_path|
63
+ if @known_files.include?(parent_directory_path)
64
+ # Have to use the old-fashioned heredocs because ZipKit
65
+ # aims to be compatible with MRI 2.1+ syntax, and squiggly
66
+ # heredoc is only available starting 2.3+
67
+ error_message = <<~ERR
68
+ The path #{parent_directory_path.inspect} which has to be added
69
+ as a directory is already used for a file.
70
+
71
+ The directory at this path would get created implicitly
72
+ to produce #{path.inspect} during decompresison.
73
+
74
+ This would make some archive utilities refuse to open
75
+ the ZIP.
76
+ ERR
77
+ raise DirectoryClobbersFile, error_message
78
+ end
79
+ @known_directories << parent_directory_path
80
+ end
81
+ end
82
+
83
+ # Adds a file path to the set of known paths, including
84
+ # all the directories that contain it. Once a file has been added,
85
+ # it is no longer possible to add a directory having the same path
86
+ # as this would cause conflict.
87
+ #
88
+ # The operation also adds all the containing directories for the file, so
89
+ # add_file_path("dir/dir2/file.doc")
90
+ # will add "dir" and "dir/dir2" as directories, "dir/dir2/dir3".
91
+ #
92
+ # @param file_path[String] the path to the directory to add
93
+ # @return [void]
94
+ def add_file_path(file_path)
95
+ if @known_files.include?(file_path)
96
+ error_message = <<~ERR
97
+ The file at #{file_path.inspect} has already been included
98
+ in the archive. Adding it the second time would cause
99
+ the first file to be overwritten during unarchiving, and
100
+ could also get the archive flagged as invalid.
101
+ ERR
102
+ raise Conflict, error_message
103
+ end
104
+
105
+ if @known_directories.include?(file_path)
106
+ error_message = <<~ERR
107
+ The path #{file_path.inspect} is already used for
108
+ a directory, but you are trying to add it as a file.
109
+
110
+ This would make some archive utilities refuse
111
+ to open the ZIP.
112
+ ERR
113
+ raise FileClobbersDirectory, error_message
114
+ end
115
+
116
+ # Add all the directories which this file is contained in
117
+ *dir_components, _file_name = non_empty_path_components(file_path)
118
+ add_directory_path(dir_components.join("/"))
119
+
120
+ # ...and then the file itself
121
+ @known_files << file_path
122
+ end
123
+
124
+ # Tells whether a specific full path is already known to the PathSet.
125
+ # Can be a path for a directory or for a file.
126
+ #
127
+ # @param path_in_archive[String] the path to check for inclusion
128
+ # @return [Boolean]
129
+ def include?(path_in_archive)
130
+ @known_files.include?(path_in_archive) || @known_directories.include?(path_in_archive)
131
+ end
132
+
133
+ # Clears the contained sets
134
+ # @return [void]
135
+ def clear
136
+ @known_files.clear
137
+ @known_directories.clear
138
+ end
139
+
140
+ # Adds the directory or file path to the path set
141
+ #
142
+ # @return [void]
143
+ def add_directory_or_file_path(path_in_archive)
144
+ if path_in_archive.end_with?("/")
145
+ add_directory_path(path_in_archive)
146
+ else
147
+ add_file_path(path_in_archive)
148
+ end
149
+ end
150
+
151
+ private
152
+
153
+ def non_empty_path_components(path)
154
+ path.split("/").reject(&:empty?)
155
+ end
156
+
157
+ def path_and_ancestors(path)
158
+ path_components = non_empty_path_components(path)
159
+ path_components.each_with_object([]) do |component, seen|
160
+ seen << [seen.last, component].compact.join("/")
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ # A body wrapper that emits chunked responses, creating valid
4
+ # Transfer-Encoding::Chunked HTTP response body. This is copied from Rack::Chunked::Body,
5
+ # because Rack is not going to include that class after version 3.x
6
+ # Rails has a substitute class for this inside ActionController::Streaming,
7
+ # but that module is a private constant in the Rails codebase, and is thus
8
+ # considered "private" from the Rails standpoint. It is not that much code to
9
+ # carry, so we copy it into our code.
10
+ class ZipKit::RackChunkedBody
11
+ TERM = "\r\n"
12
+ TAIL = "0#{TERM}"
13
+
14
+ # @param body[#each] the enumerable that yields bytes, usually a `OutputEnumerator`
15
+ def initialize(body)
16
+ @body = body
17
+ end
18
+
19
+ # For each string yielded by the response body, yield
20
+ # the element in chunked encoding - and finish off with a terminator
21
+ def each
22
+ term = TERM
23
+ @body.each do |chunk|
24
+ size = chunk.bytesize
25
+ next if size == 0
26
+
27
+ yield [size.to_s(16), term, chunk.b, term].join
28
+ end
29
+ yield TAIL
30
+ yield term
31
+ end
32
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Contains a file handle which can be closed once the response finishes sending.
4
+ # It supports `to_path` so that `Rack::Sendfile` can intercept it
5
+ class ZipKit::RackTempfileBody
6
+ TEMPFILE_NAME_PREFIX = "zip-tricks-tf-body-"
7
+ attr_reader :tempfile
8
+
9
+ # @param body[#each] the enumerable that yields bytes, usually a `OutputEnumerator`.
10
+ # The `body` will be read in full immediately and closed.
11
+ def initialize(env, body)
12
+ @tempfile = Tempfile.new(TEMPFILE_NAME_PREFIX)
13
+ # Rack::TempfileReaper calls close! on tempfiles which get buffered
14
+ # We wil assume that it works fine with Rack::Sendfile (i.e. the path
15
+ # to the file getting served gets used before we unlink the tempfile)
16
+ env["rack.tempfiles"] ||= []
17
+ env["rack.tempfiles"] << @tempfile
18
+
19
+ @tempfile.binmode
20
+ @body = body
21
+ @did_flush = false
22
+ end
23
+
24
+ # Returns the size of the contained `Tempfile` so that a correct
25
+ # Content-Length header can be set
26
+ #
27
+ # @return [Integer]
28
+ def size
29
+ flush
30
+ @tempfile.size
31
+ end
32
+
33
+ # Returns the path to the `Tempfile`, so that Rack::Sendfile can send this response
34
+ # using the downstream webserver
35
+ #
36
+ # @return [String]
37
+ def to_path
38
+ flush
39
+ @tempfile.to_path
40
+ end
41
+
42
+ # Stream the file's contents if `Rack::Sendfile` isn't present.
43
+ #
44
+ # @return [void]
45
+ def each
46
+ flush
47
+ while (chunk = @tempfile.read(16384))
48
+ yield chunk
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ def flush
55
+ if !@did_flush
56
+ @body.each { |bytes| @tempfile << bytes }
57
+ @did_flush = true
58
+ end
59
+ @tempfile.rewind
60
+ end
61
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Should be included into a Rails controller for easy ZIP output from any action.
4
+ module ZipKit::RailsStreaming
5
+ # Opens a {ZipKit::Streamer} and yields it to the caller. The output of the streamer
6
+ # gets automatically forwarded to the Rails response stream. When the output completes,
7
+ # the Rails response stream is going to be closed automatically.
8
+ # @param filename[String] name of the file for the Content-Disposition header
9
+ # @param type[String] the content type (MIME type) of the archive being output
10
+ # @param zip_streamer_options[Hash] options that will be passed to the Streamer.
11
+ # See {ZipKit::Streamer#initialize} for the full list of options.
12
+ # @yield [Streamer] the streamer that can be written to
13
+ # @return [ZipKit::OutputEnumerator] The output enumerator assigned to the response body
14
+ def zip_kit_stream(filename: "download.zip", type: "application/zip", **zip_streamer_options, &zip_streaming_blk)
15
+ # The output enumerator yields chunks of bytes generated from ZipKit. Instantiating it
16
+ # first will also validate the Streamer options.
17
+ chunk_yielder = ZipKit::OutputEnumerator.new(**zip_streamer_options, &zip_streaming_blk)
18
+
19
+ # We want some common headers for file sending. Rails will also set
20
+ # self.sending_file = true for us when we call send_file_headers!
21
+ send_file_headers!(type: type, filename: filename)
22
+
23
+ # Check for the proxy configuration first. This is the first common misconfiguration which destroys streaming -
24
+ # since HTTP 1.0 does not support chunked responses we need to revert to buffering. The issue though is that
25
+ # this reversion happens silently and it is usually not clear at all why streaming does not work. So let's at
26
+ # the very least print it to the Rails log.
27
+ if request.get_header("HTTP_VERSION") == "HTTP/1.0"
28
+ logger&.warn { "The downstream HTTP proxy/LB insists on HTTP/1.0 protocol, ZIP response will be buffered." }
29
+ end
30
+
31
+ headers, rack_body = chunk_yielder.to_headers_and_rack_response_body(request.env)
32
+
33
+ # Set the "particular" streaming headers
34
+ response.headers.merge!(headers)
35
+ self.response_body = rack_body
36
+ end
37
+ end
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ # An object that fakes just-enough of an IO to be dangerous
4
+ # - or, more precisely, to be useful as a source for the FileReader
5
+ # central directory parser. Effectively we substitute an IO object
6
+ # for an object that fetches parts of the remote file over HTTP using `Range:`
7
+ # headers. The `RemoteIO` acts as an adapter between an object that performs the
8
+ # actual fetches over HTTP and an object that expects a handful of IO methods to be
9
+ # available.
10
+ class ZipKit::RemoteIO
11
+ # @param url[String, URI] the HTTP/HTTPS URL of the object to be retrieved
12
+ def initialize(url)
13
+ @pos = 0
14
+ @uri = URI(url)
15
+ @remote_size = nil
16
+ end
17
+
18
+ # Emulates IO#seek
19
+ # @param offset[Integer] absolute offset in the remote resource to seek to
20
+ # @param mode[Integer] The seek mode (only SEEK_SET is supported)
21
+ def seek(offset, mode = IO::SEEK_SET)
22
+ raise "Unsupported read mode #{mode}" unless mode == IO::SEEK_SET
23
+ @remote_size ||= request_object_size
24
+ @pos = clamp(0, offset, @remote_size)
25
+ 0 # always return 0!
26
+ end
27
+
28
+ # Emulates IO#size.
29
+ #
30
+ # @return [Integer] the size of the remote resource
31
+ def size
32
+ @remote_size ||= request_object_size
33
+ end
34
+
35
+ # Emulates IO#read, but requires the number of bytes to read
36
+ # The read will be limited to the
37
+ # size of the remote resource relative to the current offset in the IO,
38
+ # so if you are at offset 0 in the IO of size 10, doing a `read(20)`
39
+ # will only return you 10 bytes of result, and not raise any exceptions.
40
+ #
41
+ # @param n_bytes[Fixnum, nil] how many bytes to read, or `nil` to read all the way to the end
42
+ # @return [String] the read bytes
43
+ def read(n_bytes = nil)
44
+ # If the resource is empty there is nothing to read
45
+ return if size.zero?
46
+
47
+ maximum_avaialable = size - @pos
48
+ n_bytes ||= maximum_avaialable # nil == read to the end of file
49
+ return "" if n_bytes.zero?
50
+ raise ArgumentError, "No negative reads(#{n_bytes})" if n_bytes < 0
51
+
52
+ n_bytes = clamp(0, n_bytes, maximum_avaialable)
53
+
54
+ http_range = (@pos..(@pos + n_bytes - 1))
55
+ request_range(http_range).tap do |data|
56
+ raise "Remote read returned #{data.bytesize} bytes instead of #{n_bytes} as requested" if data.bytesize != n_bytes
57
+ @pos = clamp(0, @pos + data.bytesize, size)
58
+ end
59
+ end
60
+
61
+ # Returns the current pointer position within the IO
62
+ #
63
+ # @return [Fixnum]
64
+ def tell
65
+ @pos
66
+ end
67
+
68
+ protected
69
+
70
+ # Only used internally when reading the remote ZIP.
71
+ #
72
+ # @param range[Range] the HTTP range of data to fetch from remote
73
+ # @return [String] the response body of the ranged request
74
+ def request_range(range)
75
+ http = Net::HTTP.start(@uri.hostname, @uri.port)
76
+ request = Net::HTTP::Get.new(@uri)
77
+ request.range = range
78
+ response = http.request(request)
79
+ case response.code
80
+ when "206", "200"
81
+ response.body
82
+ else
83
+ raise "Remote at #{@uri} replied with code #{response.code}"
84
+ end
85
+ end
86
+
87
+ # For working with S3 it is a better idea to perform a GET request for one byte, since doing a HEAD
88
+ # request needs a different permission - and standard GET presigned URLs are not allowed to perform it
89
+ #
90
+ # @return [Integer] the size of the remote resource, parsed either from Content-Length or Content-Range header
91
+ def request_object_size
92
+ http = Net::HTTP.start(@uri.hostname, @uri.port)
93
+ request = Net::HTTP::Get.new(@uri)
94
+ request.range = 0..0
95
+ response = http.request(request)
96
+ case response.code
97
+ when "206"
98
+ content_range_header_value = response["Content-Range"]
99
+ content_range_header_value.split("/").last.to_i
100
+ when "200"
101
+ response["Content-Length"].to_i
102
+ else
103
+ raise "Remote at #{@uri} replied with code #{response.code}"
104
+ end
105
+ end
106
+
107
+ private
108
+
109
+ def clamp(a, b, c)
110
+ return a if b < a
111
+ return c if b > c
112
+ b
113
+ end
114
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Alows reading the central directory of a remote ZIP file without
4
+ # downloading the entire file. The central directory provides the
5
+ # offsets at which the actual file contents is located. You can then
6
+ # use the `Range:` HTTP headers to download those entries separately.
7
+ #
8
+ # Please read the security warning in `FileReader` _VERY CAREFULLY_
9
+ # before you use this module.
10
+ module ZipKit::RemoteUncap
11
+ # @param uri[String] the HTTP(S) URL to read the ZIP footer from
12
+ # @param reader_class[Class] which class to use for reading
13
+ # @param options_for_zip_reader[Hash] any additional options to give to
14
+ # {ZipKit::FileReader} when reading
15
+ # @return [Array<ZipKit::FileReader::ZipEntry>] metadata about the
16
+ # files within the remote archive
17
+ def self.files_within_zip_at(uri, reader_class: ZipKit::FileReader, **options_for_zip_reader)
18
+ fake_io = ZipKit::RemoteIO.new(uri)
19
+ reader = reader_class.new
20
+ reader.read_zip_structure(io: fake_io, **options_for_zip_reader)
21
+ end
22
+ end
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Helps to estimate archive sizes
4
+ class ZipKit::SizeEstimator
5
+ require_relative "streamer"
6
+
7
+ # Creates a new estimator with a Streamer object. Normally you should use
8
+ # `estimate` instead an not use this method directly.
9
+ def initialize(streamer)
10
+ @streamer = streamer
11
+ end
12
+ private :initialize
13
+
14
+ # Performs the estimate using fake archiving. It needs to know the sizes of the
15
+ # entries upfront. Usage:
16
+ #
17
+ # expected_zip_size = SizeEstimator.estimate do | estimator |
18
+ # estimator.add_stored_entry(filename: "file.doc", size: 898291)
19
+ # estimator.add_deflated_entry(filename: "family.tif",
20
+ # uncompressed_size: 89281911, compressed_size: 121908)
21
+ # end
22
+ #
23
+ # @param kwargs_for_streamer_new Any options to pass to Streamer, see {Streamer#initialize}
24
+ # @return [Integer] the size of the resulting archive, in bytes
25
+ # @yield [SizeEstimator] the estimator
26
+ def self.estimate(**kwargs_for_streamer_new)
27
+ streamer = ZipKit::Streamer.new(ZipKit::NullWriter, **kwargs_for_streamer_new)
28
+ estimator = new(streamer)
29
+ yield(estimator)
30
+ streamer.close # Returns the .tell of the contained IO
31
+ end
32
+
33
+ # Add a fake entry to the archive, to see how big it is going to be in the end.
34
+ #
35
+ # @param filename [String] the name of the file (filenames are variable-width in the ZIP)
36
+ # @param size [Fixnum] size of the uncompressed entry
37
+ # @param use_data_descriptor[Boolean] whether the entry uses a postfix
38
+ # data descriptor to specify size
39
+ # @return self
40
+ def add_stored_entry(filename:, size:, use_data_descriptor: false)
41
+ @streamer.add_stored_entry(filename: filename,
42
+ crc32: 0,
43
+ size: size,
44
+ use_data_descriptor: use_data_descriptor)
45
+ @streamer.simulate_write(size)
46
+ if use_data_descriptor
47
+ @streamer.update_last_entry_and_write_data_descriptor(crc32: 0, compressed_size: size, uncompressed_size: size)
48
+ end
49
+ self
50
+ end
51
+
52
+ # Add a fake entry to the archive, to see how big it is going to be in the end.
53
+ #
54
+ # @param filename [String] the name of the file (filenames are variable-width in the ZIP)
55
+ # @param uncompressed_size [Fixnum] size of the uncompressed entry
56
+ # @param compressed_size [Fixnum] size of the compressed entry
57
+ # @param use_data_descriptor[Boolean] whether the entry uses a postfix data
58
+ # descriptor to specify size
59
+ # @return self
60
+ def add_deflated_entry(filename:, uncompressed_size:, compressed_size:, use_data_descriptor: false)
61
+ @streamer.add_deflated_entry(filename: filename,
62
+ crc32: 0,
63
+ compressed_size: compressed_size,
64
+ uncompressed_size: uncompressed_size,
65
+ use_data_descriptor: use_data_descriptor)
66
+
67
+ @streamer.simulate_write(compressed_size)
68
+ if use_data_descriptor
69
+ @streamer.update_last_entry_and_write_data_descriptor(crc32: 0,
70
+ compressed_size: compressed_size,
71
+ uncompressed_size: uncompressed_size)
72
+ end
73
+ self
74
+ end
75
+
76
+ # Add an empty directory to the archive.
77
+ #
78
+ # @param dirname [String] the name of the directory
79
+ # @return self
80
+ def add_empty_directory_entry(dirname:)
81
+ @streamer.add_empty_directory(dirname: dirname)
82
+ self
83
+ end
84
+ end