zip_kit 6.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +7 -0
  2. data/.codeclimate.yml +7 -0
  3. data/.document +5 -0
  4. data/.github/workflows/ci.yml +29 -0
  5. data/.gitignore +61 -0
  6. data/.rspec +1 -0
  7. data/.standard.yml +8 -0
  8. data/.yardopts +1 -0
  9. data/CHANGELOG.md +255 -0
  10. data/CODE_OF_CONDUCT.md +46 -0
  11. data/CONTRIBUTING.md +153 -0
  12. data/Gemfile +4 -0
  13. data/IMPLEMENTATION_DETAILS.md +97 -0
  14. data/LICENSE.txt +20 -0
  15. data/README.md +234 -0
  16. data/Rakefile +21 -0
  17. data/bench/buffered_crc32_bench.rb +109 -0
  18. data/examples/archive_size_estimate.rb +15 -0
  19. data/examples/config.ru +7 -0
  20. data/examples/deferred_write.rb +58 -0
  21. data/examples/parallel_compression_with_block_deflate.rb +86 -0
  22. data/examples/rack_application.rb +63 -0
  23. data/examples/s3_upload.rb +23 -0
  24. data/lib/zip_kit/block_deflate.rb +130 -0
  25. data/lib/zip_kit/block_write.rb +47 -0
  26. data/lib/zip_kit/file_reader/inflating_reader.rb +36 -0
  27. data/lib/zip_kit/file_reader/stored_reader.rb +35 -0
  28. data/lib/zip_kit/file_reader.rb +740 -0
  29. data/lib/zip_kit/null_writer.rb +12 -0
  30. data/lib/zip_kit/output_enumerator.rb +150 -0
  31. data/lib/zip_kit/path_set.rb +163 -0
  32. data/lib/zip_kit/rack_chunked_body.rb +32 -0
  33. data/lib/zip_kit/rack_tempfile_body.rb +61 -0
  34. data/lib/zip_kit/rails_streaming.rb +37 -0
  35. data/lib/zip_kit/remote_io.rb +114 -0
  36. data/lib/zip_kit/remote_uncap.rb +22 -0
  37. data/lib/zip_kit/size_estimator.rb +84 -0
  38. data/lib/zip_kit/stream_crc32.rb +60 -0
  39. data/lib/zip_kit/streamer/deflated_writer.rb +45 -0
  40. data/lib/zip_kit/streamer/entry.rb +37 -0
  41. data/lib/zip_kit/streamer/filler.rb +9 -0
  42. data/lib/zip_kit/streamer/heuristic.rb +68 -0
  43. data/lib/zip_kit/streamer/stored_writer.rb +39 -0
  44. data/lib/zip_kit/streamer/writable.rb +36 -0
  45. data/lib/zip_kit/streamer.rb +614 -0
  46. data/lib/zip_kit/uniquify_filename.rb +39 -0
  47. data/lib/zip_kit/version.rb +5 -0
  48. data/lib/zip_kit/write_and_tell.rb +40 -0
  49. data/lib/zip_kit/write_buffer.rb +71 -0
  50. data/lib/zip_kit/write_shovel.rb +22 -0
  51. data/lib/zip_kit/zip_writer.rb +436 -0
  52. data/lib/zip_kit.rb +24 -0
  53. data/zip_kit.gemspec +41 -0
  54. metadata +335 -0
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Used when you need to supply a destination IO for some
4
+ # write operations, but want to discard the data (like when
5
+ # estimating the size of a ZIP)
6
+ module ZipKit::NullWriter
7
+ # @param _[String] the data to write
8
+ # @return [self]
9
+ def self.<<(_)
10
+ self
11
+ end
12
+ end
@@ -0,0 +1,150 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "time" # for .httpdate
4
+
5
+ # The output enumerator makes it possible to "pull" from a ZipKit streamer
6
+ # object instead of having it "push" writes to you. It will "stash" the block which
7
+ # writes the ZIP archive through the streamer, and when you call `each` on the Enumerator
8
+ # it will yield you the bytes the block writes. Since it is an enumerator you can
9
+ # use `next` to take chunks written by the ZipKit streamer one by one. It can be very
10
+ # convenient when you need to segment your ZIP output into bigger chunks for, say,
11
+ # uploading them to a cloud storage provider such as S3.
12
+ #
13
+ # Another use of the `OutputEnumerator` is as a Rack response body - since a Rack
14
+ # response body object must support `#each` yielding successive binary strings.
15
+ # Which is exactly what `OutputEnumerator` does.
16
+ #
17
+ # The enumerator can provide you some more conveinences for HTTP output - correct streaming
18
+ # headers and a body with chunked transfer encoding.
19
+ #
20
+ # iterable_zip_body = ZipKit::OutputEnumerator.new do | streamer |
21
+ # streamer.write_file('big.csv') do |sink|
22
+ # CSV(sink) do |csv_writer|
23
+ # csv_writer << Person.column_names
24
+ # Person.all.find_each do |person|
25
+ # csv_writer << person.attributes.values
26
+ # end
27
+ # end
28
+ # end
29
+ # end
30
+ #
31
+ # Either as a `Transfer-Encoding: chunked` response (if your webserver supports it),
32
+ # which will give you true streaming capability:
33
+ #
34
+ # headers, chunked_or_presized_rack_body = iterable_zip_body.to_headers_and_rack_response_body(env)
35
+ # [200, headers, chunked_or_presized_rack_body]
36
+ #
37
+ # or it will wrap your output in a `TempfileBody` object which buffers the ZIP before output. Buffering has
38
+ # benefits if your webserver does not support anything beyound HTTP/1.0, and also engages automatically
39
+ # in unit tests (since rack-test and Rails tests do not do streaming HTTP/1.1).
40
+ class ZipKit::OutputEnumerator
41
+ DEFAULT_WRITE_BUFFER_SIZE = 64 * 1024
42
+
43
+ # Creates a new OutputEnumerator enumerator. The enumerator can be read from using `each`,
44
+ # and the creation of the ZIP is in lockstep with the caller calling `each` on the returned
45
+ # output enumerator object. This can be used when the calling program wants to stream the
46
+ # output of the ZIP archive and throttle that output, or split it into chunks, or use it
47
+ # as a generator.
48
+ #
49
+ # For example:
50
+ #
51
+ # # The block given to {output_enum} won't be executed immediately - rather it
52
+ # # will only start to execute when the caller starts to read from the output
53
+ # # by calling `each`
54
+ # body = ::ZipKit::OutputEnumerator.new(writer: CustomWriter) do |streamer|
55
+ # streamer.add_stored_entry(filename: 'large.tif', size: 1289894, crc32: 198210)
56
+ # streamer << large_file.read(1024*1024) until large_file.eof?
57
+ # ...
58
+ # end
59
+ #
60
+ # body.each do |bin_string|
61
+ # # Send the output somewhere, buffer it in a file etc.
62
+ # # The block passed into `initialize` will only start executing once `#each`
63
+ # # is called
64
+ # ...
65
+ # end
66
+ #
67
+ # @param kwargs_for_new [Hash] keyword arguments for {Streamer.new}
68
+ # @return [ZipKit::OutputEnumerator] the enumerator you can read bytestrings of the ZIP from by calling `each`
69
+ #
70
+ # @param streamer_options[Hash] options for Streamer, see {ZipKit::Streamer.new}
71
+ # @param write_buffer_size[Integer] By default all ZipKit writes are unbuffered. For output to sockets
72
+ # it is beneficial to bulkify those writes so that they are roughly sized to a socket buffer chunk. This
73
+ # object will bulkify writes for you in this way (so `each` will yield not on every call to `<<` from the Streamer
74
+ # but at block size boundaries or greater). Set it to 0 for unbuffered writes.
75
+ # @param blk a block that will receive the Streamer object when executing. The block will not be executed
76
+ # immediately but only once `each` is called on the OutputEnumerator
77
+ def initialize(write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE, **streamer_options, &blk)
78
+ @streamer_options = streamer_options.to_h
79
+ @bufsize = write_buffer_size.to_i
80
+ @archiving_block = blk
81
+ end
82
+
83
+ # Executes the block given to the constructor with a {ZipKit::Streamer}
84
+ # and passes each written chunk to the block given to the method. This allows one
85
+ # to "take" output of the ZIP piecewise. If called without a block will return an Enumerator
86
+ # that you can pull data from using `next`.
87
+ #
88
+ # **NOTE** Because the `WriteBuffer` inside this object can reuse the buffer, it is important
89
+ # that the `String` that is yielded **either** gets consumed eagerly (written byte-by-byte somewhere, or `#dup`-ed)
90
+ # since the write buffer will clear it after your block returns. If you expand this Enumerator
91
+ # eagerly into an Array you might notice that a lot of the segments of your ZIP output are
92
+ # empty - this means that you need to duplicate them.
93
+ #
94
+ # @yield [String] a chunk of the ZIP output in binary encoding
95
+ def each
96
+ if block_given?
97
+ block_write = ZipKit::BlockWrite.new { |chunk| yield(chunk) }
98
+ buffer = ZipKit::WriteBuffer.new(block_write, @bufsize)
99
+ ZipKit::Streamer.open(buffer, **@streamer_options, &@archiving_block)
100
+ buffer.flush
101
+ else
102
+ enum_for(:each)
103
+ end
104
+ end
105
+
106
+ # Returns a tuple of `headers, body` - headers are a `Hash` and the body is
107
+ # an object that can be used as a Rack response body. The method will automatically
108
+ # switch the wrapping of the output depending on whether the response can be pre-sized,
109
+ # and whether your downstream webserver (like nginx) is configured to support
110
+ # the HTTP/1.1 protocol version.
111
+ #
112
+ # @param rack_env[Hash] the Rack env, which the method may need to mutate (adding a Tempfile for cleanup)
113
+ # @param content_length[Integer] the amount of bytes that the archive will contain. If given, no Chunked encoding gets applied.
114
+ # @return [Array]
115
+ def to_headers_and_rack_response_body(rack_env, content_length: nil)
116
+ headers = {
117
+ # We need to ensure Rack::ETag does not suddenly start buffering us, see
118
+ # https://github.com/rack/rack/issues/1619#issuecomment-606315714
119
+ # Set this even when not streaming for consistency. The fact that there would be
120
+ # a weak ETag generated would mean that the middleware buffers, so we have tests for that.
121
+ "Last-Modified" => Time.now.httpdate,
122
+ # Make sure Rack::Deflater does not touch our response body either, see
123
+ # https://github.com/felixbuenemann/xlsxtream/issues/14#issuecomment-529569548
124
+ "Content-Encoding" => "identity",
125
+ # Disable buffering for both nginx and Google Load Balancer, see
126
+ # https://cloud.google.com/appengine/docs/flexible/how-requests-are-handled?tab=python#x-accel-buffering
127
+ "X-Accel-Buffering" => "no"
128
+ }
129
+
130
+ if content_length
131
+ # If we know the size of the body, transfer encoding is not required at all - so the enumerator itself
132
+ # can function as the Rack body. This also would apply in HTTP/2 contexts where chunked encoding would
133
+ # no longer be required - then the enumerator could get returned "bare".
134
+ body = self
135
+ headers["Content-Length"] = content_length.to_i.to_s
136
+ elsif rack_env["HTTP_VERSION"] == "HTTP/1.0"
137
+ # Check for the proxy configuration first. This is the first common misconfiguration which destroys streaming -
138
+ # since HTTP 1.0 does not support chunked responses we need to revert to buffering. The issue though is that
139
+ # this reversion happens silently and it is usually not clear at all why streaming does not work. So let's at
140
+ # the very least print it to the Rails log.
141
+ body = ZipKit::RackTempfileBody.new(rack_env, self)
142
+ headers["Content-Length"] = body.size.to_s
143
+ else
144
+ body = ZipKit::RackChunkedBody.new(self)
145
+ headers["Transfer-Encoding"] = "chunked"
146
+ end
147
+
148
+ [headers, body]
149
+ end
150
+ end
@@ -0,0 +1,163 @@
1
+ # frozen_string_literal: true
2
+
3
+ # A ZIP archive contains a flat list of entries. These entries can implicitly
4
+ # create directories when the archive is expanded. For example, an entry with
5
+ # the filename of "some folder/file.docx" will make the unarchiving application
6
+ # create a directory called "some folder" automatically, and then deposit the
7
+ # file "file.docx" in that directory. These "implicit" directories can be
8
+ # arbitrarily nested, and create a tree structure of directories. That structure
9
+ # however is implicit as the archive contains a flat list.
10
+ #
11
+ # This creates opportunities for conflicts. For example, imagine the following
12
+ # structure:
13
+ #
14
+ # * `something/` - specifies an empty directory with the name "something"
15
+ # * `something` - specifies a file, creates a conflict
16
+ #
17
+ # This can be prevented with filename uniqueness checks. It does get funkier however
18
+ # as the rabbit hole goes down:
19
+ #
20
+ # * `dir/subdir/another_subdir/yet_another_subdir/file.bin` - declares a file and directories
21
+ # * `dir/subdir/another_subdir/yet_another_subdir` - declares a file at one of the levels, creates a conflict
22
+ #
23
+ # The results of this ZIP structure aren't very easy to predict as they depend on the
24
+ # application that opens the archive. For example, BOMArchiveHelper on macOS will expand files
25
+ # as they are declared in the ZIP, but once a conflict occurs it will fail with "error -21". It
26
+ # is not very transparent to the user why unarchiving fails, and it has to - and can reliably - only
27
+ # be prevented when the archive gets created.
28
+ #
29
+ # Unfortunately that conflicts with another "magical" feature of ZipKit which automatically
30
+ # "fixes" duplicate filenames - filenames (paths) which have already been added to the archive.
31
+ # This fix is performed by appending (1), then (2) and so forth to the filename so that the
32
+ # conflict is avoided. This is not possible to apply to directories, because when one of the
33
+ # path components is reused in multiple filenames it means those entities should end up in
34
+ # the same directory (subdirectory) once the archive is opened.
35
+ #
36
+ # The `PathSet` keeps track of entries as they get added using 2 Sets (cheap presence checks),
37
+ # one for directories and one for files. It will raise a `Conflict` exception if there are
38
+ # files clobbering one another, or in case files collide with directories.
39
+ class ZipKit::PathSet
40
+ class Conflict < StandardError
41
+ end
42
+
43
+ class FileClobbersDirectory < Conflict
44
+ end
45
+
46
+ class DirectoryClobbersFile < Conflict
47
+ end
48
+
49
+ def initialize
50
+ @known_directories = Set.new
51
+ @known_files = Set.new
52
+ end
53
+
54
+ # Adds a directory path to the set of known paths, including
55
+ # all the directories that contain it. So, calling
56
+ # add_directory_path("dir/dir2/dir3")
57
+ # will add "dir", "dir/dir2", "dir/dir2/dir3".
58
+ #
59
+ # @param path[String] the path to the directory to add
60
+ # @return [void]
61
+ def add_directory_path(path)
62
+ path_and_ancestors(path).each do |parent_directory_path|
63
+ if @known_files.include?(parent_directory_path)
64
+ # Have to use the old-fashioned heredocs because ZipKit
65
+ # aims to be compatible with MRI 2.1+ syntax, and squiggly
66
+ # heredoc is only available starting 2.3+
67
+ error_message = <<~ERR
68
+ The path #{parent_directory_path.inspect} which has to be added
69
+ as a directory is already used for a file.
70
+
71
+ The directory at this path would get created implicitly
72
+ to produce #{path.inspect} during decompresison.
73
+
74
+ This would make some archive utilities refuse to open
75
+ the ZIP.
76
+ ERR
77
+ raise DirectoryClobbersFile, error_message
78
+ end
79
+ @known_directories << parent_directory_path
80
+ end
81
+ end
82
+
83
+ # Adds a file path to the set of known paths, including
84
+ # all the directories that contain it. Once a file has been added,
85
+ # it is no longer possible to add a directory having the same path
86
+ # as this would cause conflict.
87
+ #
88
+ # The operation also adds all the containing directories for the file, so
89
+ # add_file_path("dir/dir2/file.doc")
90
+ # will add "dir" and "dir/dir2" as directories, "dir/dir2/dir3".
91
+ #
92
+ # @param file_path[String] the path to the directory to add
93
+ # @return [void]
94
+ def add_file_path(file_path)
95
+ if @known_files.include?(file_path)
96
+ error_message = <<~ERR
97
+ The file at #{file_path.inspect} has already been included
98
+ in the archive. Adding it the second time would cause
99
+ the first file to be overwritten during unarchiving, and
100
+ could also get the archive flagged as invalid.
101
+ ERR
102
+ raise Conflict, error_message
103
+ end
104
+
105
+ if @known_directories.include?(file_path)
106
+ error_message = <<~ERR
107
+ The path #{file_path.inspect} is already used for
108
+ a directory, but you are trying to add it as a file.
109
+
110
+ This would make some archive utilities refuse
111
+ to open the ZIP.
112
+ ERR
113
+ raise FileClobbersDirectory, error_message
114
+ end
115
+
116
+ # Add all the directories which this file is contained in
117
+ *dir_components, _file_name = non_empty_path_components(file_path)
118
+ add_directory_path(dir_components.join("/"))
119
+
120
+ # ...and then the file itself
121
+ @known_files << file_path
122
+ end
123
+
124
+ # Tells whether a specific full path is already known to the PathSet.
125
+ # Can be a path for a directory or for a file.
126
+ #
127
+ # @param path_in_archive[String] the path to check for inclusion
128
+ # @return [Boolean]
129
+ def include?(path_in_archive)
130
+ @known_files.include?(path_in_archive) || @known_directories.include?(path_in_archive)
131
+ end
132
+
133
+ # Clears the contained sets
134
+ # @return [void]
135
+ def clear
136
+ @known_files.clear
137
+ @known_directories.clear
138
+ end
139
+
140
+ # Adds the directory or file path to the path set
141
+ #
142
+ # @return [void]
143
+ def add_directory_or_file_path(path_in_archive)
144
+ if path_in_archive.end_with?("/")
145
+ add_directory_path(path_in_archive)
146
+ else
147
+ add_file_path(path_in_archive)
148
+ end
149
+ end
150
+
151
+ private
152
+
153
+ def non_empty_path_components(path)
154
+ path.split("/").reject(&:empty?)
155
+ end
156
+
157
+ def path_and_ancestors(path)
158
+ path_components = non_empty_path_components(path)
159
+ path_components.each_with_object([]) do |component, seen|
160
+ seen << [seen.last, component].compact.join("/")
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ # A body wrapper that emits chunked responses, creating valid
4
+ # Transfer-Encoding::Chunked HTTP response body. This is copied from Rack::Chunked::Body,
5
+ # because Rack is not going to include that class after version 3.x
6
+ # Rails has a substitute class for this inside ActionController::Streaming,
7
+ # but that module is a private constant in the Rails codebase, and is thus
8
+ # considered "private" from the Rails standpoint. It is not that much code to
9
+ # carry, so we copy it into our code.
10
+ class ZipKit::RackChunkedBody
11
+ TERM = "\r\n"
12
+ TAIL = "0#{TERM}"
13
+
14
+ # @param body[#each] the enumerable that yields bytes, usually a `OutputEnumerator`
15
+ def initialize(body)
16
+ @body = body
17
+ end
18
+
19
+ # For each string yielded by the response body, yield
20
+ # the element in chunked encoding - and finish off with a terminator
21
+ def each
22
+ term = TERM
23
+ @body.each do |chunk|
24
+ size = chunk.bytesize
25
+ next if size == 0
26
+
27
+ yield [size.to_s(16), term, chunk.b, term].join
28
+ end
29
+ yield TAIL
30
+ yield term
31
+ end
32
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Contains a file handle which can be closed once the response finishes sending.
4
+ # It supports `to_path` so that `Rack::Sendfile` can intercept it
5
+ class ZipKit::RackTempfileBody
6
+ TEMPFILE_NAME_PREFIX = "zip-tricks-tf-body-"
7
+ attr_reader :tempfile
8
+
9
+ # @param body[#each] the enumerable that yields bytes, usually a `OutputEnumerator`.
10
+ # The `body` will be read in full immediately and closed.
11
+ def initialize(env, body)
12
+ @tempfile = Tempfile.new(TEMPFILE_NAME_PREFIX)
13
+ # Rack::TempfileReaper calls close! on tempfiles which get buffered
14
+ # We wil assume that it works fine with Rack::Sendfile (i.e. the path
15
+ # to the file getting served gets used before we unlink the tempfile)
16
+ env["rack.tempfiles"] ||= []
17
+ env["rack.tempfiles"] << @tempfile
18
+
19
+ @tempfile.binmode
20
+ @body = body
21
+ @did_flush = false
22
+ end
23
+
24
+ # Returns the size of the contained `Tempfile` so that a correct
25
+ # Content-Length header can be set
26
+ #
27
+ # @return [Integer]
28
+ def size
29
+ flush
30
+ @tempfile.size
31
+ end
32
+
33
+ # Returns the path to the `Tempfile`, so that Rack::Sendfile can send this response
34
+ # using the downstream webserver
35
+ #
36
+ # @return [String]
37
+ def to_path
38
+ flush
39
+ @tempfile.to_path
40
+ end
41
+
42
+ # Stream the file's contents if `Rack::Sendfile` isn't present.
43
+ #
44
+ # @return [void]
45
+ def each
46
+ flush
47
+ while (chunk = @tempfile.read(16384))
48
+ yield chunk
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ def flush
55
+ if !@did_flush
56
+ @body.each { |bytes| @tempfile << bytes }
57
+ @did_flush = true
58
+ end
59
+ @tempfile.rewind
60
+ end
61
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Should be included into a Rails controller for easy ZIP output from any action.
4
+ module ZipKit::RailsStreaming
5
+ # Opens a {ZipKit::Streamer} and yields it to the caller. The output of the streamer
6
+ # gets automatically forwarded to the Rails response stream. When the output completes,
7
+ # the Rails response stream is going to be closed automatically.
8
+ # @param filename[String] name of the file for the Content-Disposition header
9
+ # @param type[String] the content type (MIME type) of the archive being output
10
+ # @param zip_streamer_options[Hash] options that will be passed to the Streamer.
11
+ # See {ZipKit::Streamer#initialize} for the full list of options.
12
+ # @yield [Streamer] the streamer that can be written to
13
+ # @return [ZipKit::OutputEnumerator] The output enumerator assigned to the response body
14
+ def zip_kit_stream(filename: "download.zip", type: "application/zip", **zip_streamer_options, &zip_streaming_blk)
15
+ # The output enumerator yields chunks of bytes generated from ZipKit. Instantiating it
16
+ # first will also validate the Streamer options.
17
+ chunk_yielder = ZipKit::OutputEnumerator.new(**zip_streamer_options, &zip_streaming_blk)
18
+
19
+ # We want some common headers for file sending. Rails will also set
20
+ # self.sending_file = true for us when we call send_file_headers!
21
+ send_file_headers!(type: type, filename: filename)
22
+
23
+ # Check for the proxy configuration first. This is the first common misconfiguration which destroys streaming -
24
+ # since HTTP 1.0 does not support chunked responses we need to revert to buffering. The issue though is that
25
+ # this reversion happens silently and it is usually not clear at all why streaming does not work. So let's at
26
+ # the very least print it to the Rails log.
27
+ if request.get_header("HTTP_VERSION") == "HTTP/1.0"
28
+ logger&.warn { "The downstream HTTP proxy/LB insists on HTTP/1.0 protocol, ZIP response will be buffered." }
29
+ end
30
+
31
+ headers, rack_body = chunk_yielder.to_headers_and_rack_response_body(request.env)
32
+
33
+ # Set the "particular" streaming headers
34
+ response.headers.merge!(headers)
35
+ self.response_body = rack_body
36
+ end
37
+ end
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ # An object that fakes just-enough of an IO to be dangerous
4
+ # - or, more precisely, to be useful as a source for the FileReader
5
+ # central directory parser. Effectively we substitute an IO object
6
+ # for an object that fetches parts of the remote file over HTTP using `Range:`
7
+ # headers. The `RemoteIO` acts as an adapter between an object that performs the
8
+ # actual fetches over HTTP and an object that expects a handful of IO methods to be
9
+ # available.
10
+ class ZipKit::RemoteIO
11
+ # @param url[String, URI] the HTTP/HTTPS URL of the object to be retrieved
12
+ def initialize(url)
13
+ @pos = 0
14
+ @uri = URI(url)
15
+ @remote_size = nil
16
+ end
17
+
18
+ # Emulates IO#seek
19
+ # @param offset[Integer] absolute offset in the remote resource to seek to
20
+ # @param mode[Integer] The seek mode (only SEEK_SET is supported)
21
+ def seek(offset, mode = IO::SEEK_SET)
22
+ raise "Unsupported read mode #{mode}" unless mode == IO::SEEK_SET
23
+ @remote_size ||= request_object_size
24
+ @pos = clamp(0, offset, @remote_size)
25
+ 0 # always return 0!
26
+ end
27
+
28
+ # Emulates IO#size.
29
+ #
30
+ # @return [Integer] the size of the remote resource
31
+ def size
32
+ @remote_size ||= request_object_size
33
+ end
34
+
35
+ # Emulates IO#read, but requires the number of bytes to read
36
+ # The read will be limited to the
37
+ # size of the remote resource relative to the current offset in the IO,
38
+ # so if you are at offset 0 in the IO of size 10, doing a `read(20)`
39
+ # will only return you 10 bytes of result, and not raise any exceptions.
40
+ #
41
+ # @param n_bytes[Fixnum, nil] how many bytes to read, or `nil` to read all the way to the end
42
+ # @return [String] the read bytes
43
+ def read(n_bytes = nil)
44
+ # If the resource is empty there is nothing to read
45
+ return if size.zero?
46
+
47
+ maximum_avaialable = size - @pos
48
+ n_bytes ||= maximum_avaialable # nil == read to the end of file
49
+ return "" if n_bytes.zero?
50
+ raise ArgumentError, "No negative reads(#{n_bytes})" if n_bytes < 0
51
+
52
+ n_bytes = clamp(0, n_bytes, maximum_avaialable)
53
+
54
+ http_range = (@pos..(@pos + n_bytes - 1))
55
+ request_range(http_range).tap do |data|
56
+ raise "Remote read returned #{data.bytesize} bytes instead of #{n_bytes} as requested" if data.bytesize != n_bytes
57
+ @pos = clamp(0, @pos + data.bytesize, size)
58
+ end
59
+ end
60
+
61
+ # Returns the current pointer position within the IO
62
+ #
63
+ # @return [Fixnum]
64
+ def tell
65
+ @pos
66
+ end
67
+
68
+ protected
69
+
70
+ # Only used internally when reading the remote ZIP.
71
+ #
72
+ # @param range[Range] the HTTP range of data to fetch from remote
73
+ # @return [String] the response body of the ranged request
74
+ def request_range(range)
75
+ http = Net::HTTP.start(@uri.hostname, @uri.port)
76
+ request = Net::HTTP::Get.new(@uri)
77
+ request.range = range
78
+ response = http.request(request)
79
+ case response.code
80
+ when "206", "200"
81
+ response.body
82
+ else
83
+ raise "Remote at #{@uri} replied with code #{response.code}"
84
+ end
85
+ end
86
+
87
+ # For working with S3 it is a better idea to perform a GET request for one byte, since doing a HEAD
88
+ # request needs a different permission - and standard GET presigned URLs are not allowed to perform it
89
+ #
90
+ # @return [Integer] the size of the remote resource, parsed either from Content-Length or Content-Range header
91
+ def request_object_size
92
+ http = Net::HTTP.start(@uri.hostname, @uri.port)
93
+ request = Net::HTTP::Get.new(@uri)
94
+ request.range = 0..0
95
+ response = http.request(request)
96
+ case response.code
97
+ when "206"
98
+ content_range_header_value = response["Content-Range"]
99
+ content_range_header_value.split("/").last.to_i
100
+ when "200"
101
+ response["Content-Length"].to_i
102
+ else
103
+ raise "Remote at #{@uri} replied with code #{response.code}"
104
+ end
105
+ end
106
+
107
+ private
108
+
109
+ def clamp(a, b, c)
110
+ return a if b < a
111
+ return c if b > c
112
+ b
113
+ end
114
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Alows reading the central directory of a remote ZIP file without
4
+ # downloading the entire file. The central directory provides the
5
+ # offsets at which the actual file contents is located. You can then
6
+ # use the `Range:` HTTP headers to download those entries separately.
7
+ #
8
+ # Please read the security warning in `FileReader` _VERY CAREFULLY_
9
+ # before you use this module.
10
+ module ZipKit::RemoteUncap
11
+ # @param uri[String] the HTTP(S) URL to read the ZIP footer from
12
+ # @param reader_class[Class] which class to use for reading
13
+ # @param options_for_zip_reader[Hash] any additional options to give to
14
+ # {ZipKit::FileReader} when reading
15
+ # @return [Array<ZipKit::FileReader::ZipEntry>] metadata about the
16
+ # files within the remote archive
17
+ def self.files_within_zip_at(uri, reader_class: ZipKit::FileReader, **options_for_zip_reader)
18
+ fake_io = ZipKit::RemoteIO.new(uri)
19
+ reader = reader_class.new
20
+ reader.read_zip_structure(io: fake_io, **options_for_zip_reader)
21
+ end
22
+ end
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Helps to estimate archive sizes
4
+ class ZipKit::SizeEstimator
5
+ require_relative "streamer"
6
+
7
+ # Creates a new estimator with a Streamer object. Normally you should use
8
+ # `estimate` instead an not use this method directly.
9
+ def initialize(streamer)
10
+ @streamer = streamer
11
+ end
12
+ private :initialize
13
+
14
+ # Performs the estimate using fake archiving. It needs to know the sizes of the
15
+ # entries upfront. Usage:
16
+ #
17
+ # expected_zip_size = SizeEstimator.estimate do | estimator |
18
+ # estimator.add_stored_entry(filename: "file.doc", size: 898291)
19
+ # estimator.add_deflated_entry(filename: "family.tif",
20
+ # uncompressed_size: 89281911, compressed_size: 121908)
21
+ # end
22
+ #
23
+ # @param kwargs_for_streamer_new Any options to pass to Streamer, see {Streamer#initialize}
24
+ # @return [Integer] the size of the resulting archive, in bytes
25
+ # @yield [SizeEstimator] the estimator
26
+ def self.estimate(**kwargs_for_streamer_new)
27
+ streamer = ZipKit::Streamer.new(ZipKit::NullWriter, **kwargs_for_streamer_new)
28
+ estimator = new(streamer)
29
+ yield(estimator)
30
+ streamer.close # Returns the .tell of the contained IO
31
+ end
32
+
33
+ # Add a fake entry to the archive, to see how big it is going to be in the end.
34
+ #
35
+ # @param filename [String] the name of the file (filenames are variable-width in the ZIP)
36
+ # @param size [Fixnum] size of the uncompressed entry
37
+ # @param use_data_descriptor[Boolean] whether the entry uses a postfix
38
+ # data descriptor to specify size
39
+ # @return self
40
+ def add_stored_entry(filename:, size:, use_data_descriptor: false)
41
+ @streamer.add_stored_entry(filename: filename,
42
+ crc32: 0,
43
+ size: size,
44
+ use_data_descriptor: use_data_descriptor)
45
+ @streamer.simulate_write(size)
46
+ if use_data_descriptor
47
+ @streamer.update_last_entry_and_write_data_descriptor(crc32: 0, compressed_size: size, uncompressed_size: size)
48
+ end
49
+ self
50
+ end
51
+
52
+ # Add a fake entry to the archive, to see how big it is going to be in the end.
53
+ #
54
+ # @param filename [String] the name of the file (filenames are variable-width in the ZIP)
55
+ # @param uncompressed_size [Fixnum] size of the uncompressed entry
56
+ # @param compressed_size [Fixnum] size of the compressed entry
57
+ # @param use_data_descriptor[Boolean] whether the entry uses a postfix data
58
+ # descriptor to specify size
59
+ # @return self
60
+ def add_deflated_entry(filename:, uncompressed_size:, compressed_size:, use_data_descriptor: false)
61
+ @streamer.add_deflated_entry(filename: filename,
62
+ crc32: 0,
63
+ compressed_size: compressed_size,
64
+ uncompressed_size: uncompressed_size,
65
+ use_data_descriptor: use_data_descriptor)
66
+
67
+ @streamer.simulate_write(compressed_size)
68
+ if use_data_descriptor
69
+ @streamer.update_last_entry_and_write_data_descriptor(crc32: 0,
70
+ compressed_size: compressed_size,
71
+ uncompressed_size: uncompressed_size)
72
+ end
73
+ self
74
+ end
75
+
76
+ # Add an empty directory to the archive.
77
+ #
78
+ # @param dirname [String] the name of the directory
79
+ # @return self
80
+ def add_empty_directory_entry(dirname:)
81
+ @streamer.add_empty_directory(dirname: dirname)
82
+ self
83
+ end
84
+ end