zip_kit 6.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +7 -0
  2. data/.codeclimate.yml +7 -0
  3. data/.document +5 -0
  4. data/.github/workflows/ci.yml +29 -0
  5. data/.gitignore +61 -0
  6. data/.rspec +1 -0
  7. data/.standard.yml +8 -0
  8. data/.yardopts +1 -0
  9. data/CHANGELOG.md +255 -0
  10. data/CODE_OF_CONDUCT.md +46 -0
  11. data/CONTRIBUTING.md +153 -0
  12. data/Gemfile +4 -0
  13. data/IMPLEMENTATION_DETAILS.md +97 -0
  14. data/LICENSE.txt +20 -0
  15. data/README.md +234 -0
  16. data/Rakefile +21 -0
  17. data/bench/buffered_crc32_bench.rb +109 -0
  18. data/examples/archive_size_estimate.rb +15 -0
  19. data/examples/config.ru +7 -0
  20. data/examples/deferred_write.rb +58 -0
  21. data/examples/parallel_compression_with_block_deflate.rb +86 -0
  22. data/examples/rack_application.rb +63 -0
  23. data/examples/s3_upload.rb +23 -0
  24. data/lib/zip_kit/block_deflate.rb +130 -0
  25. data/lib/zip_kit/block_write.rb +47 -0
  26. data/lib/zip_kit/file_reader/inflating_reader.rb +36 -0
  27. data/lib/zip_kit/file_reader/stored_reader.rb +35 -0
  28. data/lib/zip_kit/file_reader.rb +740 -0
  29. data/lib/zip_kit/null_writer.rb +12 -0
  30. data/lib/zip_kit/output_enumerator.rb +150 -0
  31. data/lib/zip_kit/path_set.rb +163 -0
  32. data/lib/zip_kit/rack_chunked_body.rb +32 -0
  33. data/lib/zip_kit/rack_tempfile_body.rb +61 -0
  34. data/lib/zip_kit/rails_streaming.rb +37 -0
  35. data/lib/zip_kit/remote_io.rb +114 -0
  36. data/lib/zip_kit/remote_uncap.rb +22 -0
  37. data/lib/zip_kit/size_estimator.rb +84 -0
  38. data/lib/zip_kit/stream_crc32.rb +60 -0
  39. data/lib/zip_kit/streamer/deflated_writer.rb +45 -0
  40. data/lib/zip_kit/streamer/entry.rb +37 -0
  41. data/lib/zip_kit/streamer/filler.rb +9 -0
  42. data/lib/zip_kit/streamer/heuristic.rb +68 -0
  43. data/lib/zip_kit/streamer/stored_writer.rb +39 -0
  44. data/lib/zip_kit/streamer/writable.rb +36 -0
  45. data/lib/zip_kit/streamer.rb +614 -0
  46. data/lib/zip_kit/uniquify_filename.rb +39 -0
  47. data/lib/zip_kit/version.rb +5 -0
  48. data/lib/zip_kit/write_and_tell.rb +40 -0
  49. data/lib/zip_kit/write_buffer.rb +71 -0
  50. data/lib/zip_kit/write_shovel.rb +22 -0
  51. data/lib/zip_kit/zip_writer.rb +436 -0
  52. data/lib/zip_kit.rb +24 -0
  53. data/zip_kit.gemspec +41 -0
  54. metadata +335 -0
@@ -0,0 +1,614 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+
5
+ # Is used to write streamed ZIP archives into the provided IO-ish object.
6
+ # The output IO is never going to be rewound or seeked, so the output
7
+ # of this object can be coupled directly to, say, a Rack output. The
8
+ # output can also be a String, Array or anything that responds to `<<`.
9
+ #
10
+ # Allows for splicing raw files (for "stored" entries without compression)
11
+ # and splicing of deflated files (for "deflated" storage mode).
12
+ #
13
+ # For stored entries, you need to know the CRC32 (as a uint) and the filesize upfront,
14
+ # before the writing of the entry body starts.
15
+ #
16
+ # Any object that responds to `<<` can be used as the Streamer target - you can use
17
+ # a String, an Array, a Socket or a File, at your leisure.
18
+ #
19
+ # ## Using the Streamer with runtime compression
20
+ #
21
+ # You can use the Streamer with data descriptors (the CRC32 and the sizes will be
22
+ # written after the file data). This allows non-rewinding on-the-fly compression.
23
+ # The streamer will pick the optimum compression method ("stored" or "deflated")
24
+ # depending on the nature of the byte stream you send into it (by using a small buffer).
25
+ # If you are compressing large files, the Deflater object that the Streamer controls
26
+ # will be regularly flushed to prevent memory inflation.
27
+ #
28
+ # ZipKit::Streamer.open(file_socket_or_string) do |zip|
29
+ # zip.write_file('mov.mp4') do |sink|
30
+ # File.open('mov.mp4', 'rb'){|source| IO.copy_stream(source, sink) }
31
+ # end
32
+ # zip.write_file('long-novel.txt') do |sink|
33
+ # File.open('novel.txt', 'rb'){|source| IO.copy_stream(source, sink) }
34
+ # end
35
+ # end
36
+ #
37
+ # The central directory will be written automatically at the end of the block.
38
+ #
39
+ # ## Using the Streamer with entries of known size and having a known CRC32 checksum
40
+ #
41
+ # Streamer allows "IO splicing" - in this mode it will only control the metadata output,
42
+ # but you can write the data to the socket/file outside of the Streamer. For example, when
43
+ # using the sendfile gem:
44
+ #
45
+ # ZipKit::Streamer.open(socket) do | zip |
46
+ # zip.add_stored_entry(filename: "myfile1.bin", size: 9090821, crc32: 12485)
47
+ # socket.sendfile(tempfile1)
48
+ # zip.simulate_write(tempfile1.size)
49
+ #
50
+ # zip.add_stored_entry(filename: "myfile2.bin", size: 458678, crc32: 89568)
51
+ # socket.sendfile(tempfile2)
52
+ # zip.simulate_write(tempfile2.size)
53
+ # end
54
+ #
55
+ # Note that you need to use `simulate_write` in this case. This needs to happen since Streamer
56
+ # writes absolute offsets into the ZIP (local file header offsets and the like),
57
+ # and it relies on the output object to tell it how many bytes have been written
58
+ # so far. When using `sendfile` the Ruby write methods get bypassed entirely, and the
59
+ # offsets in the IO will not be updated - which will result in an invalid ZIP.
60
+ #
61
+ #
62
+ # ## On-the-fly deflate -using the Streamer with async/suspended writes and data descriptors
63
+ #
64
+ # If you are unable to use the block versions of `write_deflated_file` and `write_stored_file`
65
+ # there is an option to use a separate writer object. It gets returned from `write_deflated_file`
66
+ # and `write_stored_file` if you do not provide them with a block, and will accept data writes.
67
+ # Do note that you _must_ call `#close` on that object yourself:
68
+ #
69
+ # ZipKit::Streamer.open(socket) do | zip |
70
+ # w = zip.write_stored_file('mov.mp4')
71
+ # IO.copy_stream(source_io, w)
72
+ # w.close
73
+ # end
74
+ #
75
+ # The central directory will be written automatically at the end of the `open` block. If you need
76
+ # to manage the Streamer manually, or defer the central directory write until appropriate, use
77
+ # the constructor instead and call `Streamer#close`:
78
+ #
79
+ # zip = ZipKit::Streamer.new(out_io)
80
+ # .....
81
+ # zip.close
82
+ #
83
+ # Calling {Streamer#close} **will not** call `#close` on the underlying IO object.
84
+ class ZipKit::Streamer
85
+ autoload :DeflatedWriter, File.dirname(__FILE__) + "/streamer/deflated_writer.rb"
86
+ autoload :Writable, File.dirname(__FILE__) + "/streamer/writable.rb"
87
+ autoload :StoredWriter, File.dirname(__FILE__) + "/streamer/stored_writer.rb"
88
+ autoload :Entry, File.dirname(__FILE__) + "/streamer/entry.rb"
89
+ autoload :Filler, File.dirname(__FILE__) + "/streamer/filler.rb"
90
+ autoload :Heuristic, File.dirname(__FILE__) + "/streamer/heuristic.rb"
91
+
92
+ include ZipKit::WriteShovel
93
+
94
+ STORED = 0
95
+ DEFLATED = 8
96
+
97
+ EntryBodySizeMismatch = Class.new(StandardError)
98
+ InvalidOutput = Class.new(ArgumentError)
99
+ Overflow = Class.new(StandardError)
100
+ UnknownMode = Class.new(StandardError)
101
+ OffsetOutOfSync = Class.new(StandardError)
102
+
103
+ private_constant :STORED, :DEFLATED
104
+
105
+ # Creates a new Streamer on top of the given IO-ish object and yields it. Once the given block
106
+ # returns, the Streamer will have it's `close` method called, which will write out the central
107
+ # directory of the archive to the output.
108
+ #
109
+ # @param stream [IO] the destination IO for the ZIP (should respond to `tell` and `<<`)
110
+ # @param kwargs_for_new [Hash] keyword arguments for #initialize
111
+ # @yield [Streamer] the streamer that can be written to
112
+ def self.open(stream, **kwargs_for_new)
113
+ archive = new(stream, **kwargs_for_new)
114
+ yield(archive)
115
+ archive.close
116
+ end
117
+
118
+ # Creates a new Streamer on top of the given IO-ish object.
119
+ #
120
+ # @param writable[#<<] the destination IO for the ZIP. Anything that responds to `<<` can be used.
121
+ # @param writer[ZipKit::ZipWriter] the object to be used as the writer.
122
+ # Defaults to an instance of ZipKit::ZipWriter, normally you won't need to override it
123
+ # @param auto_rename_duplicate_filenames[Boolean] whether duplicate filenames, when encountered,
124
+ # should be suffixed with (1), (2) etc. Default value is `false` - if
125
+ # dupliate names are used an exception will be raised
126
+ def initialize(writable, writer: create_writer, auto_rename_duplicate_filenames: false)
127
+ raise InvalidOutput, "The writable must respond to #<< or #write" unless writable.respond_to?(:<<) || writable.respond_to?(:write)
128
+
129
+ @out = ZipKit::WriteAndTell.new(writable)
130
+ @files = []
131
+ @path_set = ZipKit::PathSet.new
132
+ @writer = writer
133
+ @dedupe_filenames = auto_rename_duplicate_filenames
134
+ end
135
+
136
+ # Writes a part of a zip entry body (actual binary data of the entry) into the output stream.
137
+ #
138
+ # @param binary_data [String] a String in binary encoding
139
+ # @return self
140
+ def <<(binary_data)
141
+ @out << binary_data
142
+ self
143
+ end
144
+
145
+ # Advances the internal IO pointer to keep the offsets of the ZIP file in
146
+ # check. Use this if you are going to use accelerated writes to the socket
147
+ # (like the `sendfile()` call) after writing the headers, or if you
148
+ # just need to figure out the size of the archive.
149
+ #
150
+ # @param num_bytes [Integer] how many bytes are going to be written bypassing the Streamer
151
+ # @return [Integer] position in the output stream / ZIP archive
152
+ def simulate_write(num_bytes)
153
+ @out.advance_position_by(num_bytes)
154
+ @out.tell
155
+ end
156
+
157
+ # Writes out the local header for an entry (file in the ZIP) that is using
158
+ # the deflated storage model (is compressed). Once this method is called,
159
+ # the `<<` method has to be called to write the actual contents of the body.
160
+ #
161
+ # Note that the deflated body that is going to be written into the output
162
+ # has to be _precompressed_ (pre-deflated) before writing it into the
163
+ # Streamer, because otherwise it is impossible to know it's size upfront.
164
+ #
165
+ # @param filename [String] the name of the file in the entry
166
+ # @param modification_time [Time] the modification time of the file in the archive
167
+ # @param compressed_size [Integer] the size of the compressed entry that
168
+ # is going to be written into the archive
169
+ # @param uncompressed_size [Integer] the size of the entry when uncompressed, in bytes
170
+ # @param crc32 [Integer] the CRC32 checksum of the entry when uncompressed
171
+ # @param use_data_descriptor [Boolean] whether the entry body will be followed by a data descriptor
172
+ # @param unix_permissions[Fixnum?] which UNIX permissions to set, normally the default should be used
173
+ # @return [Integer] the offset the output IO is at after writing the entry header
174
+ def add_deflated_entry(filename:, modification_time: Time.now.utc, compressed_size: 0, uncompressed_size: 0, crc32: 0, unix_permissions: nil, use_data_descriptor: false)
175
+ add_file_and_write_local_header(filename: filename,
176
+ modification_time: modification_time,
177
+ crc32: crc32,
178
+ storage_mode: DEFLATED,
179
+ compressed_size: compressed_size,
180
+ uncompressed_size: uncompressed_size,
181
+ unix_permissions: unix_permissions,
182
+ use_data_descriptor: use_data_descriptor)
183
+ @out.tell
184
+ end
185
+
186
+ # Writes out the local header for an entry (file in the ZIP) that is using
187
+ # the stored storage model (is stored as-is).
188
+ # Once this method is called, the `<<` method has to be called one or more
189
+ # times to write the actual contents of the body.
190
+ #
191
+ # @param filename [String] the name of the file in the entry
192
+ # @param modification_time [Time] the modification time of the file in the archive
193
+ # @param size [Integer] the size of the file when uncompressed, in bytes
194
+ # @param crc32 [Integer] the CRC32 checksum of the entry when uncompressed
195
+ # @param use_data_descriptor [Boolean] whether the entry body will be followed by a data descriptor. When in use
196
+ # @param unix_permissions[Fixnum?] which UNIX permissions to set, normally the default should be used
197
+ # @return [Integer] the offset the output IO is at after writing the entry header
198
+ def add_stored_entry(filename:, modification_time: Time.now.utc, size: 0, crc32: 0, unix_permissions: nil, use_data_descriptor: false)
199
+ add_file_and_write_local_header(filename: filename,
200
+ modification_time: modification_time,
201
+ crc32: crc32,
202
+ storage_mode: STORED,
203
+ compressed_size: size,
204
+ uncompressed_size: size,
205
+ unix_permissions: unix_permissions,
206
+ use_data_descriptor: use_data_descriptor)
207
+ @out.tell
208
+ end
209
+
210
+ # Adds an empty directory to the archive with a size of 0 and permissions of 755.
211
+ #
212
+ # @param dirname [String] the name of the directory in the archive
213
+ # @param modification_time [Time] the modification time of the directory in the archive
214
+ # @param unix_permissions[Fixnum?] which UNIX permissions to set, normally the default should be used
215
+ # @return [Integer] the offset the output IO is at after writing the entry header
216
+ def add_empty_directory(dirname:, modification_time: Time.now.utc, unix_permissions: nil)
217
+ add_file_and_write_local_header(filename: dirname.to_s + "/",
218
+ modification_time: modification_time,
219
+ crc32: 0,
220
+ storage_mode: STORED,
221
+ compressed_size: 0,
222
+ uncompressed_size: 0,
223
+ unix_permissions: unix_permissions,
224
+ use_data_descriptor: false)
225
+ @out.tell
226
+ end
227
+
228
+ # Opens the stream for a file stored in the archive, and yields a writer
229
+ # for that file to the block.
230
+ # The writer will buffer a small amount of data and see whether compression is
231
+ # effective for the data being output. If compression turns out to work well -
232
+ # for instance, if the output is mostly text - it is going to create a deflated
233
+ # file inside the zip. If the compression benefits are negligible, it will
234
+ # create a stored file inside the zip. It will delegate either to `write_deflated_file`
235
+ # or to `write_stored_file`.
236
+ #
237
+ # Using a block, the write will be terminated with a data descriptor outright.
238
+ #
239
+ # zip.write_file("foo.txt") do |sink|
240
+ # IO.copy_stream(source_file, sink)
241
+ # end
242
+ #
243
+ # If deferred writes are desired (for example - to integrate with an API that
244
+ # does not support blocks, or to work with non-blocking environments) the method
245
+ # has to be called without a block. In that case it returns the sink instead,
246
+ # permitting to write to it in a deferred fashion. When `close` is called on
247
+ # the sink, any remanining compression output will be flushed and the data
248
+ # descriptor is going to be written.
249
+ #
250
+ # Note that even though it does not have to happen within the same call stack,
251
+ # call sequencing still must be observed. It is therefore not possible to do
252
+ # this:
253
+ #
254
+ # writer_for_file1 = zip.write_file("somefile.jpg")
255
+ # writer_for_file2 = zip.write_file("another.tif")
256
+ # writer_for_file1 << data
257
+ # writer_for_file2 << data
258
+ #
259
+ # because it is likely to result in an invalid ZIP file structure later on.
260
+ # So using this facility in async scenarios is certainly possible, but care
261
+ # and attention is recommended.
262
+ #
263
+ # @param filename[String] the name of the file in the archive
264
+ # @param modification_time [Time] the modification time of the file in the archive
265
+ # @param unix_permissions[Fixnum?] which UNIX permissions to set, normally the default should be used
266
+ # @yield
267
+ # sink[#<<, #write]
268
+ # an object that the file contents must be written to.
269
+ # Do not call `#close` on it - Streamer will do it for you. Write in chunks to achieve proper streaming
270
+ # output (using `IO.copy_stream` is a good approach).
271
+ # @return [#<<, #write, #close] an object that the file contents must be written to, has to be closed manually
272
+ def write_file(filename, modification_time: Time.now.utc, unix_permissions: nil, &blk)
273
+ writable = ZipKit::Streamer::Heuristic.new(self, filename, modification_time: modification_time, unix_permissions: unix_permissions)
274
+ yield_or_return_writable(writable, &blk)
275
+ end
276
+
277
+ # Opens the stream for a stored file in the archive, and yields a writer
278
+ # for that file to the block.
279
+ # Once the write completes, a data descriptor will be written with the
280
+ # actual compressed/uncompressed sizes and the CRC32 checksum.
281
+ #
282
+ # Using a block, the write will be terminated with a data descriptor outright.
283
+ #
284
+ # zip.write_stored_file("foo.txt") do |sink|
285
+ # IO.copy_stream(source_file, sink)
286
+ # end
287
+ #
288
+ # If deferred writes are desired (for example - to integrate with an API that
289
+ # does not support blocks, or to work with non-blocking environments) the method
290
+ # has to be called without a block. In that case it returns the sink instead,
291
+ # permitting to write to it in a deferred fashion. When `close` is called on
292
+ # the sink, any remanining compression output will be flushed and the data
293
+ # descriptor is going to be written.
294
+ #
295
+ # Note that even though it does not have to happen within the same call stack,
296
+ # call sequencing still must be observed. It is therefore not possible to do
297
+ # this:
298
+ #
299
+ # writer_for_file1 = zip.write_stored_file("somefile.jpg")
300
+ # writer_for_file2 = zip.write_stored_file("another.tif")
301
+ # writer_for_file1 << data
302
+ # writer_for_file2 << data
303
+ #
304
+ # because it is likely to result in an invalid ZIP file structure later on.
305
+ # So using this facility in async scenarios is certainly possible, but care
306
+ # and attention is recommended.
307
+ #
308
+ # If an exception is raised inside the block that is passed to the method, a `rollback!` call
309
+ # will be performed automatically and the entry just written will be omitted from the ZIP
310
+ # central directory. This can be useful if you want to rescue the exception and reattempt
311
+ # adding the ZIP file. Note that you will need to call `write_deflated_file` again to start a
312
+ # new file - you can't keep writing to the one that failed.
313
+ #
314
+ # @param filename[String] the name of the file in the archive
315
+ # @param modification_time [Time] the modification time of the file in the archive
316
+ # @param unix_permissions[Fixnum?] which UNIX permissions to set, normally the default should be used
317
+ # @yield
318
+ # sink[#<<, #write]
319
+ # an object that the file contents must be written to.
320
+ # Do not call `#close` on it - Streamer will do it for you. Write in chunks to achieve proper streaming
321
+ # output (using `IO.copy_stream` is a good approach).
322
+ # @return [#<<, #write, #close] an object that the file contents must be written to, has to be closed manually
323
+ def write_stored_file(filename, modification_time: Time.now.utc, unix_permissions: nil, &blk)
324
+ add_stored_entry(filename: filename,
325
+ modification_time: modification_time,
326
+ use_data_descriptor: true,
327
+ crc32: 0,
328
+ size: 0,
329
+ unix_permissions: unix_permissions)
330
+
331
+ writable = Writable.new(self, StoredWriter.new(@out))
332
+ yield_or_return_writable(writable, &blk)
333
+ end
334
+
335
+ # Opens the stream for a deflated file in the archive, and yields a writer
336
+ # for that file to the block. Once the write completes, a data descriptor
337
+ # will be written with the actual compressed/uncompressed sizes and the
338
+ # CRC32 checksum.
339
+ #
340
+ # Using a block, the write will be terminated with a data descriptor outright.
341
+ #
342
+ # zip.write_stored_file("foo.txt") do |sink|
343
+ # IO.copy_stream(source_file, sink)
344
+ # end
345
+ #
346
+ # If deferred writes are desired (for example - to integrate with an API that
347
+ # does not support blocks, or to work with non-blocking environments) the method
348
+ # has to be called without a block. In that case it returns the sink instead,
349
+ # permitting to write to it in a deferred fashion. When `close` is called on
350
+ # the sink, any remanining compression output will be flushed and the data
351
+ # descriptor is going to be written.
352
+ #
353
+ # Note that even though it does not have to happen within the same call stack,
354
+ # call sequencing still must be observed. It is therefore not possible to do
355
+ # this:
356
+ #
357
+ # writer_for_file1 = zip.write_deflated_file("somefile.jpg")
358
+ # writer_for_file2 = zip.write_deflated_file("another.tif")
359
+ # writer_for_file1 << data
360
+ # writer_for_file2 << data
361
+ # writer_for_file1.close
362
+ # writer_for_file2.close
363
+ #
364
+ # because it is likely to result in an invalid ZIP file structure later on.
365
+ # So using this facility in async scenarios is certainly possible, but care
366
+ # and attention is recommended.
367
+ #
368
+ # If an exception is raised inside the block that is passed to the method, a `rollback!` call
369
+ # will be performed automatically and the entry just written will be omitted from the ZIP
370
+ # central directory. This can be useful if you want to rescue the exception and reattempt
371
+ # adding the ZIP file. Note that you will need to call `write_deflated_file` again to start a
372
+ # new file - you can't keep writing to the one that failed.
373
+ #
374
+ # @param filename[String] the name of the file in the archive
375
+ # @param modification_time [Time] the modification time of the file in the archive
376
+ # @param unix_permissions[Fixnum?] which UNIX permissions to set, normally the default should be used
377
+ # @yield
378
+ # sink[#<<, #write]
379
+ # an object that the file contents must be written to.
380
+ # Do not call `#close` on it - Streamer will do it for you. Write in chunks to achieve proper streaming
381
+ # output (using `IO.copy_stream` is a good approach).
382
+ # @return [#<<, #write, #close] an object that the file contents must be written to, has to be closed manually
383
+ def write_deflated_file(filename, modification_time: Time.now.utc, unix_permissions: nil, &blk)
384
+ add_deflated_entry(filename: filename,
385
+ modification_time: modification_time,
386
+ use_data_descriptor: true,
387
+ crc32: 0,
388
+ compressed_size: 0,
389
+ uncompressed_size: 0,
390
+ unix_permissions: unix_permissions)
391
+
392
+ writable = Writable.new(self, DeflatedWriter.new(@out))
393
+ yield_or_return_writable(writable, &blk)
394
+ end
395
+
396
+ # Closes the archive. Writes the central directory, and switches the writer into
397
+ # a state where it can no longer be written to.
398
+ #
399
+ # Once this method is called, the `Streamer` should be discarded (the ZIP archive is complete).
400
+ #
401
+ # @return [Integer] the offset the output IO is at after closing the archive
402
+ def close
403
+ # Make sure offsets are in order
404
+ verify_offsets!
405
+
406
+ # Record the central directory offset, so that it can be written into the EOCD record
407
+ cdir_starts_at = @out.tell
408
+
409
+ # Write out the central directory entries, one for each file
410
+ @files.each do |entry|
411
+ # Skip fillers which are standing in for broken/incomplete files
412
+ next if entry.filler?
413
+
414
+ @writer.write_central_directory_file_header(io: @out,
415
+ local_file_header_location: entry.local_header_offset,
416
+ gp_flags: entry.gp_flags,
417
+ storage_mode: entry.storage_mode,
418
+ compressed_size: entry.compressed_size,
419
+ uncompressed_size: entry.uncompressed_size,
420
+ mtime: entry.mtime,
421
+ crc32: entry.crc32,
422
+ filename: entry.filename,
423
+ unix_permissions: entry.unix_permissions)
424
+ end
425
+
426
+ # Record the central directory size, for the EOCDR
427
+ cdir_size = @out.tell - cdir_starts_at
428
+
429
+ # Write out the EOCDR
430
+ @writer.write_end_of_central_directory(io: @out,
431
+ start_of_central_directory_location: cdir_starts_at,
432
+ central_directory_size: cdir_size,
433
+ num_files_in_archive: @files.length)
434
+
435
+ # Clear the files so that GC will not have to trace all the way to here to deallocate them
436
+ @files.clear
437
+ @path_set.clear
438
+
439
+ # and return the final offset
440
+ @out.tell
441
+ end
442
+
443
+ # Sets up the ZipWriter with wrappers if necessary. The method is called once, when the Streamer
444
+ # gets instantiated - the Writer then gets reused. This method is primarily there so that you
445
+ # can override it.
446
+ #
447
+ # @return [ZipKit::ZipWriter] the writer to perform writes with
448
+ def create_writer
449
+ ZipKit::ZipWriter.new
450
+ end
451
+
452
+ # Updates the last entry written with the CRC32 checksum and compressed/uncompressed
453
+ # sizes. For stored entries, `compressed_size` and `uncompressed_size` are the same.
454
+ # After updating the entry will immediately write the data descriptor bytes
455
+ # to the output.
456
+ #
457
+ # @param crc32 [Integer] the CRC32 checksum of the entry when uncompressed
458
+ # @param compressed_size [Integer] the size of the compressed segment within the ZIP
459
+ # @param uncompressed_size [Integer] the size of the entry once uncompressed
460
+ # @return [Integer] the offset the output IO is at after writing the data descriptor
461
+ def update_last_entry_and_write_data_descriptor(crc32:, compressed_size:, uncompressed_size:)
462
+ # Save the information into the entry for when the time comes to write
463
+ # out the central directory
464
+ last_entry = @files.fetch(-1)
465
+ last_entry.crc32 = crc32
466
+ last_entry.compressed_size = compressed_size
467
+ last_entry.uncompressed_size = uncompressed_size
468
+
469
+ offset_before_data_descriptor = @out.tell
470
+ @writer.write_data_descriptor(io: @out,
471
+ crc32: last_entry.crc32,
472
+ compressed_size: last_entry.compressed_size,
473
+ uncompressed_size: last_entry.uncompressed_size)
474
+ last_entry.bytes_used_for_data_descriptor = @out.tell - offset_before_data_descriptor
475
+
476
+ @out.tell
477
+ end
478
+
479
+ # Removes the buffered local entry for the last file written. This can be used when rescuing from exceptions
480
+ # when you want to skip the file that failed writing into the ZIP from getting written out into the
481
+ # ZIP central directory. This is useful when, for example, you encounter errors retrieving the file
482
+ # that you want to place inside the ZIP from a remote storage location and some network exception
483
+ # gets raised. `write_deflated_file` and `write_stored_file` will rollback for you automatically.
484
+ # Of course it is not possible to remove the failed entry from the ZIP file entirely, as the data
485
+ # is likely already on the wire. However, excluding the entry from the central directory of the ZIP
486
+ # file will allow better-behaved ZIP unarchivers to extract the entries which did store correctly,
487
+ # provided they read the ZIP from the central directory and not straight-ahead.
488
+ #
489
+ # @example
490
+ # zip.add_stored_entry(filename: "data.bin", size: 4.megabytes, crc32: the_crc)
491
+ # while chunk = remote.read(65*2048)
492
+ # zip << chunk
493
+ # rescue Timeout::Error
494
+ # zip.rollback!
495
+ # # and proceed to the next file
496
+ # end
497
+ # @return [Integer] position in the output stream / ZIP archive
498
+ def rollback!
499
+ removed_entry = @files.pop
500
+ return @out.tell unless removed_entry
501
+
502
+ @path_set.clear
503
+ @files.each do |e|
504
+ @path_set.add_directory_or_file_path(e.filename) unless e.filler?
505
+ end
506
+ @files << Filler.new(@out.tell - removed_entry.local_header_offset)
507
+
508
+ @out.tell
509
+ end
510
+
511
+ private
512
+
513
+ def yield_or_return_writable(writable, &block_to_pass_writable_to)
514
+ if block_to_pass_writable_to
515
+ begin
516
+ yield(writable)
517
+ writable.close
518
+ rescue
519
+ writable.close
520
+ rollback!
521
+ raise
522
+ end
523
+ end
524
+
525
+ writable
526
+ end
527
+
528
+ def verify_offsets!
529
+ # We need to check whether the offsets noted for the entries actually make sense
530
+ computed_offset = @files.map(&:total_bytes_used).inject(0, &:+)
531
+ actual_offset = @out.tell
532
+ if computed_offset != actual_offset
533
+ message = <<~EMS
534
+ The offset of the Streamer output IO is out of sync with the expected value. All entries written so far,
535
+ including their compressed bodies, local headers and data descriptors, add up to a certain offset,
536
+ but this offset does not match the actual offset of the IO.
537
+
538
+ Entries add up to #{computed_offset} bytes and the IO is at #{actual_offset} bytes.
539
+
540
+ This can happen if you write local headers for an entry, write the "body" of the entry directly to the IO
541
+ object which is your destination, but do not adjust the offset known to the Streamer object. To adjust
542
+ the offfset you need to call `Streamer#simulate_write(body_size)` after outputting the entry. Otherwise
543
+ the local header offsets of the entries you write are going to be incorrect and some ZIP applications
544
+ are going to have problems opening your archive.
545
+ EMS
546
+ raise OffsetOutOfSync, message
547
+ end
548
+ end
549
+
550
+ def add_file_and_write_local_header(
551
+ filename:,
552
+ modification_time:,
553
+ crc32:,
554
+ storage_mode:,
555
+ compressed_size:,
556
+ uncompressed_size:,
557
+ use_data_descriptor:,
558
+ unix_permissions:
559
+ )
560
+
561
+ # Clean backslashes
562
+ filename = remove_backslash(filename)
563
+ raise UnknownMode, "Unknown compression mode #{storage_mode}" unless [STORED, DEFLATED].include?(storage_mode)
564
+ raise Overflow, "Filename is too long" if filename.bytesize > 0xFFFF
565
+
566
+ # If we need to massage filenames to enforce uniqueness,
567
+ # do so before we check for file/directory conflicts
568
+ filename = ZipKit::UniquifyFilename.call(filename, @path_set) if @dedupe_filenames
569
+
570
+ # Make sure there is no file/directory clobbering (conflicts), or - if deduping is disabled -
571
+ # no duplicate filenames/paths
572
+ if filename.end_with?("/")
573
+ @path_set.add_directory_path(filename)
574
+ else
575
+ @path_set.add_file_path(filename)
576
+ end
577
+
578
+ if use_data_descriptor
579
+ crc32 = 0
580
+ compressed_size = 0
581
+ uncompressed_size = 0
582
+ end
583
+
584
+ local_header_starts_at = @out.tell
585
+
586
+ e = Entry.new(filename,
587
+ crc32,
588
+ compressed_size,
589
+ uncompressed_size,
590
+ storage_mode,
591
+ modification_time,
592
+ use_data_descriptor,
593
+ _local_file_header_offset = local_header_starts_at,
594
+ _bytes_used_for_local_header = 0,
595
+ _bytes_used_for_data_descriptor = 0,
596
+ unix_permissions)
597
+
598
+ @writer.write_local_file_header(io: @out,
599
+ gp_flags: e.gp_flags,
600
+ crc32: e.crc32,
601
+ compressed_size: e.compressed_size,
602
+ uncompressed_size: e.uncompressed_size,
603
+ mtime: e.mtime,
604
+ filename: e.filename,
605
+ storage_mode: e.storage_mode)
606
+ e.bytes_used_for_local_header = @out.tell - e.local_header_offset
607
+
608
+ @files << e
609
+ end
610
+
611
+ def remove_backslash(filename)
612
+ filename.tr("\\", "_")
613
+ end
614
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ZipKit::UniquifyFilename
4
+ # Makes a given filename unique by appending a (n) suffix
5
+ # between just before the filename extension. So "file.txt" gets
6
+ # transformed into "file (1).txt". The transformation is applied
7
+ # repeatedly as long as the generated filename is present
8
+ # in `while_included_in` object
9
+ #
10
+ # @param path[String] the path to make unique
11
+ # @param while_included_in[#include?] an object that stores the list of already used paths
12
+ # @return [String] the path as is, or with the suffix required to make it unique
13
+ def self.call(path, while_included_in)
14
+ return path unless while_included_in.include?(path)
15
+
16
+ # we add (1), (2), (n) at the end of a filename before the filename extension,
17
+ # but only if there is a duplicate
18
+ copy_pattern = /\((\d+)\)$/
19
+ parts = path.split(".")
20
+ ext = if parts.last =~ /gz|zip/ && parts.size > 2
21
+ parts.pop(2)
22
+ elsif parts.size > 1
23
+ parts.pop
24
+ end
25
+ fn_last_part = parts.pop
26
+
27
+ duplicate_counter = 1
28
+ loop do
29
+ fn_last_part = if fn_last_part&.match?(copy_pattern)
30
+ fn_last_part.sub(copy_pattern, "(#{duplicate_counter})")
31
+ else
32
+ "#{fn_last_part} (#{duplicate_counter})"
33
+ end
34
+ new_path = (parts + [fn_last_part, ext]).compact.join(".")
35
+ return new_path unless while_included_in.include?(new_path)
36
+ duplicate_counter += 1
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ZipKit
4
+ VERSION = "6.0.0"
5
+ end