just-keep-zipping 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +56 -1
- data/lib/just-keep-zipping.rb +37 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 98bb5c411d4764cdbaa2b15a6d938443a9ee4b8ce05863338d0e234222c135a3
|
4
|
+
data.tar.gz: 3fe9a60cde27e9c204c5509dc994bc165153d478bb51ade721cb76433f8cc191
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 584cb29baaa3ffff8d7adb59703fbf853f13a14757ebd654004b626a2e705f0edd5df112ceb67c178d3fe5717d3efaa6c57e1d2bbd93e9ae86f00f392042e103
|
7
|
+
data.tar.gz: 81f45ea7a320dd08c1627103cd6730c296995084c42510c3de62acace301a51b07e967cfafdfff8fd5f9af16bbe7e11084cd77c0ac9d3328504f854d65a7629e
|
data/README.md
CHANGED
@@ -1,6 +1,20 @@
|
|
1
1
|
# just-keep-zipping
|
2
2
|
|
3
|
-
Produce a
|
3
|
+
Produce a ZIP file from many source files, in a streaming or distributed fashion.
|
4
|
+
|
5
|
+
The ZIP format is well suited for quick updates, allowing appends of new data without needing to extract and compress
|
6
|
+
the entire archive. This is possible because the ZIP header is written at the end of the file, and a new header can be
|
7
|
+
added after new data is added. However, the file must be available locally for ZIP tools to operate effectively. If the
|
8
|
+
file is remote, then the entire archive must be downloaded, updated, then uploaded--which is a heavyweight method of
|
9
|
+
adding small files to a large archive.
|
10
|
+
|
11
|
+
Memory, disk space, and CPU time are all limits when running in a cloud environment, and it does not always scale to
|
12
|
+
require the production of an entire ZIP archive to occur within a single processing unit.
|
13
|
+
|
14
|
+
Just Keep Zipping allows a large ZIP archive to be produced in parts, on one machine or many, and can be used with
|
15
|
+
Amazon S3 or Google Cloud Storage.
|
16
|
+
|
17
|
+
The instance is Marshallable, and the `progress_data` used between steps can be stored in Redis or another object store.
|
4
18
|
|
5
19
|
## Usage
|
6
20
|
|
@@ -32,3 +46,44 @@ Assemble the zip
|
|
32
46
|
|
33
47
|
data = incomplete_data + ending_data
|
34
48
|
|
49
|
+
## Amazon S3
|
50
|
+
|
51
|
+
https://docs.aws.amazon.com/sdkforruby/api/Aws/S3/Object.html#initiate_multipart_upload-instance_method
|
52
|
+
|
53
|
+
Each interval, e.g. 50-100 files, save the current data into s3. When finished, use a Multipart Upload with
|
54
|
+
`copy_part` to combine the parts into a whole.
|
55
|
+
|
56
|
+
zip = JustKeepZipping.new
|
57
|
+
zip.add 'file1.txt', 'Data to be zipped'
|
58
|
+
|
59
|
+
bucket.object('part_one').put zip.read
|
60
|
+
|
61
|
+
zip.add 'file2.txt', 'More data to be zipped'
|
62
|
+
zip.close
|
63
|
+
|
64
|
+
bucket.object('part_two').put zip.read
|
65
|
+
|
66
|
+
upload = bucket.object('archive.zip').initiate_multipart_upload
|
67
|
+
upload.part(1).copy_from copy_source: "bucket/part_one"
|
68
|
+
upload.part(2).copy_from copy_source: "bucket/part_two"
|
69
|
+
upload.complete compute_parts: true
|
70
|
+
|
71
|
+
## Google Cloud Storage
|
72
|
+
|
73
|
+
http://googleapis.github.io/google-cloud-ruby/docs/google-cloud-storage/latest/Google/Cloud/Storage/Bucket.html#compose-instance_method
|
74
|
+
|
75
|
+
Each interval, e.g. 50-100 files, save the current data into s3. When finished, use the compose method to join the parts
|
76
|
+
into a whole (for more than 32 parts, iteratively compose the destination file as an input of the next group).
|
77
|
+
|
78
|
+
zip = JustKeepZipping.new
|
79
|
+
zip.add 'file1.txt', 'Data to be zipped'
|
80
|
+
|
81
|
+
bucket.create_file StringIO.new(zip.read), 'part_one'
|
82
|
+
|
83
|
+
zip.add 'file2.txt', 'More data to be zipped'
|
84
|
+
zip.close
|
85
|
+
|
86
|
+
bucket.create_file StringIO.new(zip.read), 'part_two'
|
87
|
+
|
88
|
+
bucket.compose ['part_one', 'part_two'], 'archive.zip'
|
89
|
+
|
data/lib/just-keep-zipping.rb
CHANGED
@@ -1,17 +1,45 @@
|
|
1
1
|
require 'zip'
|
2
2
|
|
3
|
+
# Allows the creating of large ZIP files in a streaming fashion.
|
4
|
+
#
|
5
|
+
# Example:
|
6
|
+
# zip = JustKeepZipping.new
|
7
|
+
# zip.add 'file1.txt', 'Data to be zipped'
|
8
|
+
# data1 = zip.read
|
9
|
+
# progress = Marshal.dump zip # into an object store?
|
10
|
+
#
|
11
|
+
# zip = Marshal.load progress # load from object store?
|
12
|
+
# zip.add 'file2.txt', 'More data to be zipped'
|
13
|
+
# zip.close
|
14
|
+
# data2 = zip.read
|
15
|
+
#
|
16
|
+
# complete_archive = data1 + data2
|
17
|
+
#
|
3
18
|
class JustKeepZipping
|
4
19
|
attr_reader :entries
|
5
20
|
|
21
|
+
# Use the constructor for the initial object creation.
|
22
|
+
# Use Marshal.dump and Marshal.load (e.g. with Redis) to tranfer this instance between
|
23
|
+
# compute units (e.g. Sidekiq jobs).
|
24
|
+
#
|
6
25
|
def initialize
|
7
26
|
@entries = []
|
8
27
|
@data = ''
|
9
28
|
end
|
10
29
|
|
30
|
+
# The current data size. Use this as a stopping or checkpoint condition, to
|
31
|
+
# keep memory from growing too large.
|
32
|
+
#
|
11
33
|
def current_size
|
12
34
|
@data.size
|
13
35
|
end
|
14
36
|
|
37
|
+
# Add a file to the archive.
|
38
|
+
#
|
39
|
+
# Params:
|
40
|
+
# +filename+:: a string representing the name of the file as it should appear in the archive
|
41
|
+
# +body+:: a string or IO object that represents the contents of the file
|
42
|
+
#
|
15
43
|
def add(filename, body)
|
16
44
|
io = Zip::OutputStream.write_buffer do |zip|
|
17
45
|
zip.put_next_entry filename
|
@@ -31,6 +59,10 @@ class JustKeepZipping
|
|
31
59
|
nil
|
32
60
|
end
|
33
61
|
|
62
|
+
# Finalizes the archive by adding the trailing ZIP header. A final read must be called to get the data.
|
63
|
+
#
|
64
|
+
# No further files should be added after calling close.
|
65
|
+
#
|
34
66
|
def close
|
35
67
|
contents_size = 0
|
36
68
|
@entries.each do |e|
|
@@ -51,6 +83,11 @@ class JustKeepZipping
|
|
51
83
|
contents_size
|
52
84
|
end
|
53
85
|
|
86
|
+
# Get the current ZIP data, to save in an object store like S3 or GCS.
|
87
|
+
#
|
88
|
+
# Do this before persisting this instance with Marshal.dump, to avoid
|
89
|
+
# placing too much progress data into a temporary object store like Redis.
|
90
|
+
#
|
54
91
|
def read
|
55
92
|
data = @data
|
56
93
|
@data = ''
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: just-keep-zipping
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Calhoun
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-11-
|
11
|
+
date: 2018-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rubyzip
|
@@ -54,7 +54,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
54
54
|
version: '0'
|
55
55
|
requirements: []
|
56
56
|
rubyforge_project:
|
57
|
-
rubygems_version: 2.6
|
57
|
+
rubygems_version: 2.7.6
|
58
58
|
signing_key:
|
59
59
|
specification_version: 4
|
60
60
|
summary: Just Keep Zipping
|