just-keep-zipping 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +56 -1
- data/lib/just-keep-zipping.rb +37 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 98bb5c411d4764cdbaa2b15a6d938443a9ee4b8ce05863338d0e234222c135a3
|
4
|
+
data.tar.gz: 3fe9a60cde27e9c204c5509dc994bc165153d478bb51ade721cb76433f8cc191
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 584cb29baaa3ffff8d7adb59703fbf853f13a14757ebd654004b626a2e705f0edd5df112ceb67c178d3fe5717d3efaa6c57e1d2bbd93e9ae86f00f392042e103
|
7
|
+
data.tar.gz: 81f45ea7a320dd08c1627103cd6730c296995084c42510c3de62acace301a51b07e967cfafdfff8fd5f9af16bbe7e11084cd77c0ac9d3328504f854d65a7629e
|
data/README.md
CHANGED
@@ -1,6 +1,20 @@
|
|
1
1
|
# just-keep-zipping
|
2
2
|
|
3
|
-
Produce a
|
3
|
+
Produce a ZIP file from many source files, in a streaming or distributed fashion.
|
4
|
+
|
5
|
+
The ZIP format is well suited for quick updates, allowing appends of new data without needing to extract and compress
|
6
|
+
the entire archive. This is possible because the ZIP header is written at the end of the file, and a new header can be
|
7
|
+
added after new data is added. However, the file must be available locally for ZIP tools to operate effectively. If the
|
8
|
+
file is remote, then the entire archive must be downloaded, updated, then uploaded--which is a heavyweight method of
|
9
|
+
adding small files to a large archive.
|
10
|
+
|
11
|
+
Memory, disk space, and CPU time are all limits when running in a cloud environment, and it does not always scale to
|
12
|
+
require the production of an entire ZIP archive to occur within a single processing unit.
|
13
|
+
|
14
|
+
Just Keep Zipping allows a large ZIP archive to be produced in parts, on one machine or many, and can be used with
|
15
|
+
Amazon S3 or Google Cloud Storage.
|
16
|
+
|
17
|
+
The instance is Marshallable, and the `progress_data` used between steps can be stored in Redis or another object store.
|
4
18
|
|
5
19
|
## Usage
|
6
20
|
|
@@ -32,3 +46,44 @@ Assemble the zip
|
|
32
46
|
|
33
47
|
data = incomplete_data + ending_data
|
34
48
|
|
49
|
+
## Amazon S3
|
50
|
+
|
51
|
+
https://docs.aws.amazon.com/sdkforruby/api/Aws/S3/Object.html#initiate_multipart_upload-instance_method
|
52
|
+
|
53
|
+
Each interval, e.g. 50-100 files, save the current data into s3. When finished, use a Multipart Upload with
|
54
|
+
`copy_part` to combine the parts into a whole.
|
55
|
+
|
56
|
+
zip = JustKeepZipping.new
|
57
|
+
zip.add 'file1.txt', 'Data to be zipped'
|
58
|
+
|
59
|
+
bucket.object('part_one').put zip.read
|
60
|
+
|
61
|
+
zip.add 'file2.txt', 'More data to be zipped'
|
62
|
+
zip.close
|
63
|
+
|
64
|
+
bucket.object('part_two').put zip.read
|
65
|
+
|
66
|
+
upload = bucket.object('archive.zip').initiate_multipart_upload
|
67
|
+
upload.part(1).copy_from copy_source: "bucket/part_one"
|
68
|
+
upload.part(2).copy_from copy_source: "bucket/part_two"
|
69
|
+
upload.complete compute_parts: true
|
70
|
+
|
71
|
+
## Google Cloud Storage
|
72
|
+
|
73
|
+
http://googleapis.github.io/google-cloud-ruby/docs/google-cloud-storage/latest/Google/Cloud/Storage/Bucket.html#compose-instance_method
|
74
|
+
|
75
|
+
Each interval, e.g. 50-100 files, save the current data into s3. When finished, use the compose method to join the parts
|
76
|
+
into a whole (for more than 32 parts, iteratively compose the destination file as an input of the next group).
|
77
|
+
|
78
|
+
zip = JustKeepZipping.new
|
79
|
+
zip.add 'file1.txt', 'Data to be zipped'
|
80
|
+
|
81
|
+
bucket.create_file StringIO.new(zip.read), 'part_one'
|
82
|
+
|
83
|
+
zip.add 'file2.txt', 'More data to be zipped'
|
84
|
+
zip.close
|
85
|
+
|
86
|
+
bucket.create_file StringIO.new(zip.read), 'part_two'
|
87
|
+
|
88
|
+
bucket.compose ['part_one', 'part_two'], 'archive.zip'
|
89
|
+
|
data/lib/just-keep-zipping.rb
CHANGED
@@ -1,17 +1,45 @@
|
|
1
1
|
require 'zip'
|
2
2
|
|
3
|
+
# Allows the creating of large ZIP files in a streaming fashion.
|
4
|
+
#
|
5
|
+
# Example:
|
6
|
+
# zip = JustKeepZipping.new
|
7
|
+
# zip.add 'file1.txt', 'Data to be zipped'
|
8
|
+
# data1 = zip.read
|
9
|
+
# progress = Marshal.dump zip # into an object store?
|
10
|
+
#
|
11
|
+
# zip = Marshal.load progress # load from object store?
|
12
|
+
# zip.add 'file2.txt', 'More data to be zipped'
|
13
|
+
# zip.close
|
14
|
+
# data2 = zip.read
|
15
|
+
#
|
16
|
+
# complete_archive = data1 + data2
|
17
|
+
#
|
3
18
|
class JustKeepZipping
|
4
19
|
attr_reader :entries
|
5
20
|
|
21
|
+
# Use the constructor for the initial object creation.
|
22
|
+
# Use Marshal.dump and Marshal.load (e.g. with Redis) to tranfer this instance between
|
23
|
+
# compute units (e.g. Sidekiq jobs).
|
24
|
+
#
|
6
25
|
def initialize
|
7
26
|
@entries = []
|
8
27
|
@data = ''
|
9
28
|
end
|
10
29
|
|
30
|
+
# The current data size. Use this as a stopping or checkpoint condition, to
|
31
|
+
# keep memory from growing too large.
|
32
|
+
#
|
11
33
|
def current_size
|
12
34
|
@data.size
|
13
35
|
end
|
14
36
|
|
37
|
+
# Add a file to the archive.
|
38
|
+
#
|
39
|
+
# Params:
|
40
|
+
# +filename+:: a string representing the name of the file as it should appear in the archive
|
41
|
+
# +body+:: a string or IO object that represents the contents of the file
|
42
|
+
#
|
15
43
|
def add(filename, body)
|
16
44
|
io = Zip::OutputStream.write_buffer do |zip|
|
17
45
|
zip.put_next_entry filename
|
@@ -31,6 +59,10 @@ class JustKeepZipping
|
|
31
59
|
nil
|
32
60
|
end
|
33
61
|
|
62
|
+
# Finalizes the archive by adding the trailing ZIP header. A final read must be called to get the data.
|
63
|
+
#
|
64
|
+
# No further files should be added after calling close.
|
65
|
+
#
|
34
66
|
def close
|
35
67
|
contents_size = 0
|
36
68
|
@entries.each do |e|
|
@@ -51,6 +83,11 @@ class JustKeepZipping
|
|
51
83
|
contents_size
|
52
84
|
end
|
53
85
|
|
86
|
+
# Get the current ZIP data, to save in an object store like S3 or GCS.
|
87
|
+
#
|
88
|
+
# Do this before persisting this instance with Marshal.dump, to avoid
|
89
|
+
# placing too much progress data into a temporary object store like Redis.
|
90
|
+
#
|
54
91
|
def read
|
55
92
|
data = @data
|
56
93
|
@data = ''
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: just-keep-zipping
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Calhoun
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-11-
|
11
|
+
date: 2018-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rubyzip
|
@@ -54,7 +54,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
54
54
|
version: '0'
|
55
55
|
requirements: []
|
56
56
|
rubyforge_project:
|
57
|
-
rubygems_version: 2.6
|
57
|
+
rubygems_version: 2.7.6
|
58
58
|
signing_key:
|
59
59
|
specification_version: 4
|
60
60
|
summary: Just Keep Zipping
|