s3snapshot 0.0.10 → 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +4 -1
- data/lib/s3snapshot/cli.rb +3 -2
- data/lib/s3snapshot/dir_upload.rb +224 -14
- data/lib/s3snapshot/version.rb +1 -1
- data/s3snapshot.gemspec +2 -2
- metadata +11 -10
data/README.rdoc
CHANGED
@@ -1,4 +1,7 @@
|
|
1
1
|
A simple backup utility to create time based snapshots similar to time machine on s3.
|
2
2
|
All options are command line based so they can easily be scripted with cron.
|
3
3
|
Supports multiple daily backups, as well as rolling daily and weekly.
|
4
|
-
For daily, it keeps the newest complete backup for the day, for weekly it keeps the last day in the week.
|
4
|
+
For daily, it keeps the newest complete backup for the day, for weekly it keeps the last day in the week.
|
5
|
+
|
6
|
+
Note that this should only be run on machines with the *nix split command available. May work on windows with cygwin, but is untested.
|
7
|
+
If a file is over 5GB, the only way to upload it to S3 is to use multipart uploads. To split the file into parts, s3snapshot relies on the split command. The file parts are uploaded sequentially, so there is no speed boost in using multipart upload. s3snapshot is developed on MRI 1.8.7 so multi thread upload does not work.
|
data/lib/s3snapshot/cli.rb
CHANGED
@@ -14,14 +14,15 @@ module S3snapshot
|
|
14
14
|
method_option :bucket, :aliases => "-b", :desc => "The aws bucket to use", :type => :string, :required => true
|
15
15
|
method_option :directory, :aliases => "-d", :desc => "The directory to upload", :type => :string, :required => true
|
16
16
|
method_option :prefix, :aliases => "-p", :desc => "A prefix to prepend to the path before the timestamp. Useful in cluster to specifiy a node name, or a node+directory scheme. Prefix strategies can be mixed in a bucket, they must just be unique." , :type => :string, :required => true
|
17
|
-
|
17
|
+
method_option :tmpdir, :aliases => "-t", :desc => "The tmp directory to use when creating file splits. Should not have a trailing /", :type => :string, :required => false
|
18
|
+
|
18
19
|
##
|
19
20
|
#Uploads the directory to the s3 bucket with a prefix
|
20
21
|
def backup
|
21
22
|
directory = options[:directory]
|
22
23
|
puts "You are uploading directory #{directory}"
|
23
24
|
|
24
|
-
s3upload = DirUpload.new(options[:awsid], options[:awskey],
|
25
|
+
s3upload = DirUpload.new(options[:awsid], options[:awskey], options[:bucket], options[:prefix], directory, options[:tmpdir] )
|
25
26
|
s3upload.upload
|
26
27
|
end
|
27
28
|
|
@@ -3,7 +3,10 @@ require 's3snapshot/sync_op'
|
|
3
3
|
require 's3snapshot/time_factory'
|
4
4
|
require 'time'
|
5
5
|
require 'fileutils'
|
6
|
-
|
6
|
+
require 'digest/md5'
|
7
|
+
require 'base64'
|
8
|
+
require 'date'
|
9
|
+
require 'tmpdir'
|
7
10
|
|
8
11
|
module S3snapshot
|
9
12
|
class DirUpload < SyncOp
|
@@ -11,22 +14,37 @@ module S3snapshot
|
|
11
14
|
@tmpdir = nil
|
12
15
|
@local_dir = nil
|
13
16
|
@prefix = nil
|
17
|
+
|
18
|
+
MAX_RETRY_COUNT = 5;
|
14
19
|
|
15
|
-
def initialize(aws_id, aws_key, bucket_name, prefix, local_dir )
|
20
|
+
def initialize(aws_id, aws_key, bucket_name, prefix, local_dir, tmp_dir = nil )
|
16
21
|
super(aws_id, aws_key, bucket_name)
|
17
22
|
@local_dir = local_dir
|
18
23
|
@prefix = prefix
|
19
|
-
|
24
|
+
|
25
|
+
@tmpdir = tmp_dir ? tmp_dir : Dir.tmp
|
26
|
+
if File.exists? @tmpdir
|
27
|
+
puts "Temp directory #{@tmpdir} exists."
|
28
|
+
else
|
29
|
+
begin
|
30
|
+
FileUtils.mkdir_p @tmpdir
|
31
|
+
FileUtils.chmod 0777, @tmpdir
|
32
|
+
rescue Exception => e
|
33
|
+
puts "Unable to create directory #{@tmpdir} due to #{e.message}"
|
34
|
+
puts e.backtrace.join("\n")
|
35
|
+
exit 5
|
36
|
+
end
|
37
|
+
end
|
20
38
|
end
|
21
39
|
|
22
40
|
def upload
|
23
|
-
|
41
|
+
|
24
42
|
start_time = TimeFactory.utc_time
|
25
43
|
|
26
44
|
prefix_path = timepath(@prefix, start_time)
|
27
45
|
|
28
46
|
files = get_local_files
|
29
|
-
|
47
|
+
|
30
48
|
files.each do |file|
|
31
49
|
file_name = file[@local_dir.length..-1];
|
32
50
|
|
@@ -36,16 +54,23 @@ module S3snapshot
|
|
36
54
|
end
|
37
55
|
|
38
56
|
path = "#{prefix_path}/#{file_name}"
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
57
|
+
|
58
|
+
split_threshhold = 1024*1024*1024*4
|
59
|
+
fsize = File.size file
|
60
|
+
puts "uploading '#{file}' [#{fsize} bytes] to '#{@bucket_name}/#{path}'"
|
61
|
+
# check if file is greater than 5G
|
62
|
+
if fsize > split_threshhold
|
63
|
+
upload_file_as_multipart(file, path)
|
64
|
+
else
|
65
|
+
# normal upload
|
66
|
+
File.open(file) do |fb|
|
67
|
+
bucket.files.create(:key =>path, :body => fb)
|
68
|
+
end
|
69
|
+
|
44
70
|
end
|
45
|
-
|
71
|
+
|
46
72
|
end
|
47
73
|
|
48
|
-
|
49
74
|
puts "Writing complete marker"
|
50
75
|
|
51
76
|
#Upload the complete marker
|
@@ -70,7 +95,192 @@ module S3snapshot
|
|
70
95
|
|
71
96
|
return files
|
72
97
|
end
|
73
|
-
|
74
|
-
|
98
|
+
|
99
|
+
##
|
100
|
+
# Produces a hash of file paths to MD5 value.
|
101
|
+
##
|
102
|
+
def get_file_md5s(workdir)
|
103
|
+
parts = {}
|
104
|
+
|
105
|
+
# Get the Base64 encoded MD5 of each file
|
106
|
+
Dir.entries(workdir).each do |file|
|
107
|
+
next if file =~ /\.\./
|
108
|
+
next if file =~ /\./
|
109
|
+
|
110
|
+
full_path = "#{workdir}#{file}"
|
111
|
+
|
112
|
+
md5 = Base64.encode64(Digest::MD5.file("#{full_path}").digest).chomp!
|
113
|
+
|
114
|
+
parts[full_path] = md5
|
115
|
+
end
|
116
|
+
|
117
|
+
return parts
|
118
|
+
end
|
119
|
+
|
120
|
+
##
|
121
|
+
# Uploads a part of a file.
|
122
|
+
##
|
123
|
+
def upload_file_part(file_path, file_md5, part_number, object_key, upload_id, tags)
|
124
|
+
|
125
|
+
# Reload to stop the connection timing out, useful when uploading large chunks
|
126
|
+
aws.reload
|
127
|
+
|
128
|
+
# Could use paralllel gem or similar here so that we get parallel
|
129
|
+
# upload when running on a ruby implementation that allows multithreading.
|
130
|
+
puts "Starting on File: #{file_path} with MD5: #{file_md5} - this is part #{part_number}"
|
131
|
+
|
132
|
+
part_upload = nil
|
133
|
+
# Pass fog a file object to upload
|
134
|
+
File.open(file_path) do |file_part|
|
135
|
+
|
136
|
+
puts "Uploading #{file_part.path} [#{part_number}] to #{@bucket_name} as #{object_key} with upload id #{upload_id}"
|
137
|
+
|
138
|
+
part_upload = aws.upload_part(@bucket_name, object_key, upload_id, part_number, file_part, { 'Content-MD5' => file_md5 } )
|
139
|
+
|
140
|
+
# You need to make sure the tags array has the tags in the correct order, else the upload won't complete
|
141
|
+
index = part_number-1
|
142
|
+
tags[index] = part_upload.headers["ETag"]
|
143
|
+
|
144
|
+
puts "Response: #{part_upload.inspect}" # This will return when the part has uploaded
|
145
|
+
end
|
146
|
+
|
147
|
+
return part_upload
|
148
|
+
end
|
149
|
+
|
150
|
+
##
|
151
|
+
# Removes the workdir and all files in it.
|
152
|
+
##
|
153
|
+
def clean_up_files(workdir)
|
154
|
+
# clean up the tmp files
|
155
|
+
puts "Cleaning up file split parts"
|
156
|
+
FileUtils.remove_dir workdir
|
157
|
+
end
|
158
|
+
|
159
|
+
##
|
160
|
+
# Uploads a file in multiple parts.
|
161
|
+
#
|
162
|
+
# The disk needs to have the a minimum of the file size free.
|
163
|
+
# Expects object_to_upload to NOT have a leading /
|
164
|
+
##
|
165
|
+
def upload_file_as_multipart(object_to_upload, object_key)
|
166
|
+
|
167
|
+
# method based on gist from https://gist.github.com/908875
|
168
|
+
# also https://gist.github.com/833374
|
169
|
+
|
170
|
+
puts "Uploading file as multipart: file=#{object_to_upload}"
|
171
|
+
|
172
|
+
# tmp dir to place the split file into
|
173
|
+
cur_date = DateTime.now.strftime('%F-%T')
|
174
|
+
workdir = "#{@tmpdir}/s3snapshot-splits/#{cur_date}/#{File.basename(object_to_upload)}/"
|
175
|
+
FileUtils.mkdir_p(workdir)
|
176
|
+
|
177
|
+
# Assumes we are running on unix with the split command available.
|
178
|
+
# Split the file into chunks, max size of 1G, the chunks are 000, 001, etc
|
179
|
+
# Smaller chunks increase the likelyhood of upload success.
|
180
|
+
split_cmd = "split -b 1G -a 3 --verbose -d #{object_to_upload} #{workdir}"
|
181
|
+
puts "split command: #{split_cmd}"
|
182
|
+
split_result = system split_cmd
|
183
|
+
|
184
|
+
if split_result
|
185
|
+
puts "Split file into parts in #{workdir}"
|
186
|
+
else
|
187
|
+
puts "Split FAILED! exit code = #{$?}"
|
188
|
+
# dont clean up, as user may need to see files to work out how to fix the split error.
|
189
|
+
exit 1 # exit with non 0 error code as split failed.
|
190
|
+
end
|
191
|
+
|
192
|
+
# Map of the file_part => md5
|
193
|
+
parts = get_file_md5s workdir
|
194
|
+
|
195
|
+
puts "File #{object_to_upload} has been split into #{parts.size} parts."
|
196
|
+
### Now ready to perform the actual upload
|
197
|
+
|
198
|
+
# Initiate the upload and get the upload id
|
199
|
+
# this keeps failing, may have to retry.
|
200
|
+
|
201
|
+
multi_part_up = nil
|
202
|
+
|
203
|
+
# retry up to 5 times before failing
|
204
|
+
(1..MAX_RETRY_COUNT).each do |retry_count|
|
205
|
+
begin
|
206
|
+
aws.reload
|
207
|
+
multi_part_up = aws.initiate_multipart_upload(@bucket_name, object_key, { 'x-amz-acl' => 'private' } )
|
208
|
+
# initiation successful, so break.
|
209
|
+
break
|
210
|
+
rescue Exception => e
|
211
|
+
|
212
|
+
puts "multipart upload initiation FAILED: #{e.message}"
|
213
|
+
|
214
|
+
if retry_count <= MAX_RETRY_COUNT
|
215
|
+
puts "Retrying multipart upload initiation for the #{retry_count} time."
|
216
|
+
else
|
217
|
+
puts e.backtrace.join("\n")
|
218
|
+
# fail this, we cant initiated the upload
|
219
|
+
puts "Failed to initiate multipart upload after #{MAX_RETRY_COUNT} retries."
|
220
|
+
exit 2
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
upload_id = multi_part_up.body["UploadId"]
|
226
|
+
|
227
|
+
tags = []
|
228
|
+
|
229
|
+
# sort based on the sufix provided by the split command., eg 001
|
230
|
+
sorted_parts = parts.sort_by do |d|
|
231
|
+
d[0].split('/').last.to_i
|
232
|
+
end
|
233
|
+
|
234
|
+
sorted_parts.each_with_index do |entry, idx|
|
235
|
+
# Part numbers need to start at 1
|
236
|
+
part_number = idx + 1
|
237
|
+
|
238
|
+
# retry up to 5 times before failing
|
239
|
+
(1..MAX_RETRY_COUNT).each do |retry_count|
|
240
|
+
begin
|
241
|
+
# file_path, file_md5, part_number, object_key, upload_id, idx, tags
|
242
|
+
upload_file_part(entry[0], entry[1], part_number, object_key, upload_id, tags)
|
243
|
+
# upload of part successful, so break.
|
244
|
+
break
|
245
|
+
rescue Exception => e
|
246
|
+
puts "UPLOAD FILE PART FAILED: #{e.message}"
|
247
|
+
|
248
|
+
if retry_count <= MAX_RETRY_COUNT
|
249
|
+
puts "Retrying file part upload for #{entry[0]}"
|
250
|
+
else
|
251
|
+
puts e.backtrace.join("\n")
|
252
|
+
# failed to upload file part
|
253
|
+
puts "Failed to upload file part #{entry[0]} after #{MAX_RETRY_COUNT} retries."
|
254
|
+
exit 3
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
end
|
260
|
+
|
261
|
+
# retry up to 5 times before failing
|
262
|
+
(1..MAX_RETRY_COUNT).each do |retry_count|
|
263
|
+
begin
|
264
|
+
completed_upload = aws.complete_multipart_upload(@bucket_name, object_key, upload_id, tags)
|
265
|
+
puts "Completed Upload: #{completed_upload.inspect}"
|
266
|
+
# multipart completed, so break.
|
267
|
+
break
|
268
|
+
rescue Exception => e
|
269
|
+
puts "UPLOAD COMPLETED REQUEST FAILED: #{e.message}"
|
270
|
+
|
271
|
+
if retry_count <= MAX_RETRY_COUNT
|
272
|
+
puts "Retrying multipart upload complete"
|
273
|
+
else
|
274
|
+
puts e.backtrace.join("\n")
|
275
|
+
# failed to comple multipart upload
|
276
|
+
puts "Failed to complete multipart upload after #{MAX_RETRY_COUNT} retries."
|
277
|
+
exit 4
|
278
|
+
end
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
clean_up_files workdir
|
283
|
+
|
284
|
+
end
|
75
285
|
end
|
76
286
|
end
|
data/lib/s3snapshot/version.rb
CHANGED
data/s3snapshot.gemspec
CHANGED
@@ -6,7 +6,7 @@ Gem::Specification.new do |s|
|
|
6
6
|
s.name = "s3snapshot"
|
7
7
|
s.version = S3snapshot::VERSION
|
8
8
|
s.platform = Gem::Platform::RUBY
|
9
|
-
s.authors = ["Todd"]
|
9
|
+
s.authors = ["Todd Nine", "Andrew Esler"]
|
10
10
|
s.email = ["todd@spidertracks.co.nz"]
|
11
11
|
s.homepage = "https://github.com/spidertracks/s3snapshot"
|
12
12
|
s.summary = %q{Uploads to s3}
|
@@ -18,7 +18,7 @@ Gem::Specification.new do |s|
|
|
18
18
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
19
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
20
|
s.require_paths = ["lib"]
|
21
|
-
s.add_dependency "fog", "~>1.
|
21
|
+
s.add_dependency "fog", "~>1.3.1"
|
22
22
|
s.add_dependency "thor", "~>0.14.6"
|
23
23
|
s.add_dependency "dictionary", "~>1.0.0"
|
24
24
|
end
|
metadata
CHANGED
@@ -1,21 +1,22 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: s3snapshot
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 9
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 11
|
10
|
+
version: 0.0.11
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
|
-
- Todd
|
13
|
+
- Todd Nine
|
14
|
+
- Andrew Esler
|
14
15
|
autorequire:
|
15
16
|
bindir: bin
|
16
17
|
cert_chain: []
|
17
18
|
|
18
|
-
date:
|
19
|
+
date: 2012-07-09 00:00:00 Z
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
21
22
|
name: fog
|
@@ -25,12 +26,12 @@ dependencies:
|
|
25
26
|
requirements:
|
26
27
|
- - ~>
|
27
28
|
- !ruby/object:Gem::Version
|
28
|
-
hash:
|
29
|
+
hash: 25
|
29
30
|
segments:
|
30
31
|
- 1
|
31
|
-
-
|
32
|
-
-
|
33
|
-
version: 1.
|
32
|
+
- 3
|
33
|
+
- 1
|
34
|
+
version: 1.3.1
|
34
35
|
type: :runtime
|
35
36
|
version_requirements: *id001
|
36
37
|
- !ruby/object:Gem::Dependency
|
@@ -123,7 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
123
124
|
requirements: []
|
124
125
|
|
125
126
|
rubyforge_project: s3snapshot
|
126
|
-
rubygems_version: 1.8.
|
127
|
+
rubygems_version: 1.8.10
|
127
128
|
signing_key:
|
128
129
|
specification_version: 3
|
129
130
|
summary: Uploads to s3
|