s3snapshot 0.0.10 → 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -1,4 +1,7 @@
1
1
  A simple backup utility to create time based snapshots similar to time machine on s3.
2
2
  All options are command line based so they can easily be scripted with cron.
3
3
  Supports multiple daily backups, as well as rolling daily and weekly.
4
- For daily, it keeps the newest complete backup for the day, for weekly it keeps the last day in the week.
4
+ For daily, it keeps the newest complete backup for the day, for weekly it keeps the last day in the week.
5
+
6
+ Note that this should only be run on machines with the *nix split command available. May work on windows with cygwin, but is untested.
7
+ If a file is over 5GB, the only way to upload it to S3 is to use multipart uploads. To split the file into parts, s3snapshot relies on the split command. The file parts are uploaded sequentially, so there is no speed boost in using multipart upload. s3snapshot is developed on MRI 1.8.7 so multi thread upload does not work.
@@ -14,14 +14,15 @@ module S3snapshot
14
14
  method_option :bucket, :aliases => "-b", :desc => "The aws bucket to use", :type => :string, :required => true
15
15
  method_option :directory, :aliases => "-d", :desc => "The directory to upload", :type => :string, :required => true
16
16
  method_option :prefix, :aliases => "-p", :desc => "A prefix to prepend to the path before the timestamp. Useful in cluster to specifiy a node name, or a node+directory scheme. Prefix strategies can be mixed in a bucket, they must just be unique." , :type => :string, :required => true
17
-
17
+ method_option :tmpdir, :aliases => "-t", :desc => "The tmp directory to use when creating file splits. Should not have a trailing /", :type => :string, :required => false
18
+
18
19
  ##
19
20
  #Uploads the directory to the s3 bucket with a prefix
20
21
  def backup
21
22
  directory = options[:directory]
22
23
  puts "You are uploading directory #{directory}"
23
24
 
24
- s3upload = DirUpload.new(options[:awsid], options[:awskey], options[:bucket], options[:prefix], directory )
25
+ s3upload = DirUpload.new(options[:awsid], options[:awskey], options[:bucket], options[:prefix], directory, options[:tmpdir] )
25
26
  s3upload.upload
26
27
  end
27
28
 
@@ -3,7 +3,10 @@ require 's3snapshot/sync_op'
3
3
  require 's3snapshot/time_factory'
4
4
  require 'time'
5
5
  require 'fileutils'
6
-
6
+ require 'digest/md5'
7
+ require 'base64'
8
+ require 'date'
9
+ require 'tmpdir'
7
10
 
8
11
  module S3snapshot
9
12
  class DirUpload < SyncOp
@@ -11,22 +14,37 @@ module S3snapshot
11
14
  @tmpdir = nil
12
15
  @local_dir = nil
13
16
  @prefix = nil
17
+
18
+ MAX_RETRY_COUNT = 5;
14
19
 
15
- def initialize(aws_id, aws_key, bucket_name, prefix, local_dir )
20
+ def initialize(aws_id, aws_key, bucket_name, prefix, local_dir, tmp_dir = nil )
16
21
  super(aws_id, aws_key, bucket_name)
17
22
  @local_dir = local_dir
18
23
  @prefix = prefix
19
-
24
+
25
+ @tmpdir = tmp_dir ? tmp_dir : Dir.tmp
26
+ if File.exists? @tmpdir
27
+ puts "Temp directory #{@tmpdir} exists."
28
+ else
29
+ begin
30
+ FileUtils.mkdir_p @tmpdir
31
+ FileUtils.chmod 0777, @tmpdir
32
+ rescue Exception => e
33
+ puts "Unable to create directory #{@tmpdir} due to #{e.message}"
34
+ puts e.backtrace.join("\n")
35
+ exit 5
36
+ end
37
+ end
20
38
  end
21
39
 
22
40
  def upload
23
-
41
+
24
42
  start_time = TimeFactory.utc_time
25
43
 
26
44
  prefix_path = timepath(@prefix, start_time)
27
45
 
28
46
  files = get_local_files
29
-
47
+
30
48
  files.each do |file|
31
49
  file_name = file[@local_dir.length..-1];
32
50
 
@@ -36,16 +54,23 @@ module S3snapshot
36
54
  end
37
55
 
38
56
  path = "#{prefix_path}/#{file_name}"
39
-
40
- puts "uploading '#{file}' to '#{@bucket_name}/#{path}'"
41
-
42
- File.open(file) do |file|
43
- bucket.files.create(:key =>path, :body => file)
57
+
58
+ split_threshhold = 1024*1024*1024*4
59
+ fsize = File.size file
60
+ puts "uploading '#{file}' [#{fsize} bytes] to '#{@bucket_name}/#{path}'"
61
+ # check if file is greater than 5G
62
+ if fsize > split_threshhold
63
+ upload_file_as_multipart(file, path)
64
+ else
65
+ # normal upload
66
+ File.open(file) do |fb|
67
+ bucket.files.create(:key =>path, :body => fb)
68
+ end
69
+
44
70
  end
45
-
71
+
46
72
  end
47
73
 
48
-
49
74
  puts "Writing complete marker"
50
75
 
51
76
  #Upload the complete marker
@@ -70,7 +95,192 @@ module S3snapshot
70
95
 
71
96
  return files
72
97
  end
73
-
74
-
98
+
99
+ ##
100
+ # Produces a hash of file paths to MD5 value.
101
+ ##
102
+ def get_file_md5s(workdir)
103
+ parts = {}
104
+
105
+ # Get the Base64 encoded MD5 of each file
106
+ Dir.entries(workdir).each do |file|
107
+ next if file =~ /\.\./
108
+ next if file =~ /\./
109
+
110
+ full_path = "#{workdir}#{file}"
111
+
112
+ md5 = Base64.encode64(Digest::MD5.file("#{full_path}").digest).chomp!
113
+
114
+ parts[full_path] = md5
115
+ end
116
+
117
+ return parts
118
+ end
119
+
120
+ ##
121
+ # Uploads a part of a file.
122
+ ##
123
+ def upload_file_part(file_path, file_md5, part_number, object_key, upload_id, tags)
124
+
125
+ # Reload to stop the connection timing out, useful when uploading large chunks
126
+ aws.reload
127
+
128
+ # Could use paralllel gem or similar here so that we get parallel
129
+ # upload when running on a ruby implementation that allows multithreading.
130
+ puts "Starting on File: #{file_path} with MD5: #{file_md5} - this is part #{part_number}"
131
+
132
+ part_upload = nil
133
+ # Pass fog a file object to upload
134
+ File.open(file_path) do |file_part|
135
+
136
+ puts "Uploading #{file_part.path} [#{part_number}] to #{@bucket_name} as #{object_key} with upload id #{upload_id}"
137
+
138
+ part_upload = aws.upload_part(@bucket_name, object_key, upload_id, part_number, file_part, { 'Content-MD5' => file_md5 } )
139
+
140
+ # You need to make sure the tags array has the tags in the correct order, else the upload won't complete
141
+ index = part_number-1
142
+ tags[index] = part_upload.headers["ETag"]
143
+
144
+ puts "Response: #{part_upload.inspect}" # This will return when the part has uploaded
145
+ end
146
+
147
+ return part_upload
148
+ end
149
+
150
+ ##
151
+ # Removes the workdir and all files in it.
152
+ ##
153
+ def clean_up_files(workdir)
154
+ # clean up the tmp files
155
+ puts "Cleaning up file split parts"
156
+ FileUtils.remove_dir workdir
157
+ end
158
+
159
+ ##
160
+ # Uploads a file in multiple parts.
161
+ #
162
+ # The disk needs to have the a minimum of the file size free.
163
+ # Expects object_to_upload to NOT have a leading /
164
+ ##
165
+ def upload_file_as_multipart(object_to_upload, object_key)
166
+
167
+ # method based on gist from https://gist.github.com/908875
168
+ # also https://gist.github.com/833374
169
+
170
+ puts "Uploading file as multipart: file=#{object_to_upload}"
171
+
172
+ # tmp dir to place the split file into
173
+ cur_date = DateTime.now.strftime('%F-%T')
174
+ workdir = "#{@tmpdir}/s3snapshot-splits/#{cur_date}/#{File.basename(object_to_upload)}/"
175
+ FileUtils.mkdir_p(workdir)
176
+
177
+ # Assumes we are running on unix with the split command available.
178
+ # Split the file into chunks, max size of 1G, the chunks are 000, 001, etc
179
+ # Smaller chunks increase the likelyhood of upload success.
180
+ split_cmd = "split -b 1G -a 3 --verbose -d #{object_to_upload} #{workdir}"
181
+ puts "split command: #{split_cmd}"
182
+ split_result = system split_cmd
183
+
184
+ if split_result
185
+ puts "Split file into parts in #{workdir}"
186
+ else
187
+ puts "Split FAILED! exit code = #{$?}"
188
+ # dont clean up, as user may need to see files to work out how to fix the split error.
189
+ exit 1 # exit with non 0 error code as split failed.
190
+ end
191
+
192
+ # Map of the file_part => md5
193
+ parts = get_file_md5s workdir
194
+
195
+ puts "File #{object_to_upload} has been split into #{parts.size} parts."
196
+ ### Now ready to perform the actual upload
197
+
198
+ # Initiate the upload and get the upload id
199
+ # this keeps failing, may have to retry.
200
+
201
+ multi_part_up = nil
202
+
203
+ # retry up to 5 times before failing
204
+ (1..MAX_RETRY_COUNT).each do |retry_count|
205
+ begin
206
+ aws.reload
207
+ multi_part_up = aws.initiate_multipart_upload(@bucket_name, object_key, { 'x-amz-acl' => 'private' } )
208
+ # initiation successful, so break.
209
+ break
210
+ rescue Exception => e
211
+
212
+ puts "multipart upload initiation FAILED: #{e.message}"
213
+
214
+ if retry_count <= MAX_RETRY_COUNT
215
+ puts "Retrying multipart upload initiation for the #{retry_count} time."
216
+ else
217
+ puts e.backtrace.join("\n")
218
+ # fail this, we cant initiated the upload
219
+ puts "Failed to initiate multipart upload after #{MAX_RETRY_COUNT} retries."
220
+ exit 2
221
+ end
222
+ end
223
+ end
224
+
225
+ upload_id = multi_part_up.body["UploadId"]
226
+
227
+ tags = []
228
+
229
+ # sort based on the sufix provided by the split command., eg 001
230
+ sorted_parts = parts.sort_by do |d|
231
+ d[0].split('/').last.to_i
232
+ end
233
+
234
+ sorted_parts.each_with_index do |entry, idx|
235
+ # Part numbers need to start at 1
236
+ part_number = idx + 1
237
+
238
+ # retry up to 5 times before failing
239
+ (1..MAX_RETRY_COUNT).each do |retry_count|
240
+ begin
241
+ # file_path, file_md5, part_number, object_key, upload_id, idx, tags
242
+ upload_file_part(entry[0], entry[1], part_number, object_key, upload_id, tags)
243
+ # upload of part successful, so break.
244
+ break
245
+ rescue Exception => e
246
+ puts "UPLOAD FILE PART FAILED: #{e.message}"
247
+
248
+ if retry_count <= MAX_RETRY_COUNT
249
+ puts "Retrying file part upload for #{entry[0]}"
250
+ else
251
+ puts e.backtrace.join("\n")
252
+ # failed to upload file part
253
+ puts "Failed to upload file part #{entry[0]} after #{MAX_RETRY_COUNT} retries."
254
+ exit 3
255
+ end
256
+ end
257
+ end
258
+
259
+ end
260
+
261
+ # retry up to 5 times before failing
262
+ (1..MAX_RETRY_COUNT).each do |retry_count|
263
+ begin
264
+ completed_upload = aws.complete_multipart_upload(@bucket_name, object_key, upload_id, tags)
265
+ puts "Completed Upload: #{completed_upload.inspect}"
266
+ # multipart completed, so break.
267
+ break
268
+ rescue Exception => e
269
+ puts "UPLOAD COMPLETED REQUEST FAILED: #{e.message}"
270
+
271
+ if retry_count <= MAX_RETRY_COUNT
272
+ puts "Retrying multipart upload complete"
273
+ else
274
+ puts e.backtrace.join("\n")
275
+ # failed to comple multipart upload
276
+ puts "Failed to complete multipart upload after #{MAX_RETRY_COUNT} retries."
277
+ exit 4
278
+ end
279
+ end
280
+ end
281
+
282
+ clean_up_files workdir
283
+
284
+ end
75
285
  end
76
286
  end
@@ -1,3 +1,3 @@
1
1
  module S3snapshot
2
- VERSION = "0.0.10"
2
+ VERSION = "0.0.11"
3
3
  end
data/s3snapshot.gemspec CHANGED
@@ -6,7 +6,7 @@ Gem::Specification.new do |s|
6
6
  s.name = "s3snapshot"
7
7
  s.version = S3snapshot::VERSION
8
8
  s.platform = Gem::Platform::RUBY
9
- s.authors = ["Todd"]
9
+ s.authors = ["Todd Nine", "Andrew Esler"]
10
10
  s.email = ["todd@spidertracks.co.nz"]
11
11
  s.homepage = "https://github.com/spidertracks/s3snapshot"
12
12
  s.summary = %q{Uploads to s3}
@@ -18,7 +18,7 @@ Gem::Specification.new do |s|
18
18
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
19
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
20
  s.require_paths = ["lib"]
21
- s.add_dependency "fog", "~>1.0.0"
21
+ s.add_dependency "fog", "~>1.3.1"
22
22
  s.add_dependency "thor", "~>0.14.6"
23
23
  s.add_dependency "dictionary", "~>1.0.0"
24
24
  end
metadata CHANGED
@@ -1,21 +1,22 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: s3snapshot
3
3
  version: !ruby/object:Gem::Version
4
- hash: 11
4
+ hash: 9
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 10
10
- version: 0.0.10
9
+ - 11
10
+ version: 0.0.11
11
11
  platform: ruby
12
12
  authors:
13
- - Todd
13
+ - Todd Nine
14
+ - Andrew Esler
14
15
  autorequire:
15
16
  bindir: bin
16
17
  cert_chain: []
17
18
 
18
- date: 2011-10-11 00:00:00 Z
19
+ date: 2012-07-09 00:00:00 Z
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency
21
22
  name: fog
@@ -25,12 +26,12 @@ dependencies:
25
26
  requirements:
26
27
  - - ~>
27
28
  - !ruby/object:Gem::Version
28
- hash: 23
29
+ hash: 25
29
30
  segments:
30
31
  - 1
31
- - 0
32
- - 0
33
- version: 1.0.0
32
+ - 3
33
+ - 1
34
+ version: 1.3.1
34
35
  type: :runtime
35
36
  version_requirements: *id001
36
37
  - !ruby/object:Gem::Dependency
@@ -123,7 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
123
124
  requirements: []
124
125
 
125
126
  rubyforge_project: s3snapshot
126
- rubygems_version: 1.8.2
127
+ rubygems_version: 1.8.10
127
128
  signing_key:
128
129
  specification_version: 3
129
130
  summary: Uploads to s3