sluice-jason 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,704 @@
1
+ # Copyright (c) 2012-2014 Snowplow Analytics Ltd. All rights reserved.
2
+ #
3
+ # This program is licensed to you under the Apache License Version 2.0,
4
+ # and you may not use this file except in compliance with the Apache License Version 2.0.
5
+ # You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
6
+ #
7
+ # Unless required by applicable law or agreed to in writing,
8
+ # software distributed under the Apache License Version 2.0 is distributed on an
9
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
11
+
12
+ # Authors:: Alex Dean (mailto:support@snowplowanalytics.com), Michael Tibben
13
+ # Copyright:: Copyright (c) 2012-2014 Snowplow Analytics Ltd
14
+ # License:: Apache License Version 2.0
15
+
16
+ require 'tmpdir'
17
+ require 'fog'
18
+ require 'thread'
19
+ require 'timeout'
20
+
21
+ require 'contracts'
22
+ include Contracts
23
+
24
+ module Sluice
25
+ module Storage
26
+ module S3
27
+
28
+ # TODO: figure out logging instead of puts (https://github.com/snowplow/sluice/issues/2)
29
+ # TODO: consider moving to OO structure (https://github.com/snowplow/sluice/issues/3)
30
+
31
+ # Constants
32
+ CONCURRENCY = 10 # Threads
33
+ RETRIES = 3 # Attempts
34
+ RETRY_WAIT = 10 # Seconds
35
+ TIMEOUT_WAIT = 1800 # 30 mins should let even large files upload. +1 https://github.com/snowplow/sluice/issues/7 if this is insufficient or excessive
36
+
37
+ # Helper function to instantiate a new Fog::Storage
38
+ # for S3 based on our config options
39
+ #
40
+ # Parameters:
41
+ # +region+:: Amazon S3 region we will be working with
42
+ # +access_key_id+:: AWS access key ID
43
+ # +secret_access_key+:: AWS secret access key
44
+ Contract String, String, String => FogStorage
45
+ def new_fog_s3_from(region, access_key_id, secret_access_key)
46
+ fog = Fog::Storage.new({
47
+ :provider => 'AWS',
48
+ :region => region,
49
+ :aws_access_key_id => access_key_id,
50
+ :aws_secret_access_key => secret_access_key
51
+ })
52
+ fog.sync_clock
53
+ fog
54
+ end
55
+ module_function :new_fog_s3_from
56
+
57
+ # Return an array of all Fog::Storage::AWS::File's
58
+ #
59
+ # Parameters:
60
+ # +s3+:: A Fog::Storage s3 connection
61
+ # +location+:: The location to return files from
62
+ #
63
+ # Returns array of Fog::Storage::AWS::File's
64
+ Contract FogStorage, Location => ArrayOf[FogFile]
65
+ def list_files(s3, location)
66
+ files_and_dirs = s3.directories.get(location.bucket, prefix: location.dir_as_path).files
67
+
68
+ files = [] # Can't use a .select because of Ruby deep copy issues (array of non-POROs)
69
+ files_and_dirs.each { |f|
70
+ if is_file?(f.key)
71
+ files << f.dup
72
+ end
73
+ }
74
+ files
75
+ end
76
+ module_function :list_files
77
+
78
+ # Whether the given path is a directory or not
79
+ #
80
+ # Parameters:
81
+ # +path+:: S3 path in String form
82
+ #
83
+ # Returns boolean
84
+ Contract String => Bool
85
+ def is_folder?(path)
86
+ (path.end_with?('_$folder$') || # EMR-created
87
+ path.end_with?('/'))
88
+ end
89
+ module_function :is_folder?
90
+
91
+ # Whether the given path is a file or not
92
+ #
93
+ # Parameters:
94
+ # +path+:: S3 path in String form
95
+ #
96
+ # Returns boolean
97
+ Contract String => Bool
98
+ def is_file?(path)
99
+ !is_folder?(path)
100
+ end
101
+ module_function :is_file?
102
+
103
+ # Returns the basename for the given path
104
+ #
105
+ # Parameters:
106
+ # +path+:: S3 path in String form
107
+ #
108
+ # Returns the basename, or nil if the
109
+ # path is to a folder
110
+ Contract nil => String
111
+ def get_basename(path)
112
+ if is_folder?(path)
113
+ nil
114
+ else
115
+ match = path.match('([^/]+)$')
116
+ if match
117
+ match[1]
118
+ else
119
+ nil
120
+ end
121
+ end
122
+ end
123
+ module_function :get_basename
124
+
125
+ # Determine if a bucket is empty
126
+ #
127
+ # Parameters:
128
+ # +s3+:: A Fog::Storage s3 connection
129
+ # +location+:: The location to check
130
+ Contract FogStorage, Location => Bool
131
+ def is_empty?(s3, location)
132
+ list_files(s3, location).length == 0
133
+ end
134
+ module_function :is_empty?
135
+
136
+ # Download files from an S3 location to
137
+ # local storage, concurrently
138
+ #
139
+ # Parameters:
140
+ # +s3+:: A Fog::Storage s3 connection
141
+ # +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to download files from
142
+ # +to_directory+:: Local directory to copy files to
143
+ # +match_regex+:: a regex string to match the files to delete
144
+ def download_files(s3, from_files_or_loc, to_directory, match_regex='.+')
145
+
146
+ puts " downloading #{describe_from(from_files_or_loc)} to #{to_directory}"
147
+ process_files(:download, s3, from_files_or_loc, [], match_regex, to_directory)
148
+ end
149
+ module_function :download_files
150
+
151
+ # Delete files from S3 locations concurrently
152
+ #
153
+ # Parameters:
154
+ # +s3+:: A Fog::Storage s3 connection
155
+ # +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to delete files from
156
+ # +match_regex+:: a regex string to match the files to delete
157
+ def delete_files(s3, from_files_or_loc, match_regex='.+')
158
+
159
+ puts " deleting #{describe_from(from_files_or_loc)}"
160
+ process_files(:delete, s3, from_files_or_loc, [], match_regex)
161
+ end
162
+ module_function :delete_files
163
+
164
+ # Copies files between S3 locations in two different accounts
165
+ #
166
+ # Implementation is as follows:
167
+ # 1. Concurrent download of all files from S3 source to local tmpdir
168
+ # 2. Concurrent upload of all files from local tmpdir to S3 target
169
+ #
170
+ # In other words, the download and upload are not interleaved (which is
171
+ # inefficient because upload speeds are much lower than download speeds)
172
+ #
173
+ # In other words, the download and upload are not interleaved (which
174
+ # is inefficient because upload speeds are much lower than download speeds)
175
+ #
176
+ # +from_s3+:: A Fog::Storage s3 connection for accessing the from S3Location
177
+ # +to_s3+:: A Fog::Storage s3 connection for accessing the to S3Location
178
+ # +from_location+:: S3Location to copy files from
179
+ # +to_location+:: S3Location to copy files to
180
+ # +match_regex+:: a regex string to match the files to move
181
+ # +alter_filename_lambda+:: lambda to alter the written filename
182
+ # +flatten+:: strips off any sub-folders below the from_location
183
+ def copy_files_inter(from_s3, to_s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
184
+
185
+ puts " copying inter-account #{describe_from(from_location)} to #{to_location}"
186
+ processed = []
187
+ Dir.mktmpdir do |t|
188
+ tmp = Sluice::Storage.trail_slash(t)
189
+ processed = download_files(from_s3, from_location, tmp, match_regex)
190
+ upload_files(to_s3, tmp, to_location, '**/*') # Upload all files we downloaded
191
+ end
192
+
193
+ processed
194
+ end
195
+ module_function :copy_files_inter
196
+
197
+ # Copies files between S3 locations concurrently
198
+ #
199
+ # Parameters:
200
+ # +s3+:: A Fog::Storage s3 connection
201
+ # +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to copy files from
202
+ # +to_location+:: S3Location to copy files to
203
+ # +match_regex+:: a regex string to match the files to copy
204
+ # +alter_filename_lambda+:: lambda to alter the written filename
205
+ # +flatten+:: strips off any sub-folders below the from_location
206
+ def copy_files(s3, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
207
+
208
+ puts " copying #{describe_from(from_files_or_loc)} to #{to_location}"
209
+ process_files(:copy, s3, from_files_or_loc, [], match_regex, to_location, alter_filename_lambda, flatten)
210
+ end
211
+ module_function :copy_files
212
+
213
+ # Copies files between S3 locations maintaining a manifest to
214
+ # avoid copying a file which was copied previously.
215
+ #
216
+ # Useful in scenarios such as:
217
+ # 1. You would like to do a move but only have read permission
218
+ # on the source bucket
219
+ # 2. You would like to do a move but some other process needs
220
+ # to use the files after you
221
+ #
222
+ # +s3+:: A Fog::Storage s3 connection
223
+ # +manifest+:: A Sluice::Storage::S3::Manifest object
224
+ # +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to copy files from
225
+ # +to_location+:: S3Location to copy files to
226
+ # +match_regex+:: a regex string to match the files to copy
227
+ # +alter_filename_lambda+:: lambda to alter the written filename
228
+ # +flatten+:: strips off any sub-folders below the from_location
229
+ def copy_files_manifest(s3, manifest, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
230
+
231
+ puts " copying with manifest #{describe_from(from_files_or_loc)} to #{to_location}"
232
+ ignore = manifest.get_entries(s3) # Files to leave untouched
233
+ processed = process_files(:copy, s3, from_files_or_loc, ignore, match_regex, to_location, alter_filename_lambda, flatten)
234
+ manifest.add_entries(s3, processed)
235
+
236
+ processed
237
+ end
238
+ module_function :copy_files_manifest
239
+
240
+ # Moves files between S3 locations in two different accounts
241
+ #
242
+ # Implementation is as follows:
243
+ # 1. Concurrent download of all files from S3 source to local tmpdir
244
+ # 2. Concurrent upload of all files from local tmpdir to S3 target
245
+ # 3. Concurrent deletion of all files from S3 source
246
+ #
247
+ # In other words, the three operations are not interleaved (which is
248
+ # inefficient because upload speeds are much lower than download speeds)
249
+ #
250
+ # +from_s3+:: A Fog::Storage s3 connection for accessing the from S3Location
251
+ # +to_s3+:: A Fog::Storage s3 connection for accessing the to S3Location
252
+ # +from_location+:: S3Location to move files from
253
+ # +to_location+:: S3Location to move files to
254
+ # +match_regex+:: a regex string to match the files to move
255
+ # +alter_filename_lambda+:: lambda to alter the written filename
256
+ # +flatten+:: strips off any sub-folders below the from_location
257
+ def move_files_inter(from_s3, to_s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
258
+
259
+ puts " moving inter-account #{describe_from(from_location)} to #{to_location}"
260
+ processed = []
261
+ Dir.mktmpdir do |t|
262
+ tmp = Sluice::Storage.trail_slash(t)
263
+ processed = download_files(from_s3, from_location, tmp, match_regex)
264
+ upload_files(to_s3, tmp, to_location, '**/*') # Upload all files we downloaded
265
+ delete_files(from_s3, from_location, '.+') # Delete all files we downloaded
266
+ end
267
+
268
+ processed
269
+ end
270
+ module_function :move_files_inter
271
+
272
+ # Moves files between S3 locations concurrently
273
+ #
274
+ # Parameters:
275
+ # +s3+:: A Fog::Storage s3 connection
276
+ # +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to move files from
277
+ # +to_location+:: S3Location to move files to
278
+ # +match_regex+:: a regex string to match the files to move
279
+ # +alter_filename_lambda+:: lambda to alter the written filename
280
+ # +flatten+:: strips off any sub-folders below the from_location
281
+ def move_files(s3, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
282
+
283
+ puts " moving #{describe_from(from_files_or_loc)} to #{to_location}"
284
+ process_files(:move, s3, from_files_or_loc, [], match_regex, to_location, alter_filename_lambda, flatten)
285
+ end
286
+ module_function :move_files
287
+
288
+ # Uploads files to S3 locations concurrently
289
+ #
290
+ # Parameters:
291
+ # +s3+:: A Fog::Storage s3 connection
292
+ # +from_files_or_dir+:: Local array of files or local directory to upload files from
293
+ # +to_location+:: S3Location to upload files to
294
+ # +match_glob+:: a filesystem glob to match the files to upload
295
+ def upload_files(s3, from_files_or_dir, to_location, match_glob='*')
296
+
297
+ puts " uploading #{describe_from(from_files_or_dir)} to #{to_location}"
298
+ process_files(:upload, s3, from_files_or_dir, [], match_glob, to_location)
299
+ end
300
+ module_function :upload_files
301
+
302
+ # Upload a single file to the exact location specified
303
+ # Has no intelligence around filenaming.
304
+ #
305
+ # Parameters:
306
+ # +s3+:: A Fog::Storage s3 connection
307
+ # +from_file:: A local file path
308
+ # +to_bucket:: The Fog::Directory to upload to
309
+ # +to_file:: The file path to upload to
310
+ def upload_file(s3, from_file, to_bucket, to_file)
311
+
312
+ local_file = File.open(from_file)
313
+
314
+ dir = s3.directories.new(:key => to_bucket) # No request made
315
+ file = dir.files.create(
316
+ :key => to_file,
317
+ :body => local_file
318
+ )
319
+ file.save('x-amz-server-side-encryption' => 'AES256')
320
+ local_file.close
321
+ end
322
+ module_function :upload_file
323
+
324
+ # Download a single file to the exact path specified
325
+ # Has no intelligence around filenaming.
326
+ # Makes sure to create the path as needed.
327
+ #
328
+ # Parameters:
329
+ # +s3+:: A Fog::Storage s3 connection
330
+ # +from_file:: A Fog::Storage::AWS::File to download
331
+ # +to_file:: A local file path
332
+ def download_file(s3, from_file, to_file)
333
+
334
+ FileUtils.mkdir_p(File.dirname(to_file))
335
+
336
+ # TODO: deal with bug where Fog hangs indefinitely if network connection dies during download
337
+
338
+ local_file = File.open(to_file, "w")
339
+ local_file.write(from_file.body)
340
+ local_file.close
341
+ end
342
+ module_function :download_file
343
+
344
+ private
345
+
346
+ # Provides string describing from_files_or_dir_or_loc
347
+ # for logging purposes.
348
+ #
349
+ # Parameters:
350
+ # +from_files_or_dir_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, local directory or S3Location to process files from
351
+ #
352
+ # Returns a log-friendly string
353
+ def describe_from(from_files_or_dir_or_loc)
354
+ if from_files_or_dir_or_loc.is_a?(Array)
355
+ "#{from_files_or_dir_or_loc.length} file(s)"
356
+ else
357
+ "files from #{from_files_or_dir_or_loc}"
358
+ end
359
+ end
360
+ module_function :describe_from
361
+
362
+ # Concurrent file operations between S3 locations. Supports:
363
+ # - Download
364
+ # - Upload
365
+ # - Copy
366
+ # - Delete
367
+ # - Move (= Copy + Delete)
368
+ #
369
+ # Parameters:
370
+ # +operation+:: Operation to perform. :download, :upload, :copy, :delete, :move supported
371
+ # +ignore+:: Array of filenames to ignore (used by manifest code)
372
+ # +s3+:: A Fog::Storage s3 connection
373
+ # +from_files_or_dir_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, local directory or S3Location to process files from
374
+ # +match_regex_or_glob+:: a regex or glob string to match the files to process
375
+ # +to_loc_or_dir+:: S3Location or local directory to process files to
376
+ # +alter_filename_lambda+:: lambda to alter the written filename
377
+ # +flatten+:: strips off any sub-folders below the from_loc_or_dir
378
+ def process_files(operation, s3, from_files_or_dir_or_loc, ignore=[], match_regex_or_glob='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
379
+
380
+ # Validate that the file operation makes sense
381
+ case operation
382
+ when :copy, :move, :download, :upload
383
+ if to_loc_or_dir.nil?
384
+ raise StorageOperationError "File operation %s requires the to_loc_or_dir to be set" % operation
385
+ end
386
+ when :delete
387
+ unless to_loc_or_dir.nil?
388
+ raise StorageOperationError "File operation %s does not support the to_loc_or_dir argument" % operation
389
+ end
390
+ if alter_filename_lambda.class == Proc
391
+ raise StorageOperationError "File operation %s does not support the alter_filename_lambda argument" % operation
392
+ end
393
+ else
394
+ raise StorageOperationError "File operation %s is unsupported. Try :download, :upload, :copy, :delete or :move" % operation
395
+ end
396
+
397
+ # If we have an array of files, no additional globbing required
398
+ if from_files_or_dir_or_loc.is_a?(Array)
399
+ files_to_process = from_files_or_dir_or_loc # Could be filepaths or Fog::Storage::AWS::File's
400
+ globbed = true
401
+ # Otherwise if it's an upload, we can glob now
402
+ elsif operation == :upload
403
+ files_to_process = glob_files(from_files_or_dir_or_loc, match_regex_or_glob)
404
+ globbed = true
405
+ # Otherwise we'll do threaded globbing later...
406
+ else
407
+ files_to_process = []
408
+ from_loc = from_files_or_dir_or_loc # Alias
409
+ globbed = false
410
+ end
411
+
412
+ threads = []
413
+ mutex = Mutex.new
414
+ complete = false
415
+ marker_opts = {}
416
+ processed_files = [] # For manifest updating, determining if any files were moved etc
417
+
418
+ # If an exception is thrown in a thread that isn't handled, die quickly
419
+ Thread.abort_on_exception = true
420
+
421
+ # Create Ruby threads to concurrently execute s3 operations
422
+ for i in (0...CONCURRENCY)
423
+
424
+ # Each thread pops a file off the files_to_process array, and moves it.
425
+ # We loop until there are no more files
426
+ threads << Thread.new(i) do |thread_idx|
427
+
428
+ loop do
429
+ file = false
430
+ filepath = false
431
+ from_bucket = false
432
+ from_path = false
433
+ match = false
434
+
435
+ # First critical section:
436
+ # only allow one thread to modify the array at any time
437
+ mutex.synchronize do
438
+
439
+ # No need to do further globbing
440
+ if globbed
441
+ if files_to_process.size == 0
442
+ complete = true
443
+ next
444
+ end
445
+
446
+ file = files_to_process.pop
447
+ # Support raw filenames and also Fog::Storage::AWS::File's
448
+ if (file.is_a?(Fog::Storage::AWS::File))
449
+ from_bucket = file.directory.key # Bucket
450
+ from_path = Sluice::Storage.trail_slash(File.dirname(file.key))
451
+ filepath = file.key
452
+ else
453
+ from_bucket = nil # Not used
454
+ if from_files_or_dir_or_loc.is_a?(Array)
455
+ from_path = Sluice::Storage.trail_slash(File.dirname(file))
456
+ else
457
+ from_path = from_files_or_dir_or_loc # The root dir
458
+ end
459
+ filepath = file
460
+ end
461
+
462
+ match = true # Match is implicit in the glob
463
+ else
464
+
465
+ while !complete && !match do
466
+ if files_to_process.size == 0
467
+ # S3 batches 1000 files per request.
468
+ # We load up our array with the files to move
469
+ files_to_process = s3.directories.get(from_loc.bucket, :prefix => from_loc.dir).files.all(marker_opts).to_a
470
+ # If we don't have any files after the S3 request, we're complete
471
+ if files_to_process.size == 0
472
+ complete = true
473
+ next
474
+ else
475
+ marker_opts['marker'] = files_to_process.last.key
476
+
477
+ # By reversing the array we can use pop and get FIFO behaviour
478
+ # instead of the performance penalty incurred by unshift
479
+ files_to_process = files_to_process.reverse
480
+ end
481
+ end
482
+
483
+ file = files_to_process.pop
484
+ from_bucket = from_loc.bucket
485
+ from_path = from_loc.dir_as_path
486
+ filepath = file.key
487
+
488
+ # TODO: clean up following https://github.com/snowplow/sluice/issues/25
489
+ match = if match_regex_or_glob.is_a? NegativeRegex
490
+ !filepath.match(match_regex_or_glob.regex)
491
+ else
492
+ filepath.match(match_regex_or_glob)
493
+ end
494
+
495
+ end
496
+ end
497
+ end
498
+ # End of mutex.synchronize
499
+
500
+ # Kill this thread's loop (and thus this thread) if we are complete
501
+ break if complete
502
+
503
+ # Skip processing for a folder or file which doesn't match our regexp or glob
504
+ next if is_folder?(filepath) or not match
505
+
506
+ # Name file
507
+ basename = get_basename(filepath)
508
+ next if ignore.include?(basename) # Don't process if in our leave list
509
+
510
+ filename = rename_file(filepath, basename, alter_filename_lambda)
511
+
512
+ # What are we doing? Let's determine source and target
513
+ # Note that target excludes bucket name where relevant
514
+ case operation
515
+ when :upload
516
+ source = "#{filepath}"
517
+ target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
518
+ puts "(t#{thread_idx}) UPLOAD #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
519
+ when :download
520
+ source = "#{from_bucket}/#{filepath}"
521
+ target = name_file(filepath, filename, from_path, to_loc_or_dir, flatten)
522
+ puts "(t#{thread_idx}) DOWNLOAD #{source} +-> #{target}"
523
+ when :move
524
+ source = "#{from_bucket}/#{filepath}"
525
+ target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
526
+ puts "(t#{thread_idx}) MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
527
+ when :copy
528
+ source = "#{from_bucket}/#{filepath}"
529
+ target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
530
+ puts "(t#{thread_idx}) COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
531
+ when :delete
532
+ source = "#{from_bucket}/#{filepath}"
533
+ # No target
534
+ puts "(t#{thread_idx}) DELETE x #{source}"
535
+ end
536
+
537
+ # Upload is a standalone operation vs move/copy/delete
538
+ if operation == :upload
539
+ retry_x(
540
+ Sluice::Storage::S3,
541
+ [:upload_file, s3, filepath, to_loc_or_dir.bucket, target],
542
+ 3,
543
+ " +/> #{target}",
544
+ "Problem uploading #{filepath}. Retrying.")
545
+ end
546
+
547
+ # Download is a standalone operation vs move/copy/delete
548
+ if operation == :download
549
+ retry_x(
550
+ Sluice::Storage::S3,
551
+ [:download_file, s3, file, target],
552
+ 3,
553
+ " +/> #{target}",
554
+ "Problem downloading #{filepath}. Retrying.")
555
+ end
556
+
557
+ # A move or copy starts with a copy file
558
+ if [:move, :copy].include? operation
559
+ retry_x(
560
+ file,
561
+ [:copy, to_loc_or_dir.bucket, target],
562
+ 3,
563
+ " +-> #{to_loc_or_dir.bucket}/#{target}",
564
+ "Problem copying #{filepath}. Retrying.")
565
+ end
566
+
567
+ # A move or delete ends with a delete
568
+ if [:move, :delete].include? operation
569
+ retry_x(
570
+ file,
571
+ [:destroy],
572
+ 3,
573
+ " x #{source}",
574
+ "Problem destroying #{filepath}. Retrying.")
575
+ end
576
+
577
+ # Second critical section: we need to update
578
+ # processed_files in a thread-safe way
579
+ mutex.synchronize do
580
+ processed_files << filepath
581
+ end
582
+ end
583
+ end
584
+ end
585
+
586
+ # Wait for threads to finish
587
+ threads.each { |aThread| aThread.join }
588
+
589
+ processed_files # Return the processed files
590
+ end
591
+ module_function :process_files
592
+
593
+ # A helper function to rename a file
594
+ # TODO: fixup lambda to be Maybe[Proc]
595
+ Contract String, Maybe[String], Or[Proc, Bool] => Maybe[String]
596
+ def self.rename_file(filepath, basename, rename_lambda=false)
597
+
598
+ if rename_lambda.class == Proc
599
+ case rename_lambda.arity
600
+ when 2
601
+ rename_lambda.call(basename, filepath)
602
+ when 1
603
+ rename_lambda.call(basename)
604
+ when 0
605
+ rename_lambda.call()
606
+ else
607
+ raise StorageOperationError "Expect arity of 0, 1 or 2 for rename_lambda, not #{rename_lambda.arity}"
608
+ end
609
+ else
610
+ basename
611
+ end
612
+ end
613
+
614
+ # A helper function to list all files
615
+ # recursively in a folder
616
+ #
617
+ # Parameters:
618
+ # +dir+:: Directory to list files recursively
619
+ # +match_regex+:: a regex string to match the files to copy
620
+ #
621
+ # Returns array of files (no sub-directories)
622
+ def glob_files(dir, glob)
623
+ Dir.glob(File.join(dir, glob)).select { |f|
624
+ File.file?(f) # Drop sub-directories
625
+ }
626
+ end
627
+ module_function :glob_files
628
+
629
+ # A helper function to attempt to run a
630
+ # function retries times
631
+ #
632
+ # Parameters:
633
+ # +object+:: Object to send our function to
634
+ # +send_args+:: Function plus arguments
635
+ # +retries+:: Number of retries to attempt
636
+ # +attempt_msg+:: Message to puts on each attempt
637
+ # +failure_msg+:: Message to puts on each failure
638
+ def retry_x(object, send_args, retries, attempt_msg, failure_msg)
639
+ i = 0
640
+ begin
641
+ Timeout::timeout(TIMEOUT_WAIT) do # In case our operation times out
642
+ object.send(*send_args)
643
+ puts attempt_msg
644
+ end
645
+ rescue
646
+ raise unless i < retries
647
+ puts failure_msg
648
+ sleep(RETRY_WAIT) # Give us a bit of time before retrying
649
+ i += 1
650
+ retry
651
+ end
652
+ end
653
+ module_function :retry_x
654
+
655
+ # A helper function to prepare destination
656
+ # filenames and paths. This is a bit weird
657
+ # - it needs to exist because of differences
658
+ # in the way that Amazon S3, Fog and Unix
659
+ # treat filepaths versus keys.
660
+ #
661
+ # Parameters:
662
+ # +filepath+:: Path to file (including old filename)
663
+ # +new_filename+:: Replace the filename in the path with this
664
+ # +remove_path+:: If this is set, strip this from the front of the path
665
+ # +add_path+:: If this is set, add this to the front of the path
666
+ # +flatten+:: strips off any sub-folders below the from_location
667
+ #
668
+ # TODO: this badly needs unit tests
669
+ def name_file(filepath, new_filename, remove_path=nil, add_path=nil, flatten=false)
670
+
671
+ # First, replace the filename in filepath with new one
672
+ dirname = File.dirname(filepath)
673
+ new_filepath = (dirname == '.') ? new_filename : dirname + '/' + new_filename
674
+
675
+ # Nothing more to do
676
+ return new_filepath if remove_path.nil? and add_path.nil? and not flatten
677
+
678
+ shortened_filepath = if flatten
679
+ # Let's revert to just the filename
680
+ new_filename
681
+ else
682
+ # If we have a 'remove_path', it must be found at
683
+ # the start of the path.
684
+ # If it's not, you're probably using name_file()
685
+ # wrong.
686
+ if !filepath.start_with?(remove_path)
687
+ raise StorageOperationError, "name_file failed. Filepath '#{filepath}' does not start with '#{remove_path}'"
688
+ end
689
+
690
+ # Okay, let's remove the filepath
691
+ new_filepath[remove_path.length()..-1]
692
+ end
693
+
694
+ # Nothing more to do
695
+ return shortened_filepath if add_path.nil?
696
+
697
+ # Add the new filepath on to the start and return
698
+ return add_path + shortened_filepath
699
+ end
700
+ module_function :name_file
701
+
702
+ end
703
+ end
704
+ end