sluice-jason 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.coveralls.yml +1 -0
- data/.gitignore +20 -0
- data/.travis.yml +10 -0
- data/CHANGELOG +94 -0
- data/Gemfile +4 -0
- data/Guardfile +11 -0
- data/LICENSE-2.0.txt +202 -0
- data/README.md +83 -0
- data/Vagrantfile +23 -0
- data/lib/sluice/errors.rb +26 -0
- data/lib/sluice/storage/s3/contracts.rb +32 -0
- data/lib/sluice/storage/s3/location.rb +77 -0
- data/lib/sluice/storage/s3/manifest.rb +129 -0
- data/lib/sluice/storage/s3/s3.rb +704 -0
- data/lib/sluice/storage/storage.rb +111 -0
- data/lib/sluice/version.rb +19 -0
- data/lib/sluice.rb +21 -0
- data/sluice.gemspec +46 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/storage/s3/location_spec.rb +47 -0
- data/spec/storage/s3/s3_spec.rb +42 -0
- data/vagrant/.gitignore +3 -0
- data/vagrant/ansible.hosts +2 -0
- data/vagrant/peru.yaml +14 -0
- data/vagrant/push.bash +79 -0
- data/vagrant/up.bash +50 -0
- data/vagrant/up.guidance +5 -0
- data/vagrant/up.playbooks +1 -0
- metadata +180 -0
@@ -0,0 +1,704 @@
|
|
1
|
+
# Copyright (c) 2012-2014 Snowplow Analytics Ltd. All rights reserved.
|
2
|
+
#
|
3
|
+
# This program is licensed to you under the Apache License Version 2.0,
|
4
|
+
# and you may not use this file except in compliance with the Apache License Version 2.0.
|
5
|
+
# You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
|
6
|
+
#
|
7
|
+
# Unless required by applicable law or agreed to in writing,
|
8
|
+
# software distributed under the Apache License Version 2.0 is distributed on an
|
9
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
|
11
|
+
|
12
|
+
# Authors:: Alex Dean (mailto:support@snowplowanalytics.com), Michael Tibben
|
13
|
+
# Copyright:: Copyright (c) 2012-2014 Snowplow Analytics Ltd
|
14
|
+
# License:: Apache License Version 2.0
|
15
|
+
|
16
|
+
require 'tmpdir'
|
17
|
+
require 'fog'
|
18
|
+
require 'thread'
|
19
|
+
require 'timeout'
|
20
|
+
|
21
|
+
require 'contracts'
|
22
|
+
include Contracts
|
23
|
+
|
24
|
+
module Sluice
|
25
|
+
module Storage
|
26
|
+
module S3
|
27
|
+
|
28
|
+
# TODO: figure out logging instead of puts (https://github.com/snowplow/sluice/issues/2)
|
29
|
+
# TODO: consider moving to OO structure (https://github.com/snowplow/sluice/issues/3)
|
30
|
+
|
31
|
+
# Constants
|
32
|
+
CONCURRENCY = 10 # Threads
|
33
|
+
RETRIES = 3 # Attempts
|
34
|
+
RETRY_WAIT = 10 # Seconds
|
35
|
+
TIMEOUT_WAIT = 1800 # 30 mins should let even large files upload. +1 https://github.com/snowplow/sluice/issues/7 if this is insufficient or excessive
|
36
|
+
|
37
|
+
# Helper function to instantiate a new Fog::Storage
|
38
|
+
# for S3 based on our config options
|
39
|
+
#
|
40
|
+
# Parameters:
|
41
|
+
# +region+:: Amazon S3 region we will be working with
|
42
|
+
# +access_key_id+:: AWS access key ID
|
43
|
+
# +secret_access_key+:: AWS secret access key
|
44
|
+
Contract String, String, String => FogStorage
|
45
|
+
def new_fog_s3_from(region, access_key_id, secret_access_key)
|
46
|
+
fog = Fog::Storage.new({
|
47
|
+
:provider => 'AWS',
|
48
|
+
:region => region,
|
49
|
+
:aws_access_key_id => access_key_id,
|
50
|
+
:aws_secret_access_key => secret_access_key
|
51
|
+
})
|
52
|
+
fog.sync_clock
|
53
|
+
fog
|
54
|
+
end
|
55
|
+
module_function :new_fog_s3_from
|
56
|
+
|
57
|
+
# Return an array of all Fog::Storage::AWS::File's
|
58
|
+
#
|
59
|
+
# Parameters:
|
60
|
+
# +s3+:: A Fog::Storage s3 connection
|
61
|
+
# +location+:: The location to return files from
|
62
|
+
#
|
63
|
+
# Returns array of Fog::Storage::AWS::File's
|
64
|
+
Contract FogStorage, Location => ArrayOf[FogFile]
|
65
|
+
def list_files(s3, location)
|
66
|
+
files_and_dirs = s3.directories.get(location.bucket, prefix: location.dir_as_path).files
|
67
|
+
|
68
|
+
files = [] # Can't use a .select because of Ruby deep copy issues (array of non-POROs)
|
69
|
+
files_and_dirs.each { |f|
|
70
|
+
if is_file?(f.key)
|
71
|
+
files << f.dup
|
72
|
+
end
|
73
|
+
}
|
74
|
+
files
|
75
|
+
end
|
76
|
+
module_function :list_files
|
77
|
+
|
78
|
+
# Whether the given path is a directory or not
|
79
|
+
#
|
80
|
+
# Parameters:
|
81
|
+
# +path+:: S3 path in String form
|
82
|
+
#
|
83
|
+
# Returns boolean
|
84
|
+
Contract String => Bool
|
85
|
+
def is_folder?(path)
|
86
|
+
(path.end_with?('_$folder$') || # EMR-created
|
87
|
+
path.end_with?('/'))
|
88
|
+
end
|
89
|
+
module_function :is_folder?
|
90
|
+
|
91
|
+
# Whether the given path is a file or not
|
92
|
+
#
|
93
|
+
# Parameters:
|
94
|
+
# +path+:: S3 path in String form
|
95
|
+
#
|
96
|
+
# Returns boolean
|
97
|
+
Contract String => Bool
|
98
|
+
def is_file?(path)
|
99
|
+
!is_folder?(path)
|
100
|
+
end
|
101
|
+
module_function :is_file?
|
102
|
+
|
103
|
+
# Returns the basename for the given path
|
104
|
+
#
|
105
|
+
# Parameters:
|
106
|
+
# +path+:: S3 path in String form
|
107
|
+
#
|
108
|
+
# Returns the basename, or nil if the
|
109
|
+
# path is to a folder
|
110
|
+
Contract nil => String
|
111
|
+
def get_basename(path)
|
112
|
+
if is_folder?(path)
|
113
|
+
nil
|
114
|
+
else
|
115
|
+
match = path.match('([^/]+)$')
|
116
|
+
if match
|
117
|
+
match[1]
|
118
|
+
else
|
119
|
+
nil
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
module_function :get_basename
|
124
|
+
|
125
|
+
# Determine if a bucket is empty
|
126
|
+
#
|
127
|
+
# Parameters:
|
128
|
+
# +s3+:: A Fog::Storage s3 connection
|
129
|
+
# +location+:: The location to check
|
130
|
+
Contract FogStorage, Location => Bool
|
131
|
+
def is_empty?(s3, location)
|
132
|
+
list_files(s3, location).length == 0
|
133
|
+
end
|
134
|
+
module_function :is_empty?
|
135
|
+
|
136
|
+
# Download files from an S3 location to
|
137
|
+
# local storage, concurrently
|
138
|
+
#
|
139
|
+
# Parameters:
|
140
|
+
# +s3+:: A Fog::Storage s3 connection
|
141
|
+
# +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to download files from
|
142
|
+
# +to_directory+:: Local directory to copy files to
|
143
|
+
# +match_regex+:: a regex string to match the files to delete
|
144
|
+
def download_files(s3, from_files_or_loc, to_directory, match_regex='.+')
|
145
|
+
|
146
|
+
puts " downloading #{describe_from(from_files_or_loc)} to #{to_directory}"
|
147
|
+
process_files(:download, s3, from_files_or_loc, [], match_regex, to_directory)
|
148
|
+
end
|
149
|
+
module_function :download_files
|
150
|
+
|
151
|
+
# Delete files from S3 locations concurrently
|
152
|
+
#
|
153
|
+
# Parameters:
|
154
|
+
# +s3+:: A Fog::Storage s3 connection
|
155
|
+
# +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to delete files from
|
156
|
+
# +match_regex+:: a regex string to match the files to delete
|
157
|
+
def delete_files(s3, from_files_or_loc, match_regex='.+')
|
158
|
+
|
159
|
+
puts " deleting #{describe_from(from_files_or_loc)}"
|
160
|
+
process_files(:delete, s3, from_files_or_loc, [], match_regex)
|
161
|
+
end
|
162
|
+
module_function :delete_files
|
163
|
+
|
164
|
+
# Copies files between S3 locations in two different accounts
|
165
|
+
#
|
166
|
+
# Implementation is as follows:
|
167
|
+
# 1. Concurrent download of all files from S3 source to local tmpdir
|
168
|
+
# 2. Concurrent upload of all files from local tmpdir to S3 target
|
169
|
+
#
|
170
|
+
# In other words, the download and upload are not interleaved (which is
|
171
|
+
# inefficient because upload speeds are much lower than download speeds)
|
172
|
+
#
|
173
|
+
# In other words, the download and upload are not interleaved (which
|
174
|
+
# is inefficient because upload speeds are much lower than download speeds)
|
175
|
+
#
|
176
|
+
# +from_s3+:: A Fog::Storage s3 connection for accessing the from S3Location
|
177
|
+
# +to_s3+:: A Fog::Storage s3 connection for accessing the to S3Location
|
178
|
+
# +from_location+:: S3Location to copy files from
|
179
|
+
# +to_location+:: S3Location to copy files to
|
180
|
+
# +match_regex+:: a regex string to match the files to move
|
181
|
+
# +alter_filename_lambda+:: lambda to alter the written filename
|
182
|
+
# +flatten+:: strips off any sub-folders below the from_location
|
183
|
+
def copy_files_inter(from_s3, to_s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
|
184
|
+
|
185
|
+
puts " copying inter-account #{describe_from(from_location)} to #{to_location}"
|
186
|
+
processed = []
|
187
|
+
Dir.mktmpdir do |t|
|
188
|
+
tmp = Sluice::Storage.trail_slash(t)
|
189
|
+
processed = download_files(from_s3, from_location, tmp, match_regex)
|
190
|
+
upload_files(to_s3, tmp, to_location, '**/*') # Upload all files we downloaded
|
191
|
+
end
|
192
|
+
|
193
|
+
processed
|
194
|
+
end
|
195
|
+
module_function :copy_files_inter
|
196
|
+
|
197
|
+
# Copies files between S3 locations concurrently
|
198
|
+
#
|
199
|
+
# Parameters:
|
200
|
+
# +s3+:: A Fog::Storage s3 connection
|
201
|
+
# +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to copy files from
|
202
|
+
# +to_location+:: S3Location to copy files to
|
203
|
+
# +match_regex+:: a regex string to match the files to copy
|
204
|
+
# +alter_filename_lambda+:: lambda to alter the written filename
|
205
|
+
# +flatten+:: strips off any sub-folders below the from_location
|
206
|
+
def copy_files(s3, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
|
207
|
+
|
208
|
+
puts " copying #{describe_from(from_files_or_loc)} to #{to_location}"
|
209
|
+
process_files(:copy, s3, from_files_or_loc, [], match_regex, to_location, alter_filename_lambda, flatten)
|
210
|
+
end
|
211
|
+
module_function :copy_files
|
212
|
+
|
213
|
+
# Copies files between S3 locations maintaining a manifest to
|
214
|
+
# avoid copying a file which was copied previously.
|
215
|
+
#
|
216
|
+
# Useful in scenarios such as:
|
217
|
+
# 1. You would like to do a move but only have read permission
|
218
|
+
# on the source bucket
|
219
|
+
# 2. You would like to do a move but some other process needs
|
220
|
+
# to use the files after you
|
221
|
+
#
|
222
|
+
# +s3+:: A Fog::Storage s3 connection
|
223
|
+
# +manifest+:: A Sluice::Storage::S3::Manifest object
|
224
|
+
# +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to copy files from
|
225
|
+
# +to_location+:: S3Location to copy files to
|
226
|
+
# +match_regex+:: a regex string to match the files to copy
|
227
|
+
# +alter_filename_lambda+:: lambda to alter the written filename
|
228
|
+
# +flatten+:: strips off any sub-folders below the from_location
|
229
|
+
def copy_files_manifest(s3, manifest, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
|
230
|
+
|
231
|
+
puts " copying with manifest #{describe_from(from_files_or_loc)} to #{to_location}"
|
232
|
+
ignore = manifest.get_entries(s3) # Files to leave untouched
|
233
|
+
processed = process_files(:copy, s3, from_files_or_loc, ignore, match_regex, to_location, alter_filename_lambda, flatten)
|
234
|
+
manifest.add_entries(s3, processed)
|
235
|
+
|
236
|
+
processed
|
237
|
+
end
|
238
|
+
module_function :copy_files_manifest
|
239
|
+
|
240
|
+
# Moves files between S3 locations in two different accounts
|
241
|
+
#
|
242
|
+
# Implementation is as follows:
|
243
|
+
# 1. Concurrent download of all files from S3 source to local tmpdir
|
244
|
+
# 2. Concurrent upload of all files from local tmpdir to S3 target
|
245
|
+
# 3. Concurrent deletion of all files from S3 source
|
246
|
+
#
|
247
|
+
# In other words, the three operations are not interleaved (which is
|
248
|
+
# inefficient because upload speeds are much lower than download speeds)
|
249
|
+
#
|
250
|
+
# +from_s3+:: A Fog::Storage s3 connection for accessing the from S3Location
|
251
|
+
# +to_s3+:: A Fog::Storage s3 connection for accessing the to S3Location
|
252
|
+
# +from_location+:: S3Location to move files from
|
253
|
+
# +to_location+:: S3Location to move files to
|
254
|
+
# +match_regex+:: a regex string to match the files to move
|
255
|
+
# +alter_filename_lambda+:: lambda to alter the written filename
|
256
|
+
# +flatten+:: strips off any sub-folders below the from_location
|
257
|
+
def move_files_inter(from_s3, to_s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
|
258
|
+
|
259
|
+
puts " moving inter-account #{describe_from(from_location)} to #{to_location}"
|
260
|
+
processed = []
|
261
|
+
Dir.mktmpdir do |t|
|
262
|
+
tmp = Sluice::Storage.trail_slash(t)
|
263
|
+
processed = download_files(from_s3, from_location, tmp, match_regex)
|
264
|
+
upload_files(to_s3, tmp, to_location, '**/*') # Upload all files we downloaded
|
265
|
+
delete_files(from_s3, from_location, '.+') # Delete all files we downloaded
|
266
|
+
end
|
267
|
+
|
268
|
+
processed
|
269
|
+
end
|
270
|
+
module_function :move_files_inter
|
271
|
+
|
272
|
+
# Moves files between S3 locations concurrently
|
273
|
+
#
|
274
|
+
# Parameters:
|
275
|
+
# +s3+:: A Fog::Storage s3 connection
|
276
|
+
# +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to move files from
|
277
|
+
# +to_location+:: S3Location to move files to
|
278
|
+
# +match_regex+:: a regex string to match the files to move
|
279
|
+
# +alter_filename_lambda+:: lambda to alter the written filename
|
280
|
+
# +flatten+:: strips off any sub-folders below the from_location
|
281
|
+
def move_files(s3, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
|
282
|
+
|
283
|
+
puts " moving #{describe_from(from_files_or_loc)} to #{to_location}"
|
284
|
+
process_files(:move, s3, from_files_or_loc, [], match_regex, to_location, alter_filename_lambda, flatten)
|
285
|
+
end
|
286
|
+
module_function :move_files
|
287
|
+
|
288
|
+
# Uploads files to S3 locations concurrently
|
289
|
+
#
|
290
|
+
# Parameters:
|
291
|
+
# +s3+:: A Fog::Storage s3 connection
|
292
|
+
# +from_files_or_dir+:: Local array of files or local directory to upload files from
|
293
|
+
# +to_location+:: S3Location to upload files to
|
294
|
+
# +match_glob+:: a filesystem glob to match the files to upload
|
295
|
+
def upload_files(s3, from_files_or_dir, to_location, match_glob='*')
|
296
|
+
|
297
|
+
puts " uploading #{describe_from(from_files_or_dir)} to #{to_location}"
|
298
|
+
process_files(:upload, s3, from_files_or_dir, [], match_glob, to_location)
|
299
|
+
end
|
300
|
+
module_function :upload_files
|
301
|
+
|
302
|
+
# Upload a single file to the exact location specified
|
303
|
+
# Has no intelligence around filenaming.
|
304
|
+
#
|
305
|
+
# Parameters:
|
306
|
+
# +s3+:: A Fog::Storage s3 connection
|
307
|
+
# +from_file:: A local file path
|
308
|
+
# +to_bucket:: The Fog::Directory to upload to
|
309
|
+
# +to_file:: The file path to upload to
|
310
|
+
def upload_file(s3, from_file, to_bucket, to_file)
|
311
|
+
|
312
|
+
local_file = File.open(from_file)
|
313
|
+
|
314
|
+
dir = s3.directories.new(:key => to_bucket) # No request made
|
315
|
+
file = dir.files.create(
|
316
|
+
:key => to_file,
|
317
|
+
:body => local_file
|
318
|
+
)
|
319
|
+
file.save('x-amz-server-side-encryption' => 'AES256')
|
320
|
+
local_file.close
|
321
|
+
end
|
322
|
+
module_function :upload_file
|
323
|
+
|
324
|
+
# Download a single file to the exact path specified
|
325
|
+
# Has no intelligence around filenaming.
|
326
|
+
# Makes sure to create the path as needed.
|
327
|
+
#
|
328
|
+
# Parameters:
|
329
|
+
# +s3+:: A Fog::Storage s3 connection
|
330
|
+
# +from_file:: A Fog::Storage::AWS::File to download
|
331
|
+
# +to_file:: A local file path
|
332
|
+
def download_file(s3, from_file, to_file)
|
333
|
+
|
334
|
+
FileUtils.mkdir_p(File.dirname(to_file))
|
335
|
+
|
336
|
+
# TODO: deal with bug where Fog hangs indefinitely if network connection dies during download
|
337
|
+
|
338
|
+
local_file = File.open(to_file, "w")
|
339
|
+
local_file.write(from_file.body)
|
340
|
+
local_file.close
|
341
|
+
end
|
342
|
+
module_function :download_file
|
343
|
+
|
344
|
+
private
|
345
|
+
|
346
|
+
# Provides string describing from_files_or_dir_or_loc
|
347
|
+
# for logging purposes.
|
348
|
+
#
|
349
|
+
# Parameters:
|
350
|
+
# +from_files_or_dir_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, local directory or S3Location to process files from
|
351
|
+
#
|
352
|
+
# Returns a log-friendly string
|
353
|
+
def describe_from(from_files_or_dir_or_loc)
|
354
|
+
if from_files_or_dir_or_loc.is_a?(Array)
|
355
|
+
"#{from_files_or_dir_or_loc.length} file(s)"
|
356
|
+
else
|
357
|
+
"files from #{from_files_or_dir_or_loc}"
|
358
|
+
end
|
359
|
+
end
|
360
|
+
module_function :describe_from
|
361
|
+
|
362
|
+
# Concurrent file operations between S3 locations. Supports:
|
363
|
+
# - Download
|
364
|
+
# - Upload
|
365
|
+
# - Copy
|
366
|
+
# - Delete
|
367
|
+
# - Move (= Copy + Delete)
|
368
|
+
#
|
369
|
+
# Parameters:
|
370
|
+
# +operation+:: Operation to perform. :download, :upload, :copy, :delete, :move supported
|
371
|
+
# +ignore+:: Array of filenames to ignore (used by manifest code)
|
372
|
+
# +s3+:: A Fog::Storage s3 connection
|
373
|
+
# +from_files_or_dir_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, local directory or S3Location to process files from
|
374
|
+
# +match_regex_or_glob+:: a regex or glob string to match the files to process
|
375
|
+
# +to_loc_or_dir+:: S3Location or local directory to process files to
|
376
|
+
# +alter_filename_lambda+:: lambda to alter the written filename
|
377
|
+
# +flatten+:: strips off any sub-folders below the from_loc_or_dir
|
378
|
+
def process_files(operation, s3, from_files_or_dir_or_loc, ignore=[], match_regex_or_glob='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
|
379
|
+
|
380
|
+
# Validate that the file operation makes sense
|
381
|
+
case operation
|
382
|
+
when :copy, :move, :download, :upload
|
383
|
+
if to_loc_or_dir.nil?
|
384
|
+
raise StorageOperationError "File operation %s requires the to_loc_or_dir to be set" % operation
|
385
|
+
end
|
386
|
+
when :delete
|
387
|
+
unless to_loc_or_dir.nil?
|
388
|
+
raise StorageOperationError "File operation %s does not support the to_loc_or_dir argument" % operation
|
389
|
+
end
|
390
|
+
if alter_filename_lambda.class == Proc
|
391
|
+
raise StorageOperationError "File operation %s does not support the alter_filename_lambda argument" % operation
|
392
|
+
end
|
393
|
+
else
|
394
|
+
raise StorageOperationError "File operation %s is unsupported. Try :download, :upload, :copy, :delete or :move" % operation
|
395
|
+
end
|
396
|
+
|
397
|
+
# If we have an array of files, no additional globbing required
|
398
|
+
if from_files_or_dir_or_loc.is_a?(Array)
|
399
|
+
files_to_process = from_files_or_dir_or_loc # Could be filepaths or Fog::Storage::AWS::File's
|
400
|
+
globbed = true
|
401
|
+
# Otherwise if it's an upload, we can glob now
|
402
|
+
elsif operation == :upload
|
403
|
+
files_to_process = glob_files(from_files_or_dir_or_loc, match_regex_or_glob)
|
404
|
+
globbed = true
|
405
|
+
# Otherwise we'll do threaded globbing later...
|
406
|
+
else
|
407
|
+
files_to_process = []
|
408
|
+
from_loc = from_files_or_dir_or_loc # Alias
|
409
|
+
globbed = false
|
410
|
+
end
|
411
|
+
|
412
|
+
threads = []
|
413
|
+
mutex = Mutex.new
|
414
|
+
complete = false
|
415
|
+
marker_opts = {}
|
416
|
+
processed_files = [] # For manifest updating, determining if any files were moved etc
|
417
|
+
|
418
|
+
# If an exception is thrown in a thread that isn't handled, die quickly
|
419
|
+
Thread.abort_on_exception = true
|
420
|
+
|
421
|
+
# Create Ruby threads to concurrently execute s3 operations
|
422
|
+
for i in (0...CONCURRENCY)
|
423
|
+
|
424
|
+
# Each thread pops a file off the files_to_process array, and moves it.
|
425
|
+
# We loop until there are no more files
|
426
|
+
threads << Thread.new(i) do |thread_idx|
|
427
|
+
|
428
|
+
loop do
|
429
|
+
file = false
|
430
|
+
filepath = false
|
431
|
+
from_bucket = false
|
432
|
+
from_path = false
|
433
|
+
match = false
|
434
|
+
|
435
|
+
# First critical section:
|
436
|
+
# only allow one thread to modify the array at any time
|
437
|
+
mutex.synchronize do
|
438
|
+
|
439
|
+
# No need to do further globbing
|
440
|
+
if globbed
|
441
|
+
if files_to_process.size == 0
|
442
|
+
complete = true
|
443
|
+
next
|
444
|
+
end
|
445
|
+
|
446
|
+
file = files_to_process.pop
|
447
|
+
# Support raw filenames and also Fog::Storage::AWS::File's
|
448
|
+
if (file.is_a?(Fog::Storage::AWS::File))
|
449
|
+
from_bucket = file.directory.key # Bucket
|
450
|
+
from_path = Sluice::Storage.trail_slash(File.dirname(file.key))
|
451
|
+
filepath = file.key
|
452
|
+
else
|
453
|
+
from_bucket = nil # Not used
|
454
|
+
if from_files_or_dir_or_loc.is_a?(Array)
|
455
|
+
from_path = Sluice::Storage.trail_slash(File.dirname(file))
|
456
|
+
else
|
457
|
+
from_path = from_files_or_dir_or_loc # The root dir
|
458
|
+
end
|
459
|
+
filepath = file
|
460
|
+
end
|
461
|
+
|
462
|
+
match = true # Match is implicit in the glob
|
463
|
+
else
|
464
|
+
|
465
|
+
while !complete && !match do
|
466
|
+
if files_to_process.size == 0
|
467
|
+
# S3 batches 1000 files per request.
|
468
|
+
# We load up our array with the files to move
|
469
|
+
files_to_process = s3.directories.get(from_loc.bucket, :prefix => from_loc.dir).files.all(marker_opts).to_a
|
470
|
+
# If we don't have any files after the S3 request, we're complete
|
471
|
+
if files_to_process.size == 0
|
472
|
+
complete = true
|
473
|
+
next
|
474
|
+
else
|
475
|
+
marker_opts['marker'] = files_to_process.last.key
|
476
|
+
|
477
|
+
# By reversing the array we can use pop and get FIFO behaviour
|
478
|
+
# instead of the performance penalty incurred by unshift
|
479
|
+
files_to_process = files_to_process.reverse
|
480
|
+
end
|
481
|
+
end
|
482
|
+
|
483
|
+
file = files_to_process.pop
|
484
|
+
from_bucket = from_loc.bucket
|
485
|
+
from_path = from_loc.dir_as_path
|
486
|
+
filepath = file.key
|
487
|
+
|
488
|
+
# TODO: clean up following https://github.com/snowplow/sluice/issues/25
|
489
|
+
match = if match_regex_or_glob.is_a? NegativeRegex
|
490
|
+
!filepath.match(match_regex_or_glob.regex)
|
491
|
+
else
|
492
|
+
filepath.match(match_regex_or_glob)
|
493
|
+
end
|
494
|
+
|
495
|
+
end
|
496
|
+
end
|
497
|
+
end
|
498
|
+
# End of mutex.synchronize
|
499
|
+
|
500
|
+
# Kill this thread's loop (and thus this thread) if we are complete
|
501
|
+
break if complete
|
502
|
+
|
503
|
+
# Skip processing for a folder or file which doesn't match our regexp or glob
|
504
|
+
next if is_folder?(filepath) or not match
|
505
|
+
|
506
|
+
# Name file
|
507
|
+
basename = get_basename(filepath)
|
508
|
+
next if ignore.include?(basename) # Don't process if in our leave list
|
509
|
+
|
510
|
+
filename = rename_file(filepath, basename, alter_filename_lambda)
|
511
|
+
|
512
|
+
# What are we doing? Let's determine source and target
|
513
|
+
# Note that target excludes bucket name where relevant
|
514
|
+
case operation
|
515
|
+
when :upload
|
516
|
+
source = "#{filepath}"
|
517
|
+
target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
|
518
|
+
puts "(t#{thread_idx}) UPLOAD #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
|
519
|
+
when :download
|
520
|
+
source = "#{from_bucket}/#{filepath}"
|
521
|
+
target = name_file(filepath, filename, from_path, to_loc_or_dir, flatten)
|
522
|
+
puts "(t#{thread_idx}) DOWNLOAD #{source} +-> #{target}"
|
523
|
+
when :move
|
524
|
+
source = "#{from_bucket}/#{filepath}"
|
525
|
+
target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
|
526
|
+
puts "(t#{thread_idx}) MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
|
527
|
+
when :copy
|
528
|
+
source = "#{from_bucket}/#{filepath}"
|
529
|
+
target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
|
530
|
+
puts "(t#{thread_idx}) COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
|
531
|
+
when :delete
|
532
|
+
source = "#{from_bucket}/#{filepath}"
|
533
|
+
# No target
|
534
|
+
puts "(t#{thread_idx}) DELETE x #{source}"
|
535
|
+
end
|
536
|
+
|
537
|
+
# Upload is a standalone operation vs move/copy/delete
|
538
|
+
if operation == :upload
|
539
|
+
retry_x(
|
540
|
+
Sluice::Storage::S3,
|
541
|
+
[:upload_file, s3, filepath, to_loc_or_dir.bucket, target],
|
542
|
+
3,
|
543
|
+
" +/> #{target}",
|
544
|
+
"Problem uploading #{filepath}. Retrying.")
|
545
|
+
end
|
546
|
+
|
547
|
+
# Download is a standalone operation vs move/copy/delete
|
548
|
+
if operation == :download
|
549
|
+
retry_x(
|
550
|
+
Sluice::Storage::S3,
|
551
|
+
[:download_file, s3, file, target],
|
552
|
+
3,
|
553
|
+
" +/> #{target}",
|
554
|
+
"Problem downloading #{filepath}. Retrying.")
|
555
|
+
end
|
556
|
+
|
557
|
+
# A move or copy starts with a copy file
|
558
|
+
if [:move, :copy].include? operation
|
559
|
+
retry_x(
|
560
|
+
file,
|
561
|
+
[:copy, to_loc_or_dir.bucket, target],
|
562
|
+
3,
|
563
|
+
" +-> #{to_loc_or_dir.bucket}/#{target}",
|
564
|
+
"Problem copying #{filepath}. Retrying.")
|
565
|
+
end
|
566
|
+
|
567
|
+
# A move or delete ends with a delete
|
568
|
+
if [:move, :delete].include? operation
|
569
|
+
retry_x(
|
570
|
+
file,
|
571
|
+
[:destroy],
|
572
|
+
3,
|
573
|
+
" x #{source}",
|
574
|
+
"Problem destroying #{filepath}. Retrying.")
|
575
|
+
end
|
576
|
+
|
577
|
+
# Second critical section: we need to update
|
578
|
+
# processed_files in a thread-safe way
|
579
|
+
mutex.synchronize do
|
580
|
+
processed_files << filepath
|
581
|
+
end
|
582
|
+
end
|
583
|
+
end
|
584
|
+
end
|
585
|
+
|
586
|
+
# Wait for threads to finish
|
587
|
+
threads.each { |aThread| aThread.join }
|
588
|
+
|
589
|
+
processed_files # Return the processed files
|
590
|
+
end
|
591
|
+
module_function :process_files
|
592
|
+
|
593
|
+
# A helper function to rename a file
|
594
|
+
# TODO: fixup lambda to be Maybe[Proc]
|
595
|
+
Contract String, Maybe[String], Or[Proc, Bool] => Maybe[String]
|
596
|
+
def self.rename_file(filepath, basename, rename_lambda=false)
|
597
|
+
|
598
|
+
if rename_lambda.class == Proc
|
599
|
+
case rename_lambda.arity
|
600
|
+
when 2
|
601
|
+
rename_lambda.call(basename, filepath)
|
602
|
+
when 1
|
603
|
+
rename_lambda.call(basename)
|
604
|
+
when 0
|
605
|
+
rename_lambda.call()
|
606
|
+
else
|
607
|
+
raise StorageOperationError "Expect arity of 0, 1 or 2 for rename_lambda, not #{rename_lambda.arity}"
|
608
|
+
end
|
609
|
+
else
|
610
|
+
basename
|
611
|
+
end
|
612
|
+
end
|
613
|
+
|
614
|
+
# A helper function to list all files
|
615
|
+
# recursively in a folder
|
616
|
+
#
|
617
|
+
# Parameters:
|
618
|
+
# +dir+:: Directory to list files recursively
|
619
|
+
# +match_regex+:: a regex string to match the files to copy
|
620
|
+
#
|
621
|
+
# Returns array of files (no sub-directories)
|
622
|
+
def glob_files(dir, glob)
|
623
|
+
Dir.glob(File.join(dir, glob)).select { |f|
|
624
|
+
File.file?(f) # Drop sub-directories
|
625
|
+
}
|
626
|
+
end
|
627
|
+
module_function :glob_files
|
628
|
+
|
629
|
+
# A helper function to attempt to run a
|
630
|
+
# function retries times
|
631
|
+
#
|
632
|
+
# Parameters:
|
633
|
+
# +object+:: Object to send our function to
|
634
|
+
# +send_args+:: Function plus arguments
|
635
|
+
# +retries+:: Number of retries to attempt
|
636
|
+
# +attempt_msg+:: Message to puts on each attempt
|
637
|
+
# +failure_msg+:: Message to puts on each failure
|
638
|
+
def retry_x(object, send_args, retries, attempt_msg, failure_msg)
|
639
|
+
i = 0
|
640
|
+
begin
|
641
|
+
Timeout::timeout(TIMEOUT_WAIT) do # In case our operation times out
|
642
|
+
object.send(*send_args)
|
643
|
+
puts attempt_msg
|
644
|
+
end
|
645
|
+
rescue
|
646
|
+
raise unless i < retries
|
647
|
+
puts failure_msg
|
648
|
+
sleep(RETRY_WAIT) # Give us a bit of time before retrying
|
649
|
+
i += 1
|
650
|
+
retry
|
651
|
+
end
|
652
|
+
end
|
653
|
+
module_function :retry_x
|
654
|
+
|
655
|
+
# A helper function to prepare destination
|
656
|
+
# filenames and paths. This is a bit weird
|
657
|
+
# - it needs to exist because of differences
|
658
|
+
# in the way that Amazon S3, Fog and Unix
|
659
|
+
# treat filepaths versus keys.
|
660
|
+
#
|
661
|
+
# Parameters:
|
662
|
+
# +filepath+:: Path to file (including old filename)
|
663
|
+
# +new_filename+:: Replace the filename in the path with this
|
664
|
+
# +remove_path+:: If this is set, strip this from the front of the path
|
665
|
+
# +add_path+:: If this is set, add this to the front of the path
|
666
|
+
# +flatten+:: strips off any sub-folders below the from_location
|
667
|
+
#
|
668
|
+
# TODO: this badly needs unit tests
|
669
|
+
def name_file(filepath, new_filename, remove_path=nil, add_path=nil, flatten=false)
|
670
|
+
|
671
|
+
# First, replace the filename in filepath with new one
|
672
|
+
dirname = File.dirname(filepath)
|
673
|
+
new_filepath = (dirname == '.') ? new_filename : dirname + '/' + new_filename
|
674
|
+
|
675
|
+
# Nothing more to do
|
676
|
+
return new_filepath if remove_path.nil? and add_path.nil? and not flatten
|
677
|
+
|
678
|
+
shortened_filepath = if flatten
|
679
|
+
# Let's revert to just the filename
|
680
|
+
new_filename
|
681
|
+
else
|
682
|
+
# If we have a 'remove_path', it must be found at
|
683
|
+
# the start of the path.
|
684
|
+
# If it's not, you're probably using name_file()
|
685
|
+
# wrong.
|
686
|
+
if !filepath.start_with?(remove_path)
|
687
|
+
raise StorageOperationError, "name_file failed. Filepath '#{filepath}' does not start with '#{remove_path}'"
|
688
|
+
end
|
689
|
+
|
690
|
+
# Okay, let's remove the filepath
|
691
|
+
new_filepath[remove_path.length()..-1]
|
692
|
+
end
|
693
|
+
|
694
|
+
# Nothing more to do
|
695
|
+
return shortened_filepath if add_path.nil?
|
696
|
+
|
697
|
+
# Add the new filepath on to the start and return
|
698
|
+
return add_path + shortened_filepath
|
699
|
+
end
|
700
|
+
module_function :name_file
|
701
|
+
|
702
|
+
end
|
703
|
+
end
|
704
|
+
end
|