sluice 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +2 -0
- data/lib/sluice.rb +1 -1
- data/lib/sluice/storage/s3.rb +117 -42
- metadata +33 -51
data/README.md
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
Sluice is a Ruby gem (built with [Bundler] [bundler]) to help you build cloud-friendly ETL (extract, transform, load) processes.
|
4
4
|
|
5
|
+
**Currently it does one thing: supports very robust, very parallel copy/delete/move of S3 files from one bucket to another.**
|
6
|
+
|
5
7
|
Sluice has been extracted from a pair of Ruby ETL applications built by the [SnowPlow Analytics] [snowplow-analytics] team, specifically:
|
6
8
|
|
7
9
|
1. [EmrEtlRunner] [emr-etl-runner], a Ruby application to run the SnowPlow ETL process on Elastic MapReduce
|
data/lib/sluice.rb
CHANGED
data/lib/sluice/storage/s3.rb
CHANGED
@@ -135,7 +135,7 @@ module Sluice
|
|
135
135
|
# Parameters:
|
136
136
|
# +s3+:: A Fog::Storage s3 connection
|
137
137
|
# +from_location+:: S3Location to move files from
|
138
|
-
# +
|
138
|
+
# +to_location+:: S3Location to move files to
|
139
139
|
# +match_regex+:: a regex string to match the files to move
|
140
140
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
141
141
|
# +flatten+:: strips off any sub-folders below the from_location
|
@@ -146,7 +146,43 @@ module Sluice
|
|
146
146
|
end
|
147
147
|
module_function :move_files
|
148
148
|
|
149
|
-
#
|
149
|
+
# Uploads files to S3 locations concurrently
|
150
|
+
#
|
151
|
+
# Parameters:
|
152
|
+
# +s3+:: A Fog::Storage s3 connection
|
153
|
+
# +from_directory+:: Local directory to upload files from
|
154
|
+
# +to_location+:: S3Location to upload files to
|
155
|
+
# +match_glob+:: a filesystem glob to match the files to upload
|
156
|
+
def upload_files(s3, from_directory, to_location, match_glob='*')
|
157
|
+
|
158
|
+
puts " uploading files from #{from_directory} to #{to_location}"
|
159
|
+
process_files(:upload, s3, from_directory, match_glob, to_location)
|
160
|
+
end
|
161
|
+
module_function :upload_files
|
162
|
+
|
163
|
+
# Upload a single file to the exact location specified
|
164
|
+
# Has no intelligence around filenaming.
|
165
|
+
#
|
166
|
+
# Parameters:
|
167
|
+
# +s3+:: A Fog::Storage s3 connection
|
168
|
+
# +from_file:: A local file path
|
169
|
+
# +to_bucket:: The Fog::Directory to upload to
|
170
|
+
# +to_file:: The file path to upload to
|
171
|
+
def upload_file(s3, from_file, to_bucket, to_file)
|
172
|
+
|
173
|
+
local_file = File.open(from_file)
|
174
|
+
|
175
|
+
dir = s3.directories.new(:key => to_bucket) # No request made
|
176
|
+
file = dir.files.create(
|
177
|
+
:key => to_file,
|
178
|
+
:body => local_file
|
179
|
+
)
|
180
|
+
|
181
|
+
local_file.close
|
182
|
+
end
|
183
|
+
module_function :upload_file
|
184
|
+
|
185
|
+
# Download a single file to the exact path specified
|
150
186
|
# Has no intelligence around filenaming.
|
151
187
|
# Makes sure to create the path as needed.
|
152
188
|
#
|
@@ -169,23 +205,25 @@ module Sluice
|
|
169
205
|
private
|
170
206
|
|
171
207
|
# Concurrent file operations between S3 locations. Supports:
|
208
|
+
# - Download
|
209
|
+
# - Upload
|
172
210
|
# - Copy
|
173
211
|
# - Delete
|
174
212
|
# - Move (= Copy + Delete)
|
175
213
|
#
|
176
214
|
# Parameters:
|
177
|
-
# +operation+:: Operation to perform. :copy, :delete, :move supported
|
215
|
+
# +operation+:: Operation to perform. :download, :upload, :copy, :delete, :move supported
|
178
216
|
# +s3+:: A Fog::Storage s3 connection
|
179
|
-
# +
|
180
|
-
# +
|
217
|
+
# +from_loc_or_dir+:: S3Location to process files from
|
218
|
+
# +match_regex_or_glob+:: a regex or glob string to match the files to process
|
181
219
|
# +to_loc_or_dir+:: S3Location or local directory to process files to
|
182
220
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
183
|
-
# +flatten+:: strips off any sub-folders below the
|
184
|
-
def process_files(operation, s3,
|
221
|
+
# +flatten+:: strips off any sub-folders below the from_loc_or_dir
|
222
|
+
def process_files(operation, s3, from_loc_or_dir, match_regex_or_glob='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
|
185
223
|
|
186
224
|
# Validate that the file operation makes sense
|
187
225
|
case operation
|
188
|
-
when :copy, :move, :download
|
226
|
+
when :copy, :move, :download, :upload
|
189
227
|
if to_loc_or_dir.nil?
|
190
228
|
raise StorageOperationError "File operation %s requires the to_loc_or_dir to be set" % operation
|
191
229
|
end
|
@@ -197,10 +235,15 @@ module Sluice
|
|
197
235
|
raise StorageOperationError "File operation %s does not support the alter_filename_lambda argument" % operation
|
198
236
|
end
|
199
237
|
else
|
200
|
-
raise StorageOperationError "File operation %s is unsupported. Try :download, :copy, :delete or :move" % operation
|
238
|
+
raise StorageOperationError "File operation %s is unsupported. Try :download, :upload, :copy, :delete or :move" % operation
|
201
239
|
end
|
202
240
|
|
203
|
-
|
241
|
+
# If we are uploading, then we can glob the files before we thread
|
242
|
+
if operation == :upload
|
243
|
+
files_to_process = Dir.glob(File.join(from_loc_or_dir, match_regex_or_glob))
|
244
|
+
else
|
245
|
+
files_to_process = []
|
246
|
+
end
|
204
247
|
threads = []
|
205
248
|
mutex = Mutex.new
|
206
249
|
complete = false
|
@@ -217,37 +260,51 @@ module Sluice
|
|
217
260
|
threads << Thread.new do
|
218
261
|
loop do
|
219
262
|
file = false
|
263
|
+
filepath = false
|
220
264
|
match = false
|
221
265
|
|
222
266
|
# Critical section:
|
223
267
|
# only allow one thread to modify the array at any time
|
224
268
|
mutex.synchronize do
|
225
269
|
|
226
|
-
|
270
|
+
if operation == :upload
|
271
|
+
|
227
272
|
if files_to_process.size == 0
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
273
|
+
complete = true
|
274
|
+
next
|
275
|
+
end
|
276
|
+
|
277
|
+
filepath = files_to_process.pop
|
278
|
+
match = true # Match is implicit in the glob
|
279
|
+
else
|
280
|
+
|
281
|
+
while !complete && !match do
|
232
282
|
if files_to_process.size == 0
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
283
|
+
# S3 batches 1000 files per request.
|
284
|
+
# We load up our array with the files to move
|
285
|
+
files_to_process = s3.directories.get(from_loc_or_dir.bucket, :prefix => from_loc_or_dir.dir).files.all(marker_opts)
|
286
|
+
# If we don't have any files after the s3 request, we're complete
|
287
|
+
if files_to_process.size == 0
|
288
|
+
complete = true
|
289
|
+
next
|
290
|
+
else
|
291
|
+
marker_opts['marker'] = files_to_process.last.key
|
292
|
+
|
293
|
+
# By reversing the array we can use pop and get FIFO behaviour
|
294
|
+
# instead of the performance penalty incurred by unshift
|
295
|
+
files_to_process = files_to_process.reverse
|
296
|
+
end
|
241
297
|
end
|
242
|
-
end
|
243
298
|
|
244
|
-
|
299
|
+
file = files_to_process.pop
|
300
|
+
filepath = file.key
|
245
301
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
302
|
+
match = if match_regex_or_glob.is_a? NegativeRegex
|
303
|
+
!filepath.match(match_regex_or_glob.regex)
|
304
|
+
else
|
305
|
+
filepath.match(match_regex_or_glob)
|
306
|
+
end
|
307
|
+
end
|
251
308
|
end
|
252
309
|
end
|
253
310
|
|
@@ -255,12 +312,13 @@ module Sluice
|
|
255
312
|
break unless match
|
256
313
|
|
257
314
|
# Ignore any EMR-created _$folder$ entries
|
258
|
-
break if
|
315
|
+
break if filepath.end_with?('_$folder$')
|
259
316
|
|
260
|
-
# Match the filename, ignoring
|
261
|
-
file_match =
|
317
|
+
# Match the filename, ignoring directories
|
318
|
+
file_match = filepath.match('([^/]+)$')
|
262
319
|
break unless file_match
|
263
320
|
|
321
|
+
# Rename
|
264
322
|
if alter_filename_lambda.class == Proc
|
265
323
|
filename = alter_filename_lambda.call(file_match[1])
|
266
324
|
else
|
@@ -269,30 +327,47 @@ module Sluice
|
|
269
327
|
|
270
328
|
# What are we doing? Let's determine source and target
|
271
329
|
# Note that target excludes bucket name where relevant
|
272
|
-
source = "#{from_location.bucket}/#{file.key}"
|
273
330
|
case operation
|
331
|
+
when :upload
|
332
|
+
source = "#{filepath}"
|
333
|
+
target = name_file(filepath, filename, from_loc_or_dir, to_loc_or_dir.dir_as_path, flatten)
|
334
|
+
puts " UPLOAD #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
|
274
335
|
when :download
|
275
|
-
|
336
|
+
source = "#{from_loc_or_dir.bucket}/#{filepath}"
|
337
|
+
target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir, flatten)
|
276
338
|
puts " DOWNLOAD #{source} +-> #{target}"
|
277
339
|
when :move
|
278
|
-
|
340
|
+
source = "#{from_loc_or_dir.bucket}/#{filepath}"
|
341
|
+
target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
|
279
342
|
puts " MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
|
280
343
|
when :copy
|
281
|
-
|
344
|
+
source = "#{from_loc_or_dir.bucket}/#{filepath}"
|
345
|
+
target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
|
282
346
|
puts " COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
|
283
347
|
when :delete
|
348
|
+
source = "#{from_loc_or_dir.bucket}/#{filepath}"
|
284
349
|
# No target
|
285
350
|
puts " DELETE x #{source}"
|
286
351
|
end
|
287
352
|
|
288
|
-
#
|
353
|
+
# Upload is a standalone operation vs move/copy/delete
|
354
|
+
if operation == :upload
|
355
|
+
retry_x(
|
356
|
+
Sluice::Storage::S3,
|
357
|
+
[:upload_file, s3, filepath, to_loc_or_dir.bucket, target],
|
358
|
+
RETRIES,
|
359
|
+
" +/> #{target}",
|
360
|
+
"Problem uploading #{filepath}. Retrying.")
|
361
|
+
end
|
362
|
+
|
363
|
+
# Download is a standalone operation vs move/copy/delete
|
289
364
|
if operation == :download
|
290
365
|
retry_x(
|
291
366
|
Sluice::Storage::S3,
|
292
367
|
[:download_file, s3, file, target],
|
293
368
|
RETRIES,
|
294
369
|
" +/> #{target}",
|
295
|
-
"Problem downloading #{
|
370
|
+
"Problem downloading #{filepath}. Retrying.")
|
296
371
|
end
|
297
372
|
|
298
373
|
# A move or copy starts with a copy file
|
@@ -302,7 +377,7 @@ module Sluice
|
|
302
377
|
[:copy, to_loc_or_dir.bucket, target],
|
303
378
|
RETRIES,
|
304
379
|
" +-> #{to_loc_or_dir.bucket}/#{target}",
|
305
|
-
"Problem copying #{
|
380
|
+
"Problem copying #{filepath}. Retrying.")
|
306
381
|
end
|
307
382
|
|
308
383
|
# A move or delete ends with a delete
|
@@ -312,7 +387,7 @@ module Sluice
|
|
312
387
|
[:destroy],
|
313
388
|
RETRIES,
|
314
389
|
" x #{source}",
|
315
|
-
"Problem destroying #{
|
390
|
+
"Problem destroying #{filepath}. Retrying.")
|
316
391
|
end
|
317
392
|
end
|
318
393
|
end
|
@@ -390,7 +465,7 @@ module Sluice
|
|
390
465
|
return shortened_filepath if add_path.nil?
|
391
466
|
|
392
467
|
# Add the new filepath on to the start and return
|
393
|
-
add_path + shortened_filepath
|
468
|
+
return add_path + shortened_filepath
|
394
469
|
end
|
395
470
|
module_function :name_file
|
396
471
|
|
metadata
CHANGED
@@ -1,49 +1,41 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: sluice
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.7
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
- 6
|
10
|
-
version: 0.0.6
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Alex Dean
|
14
9
|
- Michael Tibben
|
15
10
|
autorequire:
|
16
11
|
bindir: bin
|
17
12
|
cert_chain: []
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
- !ruby/object:Gem::Dependency
|
13
|
+
date: 2013-07-11 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
22
16
|
name: fog
|
23
|
-
|
24
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
25
18
|
none: false
|
26
|
-
requirements:
|
19
|
+
requirements:
|
27
20
|
- - ~>
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
hash: 15
|
30
|
-
segments:
|
31
|
-
- 1
|
32
|
-
- 6
|
33
|
-
- 0
|
21
|
+
- !ruby/object:Gem::Version
|
34
22
|
version: 1.6.0
|
35
23
|
type: :runtime
|
36
|
-
|
37
|
-
|
38
|
-
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
none: false
|
27
|
+
requirements:
|
28
|
+
- - ~>
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: 1.6.0
|
31
|
+
description: A Ruby gem to help you build ETL processes involving Amazon S3. Uses
|
32
|
+
Fog
|
33
|
+
email:
|
39
34
|
- support@snowplowanalytics.com
|
40
35
|
executables: []
|
41
|
-
|
42
36
|
extensions: []
|
43
|
-
|
44
37
|
extra_rdoc_files: []
|
45
|
-
|
46
|
-
files:
|
38
|
+
files:
|
47
39
|
- .gitignore
|
48
40
|
- CHANGELOG
|
49
41
|
- Gemfile
|
@@ -57,36 +49,26 @@ files:
|
|
57
49
|
- sluice.gemspec
|
58
50
|
homepage: http://snowplowanalytics.com
|
59
51
|
licenses: []
|
60
|
-
|
61
52
|
post_install_message:
|
62
53
|
rdoc_options: []
|
63
|
-
|
64
|
-
require_paths:
|
54
|
+
require_paths:
|
65
55
|
- lib
|
66
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
57
|
none: false
|
68
|
-
requirements:
|
69
|
-
- -
|
70
|
-
- !ruby/object:Gem::Version
|
71
|
-
|
72
|
-
|
73
|
-
- 0
|
74
|
-
version: "0"
|
75
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
63
|
none: false
|
77
|
-
requirements:
|
78
|
-
- -
|
79
|
-
- !ruby/object:Gem::Version
|
80
|
-
|
81
|
-
segments:
|
82
|
-
- 0
|
83
|
-
version: "0"
|
64
|
+
requirements:
|
65
|
+
- - ! '>='
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '0'
|
84
68
|
requirements: []
|
85
|
-
|
86
69
|
rubyforge_project:
|
87
|
-
rubygems_version: 1.8.
|
70
|
+
rubygems_version: 1.8.25
|
88
71
|
signing_key:
|
89
72
|
specification_version: 3
|
90
73
|
summary: Ruby toolkit for cloud-friendly ETL
|
91
74
|
test_files: []
|
92
|
-
|