sluice 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -0
- data/lib/sluice.rb +1 -1
- data/lib/sluice/storage/s3.rb +117 -42
- metadata +33 -51
data/README.md
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
Sluice is a Ruby gem (built with [Bundler] [bundler]) to help you build cloud-friendly ETL (extract, transform, load) processes.
|
4
4
|
|
5
|
+
**Currently it does one thing: supports very robust, very parallel copy/delete/move of S3 files from one bucket to another.**
|
6
|
+
|
5
7
|
Sluice has been extracted from a pair of Ruby ETL applications built by the [SnowPlow Analytics] [snowplow-analytics] team, specifically:
|
6
8
|
|
7
9
|
1. [EmrEtlRunner] [emr-etl-runner], a Ruby application to run the SnowPlow ETL process on Elastic MapReduce
|
data/lib/sluice.rb
CHANGED
data/lib/sluice/storage/s3.rb
CHANGED
@@ -135,7 +135,7 @@ module Sluice
|
|
135
135
|
# Parameters:
|
136
136
|
# +s3+:: A Fog::Storage s3 connection
|
137
137
|
# +from_location+:: S3Location to move files from
|
138
|
-
# +
|
138
|
+
# +to_location+:: S3Location to move files to
|
139
139
|
# +match_regex+:: a regex string to match the files to move
|
140
140
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
141
141
|
# +flatten+:: strips off any sub-folders below the from_location
|
@@ -146,7 +146,43 @@ module Sluice
|
|
146
146
|
end
|
147
147
|
module_function :move_files
|
148
148
|
|
149
|
-
#
|
149
|
+
# Uploads files to S3 locations concurrently
|
150
|
+
#
|
151
|
+
# Parameters:
|
152
|
+
# +s3+:: A Fog::Storage s3 connection
|
153
|
+
# +from_directory+:: Local directory to upload files from
|
154
|
+
# +to_location+:: S3Location to upload files to
|
155
|
+
# +match_glob+:: a filesystem glob to match the files to upload
|
156
|
+
def upload_files(s3, from_directory, to_location, match_glob='*')
|
157
|
+
|
158
|
+
puts " uploading files from #{from_directory} to #{to_location}"
|
159
|
+
process_files(:upload, s3, from_directory, match_glob, to_location)
|
160
|
+
end
|
161
|
+
module_function :upload_files
|
162
|
+
|
163
|
+
# Upload a single file to the exact location specified
|
164
|
+
# Has no intelligence around filenaming.
|
165
|
+
#
|
166
|
+
# Parameters:
|
167
|
+
# +s3+:: A Fog::Storage s3 connection
|
168
|
+
# +from_file:: A local file path
|
169
|
+
# +to_bucket:: The Fog::Directory to upload to
|
170
|
+
# +to_file:: The file path to upload to
|
171
|
+
def upload_file(s3, from_file, to_bucket, to_file)
|
172
|
+
|
173
|
+
local_file = File.open(from_file)
|
174
|
+
|
175
|
+
dir = s3.directories.new(:key => to_bucket) # No request made
|
176
|
+
file = dir.files.create(
|
177
|
+
:key => to_file,
|
178
|
+
:body => local_file
|
179
|
+
)
|
180
|
+
|
181
|
+
local_file.close
|
182
|
+
end
|
183
|
+
module_function :upload_file
|
184
|
+
|
185
|
+
# Download a single file to the exact path specified
|
150
186
|
# Has no intelligence around filenaming.
|
151
187
|
# Makes sure to create the path as needed.
|
152
188
|
#
|
@@ -169,23 +205,25 @@ module Sluice
|
|
169
205
|
private
|
170
206
|
|
171
207
|
# Concurrent file operations between S3 locations. Supports:
|
208
|
+
# - Download
|
209
|
+
# - Upload
|
172
210
|
# - Copy
|
173
211
|
# - Delete
|
174
212
|
# - Move (= Copy + Delete)
|
175
213
|
#
|
176
214
|
# Parameters:
|
177
|
-
# +operation+:: Operation to perform. :copy, :delete, :move supported
|
215
|
+
# +operation+:: Operation to perform. :download, :upload, :copy, :delete, :move supported
|
178
216
|
# +s3+:: A Fog::Storage s3 connection
|
179
|
-
# +
|
180
|
-
# +
|
217
|
+
# +from_loc_or_dir+:: S3Location to process files from
|
218
|
+
# +match_regex_or_glob+:: a regex or glob string to match the files to process
|
181
219
|
# +to_loc_or_dir+:: S3Location or local directory to process files to
|
182
220
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
183
|
-
# +flatten+:: strips off any sub-folders below the
|
184
|
-
def process_files(operation, s3,
|
221
|
+
# +flatten+:: strips off any sub-folders below the from_loc_or_dir
|
222
|
+
def process_files(operation, s3, from_loc_or_dir, match_regex_or_glob='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
|
185
223
|
|
186
224
|
# Validate that the file operation makes sense
|
187
225
|
case operation
|
188
|
-
when :copy, :move, :download
|
226
|
+
when :copy, :move, :download, :upload
|
189
227
|
if to_loc_or_dir.nil?
|
190
228
|
raise StorageOperationError "File operation %s requires the to_loc_or_dir to be set" % operation
|
191
229
|
end
|
@@ -197,10 +235,15 @@ module Sluice
|
|
197
235
|
raise StorageOperationError "File operation %s does not support the alter_filename_lambda argument" % operation
|
198
236
|
end
|
199
237
|
else
|
200
|
-
raise StorageOperationError "File operation %s is unsupported. Try :download, :copy, :delete or :move" % operation
|
238
|
+
raise StorageOperationError "File operation %s is unsupported. Try :download, :upload, :copy, :delete or :move" % operation
|
201
239
|
end
|
202
240
|
|
203
|
-
|
241
|
+
# If we are uploading, then we can glob the files before we thread
|
242
|
+
if operation == :upload
|
243
|
+
files_to_process = Dir.glob(File.join(from_loc_or_dir, match_regex_or_glob))
|
244
|
+
else
|
245
|
+
files_to_process = []
|
246
|
+
end
|
204
247
|
threads = []
|
205
248
|
mutex = Mutex.new
|
206
249
|
complete = false
|
@@ -217,37 +260,51 @@ module Sluice
|
|
217
260
|
threads << Thread.new do
|
218
261
|
loop do
|
219
262
|
file = false
|
263
|
+
filepath = false
|
220
264
|
match = false
|
221
265
|
|
222
266
|
# Critical section:
|
223
267
|
# only allow one thread to modify the array at any time
|
224
268
|
mutex.synchronize do
|
225
269
|
|
226
|
-
|
270
|
+
if operation == :upload
|
271
|
+
|
227
272
|
if files_to_process.size == 0
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
273
|
+
complete = true
|
274
|
+
next
|
275
|
+
end
|
276
|
+
|
277
|
+
filepath = files_to_process.pop
|
278
|
+
match = true # Match is implicit in the glob
|
279
|
+
else
|
280
|
+
|
281
|
+
while !complete && !match do
|
232
282
|
if files_to_process.size == 0
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
283
|
+
# S3 batches 1000 files per request.
|
284
|
+
# We load up our array with the files to move
|
285
|
+
files_to_process = s3.directories.get(from_loc_or_dir.bucket, :prefix => from_loc_or_dir.dir).files.all(marker_opts)
|
286
|
+
# If we don't have any files after the s3 request, we're complete
|
287
|
+
if files_to_process.size == 0
|
288
|
+
complete = true
|
289
|
+
next
|
290
|
+
else
|
291
|
+
marker_opts['marker'] = files_to_process.last.key
|
292
|
+
|
293
|
+
# By reversing the array we can use pop and get FIFO behaviour
|
294
|
+
# instead of the performance penalty incurred by unshift
|
295
|
+
files_to_process = files_to_process.reverse
|
296
|
+
end
|
241
297
|
end
|
242
|
-
end
|
243
298
|
|
244
|
-
|
299
|
+
file = files_to_process.pop
|
300
|
+
filepath = file.key
|
245
301
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
302
|
+
match = if match_regex_or_glob.is_a? NegativeRegex
|
303
|
+
!filepath.match(match_regex_or_glob.regex)
|
304
|
+
else
|
305
|
+
filepath.match(match_regex_or_glob)
|
306
|
+
end
|
307
|
+
end
|
251
308
|
end
|
252
309
|
end
|
253
310
|
|
@@ -255,12 +312,13 @@ module Sluice
|
|
255
312
|
break unless match
|
256
313
|
|
257
314
|
# Ignore any EMR-created _$folder$ entries
|
258
|
-
break if
|
315
|
+
break if filepath.end_with?('_$folder$')
|
259
316
|
|
260
|
-
# Match the filename, ignoring
|
261
|
-
file_match =
|
317
|
+
# Match the filename, ignoring directories
|
318
|
+
file_match = filepath.match('([^/]+)$')
|
262
319
|
break unless file_match
|
263
320
|
|
321
|
+
# Rename
|
264
322
|
if alter_filename_lambda.class == Proc
|
265
323
|
filename = alter_filename_lambda.call(file_match[1])
|
266
324
|
else
|
@@ -269,30 +327,47 @@ module Sluice
|
|
269
327
|
|
270
328
|
# What are we doing? Let's determine source and target
|
271
329
|
# Note that target excludes bucket name where relevant
|
272
|
-
source = "#{from_location.bucket}/#{file.key}"
|
273
330
|
case operation
|
331
|
+
when :upload
|
332
|
+
source = "#{filepath}"
|
333
|
+
target = name_file(filepath, filename, from_loc_or_dir, to_loc_or_dir.dir_as_path, flatten)
|
334
|
+
puts " UPLOAD #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
|
274
335
|
when :download
|
275
|
-
|
336
|
+
source = "#{from_loc_or_dir.bucket}/#{filepath}"
|
337
|
+
target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir, flatten)
|
276
338
|
puts " DOWNLOAD #{source} +-> #{target}"
|
277
339
|
when :move
|
278
|
-
|
340
|
+
source = "#{from_loc_or_dir.bucket}/#{filepath}"
|
341
|
+
target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
|
279
342
|
puts " MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
|
280
343
|
when :copy
|
281
|
-
|
344
|
+
source = "#{from_loc_or_dir.bucket}/#{filepath}"
|
345
|
+
target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
|
282
346
|
puts " COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
|
283
347
|
when :delete
|
348
|
+
source = "#{from_loc_or_dir.bucket}/#{filepath}"
|
284
349
|
# No target
|
285
350
|
puts " DELETE x #{source}"
|
286
351
|
end
|
287
352
|
|
288
|
-
#
|
353
|
+
# Upload is a standalone operation vs move/copy/delete
|
354
|
+
if operation == :upload
|
355
|
+
retry_x(
|
356
|
+
Sluice::Storage::S3,
|
357
|
+
[:upload_file, s3, filepath, to_loc_or_dir.bucket, target],
|
358
|
+
RETRIES,
|
359
|
+
" +/> #{target}",
|
360
|
+
"Problem uploading #{filepath}. Retrying.")
|
361
|
+
end
|
362
|
+
|
363
|
+
# Download is a standalone operation vs move/copy/delete
|
289
364
|
if operation == :download
|
290
365
|
retry_x(
|
291
366
|
Sluice::Storage::S3,
|
292
367
|
[:download_file, s3, file, target],
|
293
368
|
RETRIES,
|
294
369
|
" +/> #{target}",
|
295
|
-
"Problem downloading #{
|
370
|
+
"Problem downloading #{filepath}. Retrying.")
|
296
371
|
end
|
297
372
|
|
298
373
|
# A move or copy starts with a copy file
|
@@ -302,7 +377,7 @@ module Sluice
|
|
302
377
|
[:copy, to_loc_or_dir.bucket, target],
|
303
378
|
RETRIES,
|
304
379
|
" +-> #{to_loc_or_dir.bucket}/#{target}",
|
305
|
-
"Problem copying #{
|
380
|
+
"Problem copying #{filepath}. Retrying.")
|
306
381
|
end
|
307
382
|
|
308
383
|
# A move or delete ends with a delete
|
@@ -312,7 +387,7 @@ module Sluice
|
|
312
387
|
[:destroy],
|
313
388
|
RETRIES,
|
314
389
|
" x #{source}",
|
315
|
-
"Problem destroying #{
|
390
|
+
"Problem destroying #{filepath}. Retrying.")
|
316
391
|
end
|
317
392
|
end
|
318
393
|
end
|
@@ -390,7 +465,7 @@ module Sluice
|
|
390
465
|
return shortened_filepath if add_path.nil?
|
391
466
|
|
392
467
|
# Add the new filepath on to the start and return
|
393
|
-
add_path + shortened_filepath
|
468
|
+
return add_path + shortened_filepath
|
394
469
|
end
|
395
470
|
module_function :name_file
|
396
471
|
|
metadata
CHANGED
@@ -1,49 +1,41 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: sluice
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.7
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
- 6
|
10
|
-
version: 0.0.6
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Alex Dean
|
14
9
|
- Michael Tibben
|
15
10
|
autorequire:
|
16
11
|
bindir: bin
|
17
12
|
cert_chain: []
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
- !ruby/object:Gem::Dependency
|
13
|
+
date: 2013-07-11 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
22
16
|
name: fog
|
23
|
-
|
24
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
25
18
|
none: false
|
26
|
-
requirements:
|
19
|
+
requirements:
|
27
20
|
- - ~>
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
hash: 15
|
30
|
-
segments:
|
31
|
-
- 1
|
32
|
-
- 6
|
33
|
-
- 0
|
21
|
+
- !ruby/object:Gem::Version
|
34
22
|
version: 1.6.0
|
35
23
|
type: :runtime
|
36
|
-
|
37
|
-
|
38
|
-
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
none: false
|
27
|
+
requirements:
|
28
|
+
- - ~>
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: 1.6.0
|
31
|
+
description: A Ruby gem to help you build ETL processes involving Amazon S3. Uses
|
32
|
+
Fog
|
33
|
+
email:
|
39
34
|
- support@snowplowanalytics.com
|
40
35
|
executables: []
|
41
|
-
|
42
36
|
extensions: []
|
43
|
-
|
44
37
|
extra_rdoc_files: []
|
45
|
-
|
46
|
-
files:
|
38
|
+
files:
|
47
39
|
- .gitignore
|
48
40
|
- CHANGELOG
|
49
41
|
- Gemfile
|
@@ -57,36 +49,26 @@ files:
|
|
57
49
|
- sluice.gemspec
|
58
50
|
homepage: http://snowplowanalytics.com
|
59
51
|
licenses: []
|
60
|
-
|
61
52
|
post_install_message:
|
62
53
|
rdoc_options: []
|
63
|
-
|
64
|
-
require_paths:
|
54
|
+
require_paths:
|
65
55
|
- lib
|
66
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
57
|
none: false
|
68
|
-
requirements:
|
69
|
-
- -
|
70
|
-
- !ruby/object:Gem::Version
|
71
|
-
|
72
|
-
|
73
|
-
- 0
|
74
|
-
version: "0"
|
75
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
63
|
none: false
|
77
|
-
requirements:
|
78
|
-
- -
|
79
|
-
- !ruby/object:Gem::Version
|
80
|
-
|
81
|
-
segments:
|
82
|
-
- 0
|
83
|
-
version: "0"
|
64
|
+
requirements:
|
65
|
+
- - ! '>='
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '0'
|
84
68
|
requirements: []
|
85
|
-
|
86
69
|
rubyforge_project:
|
87
|
-
rubygems_version: 1.8.
|
70
|
+
rubygems_version: 1.8.25
|
88
71
|
signing_key:
|
89
72
|
specification_version: 3
|
90
73
|
summary: Ruby toolkit for cloud-friendly ETL
|
91
74
|
test_files: []
|
92
|
-
|