sluice 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +4 -0
- data/README.md +9 -3
- data/lib/sluice/storage/s3.rb +79 -1
- data/lib/sluice.rb +1 -1
- metadata +1 -1
data/CHANGELOG
CHANGED
data/README.md
CHANGED
@@ -2,7 +2,13 @@
|
|
2
2
|
|
3
3
|
Sluice is a Ruby gem (built with [Bundler] [bundler]) to help you build cloud-friendly ETL (extract, transform, load) processes.
|
4
4
|
|
5
|
-
|
5
|
+
Currently Sluice provides the following very robust, very parallel S3 operations:
|
6
|
+
|
7
|
+
* File upload to S3
|
8
|
+
* File download from S3
|
9
|
+
* File delete from S3
|
10
|
+
* File move within S3 (from/to the same or different AWS accounts)
|
11
|
+
* File copy within S3 (from/to the same or different AWS accounts)
|
6
12
|
|
7
13
|
Sluice has been extracted from a pair of Ruby ETL applications built by the [SnowPlow Analytics] [snowplow-analytics] team, specifically:
|
8
14
|
|
@@ -15,7 +21,7 @@ Sluice has been extracted from a pair of Ruby ETL applications built by the [Sno
|
|
15
21
|
|
16
22
|
Or in your Gemfile:
|
17
23
|
|
18
|
-
gem 'sluice', '~> 0.0.
|
24
|
+
gem 'sluice', '~> 0.0.9'
|
19
25
|
|
20
26
|
## Usage
|
21
27
|
|
@@ -26,7 +32,7 @@ Rubydoc and usage examples to come.
|
|
26
32
|
To hack on Sluice locally:
|
27
33
|
|
28
34
|
$ gem build sluice.gemspec
|
29
|
-
$ sudo gem install sluice-0.0.
|
35
|
+
$ sudo gem install sluice-0.0.9.gem
|
30
36
|
|
31
37
|
To contribute:
|
32
38
|
|
data/lib/sluice/storage/s3.rb
CHANGED
@@ -13,6 +13,7 @@
|
|
13
13
|
# Copyright:: Copyright (c) 2012 SnowPlow Analytics Ltd
|
14
14
|
# License:: Apache License Version 2.0
|
15
15
|
|
16
|
+
require 'tmpdir'
|
16
17
|
require 'fog'
|
17
18
|
require 'thread'
|
18
19
|
|
@@ -180,6 +181,37 @@ module Sluice
|
|
180
181
|
end
|
181
182
|
module_function :delete_files
|
182
183
|
|
184
|
+
# Copies files between S3 locations in two different accounts
|
185
|
+
#
|
186
|
+
# Implementation is as follows:
|
187
|
+
# 1. Concurrent download of all files from S3 source to local tmpdir
|
188
|
+
# 2. Concurrent upload of all files from local tmpdir to S3 target
|
189
|
+
#
|
190
|
+
# In other words, the download and upload are not interleaved (which is
|
191
|
+
# inefficient because upload speeds are much lower than download speeds)
|
192
|
+
#
|
193
|
+
# In other words, the download and upload are not interleaved (which
|
194
|
+
# is inefficient because upload speeds are much lower than download speeds)
|
195
|
+
#
|
196
|
+
# +from_s3+:: A Fog::Storage s3 connection for accessing the from S3Location
|
197
|
+
# +to_s3+:: A Fog::Storage s3 connection for accessing the to S3Location
|
198
|
+
# +from_location+:: S3Location to copy files from
|
199
|
+
# +to_location+:: S3Location to copy files to
|
200
|
+
# +match_regex+:: a regex string to match the files to move
|
201
|
+
# +alter_filename_lambda+:: lambda to alter the written filename
|
202
|
+
# +flatten+:: strips off any sub-folders below the from_location
|
203
|
+
def copy_files_inter(from_s3, to_s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
|
204
|
+
|
205
|
+
puts " copying inter-account #{describe_from(from_location)} to #{to_location}"
|
206
|
+
Dir.mktmpdir do |t|
|
207
|
+
tmp = Sluice::Storage.trail_slash(t)
|
208
|
+
download_files(from_s3, from_location, tmp, match_regex)
|
209
|
+
upload_files(to_s3, tmp, to_location, '**/*') # Upload all files we downloaded
|
210
|
+
end
|
211
|
+
|
212
|
+
end
|
213
|
+
module_function :copy_files_inter
|
214
|
+
|
183
215
|
# Copies files between S3 locations concurrently
|
184
216
|
#
|
185
217
|
# Parameters:
|
@@ -196,6 +228,36 @@ module Sluice
|
|
196
228
|
end
|
197
229
|
module_function :copy_files
|
198
230
|
|
231
|
+
# Moves files between S3 locations in two different accounts
|
232
|
+
#
|
233
|
+
# Implementation is as follows:
|
234
|
+
# 1. Concurrent download of all files from S3 source to local tmpdir
|
235
|
+
# 2. Concurrent upload of all files from local tmpdir to S3 target
|
236
|
+
# 3. Concurrent deletion of all files from S3 source
|
237
|
+
#
|
238
|
+
# In other words, the three operations are not interleaved (which is
|
239
|
+
# inefficient because upload speeds are much lower than download speeds)
|
240
|
+
#
|
241
|
+
# +from_s3+:: A Fog::Storage s3 connection for accessing the from S3Location
|
242
|
+
# +to_s3+:: A Fog::Storage s3 connection for accessing the to S3Location
|
243
|
+
# +from_location+:: S3Location to move files from
|
244
|
+
# +to_location+:: S3Location to move files to
|
245
|
+
# +match_regex+:: a regex string to match the files to move
|
246
|
+
# +alter_filename_lambda+:: lambda to alter the written filename
|
247
|
+
# +flatten+:: strips off any sub-folders below the from_location
|
248
|
+
def move_files_inter(from_s3, to_s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
|
249
|
+
|
250
|
+
puts " moving inter-account #{describe_from(from_location)} to #{to_location}"
|
251
|
+
Dir.mktmpdir do |t|
|
252
|
+
tmp = Sluice::Storage.trail_slash(t)
|
253
|
+
download_files(from_s3, from_location, tmp, match_regex)
|
254
|
+
upload_files(to_s3, tmp, to_location, '**/*') # Upload all files we downloaded
|
255
|
+
delete_files(from_s3, from_location, '.+') # Delete all files we downloaded
|
256
|
+
end
|
257
|
+
|
258
|
+
end
|
259
|
+
module_function :move_files_inter
|
260
|
+
|
199
261
|
# Moves files between S3 locations concurrently
|
200
262
|
#
|
201
263
|
# Parameters:
|
@@ -326,7 +388,7 @@ module Sluice
|
|
326
388
|
globbed = true
|
327
389
|
# Otherwise if it's an upload, we can glob now
|
328
390
|
elsif operation == :upload
|
329
|
-
files_to_process =
|
391
|
+
files_to_process = glob_files(from_files_or_dir_or_loc, match_regex_or_glob)
|
330
392
|
globbed = true
|
331
393
|
# Otherwise we'll do threaded globbing later...
|
332
394
|
else
|
@@ -503,6 +565,22 @@ module Sluice
|
|
503
565
|
end
|
504
566
|
module_function :process_files
|
505
567
|
|
568
|
+
# A helper function to list all files
|
569
|
+
# recursively in a folder
|
570
|
+
#
|
571
|
+
# Parameters:
|
572
|
+
# +dir+:: Directory to list files recursively
|
573
|
+
# +match_regex+:: a regex string to match the files to copy
|
574
|
+
#
|
575
|
+
# Returns array of files (no sub-directories)
|
576
|
+
def glob_files(dir, glob)
|
577
|
+
Dir.glob(File.join(dir, glob)).select { |f|
|
578
|
+
File.file?(f) # Drop sub-directories
|
579
|
+
}
|
580
|
+
end
|
581
|
+
module_function :glob_files
|
582
|
+
|
583
|
+
|
506
584
|
# A helper function to attempt to run a
|
507
585
|
# function retries times
|
508
586
|
#
|
data/lib/sluice.rb
CHANGED