sluice 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +4 -0
- data/README.md +9 -3
- data/lib/sluice/storage/s3.rb +79 -1
- data/lib/sluice.rb +1 -1
- metadata +1 -1
data/CHANGELOG
CHANGED
data/README.md
CHANGED
@@ -2,7 +2,13 @@
|
|
2
2
|
|
3
3
|
Sluice is a Ruby gem (built with [Bundler] [bundler]) to help you build cloud-friendly ETL (extract, transform, load) processes.
|
4
4
|
|
5
|
-
|
5
|
+
Currently Sluice provides the following very robust, very parallel S3 operations:
|
6
|
+
|
7
|
+
* File upload to S3
|
8
|
+
* File download from S3
|
9
|
+
* File delete from S3
|
10
|
+
* File move within S3 (from/to the same or different AWS accounts)
|
11
|
+
* File copy within S3 (from/to the same or different AWS accounts)
|
6
12
|
|
7
13
|
Sluice has been extracted from a pair of Ruby ETL applications built by the [SnowPlow Analytics] [snowplow-analytics] team, specifically:
|
8
14
|
|
@@ -15,7 +21,7 @@ Sluice has been extracted from a pair of Ruby ETL applications built by the [Sno
|
|
15
21
|
|
16
22
|
Or in your Gemfile:
|
17
23
|
|
18
|
-
gem 'sluice', '~> 0.0.
|
24
|
+
gem 'sluice', '~> 0.0.9'
|
19
25
|
|
20
26
|
## Usage
|
21
27
|
|
@@ -26,7 +32,7 @@ Rubydoc and usage examples to come.
|
|
26
32
|
To hack on Sluice locally:
|
27
33
|
|
28
34
|
$ gem build sluice.gemspec
|
29
|
-
$ sudo gem install sluice-0.0.
|
35
|
+
$ sudo gem install sluice-0.0.9.gem
|
30
36
|
|
31
37
|
To contribute:
|
32
38
|
|
data/lib/sluice/storage/s3.rb
CHANGED
@@ -13,6 +13,7 @@
|
|
13
13
|
# Copyright:: Copyright (c) 2012 SnowPlow Analytics Ltd
|
14
14
|
# License:: Apache License Version 2.0
|
15
15
|
|
16
|
+
require 'tmpdir'
|
16
17
|
require 'fog'
|
17
18
|
require 'thread'
|
18
19
|
|
@@ -180,6 +181,37 @@ module Sluice
|
|
180
181
|
end
|
181
182
|
module_function :delete_files
|
182
183
|
|
184
|
+
# Copies files between S3 locations in two different accounts
|
185
|
+
#
|
186
|
+
# Implementation is as follows:
|
187
|
+
# 1. Concurrent download of all files from S3 source to local tmpdir
|
188
|
+
# 2. Concurrent upload of all files from local tmpdir to S3 target
|
189
|
+
#
|
190
|
+
# In other words, the download and upload are not interleaved (which is
|
191
|
+
# inefficient because upload speeds are much lower than download speeds)
|
192
|
+
#
|
193
|
+
# In other words, the download and upload are not interleaved (which
|
194
|
+
# is inefficient because upload speeds are much lower than download speeds)
|
195
|
+
#
|
196
|
+
# +from_s3+:: A Fog::Storage s3 connection for accessing the from S3Location
|
197
|
+
# +to_s3+:: A Fog::Storage s3 connection for accessing the to S3Location
|
198
|
+
# +from_location+:: S3Location to copy files from
|
199
|
+
# +to_location+:: S3Location to copy files to
|
200
|
+
# +match_regex+:: a regex string to match the files to move
|
201
|
+
# +alter_filename_lambda+:: lambda to alter the written filename
|
202
|
+
# +flatten+:: strips off any sub-folders below the from_location
|
203
|
+
def copy_files_inter(from_s3, to_s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
|
204
|
+
|
205
|
+
puts " copying inter-account #{describe_from(from_location)} to #{to_location}"
|
206
|
+
Dir.mktmpdir do |t|
|
207
|
+
tmp = Sluice::Storage.trail_slash(t)
|
208
|
+
download_files(from_s3, from_location, tmp, match_regex)
|
209
|
+
upload_files(to_s3, tmp, to_location, '**/*') # Upload all files we downloaded
|
210
|
+
end
|
211
|
+
|
212
|
+
end
|
213
|
+
module_function :copy_files_inter
|
214
|
+
|
183
215
|
# Copies files between S3 locations concurrently
|
184
216
|
#
|
185
217
|
# Parameters:
|
@@ -196,6 +228,36 @@ module Sluice
|
|
196
228
|
end
|
197
229
|
module_function :copy_files
|
198
230
|
|
231
|
+
# Moves files between S3 locations in two different accounts
|
232
|
+
#
|
233
|
+
# Implementation is as follows:
|
234
|
+
# 1. Concurrent download of all files from S3 source to local tmpdir
|
235
|
+
# 2. Concurrent upload of all files from local tmpdir to S3 target
|
236
|
+
# 3. Concurrent deletion of all files from S3 source
|
237
|
+
#
|
238
|
+
# In other words, the three operations are not interleaved (which is
|
239
|
+
# inefficient because upload speeds are much lower than download speeds)
|
240
|
+
#
|
241
|
+
# +from_s3+:: A Fog::Storage s3 connection for accessing the from S3Location
|
242
|
+
# +to_s3+:: A Fog::Storage s3 connection for accessing the to S3Location
|
243
|
+
# +from_location+:: S3Location to move files from
|
244
|
+
# +to_location+:: S3Location to move files to
|
245
|
+
# +match_regex+:: a regex string to match the files to move
|
246
|
+
# +alter_filename_lambda+:: lambda to alter the written filename
|
247
|
+
# +flatten+:: strips off any sub-folders below the from_location
|
248
|
+
def move_files_inter(from_s3, to_s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
|
249
|
+
|
250
|
+
puts " moving inter-account #{describe_from(from_location)} to #{to_location}"
|
251
|
+
Dir.mktmpdir do |t|
|
252
|
+
tmp = Sluice::Storage.trail_slash(t)
|
253
|
+
download_files(from_s3, from_location, tmp, match_regex)
|
254
|
+
upload_files(to_s3, tmp, to_location, '**/*') # Upload all files we downloaded
|
255
|
+
delete_files(from_s3, from_location, '.+') # Delete all files we downloaded
|
256
|
+
end
|
257
|
+
|
258
|
+
end
|
259
|
+
module_function :move_files_inter
|
260
|
+
|
199
261
|
# Moves files between S3 locations concurrently
|
200
262
|
#
|
201
263
|
# Parameters:
|
@@ -326,7 +388,7 @@ module Sluice
|
|
326
388
|
globbed = true
|
327
389
|
# Otherwise if it's an upload, we can glob now
|
328
390
|
elsif operation == :upload
|
329
|
-
files_to_process =
|
391
|
+
files_to_process = glob_files(from_files_or_dir_or_loc, match_regex_or_glob)
|
330
392
|
globbed = true
|
331
393
|
# Otherwise we'll do threaded globbing later...
|
332
394
|
else
|
@@ -503,6 +565,22 @@ module Sluice
|
|
503
565
|
end
|
504
566
|
module_function :process_files
|
505
567
|
|
568
|
+
# A helper function to list all files
|
569
|
+
# recursively in a folder
|
570
|
+
#
|
571
|
+
# Parameters:
|
572
|
+
# +dir+:: Directory to list files recursively
|
573
|
+
# +match_regex+:: a regex string to match the files to copy
|
574
|
+
#
|
575
|
+
# Returns array of files (no sub-directories)
|
576
|
+
def glob_files(dir, glob)
|
577
|
+
Dir.glob(File.join(dir, glob)).select { |f|
|
578
|
+
File.file?(f) # Drop sub-directories
|
579
|
+
}
|
580
|
+
end
|
581
|
+
module_function :glob_files
|
582
|
+
|
583
|
+
|
506
584
|
# A helper function to attempt to run a
|
507
585
|
# function retries times
|
508
586
|
#
|
data/lib/sluice.rb
CHANGED