sluice 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +14 -0
- data/README.md +2 -2
- data/lib/sluice.rb +1 -1
- data/lib/sluice/storage/s3.rb +149 -46
- metadata +5 -4
data/CHANGELOG
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Version 0.0.3 (2012-11-07)
|
2
|
+
--------------------------
|
3
|
+
Added parallel file download
|
4
|
+
Some simplification of existing code
|
5
|
+
Fixed a couple of bugs in the file move code (around S3 file keys versus local filepaths)
|
6
|
+
|
7
|
+
Version 0.0.2 (2012-11-06)
|
8
|
+
--------------------------
|
9
|
+
Added trail_slash
|
10
|
+
Bug fixing in regexify and files_up_to
|
11
|
+
|
12
|
+
Version 0.0.1
|
13
|
+
-------------
|
14
|
+
Initial release of Sluice
|
data/README.md
CHANGED
@@ -13,7 +13,7 @@ Sluice has been extracted from a pair of Ruby ETL applications built by the [Sno
|
|
13
13
|
|
14
14
|
Or in your Gemfile:
|
15
15
|
|
16
|
-
gem 'sluice', '~> 0.0.
|
16
|
+
gem 'sluice', '~> 0.0.3'
|
17
17
|
|
18
18
|
## Usage
|
19
19
|
|
@@ -61,4 +61,4 @@ limitations under the License.
|
|
61
61
|
[emr-etl-runner]: https://github.com/snowplow/snowplow/tree/master/3-etl/emr-etl-runner
|
62
62
|
[storage-loader]: https://github.com/snowplow/snowplow/tree/master/4-storage/storage-loader
|
63
63
|
|
64
|
-
[license]: http://www.apache.org/licenses/LICENSE-2.0
|
64
|
+
[license]: http://www.apache.org/licenses/LICENSE-2.0
|
data/lib/sluice.rb
CHANGED
data/lib/sluice/storage/s3.rb
CHANGED
@@ -86,11 +86,26 @@ module Sluice
|
|
86
86
|
end
|
87
87
|
module_function :is_empty?
|
88
88
|
|
89
|
+
# Download files from an S3 location to
|
90
|
+
# local storage, concurrently
|
91
|
+
#
|
92
|
+
# Parameters:
|
93
|
+
# +s3+:: A Fog::Storage s3 connection
|
94
|
+
# +from_location+:: S3Location to delete files from
|
95
|
+
# +to_directory+:: Local directory to copy files to
|
96
|
+
# +match_regex+:: a regex string to match the files to delete
|
97
|
+
def download_files(s3, from_location, to_directory, match_regex='.+')
|
98
|
+
|
99
|
+
puts " downloading files from #{from_location} to #{to_directory}"
|
100
|
+
process_files(:download, s3, from_location, match_regex, to_directory)
|
101
|
+
end
|
102
|
+
module_function :download_files
|
103
|
+
|
89
104
|
# Delete files from S3 locations concurrently
|
90
105
|
#
|
91
106
|
# Parameters:
|
92
107
|
# +s3+:: A Fog::Storage s3 connection
|
93
|
-
# +
|
108
|
+
# +from_location+:: S3Location to delete files from
|
94
109
|
# +match_regex+:: a regex string to match the files to delete
|
95
110
|
def delete_files(s3, from_location, match_regex='.+')
|
96
111
|
|
@@ -103,8 +118,8 @@ module Sluice
|
|
103
118
|
#
|
104
119
|
# Parameters:
|
105
120
|
# +s3+:: A Fog::Storage s3 connection
|
106
|
-
# +
|
107
|
-
# +
|
121
|
+
# +from_location+:: S3Location to copy files from
|
122
|
+
# +to_location+:: S3Location to copy files to
|
108
123
|
# +match_regex+:: a regex string to match the files to copy
|
109
124
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
110
125
|
def copy_files(s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false)
|
@@ -118,7 +133,7 @@ module Sluice
|
|
118
133
|
#
|
119
134
|
# Parameters:
|
120
135
|
# +s3+:: A Fog::Storage s3 connection
|
121
|
-
# +
|
136
|
+
# +from_location+:: S3Location to move files from
|
122
137
|
# +to+:: S3Location to move files to
|
123
138
|
# +match_regex+:: a regex string to match the files to move
|
124
139
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
@@ -129,6 +144,26 @@ module Sluice
|
|
129
144
|
end
|
130
145
|
module_function :move_files
|
131
146
|
|
147
|
+
# Download a single file to the exact path specified.
|
148
|
+
# Has no intelligence around filenaming.
|
149
|
+
# Makes sure to create the path as needed.
|
150
|
+
#
|
151
|
+
# Parameters:
|
152
|
+
# +s3+:: A Fog::Storage s3 connection
|
153
|
+
# +from_file:: A Fog::File to download
|
154
|
+
# +to_file:: A local file path
|
155
|
+
def download_file(s3, from_file, to_file)
|
156
|
+
|
157
|
+
FileUtils.mkdir_p(File.dirname(to_file))
|
158
|
+
|
159
|
+
# TODO: deal with bug where Fog hangs indefinitely if network connection dies during download
|
160
|
+
|
161
|
+
local_file = File.open(to_file, "w")
|
162
|
+
local_file.write(from_file.body)
|
163
|
+
local_file.close
|
164
|
+
end
|
165
|
+
module_function :download_file
|
166
|
+
|
132
167
|
private
|
133
168
|
|
134
169
|
# Concurrent file operations between S3 locations. Supports:
|
@@ -139,27 +174,27 @@ module Sluice
|
|
139
174
|
# Parameters:
|
140
175
|
# +operation+:: Operation to perform. :copy, :delete, :move supported
|
141
176
|
# +s3+:: A Fog::Storage s3 connection
|
142
|
-
# +
|
177
|
+
# +from_location+:: S3Location to process files from
|
143
178
|
# +match_regex+:: a regex string to match the files to process
|
144
|
-
# +
|
179
|
+
# +to_loc_or_dir+:: S3Location or local directory to process files to
|
145
180
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
146
|
-
def process_files(operation, s3, from_location, match_regex='.+',
|
181
|
+
def process_files(operation, s3, from_location, match_regex='.+', to_loc_or_dir=nil, alter_filename_lambda=false)
|
147
182
|
|
148
183
|
# Validate that the file operation makes sense
|
149
184
|
case operation
|
150
|
-
when :copy, :move
|
151
|
-
if
|
152
|
-
raise StorageOperationError "File operation %s requires
|
185
|
+
when :copy, :move, :download
|
186
|
+
if to_loc_or_dir.nil?
|
187
|
+
raise StorageOperationError "File operation %s requires the to_loc_or_dir to be set" % operation
|
153
188
|
end
|
154
189
|
when :delete
|
155
|
-
unless
|
156
|
-
raise StorageOperationError "File operation %s does not support the
|
190
|
+
unless to_loc_or_dir.nil?
|
191
|
+
raise StorageOperationError "File operation %s does not support the to_loc_or_dir argument" % operation
|
157
192
|
end
|
158
193
|
if alter_filename_lambda.class == Proc
|
159
194
|
raise StorageOperationError "File operation %s does not support the alter_filename_lambda argument" % operation
|
160
195
|
end
|
161
196
|
else
|
162
|
-
raise StorageOperationError "File operation %s is unsupported. Try :copy, :delete or :move" % operation
|
197
|
+
raise StorageOperationError "File operation %s is unsupported. Try :download, :copy, :delete or :move" % operation
|
163
198
|
end
|
164
199
|
|
165
200
|
files_to_process = []
|
@@ -171,7 +206,7 @@ module Sluice
|
|
171
206
|
# If an exception is thrown in a thread that isn't handled, die quickly
|
172
207
|
Thread.abort_on_exception = true
|
173
208
|
|
174
|
-
# Create
|
209
|
+
# Create Ruby threads to concurrently execute s3 operations
|
175
210
|
for i in (0...CONCURRENCY)
|
176
211
|
|
177
212
|
# Each thread pops a file off the files_to_process array, and moves it.
|
@@ -187,10 +222,10 @@ module Sluice
|
|
187
222
|
|
188
223
|
while !complete && !match do
|
189
224
|
if files_to_process.size == 0
|
190
|
-
#
|
191
|
-
#
|
225
|
+
# S3 batches 1000 files per request.
|
226
|
+
# We load up our array with the files to move
|
192
227
|
files_to_process = s3.directories.get(from_location.bucket, :prefix => from_location.dir).files.all(marker_opts)
|
193
|
-
#
|
228
|
+
# If we don't have any files after the s3 request, we're complete
|
194
229
|
if files_to_process.size == 0
|
195
230
|
complete = true
|
196
231
|
next
|
@@ -204,6 +239,7 @@ module Sluice
|
|
204
239
|
end
|
205
240
|
|
206
241
|
file = files_to_process.pop
|
242
|
+
|
207
243
|
match = if match_regex.is_a? NegativeRegex
|
208
244
|
!file.key.match(match_regex.regex)
|
209
245
|
else
|
@@ -213,12 +249,13 @@ module Sluice
|
|
213
249
|
end
|
214
250
|
|
215
251
|
# If we don't have a match, then we must be complete
|
216
|
-
break unless match
|
252
|
+
break unless match
|
253
|
+
|
254
|
+
# Ignore any EMR-created _$folder$ entries
|
255
|
+
break if file.key.end_with?('_$folder$')
|
217
256
|
|
218
257
|
# Match the filename, ignoring directory
|
219
258
|
file_match = file.key.match('([^/]+)$')
|
220
|
-
|
221
|
-
# Silently skip any sub-directories in the list
|
222
259
|
break unless file_match
|
223
260
|
|
224
261
|
if alter_filename_lambda.class == Proc
|
@@ -227,44 +264,46 @@ module Sluice
|
|
227
264
|
filename = file_match[1]
|
228
265
|
end
|
229
266
|
|
230
|
-
# What are we doing?
|
267
|
+
# What are we doing? Let's determine source and target
|
268
|
+
# Note that target excludes bucket name where relevant
|
269
|
+
source = "#{from_location.bucket}/#{file.key}"
|
231
270
|
case operation
|
271
|
+
when :download
|
272
|
+
target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir)
|
273
|
+
puts " DOWNLOAD #{source} +-> #{target}"
|
232
274
|
when :move
|
233
|
-
|
275
|
+
target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir.dir_as_path)
|
276
|
+
puts " MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
|
234
277
|
when :copy
|
235
|
-
|
278
|
+
target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir.dir_as_path)
|
279
|
+
puts " COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
|
236
280
|
when :delete
|
237
|
-
|
281
|
+
# No target
|
282
|
+
puts " DELETE x #{source}"
|
283
|
+
end
|
284
|
+
|
285
|
+
# Download is a stand-alone operation vs move/copy/delete
|
286
|
+
if operation == :download
|
287
|
+
retry_x(
|
288
|
+
download_file(s3, file, target), RETRIES,
|
289
|
+
" +/> #{target}",
|
290
|
+
"Problem downloading #{file.key}. Retrying.")
|
238
291
|
end
|
239
292
|
|
240
293
|
# A move or copy starts with a copy file
|
241
294
|
if [:move, :copy].include? operation
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
rescue
|
247
|
-
raise unless i < RETRIES
|
248
|
-
puts "Problem copying #{file.key}. Retrying.", $!, $@
|
249
|
-
sleep(RETRY_WAIT) # give us a bit of time before retrying
|
250
|
-
i += 1
|
251
|
-
retry
|
252
|
-
end
|
295
|
+
retry_x(
|
296
|
+
file.copy(to_loc_or_dir.bucket, target), RETRIES,
|
297
|
+
" +-> #{to_loc_or_dir.bucket}/#{target}",
|
298
|
+
"Problem copying #{file.key}. Retrying.")
|
253
299
|
end
|
254
300
|
|
255
301
|
# A move or delete ends with a delete
|
256
302
|
if [:move, :delete].include? operation
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
rescue
|
262
|
-
raise unless i < RETRIES
|
263
|
-
puts "Problem destroying #{file.key}. Retrying.", $!, $@
|
264
|
-
sleep(RETRY_WAIT) # Give us a bit of time before retrying
|
265
|
-
i += 1
|
266
|
-
retry
|
267
|
-
end
|
303
|
+
retry_x(
|
304
|
+
file.destroy(), RETRIES,
|
305
|
+
" x #{source}",
|
306
|
+
"Problem destroying #{file.key}. Retrying.")
|
268
307
|
end
|
269
308
|
end
|
270
309
|
end
|
@@ -276,6 +315,70 @@ module Sluice
|
|
276
315
|
end
|
277
316
|
module_function :process_files
|
278
317
|
|
318
|
+
# A helper function to attempt to run a
|
319
|
+
# function retries times
|
320
|
+
#
|
321
|
+
# Parameters:
|
322
|
+
# +function+:: Function to run
|
323
|
+
# +retries+:: Number of retries to attempt
|
324
|
+
# +attempt_msg+:: Message to puts on each attempt
|
325
|
+
# +failure_msg+:: Message to puts on each failure
|
326
|
+
def retry_x(function, retries, attempt_msg, failure_msg)
|
327
|
+
i = 0
|
328
|
+
begin
|
329
|
+
function
|
330
|
+
puts attempt_msg
|
331
|
+
rescue
|
332
|
+
raise unless i < retries
|
333
|
+
puts failure_msg
|
334
|
+
sleep(RETRY_WAIT) # Give us a bit of time before retrying
|
335
|
+
i += 1
|
336
|
+
retry
|
337
|
+
end
|
338
|
+
end
|
339
|
+
module_function :retry_x
|
340
|
+
|
341
|
+
# A helper function to prepare destination
|
342
|
+
# filenames and paths. This is a bit weird
|
343
|
+
# - it needs to exist because of differences
|
344
|
+
# in the way that Amazon S3, Fog and Unix
|
345
|
+
# treat filepaths versus keys.
|
346
|
+
#
|
347
|
+
# Parameters:
|
348
|
+
# +filepath+:: Path to file (including old filename)
|
349
|
+
# +new_filename+:: Replace the filename in the path with this
|
350
|
+
# +remove_path+:: If this is set, strip this from the front of the path
|
351
|
+
# +add_path+:: If this is set, add this to the front of the path
|
352
|
+
#
|
353
|
+
# TODO: this really needs unit tests
|
354
|
+
def name_file(filepath, new_filename, remove_path=nil, add_path=nil)
|
355
|
+
|
356
|
+
# First, replace the filename in filepath with new one
|
357
|
+
dirname = File.dirname(filepath)
|
358
|
+
new_filepath = (dirname == '.') ? new_filename : dirname + '/' + new_filename
|
359
|
+
|
360
|
+
# Nothing more to do
|
361
|
+
return new_filepath if remove_path.nil?
|
362
|
+
|
363
|
+
# If we have a 'remove_path', it must be found at
|
364
|
+
# the start of the path.
|
365
|
+
# If it's not, you're probably using name_file()
|
366
|
+
# wrong.
|
367
|
+
if !filepath.start_with?(remove_path)
|
368
|
+
raise StorageOperationError, "name_file failed. Filepath '#{filepath}' does not start with '#{remove_path}'"
|
369
|
+
end
|
370
|
+
|
371
|
+
# Okay, let's remove the filepath
|
372
|
+
shortened_filepath = new_filepath[remove_path.length()..-1]
|
373
|
+
|
374
|
+
# Nothing more to do
|
375
|
+
return shortened_filepath if add_path.nil?
|
376
|
+
|
377
|
+
# Add the new filepath on to the start and return
|
378
|
+
add_path + shortened_filepath
|
379
|
+
end
|
380
|
+
module_function :name_file
|
381
|
+
|
279
382
|
end
|
280
383
|
end
|
281
384
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sluice
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Alex Dean
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2012-11-
|
19
|
+
date: 2012-11-09 00:00:00 Z
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
22
|
name: fog
|
@@ -45,6 +45,7 @@ extra_rdoc_files: []
|
|
45
45
|
|
46
46
|
files:
|
47
47
|
- .gitignore
|
48
|
+
- CHANGELOG
|
48
49
|
- Gemfile
|
49
50
|
- LICENSE-2.0.txt
|
50
51
|
- README.md
|