sluice 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +14 -0
- data/README.md +2 -2
- data/lib/sluice.rb +1 -1
- data/lib/sluice/storage/s3.rb +149 -46
- metadata +5 -4
data/CHANGELOG
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Version 0.0.3 (2012-11-07)
|
2
|
+
--------------------------
|
3
|
+
Added parallel file download
|
4
|
+
Some simplification of existing code
|
5
|
+
Fixed a couple of bugs in the file move code (around S3 file keys versus local filepaths)
|
6
|
+
|
7
|
+
Version 0.0.2 (2012-11-06)
|
8
|
+
--------------------------
|
9
|
+
Added trail_slash
|
10
|
+
Bug fixing in regexify and files_up_to
|
11
|
+
|
12
|
+
Version 0.0.1
|
13
|
+
-------------
|
14
|
+
Initial release of Sluice
|
data/README.md
CHANGED
@@ -13,7 +13,7 @@ Sluice has been extracted from a pair of Ruby ETL applications built by the [Sno
|
|
13
13
|
|
14
14
|
Or in your Gemfile:
|
15
15
|
|
16
|
-
gem 'sluice', '~> 0.0.
|
16
|
+
gem 'sluice', '~> 0.0.3'
|
17
17
|
|
18
18
|
## Usage
|
19
19
|
|
@@ -61,4 +61,4 @@ limitations under the License.
|
|
61
61
|
[emr-etl-runner]: https://github.com/snowplow/snowplow/tree/master/3-etl/emr-etl-runner
|
62
62
|
[storage-loader]: https://github.com/snowplow/snowplow/tree/master/4-storage/storage-loader
|
63
63
|
|
64
|
-
[license]: http://www.apache.org/licenses/LICENSE-2.0
|
64
|
+
[license]: http://www.apache.org/licenses/LICENSE-2.0
|
data/lib/sluice.rb
CHANGED
data/lib/sluice/storage/s3.rb
CHANGED
@@ -86,11 +86,26 @@ module Sluice
|
|
86
86
|
end
|
87
87
|
module_function :is_empty?
|
88
88
|
|
89
|
+
# Download files from an S3 location to
|
90
|
+
# local storage, concurrently
|
91
|
+
#
|
92
|
+
# Parameters:
|
93
|
+
# +s3+:: A Fog::Storage s3 connection
|
94
|
+
# +from_location+:: S3Location to delete files from
|
95
|
+
# +to_directory+:: Local directory to copy files to
|
96
|
+
# +match_regex+:: a regex string to match the files to delete
|
97
|
+
def download_files(s3, from_location, to_directory, match_regex='.+')
|
98
|
+
|
99
|
+
puts " downloading files from #{from_location} to #{to_directory}"
|
100
|
+
process_files(:download, s3, from_location, match_regex, to_directory)
|
101
|
+
end
|
102
|
+
module_function :download_files
|
103
|
+
|
89
104
|
# Delete files from S3 locations concurrently
|
90
105
|
#
|
91
106
|
# Parameters:
|
92
107
|
# +s3+:: A Fog::Storage s3 connection
|
93
|
-
# +
|
108
|
+
# +from_location+:: S3Location to delete files from
|
94
109
|
# +match_regex+:: a regex string to match the files to delete
|
95
110
|
def delete_files(s3, from_location, match_regex='.+')
|
96
111
|
|
@@ -103,8 +118,8 @@ module Sluice
|
|
103
118
|
#
|
104
119
|
# Parameters:
|
105
120
|
# +s3+:: A Fog::Storage s3 connection
|
106
|
-
# +
|
107
|
-
# +
|
121
|
+
# +from_location+:: S3Location to copy files from
|
122
|
+
# +to_location+:: S3Location to copy files to
|
108
123
|
# +match_regex+:: a regex string to match the files to copy
|
109
124
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
110
125
|
def copy_files(s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false)
|
@@ -118,7 +133,7 @@ module Sluice
|
|
118
133
|
#
|
119
134
|
# Parameters:
|
120
135
|
# +s3+:: A Fog::Storage s3 connection
|
121
|
-
# +
|
136
|
+
# +from_location+:: S3Location to move files from
|
122
137
|
# +to+:: S3Location to move files to
|
123
138
|
# +match_regex+:: a regex string to match the files to move
|
124
139
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
@@ -129,6 +144,26 @@ module Sluice
|
|
129
144
|
end
|
130
145
|
module_function :move_files
|
131
146
|
|
147
|
+
# Download a single file to the exact path specified.
|
148
|
+
# Has no intelligence around filenaming.
|
149
|
+
# Makes sure to create the path as needed.
|
150
|
+
#
|
151
|
+
# Parameters:
|
152
|
+
# +s3+:: A Fog::Storage s3 connection
|
153
|
+
# +from_file:: A Fog::File to download
|
154
|
+
# +to_file:: A local file path
|
155
|
+
def download_file(s3, from_file, to_file)
|
156
|
+
|
157
|
+
FileUtils.mkdir_p(File.dirname(to_file))
|
158
|
+
|
159
|
+
# TODO: deal with bug where Fog hangs indefinitely if network connection dies during download
|
160
|
+
|
161
|
+
local_file = File.open(to_file, "w")
|
162
|
+
local_file.write(from_file.body)
|
163
|
+
local_file.close
|
164
|
+
end
|
165
|
+
module_function :download_file
|
166
|
+
|
132
167
|
private
|
133
168
|
|
134
169
|
# Concurrent file operations between S3 locations. Supports:
|
@@ -139,27 +174,27 @@ module Sluice
|
|
139
174
|
# Parameters:
|
140
175
|
# +operation+:: Operation to perform. :copy, :delete, :move supported
|
141
176
|
# +s3+:: A Fog::Storage s3 connection
|
142
|
-
# +
|
177
|
+
# +from_location+:: S3Location to process files from
|
143
178
|
# +match_regex+:: a regex string to match the files to process
|
144
|
-
# +
|
179
|
+
# +to_loc_or_dir+:: S3Location or local directory to process files to
|
145
180
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
146
|
-
def process_files(operation, s3, from_location, match_regex='.+',
|
181
|
+
def process_files(operation, s3, from_location, match_regex='.+', to_loc_or_dir=nil, alter_filename_lambda=false)
|
147
182
|
|
148
183
|
# Validate that the file operation makes sense
|
149
184
|
case operation
|
150
|
-
when :copy, :move
|
151
|
-
if
|
152
|
-
raise StorageOperationError "File operation %s requires
|
185
|
+
when :copy, :move, :download
|
186
|
+
if to_loc_or_dir.nil?
|
187
|
+
raise StorageOperationError "File operation %s requires the to_loc_or_dir to be set" % operation
|
153
188
|
end
|
154
189
|
when :delete
|
155
|
-
unless
|
156
|
-
raise StorageOperationError "File operation %s does not support the
|
190
|
+
unless to_loc_or_dir.nil?
|
191
|
+
raise StorageOperationError "File operation %s does not support the to_loc_or_dir argument" % operation
|
157
192
|
end
|
158
193
|
if alter_filename_lambda.class == Proc
|
159
194
|
raise StorageOperationError "File operation %s does not support the alter_filename_lambda argument" % operation
|
160
195
|
end
|
161
196
|
else
|
162
|
-
raise StorageOperationError "File operation %s is unsupported. Try :copy, :delete or :move" % operation
|
197
|
+
raise StorageOperationError "File operation %s is unsupported. Try :download, :copy, :delete or :move" % operation
|
163
198
|
end
|
164
199
|
|
165
200
|
files_to_process = []
|
@@ -171,7 +206,7 @@ module Sluice
|
|
171
206
|
# If an exception is thrown in a thread that isn't handled, die quickly
|
172
207
|
Thread.abort_on_exception = true
|
173
208
|
|
174
|
-
# Create
|
209
|
+
# Create Ruby threads to concurrently execute s3 operations
|
175
210
|
for i in (0...CONCURRENCY)
|
176
211
|
|
177
212
|
# Each thread pops a file off the files_to_process array, and moves it.
|
@@ -187,10 +222,10 @@ module Sluice
|
|
187
222
|
|
188
223
|
while !complete && !match do
|
189
224
|
if files_to_process.size == 0
|
190
|
-
#
|
191
|
-
#
|
225
|
+
# S3 batches 1000 files per request.
|
226
|
+
# We load up our array with the files to move
|
192
227
|
files_to_process = s3.directories.get(from_location.bucket, :prefix => from_location.dir).files.all(marker_opts)
|
193
|
-
#
|
228
|
+
# If we don't have any files after the s3 request, we're complete
|
194
229
|
if files_to_process.size == 0
|
195
230
|
complete = true
|
196
231
|
next
|
@@ -204,6 +239,7 @@ module Sluice
|
|
204
239
|
end
|
205
240
|
|
206
241
|
file = files_to_process.pop
|
242
|
+
|
207
243
|
match = if match_regex.is_a? NegativeRegex
|
208
244
|
!file.key.match(match_regex.regex)
|
209
245
|
else
|
@@ -213,12 +249,13 @@ module Sluice
|
|
213
249
|
end
|
214
250
|
|
215
251
|
# If we don't have a match, then we must be complete
|
216
|
-
break unless match
|
252
|
+
break unless match
|
253
|
+
|
254
|
+
# Ignore any EMR-created _$folder$ entries
|
255
|
+
break if file.key.end_with?('_$folder$')
|
217
256
|
|
218
257
|
# Match the filename, ignoring directory
|
219
258
|
file_match = file.key.match('([^/]+)$')
|
220
|
-
|
221
|
-
# Silently skip any sub-directories in the list
|
222
259
|
break unless file_match
|
223
260
|
|
224
261
|
if alter_filename_lambda.class == Proc
|
@@ -227,44 +264,46 @@ module Sluice
|
|
227
264
|
filename = file_match[1]
|
228
265
|
end
|
229
266
|
|
230
|
-
# What are we doing?
|
267
|
+
# What are we doing? Let's determine source and target
|
268
|
+
# Note that target excludes bucket name where relevant
|
269
|
+
source = "#{from_location.bucket}/#{file.key}"
|
231
270
|
case operation
|
271
|
+
when :download
|
272
|
+
target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir)
|
273
|
+
puts " DOWNLOAD #{source} +-> #{target}"
|
232
274
|
when :move
|
233
|
-
|
275
|
+
target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir.dir_as_path)
|
276
|
+
puts " MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
|
234
277
|
when :copy
|
235
|
-
|
278
|
+
target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir.dir_as_path)
|
279
|
+
puts " COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
|
236
280
|
when :delete
|
237
|
-
|
281
|
+
# No target
|
282
|
+
puts " DELETE x #{source}"
|
283
|
+
end
|
284
|
+
|
285
|
+
# Download is a stand-alone operation vs move/copy/delete
|
286
|
+
if operation == :download
|
287
|
+
retry_x(
|
288
|
+
download_file(s3, file, target), RETRIES,
|
289
|
+
" +/> #{target}",
|
290
|
+
"Problem downloading #{file.key}. Retrying.")
|
238
291
|
end
|
239
292
|
|
240
293
|
# A move or copy starts with a copy file
|
241
294
|
if [:move, :copy].include? operation
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
rescue
|
247
|
-
raise unless i < RETRIES
|
248
|
-
puts "Problem copying #{file.key}. Retrying.", $!, $@
|
249
|
-
sleep(RETRY_WAIT) # give us a bit of time before retrying
|
250
|
-
i += 1
|
251
|
-
retry
|
252
|
-
end
|
295
|
+
retry_x(
|
296
|
+
file.copy(to_loc_or_dir.bucket, target), RETRIES,
|
297
|
+
" +-> #{to_loc_or_dir.bucket}/#{target}",
|
298
|
+
"Problem copying #{file.key}. Retrying.")
|
253
299
|
end
|
254
300
|
|
255
301
|
# A move or delete ends with a delete
|
256
302
|
if [:move, :delete].include? operation
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
rescue
|
262
|
-
raise unless i < RETRIES
|
263
|
-
puts "Problem destroying #{file.key}. Retrying.", $!, $@
|
264
|
-
sleep(RETRY_WAIT) # Give us a bit of time before retrying
|
265
|
-
i += 1
|
266
|
-
retry
|
267
|
-
end
|
303
|
+
retry_x(
|
304
|
+
file.destroy(), RETRIES,
|
305
|
+
" x #{source}",
|
306
|
+
"Problem destroying #{file.key}. Retrying.")
|
268
307
|
end
|
269
308
|
end
|
270
309
|
end
|
@@ -276,6 +315,70 @@ module Sluice
|
|
276
315
|
end
|
277
316
|
module_function :process_files
|
278
317
|
|
318
|
+
# A helper function to attempt to run a
|
319
|
+
# function retries times
|
320
|
+
#
|
321
|
+
# Parameters:
|
322
|
+
# +function+:: Function to run
|
323
|
+
# +retries+:: Number of retries to attempt
|
324
|
+
# +attempt_msg+:: Message to puts on each attempt
|
325
|
+
# +failure_msg+:: Message to puts on each failure
|
326
|
+
def retry_x(function, retries, attempt_msg, failure_msg)
|
327
|
+
i = 0
|
328
|
+
begin
|
329
|
+
function
|
330
|
+
puts attempt_msg
|
331
|
+
rescue
|
332
|
+
raise unless i < retries
|
333
|
+
puts failure_msg
|
334
|
+
sleep(RETRY_WAIT) # Give us a bit of time before retrying
|
335
|
+
i += 1
|
336
|
+
retry
|
337
|
+
end
|
338
|
+
end
|
339
|
+
module_function :retry_x
|
340
|
+
|
341
|
+
# A helper function to prepare destination
|
342
|
+
# filenames and paths. This is a bit weird
|
343
|
+
# - it needs to exist because of differences
|
344
|
+
# in the way that Amazon S3, Fog and Unix
|
345
|
+
# treat filepaths versus keys.
|
346
|
+
#
|
347
|
+
# Parameters:
|
348
|
+
# +filepath+:: Path to file (including old filename)
|
349
|
+
# +new_filename+:: Replace the filename in the path with this
|
350
|
+
# +remove_path+:: If this is set, strip this from the front of the path
|
351
|
+
# +add_path+:: If this is set, add this to the front of the path
|
352
|
+
#
|
353
|
+
# TODO: this really needs unit tests
|
354
|
+
def name_file(filepath, new_filename, remove_path=nil, add_path=nil)
|
355
|
+
|
356
|
+
# First, replace the filename in filepath with new one
|
357
|
+
dirname = File.dirname(filepath)
|
358
|
+
new_filepath = (dirname == '.') ? new_filename : dirname + '/' + new_filename
|
359
|
+
|
360
|
+
# Nothing more to do
|
361
|
+
return new_filepath if remove_path.nil?
|
362
|
+
|
363
|
+
# If we have a 'remove_path', it must be found at
|
364
|
+
# the start of the path.
|
365
|
+
# If it's not, you're probably using name_file()
|
366
|
+
# wrong.
|
367
|
+
if !filepath.start_with?(remove_path)
|
368
|
+
raise StorageOperationError, "name_file failed. Filepath '#{filepath}' does not start with '#{remove_path}'"
|
369
|
+
end
|
370
|
+
|
371
|
+
# Okay, let's remove the filepath
|
372
|
+
shortened_filepath = new_filepath[remove_path.length()..-1]
|
373
|
+
|
374
|
+
# Nothing more to do
|
375
|
+
return shortened_filepath if add_path.nil?
|
376
|
+
|
377
|
+
# Add the new filepath on to the start and return
|
378
|
+
add_path + shortened_filepath
|
379
|
+
end
|
380
|
+
module_function :name_file
|
381
|
+
|
279
382
|
end
|
280
383
|
end
|
281
384
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sluice
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Alex Dean
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2012-11-
|
19
|
+
date: 2012-11-09 00:00:00 Z
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
22
|
name: fog
|
@@ -45,6 +45,7 @@ extra_rdoc_files: []
|
|
45
45
|
|
46
46
|
files:
|
47
47
|
- .gitignore
|
48
|
+
- CHANGELOG
|
48
49
|
- Gemfile
|
49
50
|
- LICENSE-2.0.txt
|
50
51
|
- README.md
|