sluice 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +8 -0
- data/lib/sluice/storage/s3.rb +157 -53
- data/lib/sluice.rb +1 -1
- data/sluice.gemspec +1 -1
- metadata +5 -5
data/CHANGELOG
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
Version 0.0.8 (2013-08-14)
|
2
|
+
--------------------------
|
3
|
+
Added in ability to process a list of files rather than an S3 location
|
4
|
+
|
5
|
+
Version 0.0.7 (2013-XX-XX)
|
6
|
+
--------------------------
|
7
|
+
Added in upload capability
|
8
|
+
|
1
9
|
Version 0.0.6 (2012-12-31)
|
2
10
|
--------------------------
|
3
11
|
Fixed is_empty? (was actually is_not_empty?)
|
data/lib/sluice/storage/s3.rb
CHANGED
@@ -9,7 +9,7 @@
|
|
9
9
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
10
|
# See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
|
11
11
|
|
12
|
-
# Authors::
|
12
|
+
# Authors:: Alex Dean (mailto:support@snowplowanalytics.com), Michael Tibben
|
13
13
|
# Copyright:: Copyright (c) 2012 SnowPlow Analytics Ltd
|
14
14
|
# License:: Apache License Version 2.0
|
15
15
|
|
@@ -67,22 +67,88 @@ module Sluice
|
|
67
67
|
# +access_key_id+:: AWS access key ID
|
68
68
|
# +secret_access_key+:: AWS secret access key
|
69
69
|
def new_fog_s3_from(region, access_key_id, secret_access_key)
|
70
|
-
Fog::Storage.new({
|
70
|
+
fog = Fog::Storage.new({
|
71
71
|
:provider => 'AWS',
|
72
72
|
:region => region,
|
73
73
|
:aws_access_key_id => access_key_id,
|
74
74
|
:aws_secret_access_key => secret_access_key
|
75
75
|
})
|
76
|
+
fog.sync_clock
|
77
|
+
fog
|
76
78
|
end
|
77
79
|
module_function :new_fog_s3_from
|
78
80
|
|
81
|
+
# Return an array of all Fog::Storage::AWS::File's
|
82
|
+
#
|
83
|
+
# Parameters:
|
84
|
+
# +s3+:: A Fog::Storage s3 connection
|
85
|
+
# +location+:: The location to return files from
|
86
|
+
#
|
87
|
+
# Returns array of Fog::Storage::AWS::File's
|
88
|
+
def list_files(s3, location)
|
89
|
+
files_and_dirs = s3.directories.get(location.bucket, prefix: location.dir).files
|
90
|
+
|
91
|
+
files = [] # Can't use a .select because of Ruby deep copy issues (array of non-POROs)
|
92
|
+
files_and_dirs.each { |f|
|
93
|
+
if is_file?(f.key)
|
94
|
+
files << f.dup
|
95
|
+
end
|
96
|
+
}
|
97
|
+
files
|
98
|
+
end
|
99
|
+
module_function :list_files
|
100
|
+
|
101
|
+
# Whether the given path is a directory or not
|
102
|
+
#
|
103
|
+
# Parameters:
|
104
|
+
# +path+:: S3 path in String form
|
105
|
+
#
|
106
|
+
# Returns boolean
|
107
|
+
def is_folder?(path)
|
108
|
+
(path.end_with?('_$folder$') || # EMR-created
|
109
|
+
path.end_with?('/'))
|
110
|
+
end
|
111
|
+
module_function :is_folder?
|
112
|
+
|
113
|
+
# Whether the given path is a file or not
|
114
|
+
#
|
115
|
+
# Parameters:
|
116
|
+
# +path+:: S3 path in String form
|
117
|
+
#
|
118
|
+
# Returns boolean
|
119
|
+
def is_file?(path)
|
120
|
+
!is_folder?(path)
|
121
|
+
end
|
122
|
+
module_function :is_file?
|
123
|
+
|
124
|
+
# Returns the basename for the given path
|
125
|
+
#
|
126
|
+
# Parameters:
|
127
|
+
# +path+:: S3 path in String form
|
128
|
+
#
|
129
|
+
# Returns the basename, or nil if the
|
130
|
+
# path is to a folder
|
131
|
+
def get_basename(path)
|
132
|
+
if is_folder?(path)
|
133
|
+
nil
|
134
|
+
else
|
135
|
+
match = path.match('([^/]+)$')
|
136
|
+
if match
|
137
|
+
match[1]
|
138
|
+
else
|
139
|
+
nil
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
module_function :get_basename
|
144
|
+
|
79
145
|
# Determine if a bucket is empty
|
80
146
|
#
|
81
147
|
# Parameters:
|
82
148
|
# +s3+:: A Fog::Storage s3 connection
|
83
149
|
# +location+:: The location to check
|
84
150
|
def is_empty?(s3, location)
|
85
|
-
s3
|
151
|
+
list_files(s3, location).length <= 1
|
86
152
|
end
|
87
153
|
module_function :is_empty?
|
88
154
|
|
@@ -91,13 +157,13 @@ module Sluice
|
|
91
157
|
#
|
92
158
|
# Parameters:
|
93
159
|
# +s3+:: A Fog::Storage s3 connection
|
94
|
-
# +
|
160
|
+
# +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to download files from
|
95
161
|
# +to_directory+:: Local directory to copy files to
|
96
162
|
# +match_regex+:: a regex string to match the files to delete
|
97
|
-
def download_files(s3,
|
163
|
+
def download_files(s3, from_files_or_loc, to_directory, match_regex='.+')
|
98
164
|
|
99
|
-
puts " downloading
|
100
|
-
process_files(:download, s3,
|
165
|
+
puts " downloading #{describe_from(from_files_or_loc)} to #{to_directory}"
|
166
|
+
process_files(:download, s3, from_files_or_loc, match_regex, to_directory)
|
101
167
|
end
|
102
168
|
module_function :download_files
|
103
169
|
|
@@ -105,12 +171,12 @@ module Sluice
|
|
105
171
|
#
|
106
172
|
# Parameters:
|
107
173
|
# +s3+:: A Fog::Storage s3 connection
|
108
|
-
# +
|
174
|
+
# +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to delete files from
|
109
175
|
# +match_regex+:: a regex string to match the files to delete
|
110
|
-
def delete_files(s3,
|
176
|
+
def delete_files(s3, from_files_or_loc, match_regex='.+')
|
111
177
|
|
112
|
-
puts " deleting
|
113
|
-
process_files(:delete, s3,
|
178
|
+
puts " deleting #{describe_from(from_files_or_loc)}"
|
179
|
+
process_files(:delete, s3, from_files_or_loc, match_regex)
|
114
180
|
end
|
115
181
|
module_function :delete_files
|
116
182
|
|
@@ -118,15 +184,15 @@ module Sluice
|
|
118
184
|
#
|
119
185
|
# Parameters:
|
120
186
|
# +s3+:: A Fog::Storage s3 connection
|
121
|
-
# +
|
187
|
+
# +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to copy files from
|
122
188
|
# +to_location+:: S3Location to copy files to
|
123
189
|
# +match_regex+:: a regex string to match the files to copy
|
124
190
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
125
191
|
# +flatten+:: strips off any sub-folders below the from_location
|
126
|
-
def copy_files(s3,
|
192
|
+
def copy_files(s3, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
|
127
193
|
|
128
|
-
puts " copying
|
129
|
-
process_files(:copy, s3,
|
194
|
+
puts " copying #{describe_from(from_files_or_loc)} to #{to_location}"
|
195
|
+
process_files(:copy, s3, from_files_or_loc, match_regex, to_location, alter_filename_lambda, flatten)
|
130
196
|
end
|
131
197
|
module_function :copy_files
|
132
198
|
|
@@ -134,15 +200,15 @@ module Sluice
|
|
134
200
|
#
|
135
201
|
# Parameters:
|
136
202
|
# +s3+:: A Fog::Storage s3 connection
|
137
|
-
# +
|
203
|
+
# +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to move files from
|
138
204
|
# +to_location+:: S3Location to move files to
|
139
205
|
# +match_regex+:: a regex string to match the files to move
|
140
206
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
141
207
|
# +flatten+:: strips off any sub-folders below the from_location
|
142
|
-
def move_files(s3,
|
208
|
+
def move_files(s3, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
|
143
209
|
|
144
|
-
puts " moving
|
145
|
-
process_files(:move, s3,
|
210
|
+
puts " moving #{describe_from(from_files_or_loc)} to #{to_location}"
|
211
|
+
process_files(:move, s3, from_files_or_loc, match_regex, to_location, alter_filename_lambda, flatten)
|
146
212
|
end
|
147
213
|
module_function :move_files
|
148
214
|
|
@@ -150,13 +216,13 @@ module Sluice
|
|
150
216
|
#
|
151
217
|
# Parameters:
|
152
218
|
# +s3+:: A Fog::Storage s3 connection
|
153
|
-
# +
|
219
|
+
# +from_files_or_dir+:: Local array of files or local directory to upload files from
|
154
220
|
# +to_location+:: S3Location to upload files to
|
155
221
|
# +match_glob+:: a filesystem glob to match the files to upload
|
156
|
-
def upload_files(s3,
|
222
|
+
def upload_files(s3, from_files_or_dir, to_location, match_glob='*')
|
157
223
|
|
158
|
-
puts " uploading
|
159
|
-
process_files(:upload, s3,
|
224
|
+
puts " uploading #{describe_from(from_files_or_dir)} to #{to_location}"
|
225
|
+
process_files(:upload, s3, from_files_or_dir, match_glob, to_location)
|
160
226
|
end
|
161
227
|
module_function :upload_files
|
162
228
|
|
@@ -188,7 +254,7 @@ module Sluice
|
|
188
254
|
#
|
189
255
|
# Parameters:
|
190
256
|
# +s3+:: A Fog::Storage s3 connection
|
191
|
-
# +from_file:: A Fog::File to download
|
257
|
+
# +from_file:: A Fog::Storage::AWS::File to download
|
192
258
|
# +to_file:: A local file path
|
193
259
|
def download_file(s3, from_file, to_file)
|
194
260
|
|
@@ -204,6 +270,22 @@ module Sluice
|
|
204
270
|
|
205
271
|
private
|
206
272
|
|
273
|
+
# Provides string describing from_files_or_dir_or_loc
|
274
|
+
# for logging purposes.
|
275
|
+
#
|
276
|
+
# Parameters:
|
277
|
+
# +from_files_or_dir_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, local directory or S3Location to process files from
|
278
|
+
#
|
279
|
+
# Returns a log-friendly string
|
280
|
+
def describe_from(from_files_or_dir_or_loc)
|
281
|
+
if from_files_or_dir_or_loc.is_a?(Array)
|
282
|
+
"#{from_files_or_dir_or_loc.length} file(s)"
|
283
|
+
else
|
284
|
+
"files from #{from_files_or_dir_or_loc}"
|
285
|
+
end
|
286
|
+
end
|
287
|
+
module_function :describe_from
|
288
|
+
|
207
289
|
# Concurrent file operations between S3 locations. Supports:
|
208
290
|
# - Download
|
209
291
|
# - Upload
|
@@ -214,12 +296,12 @@ module Sluice
|
|
214
296
|
# Parameters:
|
215
297
|
# +operation+:: Operation to perform. :download, :upload, :copy, :delete, :move supported
|
216
298
|
# +s3+:: A Fog::Storage s3 connection
|
217
|
-
# +
|
299
|
+
# +from_files_or_dir_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, local directory or S3Location to process files from
|
218
300
|
# +match_regex_or_glob+:: a regex or glob string to match the files to process
|
219
301
|
# +to_loc_or_dir+:: S3Location or local directory to process files to
|
220
302
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
221
303
|
# +flatten+:: strips off any sub-folders below the from_loc_or_dir
|
222
|
-
def process_files(operation, s3,
|
304
|
+
def process_files(operation, s3, from_files_or_dir_or_loc, match_regex_or_glob='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
|
223
305
|
|
224
306
|
# Validate that the file operation makes sense
|
225
307
|
case operation
|
@@ -238,12 +320,21 @@ module Sluice
|
|
238
320
|
raise StorageOperationError "File operation %s is unsupported. Try :download, :upload, :copy, :delete or :move" % operation
|
239
321
|
end
|
240
322
|
|
241
|
-
# If we
|
242
|
-
if
|
243
|
-
files_to_process =
|
323
|
+
# If we have an array of files, no additional globbing required
|
324
|
+
if from_files_or_dir_or_loc.is_a?(Array)
|
325
|
+
files_to_process = from_files_or_dir_or_loc # Could be filepaths or Fog::Storage::AWS::File's
|
326
|
+
globbed = true
|
327
|
+
# Otherwise if it's an upload, we can glob now
|
328
|
+
elsif operation == :upload
|
329
|
+
files_to_process = Dir.glob(File.join(from_files_or_dir_or_loc, match_regex_or_glob))
|
330
|
+
globbed = true
|
331
|
+
# Otherwise we'll do threaded globbing later...
|
244
332
|
else
|
245
333
|
files_to_process = []
|
334
|
+
from_loc = from_files_or_dir_or_loc # Alias
|
335
|
+
globbed = false
|
246
336
|
end
|
337
|
+
|
247
338
|
threads = []
|
248
339
|
mutex = Mutex.new
|
249
340
|
complete = false
|
@@ -261,20 +352,37 @@ module Sluice
|
|
261
352
|
loop do
|
262
353
|
file = false
|
263
354
|
filepath = false
|
355
|
+
from_bucket = false
|
356
|
+
from_path = false
|
264
357
|
match = false
|
265
358
|
|
266
359
|
# Critical section:
|
267
360
|
# only allow one thread to modify the array at any time
|
268
361
|
mutex.synchronize do
|
269
362
|
|
270
|
-
|
271
|
-
|
363
|
+
# No need to do further globbing
|
364
|
+
if globbed
|
272
365
|
if files_to_process.size == 0
|
273
366
|
complete = true
|
274
367
|
next
|
275
368
|
end
|
276
369
|
|
277
|
-
|
370
|
+
file = files_to_process.pop
|
371
|
+
# Support raw filenames and also Fog::Storage::AWS::File's
|
372
|
+
if (file.is_a?(Fog::Storage::AWS::File))
|
373
|
+
from_bucket = file.directory.key # Bucket
|
374
|
+
from_path = Sluice::Storage.trail_slash(File.dirname(file.key))
|
375
|
+
filepath = file.key
|
376
|
+
else
|
377
|
+
from_bucket = nil # Not used
|
378
|
+
if from_files_or_dir_or_loc.is_a?(Array)
|
379
|
+
from_path = Sluice::Storage.trail_slash(File.dirname(file))
|
380
|
+
else
|
381
|
+
from_path = from_files_or_dir_or_loc # The root dir
|
382
|
+
end
|
383
|
+
filepath = file
|
384
|
+
end
|
385
|
+
|
278
386
|
match = true # Match is implicit in the glob
|
279
387
|
else
|
280
388
|
|
@@ -282,8 +390,8 @@ module Sluice
|
|
282
390
|
if files_to_process.size == 0
|
283
391
|
# S3 batches 1000 files per request.
|
284
392
|
# We load up our array with the files to move
|
285
|
-
files_to_process = s3.directories.get(
|
286
|
-
# If we don't have any files after the
|
393
|
+
files_to_process = s3.directories.get(from_loc.bucket, :prefix => from_loc.dir).files.all(marker_opts)
|
394
|
+
# If we don't have any files after the S3 request, we're complete
|
287
395
|
if files_to_process.size == 0
|
288
396
|
complete = true
|
289
397
|
next
|
@@ -297,6 +405,8 @@ module Sluice
|
|
297
405
|
end
|
298
406
|
|
299
407
|
file = files_to_process.pop
|
408
|
+
from_bucket = from_loc.bucket
|
409
|
+
from_path = from_loc.dir_as_path
|
300
410
|
filepath = file.key
|
301
411
|
|
302
412
|
match = if match_regex_or_glob.is_a? NegativeRegex
|
@@ -308,21 +418,15 @@ module Sluice
|
|
308
418
|
end
|
309
419
|
end
|
310
420
|
|
311
|
-
# If we don't have a match, then we must be complete
|
312
421
|
break unless match
|
422
|
+
break if is_folder?(filepath)
|
313
423
|
|
314
|
-
#
|
315
|
-
|
316
|
-
|
317
|
-
# Match the filename, ignoring directories
|
318
|
-
file_match = filepath.match('([^/]+)$')
|
319
|
-
break unless file_match
|
320
|
-
|
321
|
-
# Rename
|
424
|
+
# Name file
|
425
|
+
basename = get_basename(filepath)
|
322
426
|
if alter_filename_lambda.class == Proc
|
323
|
-
filename = alter_filename_lambda.call(
|
427
|
+
filename = alter_filename_lambda.call(basename)
|
324
428
|
else
|
325
|
-
filename =
|
429
|
+
filename = basename
|
326
430
|
end
|
327
431
|
|
328
432
|
# What are we doing? Let's determine source and target
|
@@ -330,22 +434,22 @@ module Sluice
|
|
330
434
|
case operation
|
331
435
|
when :upload
|
332
436
|
source = "#{filepath}"
|
333
|
-
target = name_file(filepath, filename,
|
437
|
+
target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
|
334
438
|
puts " UPLOAD #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
|
335
439
|
when :download
|
336
|
-
source = "#{
|
337
|
-
target = name_file(filepath, filename,
|
440
|
+
source = "#{from_bucket}/#{filepath}"
|
441
|
+
target = name_file(filepath, filename, from_path, to_loc_or_dir, flatten)
|
338
442
|
puts " DOWNLOAD #{source} +-> #{target}"
|
339
443
|
when :move
|
340
|
-
source = "#{
|
341
|
-
target = name_file(filepath, filename,
|
444
|
+
source = "#{from_bucket}/#{filepath}"
|
445
|
+
target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
|
342
446
|
puts " MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
|
343
447
|
when :copy
|
344
|
-
source = "#{
|
345
|
-
target = name_file(filepath, filename,
|
448
|
+
source = "#{from_bucket}/#{filepath}"
|
449
|
+
target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
|
346
450
|
puts " COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
|
347
451
|
when :delete
|
348
|
-
source = "#{
|
452
|
+
source = "#{from_bucket}/#{filepath}"
|
349
453
|
# No target
|
350
454
|
puts " DELETE x #{source}"
|
351
455
|
end
|
data/lib/sluice.rb
CHANGED
data/sluice.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sluice
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2013-
|
13
|
+
date: 2013-08-14 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: fog
|
@@ -19,7 +19,7 @@ dependencies:
|
|
19
19
|
requirements:
|
20
20
|
- - ~>
|
21
21
|
- !ruby/object:Gem::Version
|
22
|
-
version: 1.
|
22
|
+
version: 1.14.0
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -27,7 +27,7 @@ dependencies:
|
|
27
27
|
requirements:
|
28
28
|
- - ~>
|
29
29
|
- !ruby/object:Gem::Version
|
30
|
-
version: 1.
|
30
|
+
version: 1.14.0
|
31
31
|
description: A Ruby gem to help you build ETL processes involving Amazon S3. Uses
|
32
32
|
Fog
|
33
33
|
email:
|
@@ -67,7 +67,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
67
67
|
version: '0'
|
68
68
|
requirements: []
|
69
69
|
rubyforge_project:
|
70
|
-
rubygems_version: 1.8.
|
70
|
+
rubygems_version: 1.8.24
|
71
71
|
signing_key:
|
72
72
|
specification_version: 3
|
73
73
|
summary: Ruby toolkit for cloud-friendly ETL
|