sluice 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +8 -0
- data/lib/sluice/storage/s3.rb +157 -53
- data/lib/sluice.rb +1 -1
- data/sluice.gemspec +1 -1
- metadata +5 -5
data/CHANGELOG
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
Version 0.0.8 (2013-08-14)
|
2
|
+
--------------------------
|
3
|
+
Added in ability to process a list of files rather than an S3 location
|
4
|
+
|
5
|
+
Version 0.0.7 (2013-XX-XX)
|
6
|
+
--------------------------
|
7
|
+
Added in upload capability
|
8
|
+
|
1
9
|
Version 0.0.6 (2012-12-31)
|
2
10
|
--------------------------
|
3
11
|
Fixed is_empty? (was actually is_not_empty?)
|
data/lib/sluice/storage/s3.rb
CHANGED
@@ -9,7 +9,7 @@
|
|
9
9
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
10
|
# See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
|
11
11
|
|
12
|
-
# Authors::
|
12
|
+
# Authors:: Alex Dean (mailto:support@snowplowanalytics.com), Michael Tibben
|
13
13
|
# Copyright:: Copyright (c) 2012 SnowPlow Analytics Ltd
|
14
14
|
# License:: Apache License Version 2.0
|
15
15
|
|
@@ -67,22 +67,88 @@ module Sluice
|
|
67
67
|
# +access_key_id+:: AWS access key ID
|
68
68
|
# +secret_access_key+:: AWS secret access key
|
69
69
|
def new_fog_s3_from(region, access_key_id, secret_access_key)
|
70
|
-
Fog::Storage.new({
|
70
|
+
fog = Fog::Storage.new({
|
71
71
|
:provider => 'AWS',
|
72
72
|
:region => region,
|
73
73
|
:aws_access_key_id => access_key_id,
|
74
74
|
:aws_secret_access_key => secret_access_key
|
75
75
|
})
|
76
|
+
fog.sync_clock
|
77
|
+
fog
|
76
78
|
end
|
77
79
|
module_function :new_fog_s3_from
|
78
80
|
|
81
|
+
# Return an array of all Fog::Storage::AWS::File's
|
82
|
+
#
|
83
|
+
# Parameters:
|
84
|
+
# +s3+:: A Fog::Storage s3 connection
|
85
|
+
# +location+:: The location to return files from
|
86
|
+
#
|
87
|
+
# Returns array of Fog::Storage::AWS::File's
|
88
|
+
def list_files(s3, location)
|
89
|
+
files_and_dirs = s3.directories.get(location.bucket, prefix: location.dir).files
|
90
|
+
|
91
|
+
files = [] # Can't use a .select because of Ruby deep copy issues (array of non-POROs)
|
92
|
+
files_and_dirs.each { |f|
|
93
|
+
if is_file?(f.key)
|
94
|
+
files << f.dup
|
95
|
+
end
|
96
|
+
}
|
97
|
+
files
|
98
|
+
end
|
99
|
+
module_function :list_files
|
100
|
+
|
101
|
+
# Whether the given path is a directory or not
|
102
|
+
#
|
103
|
+
# Parameters:
|
104
|
+
# +path+:: S3 path in String form
|
105
|
+
#
|
106
|
+
# Returns boolean
|
107
|
+
def is_folder?(path)
|
108
|
+
(path.end_with?('_$folder$') || # EMR-created
|
109
|
+
path.end_with?('/'))
|
110
|
+
end
|
111
|
+
module_function :is_folder?
|
112
|
+
|
113
|
+
# Whether the given path is a file or not
|
114
|
+
#
|
115
|
+
# Parameters:
|
116
|
+
# +path+:: S3 path in String form
|
117
|
+
#
|
118
|
+
# Returns boolean
|
119
|
+
def is_file?(path)
|
120
|
+
!is_folder?(path)
|
121
|
+
end
|
122
|
+
module_function :is_file?
|
123
|
+
|
124
|
+
# Returns the basename for the given path
|
125
|
+
#
|
126
|
+
# Parameters:
|
127
|
+
# +path+:: S3 path in String form
|
128
|
+
#
|
129
|
+
# Returns the basename, or nil if the
|
130
|
+
# path is to a folder
|
131
|
+
def get_basename(path)
|
132
|
+
if is_folder?(path)
|
133
|
+
nil
|
134
|
+
else
|
135
|
+
match = path.match('([^/]+)$')
|
136
|
+
if match
|
137
|
+
match[1]
|
138
|
+
else
|
139
|
+
nil
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
module_function :get_basename
|
144
|
+
|
79
145
|
# Determine if a bucket is empty
|
80
146
|
#
|
81
147
|
# Parameters:
|
82
148
|
# +s3+:: A Fog::Storage s3 connection
|
83
149
|
# +location+:: The location to check
|
84
150
|
def is_empty?(s3, location)
|
85
|
-
s3
|
151
|
+
list_files(s3, location).length <= 1
|
86
152
|
end
|
87
153
|
module_function :is_empty?
|
88
154
|
|
@@ -91,13 +157,13 @@ module Sluice
|
|
91
157
|
#
|
92
158
|
# Parameters:
|
93
159
|
# +s3+:: A Fog::Storage s3 connection
|
94
|
-
# +
|
160
|
+
# +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to download files from
|
95
161
|
# +to_directory+:: Local directory to copy files to
|
96
162
|
# +match_regex+:: a regex string to match the files to delete
|
97
|
-
def download_files(s3,
|
163
|
+
def download_files(s3, from_files_or_loc, to_directory, match_regex='.+')
|
98
164
|
|
99
|
-
puts " downloading
|
100
|
-
process_files(:download, s3,
|
165
|
+
puts " downloading #{describe_from(from_files_or_loc)} to #{to_directory}"
|
166
|
+
process_files(:download, s3, from_files_or_loc, match_regex, to_directory)
|
101
167
|
end
|
102
168
|
module_function :download_files
|
103
169
|
|
@@ -105,12 +171,12 @@ module Sluice
|
|
105
171
|
#
|
106
172
|
# Parameters:
|
107
173
|
# +s3+:: A Fog::Storage s3 connection
|
108
|
-
# +
|
174
|
+
# +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to delete files from
|
109
175
|
# +match_regex+:: a regex string to match the files to delete
|
110
|
-
def delete_files(s3,
|
176
|
+
def delete_files(s3, from_files_or_loc, match_regex='.+')
|
111
177
|
|
112
|
-
puts " deleting
|
113
|
-
process_files(:delete, s3,
|
178
|
+
puts " deleting #{describe_from(from_files_or_loc)}"
|
179
|
+
process_files(:delete, s3, from_files_or_loc, match_regex)
|
114
180
|
end
|
115
181
|
module_function :delete_files
|
116
182
|
|
@@ -118,15 +184,15 @@ module Sluice
|
|
118
184
|
#
|
119
185
|
# Parameters:
|
120
186
|
# +s3+:: A Fog::Storage s3 connection
|
121
|
-
# +
|
187
|
+
# +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to copy files from
|
122
188
|
# +to_location+:: S3Location to copy files to
|
123
189
|
# +match_regex+:: a regex string to match the files to copy
|
124
190
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
125
191
|
# +flatten+:: strips off any sub-folders below the from_location
|
126
|
-
def copy_files(s3,
|
192
|
+
def copy_files(s3, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
|
127
193
|
|
128
|
-
puts " copying
|
129
|
-
process_files(:copy, s3,
|
194
|
+
puts " copying #{describe_from(from_files_or_loc)} to #{to_location}"
|
195
|
+
process_files(:copy, s3, from_files_or_loc, match_regex, to_location, alter_filename_lambda, flatten)
|
130
196
|
end
|
131
197
|
module_function :copy_files
|
132
198
|
|
@@ -134,15 +200,15 @@ module Sluice
|
|
134
200
|
#
|
135
201
|
# Parameters:
|
136
202
|
# +s3+:: A Fog::Storage s3 connection
|
137
|
-
# +
|
203
|
+
# +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to move files from
|
138
204
|
# +to_location+:: S3Location to move files to
|
139
205
|
# +match_regex+:: a regex string to match the files to move
|
140
206
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
141
207
|
# +flatten+:: strips off any sub-folders below the from_location
|
142
|
-
def move_files(s3,
|
208
|
+
def move_files(s3, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
|
143
209
|
|
144
|
-
puts " moving
|
145
|
-
process_files(:move, s3,
|
210
|
+
puts " moving #{describe_from(from_files_or_loc)} to #{to_location}"
|
211
|
+
process_files(:move, s3, from_files_or_loc, match_regex, to_location, alter_filename_lambda, flatten)
|
146
212
|
end
|
147
213
|
module_function :move_files
|
148
214
|
|
@@ -150,13 +216,13 @@ module Sluice
|
|
150
216
|
#
|
151
217
|
# Parameters:
|
152
218
|
# +s3+:: A Fog::Storage s3 connection
|
153
|
-
# +
|
219
|
+
# +from_files_or_dir+:: Local array of files or local directory to upload files from
|
154
220
|
# +to_location+:: S3Location to upload files to
|
155
221
|
# +match_glob+:: a filesystem glob to match the files to upload
|
156
|
-
def upload_files(s3,
|
222
|
+
def upload_files(s3, from_files_or_dir, to_location, match_glob='*')
|
157
223
|
|
158
|
-
puts " uploading
|
159
|
-
process_files(:upload, s3,
|
224
|
+
puts " uploading #{describe_from(from_files_or_dir)} to #{to_location}"
|
225
|
+
process_files(:upload, s3, from_files_or_dir, match_glob, to_location)
|
160
226
|
end
|
161
227
|
module_function :upload_files
|
162
228
|
|
@@ -188,7 +254,7 @@ module Sluice
|
|
188
254
|
#
|
189
255
|
# Parameters:
|
190
256
|
# +s3+:: A Fog::Storage s3 connection
|
191
|
-
# +from_file:: A Fog::File to download
|
257
|
+
# +from_file:: A Fog::Storage::AWS::File to download
|
192
258
|
# +to_file:: A local file path
|
193
259
|
def download_file(s3, from_file, to_file)
|
194
260
|
|
@@ -204,6 +270,22 @@ module Sluice
|
|
204
270
|
|
205
271
|
private
|
206
272
|
|
273
|
+
# Provides string describing from_files_or_dir_or_loc
|
274
|
+
# for logging purposes.
|
275
|
+
#
|
276
|
+
# Parameters:
|
277
|
+
# +from_files_or_dir_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, local directory or S3Location to process files from
|
278
|
+
#
|
279
|
+
# Returns a log-friendly string
|
280
|
+
def describe_from(from_files_or_dir_or_loc)
|
281
|
+
if from_files_or_dir_or_loc.is_a?(Array)
|
282
|
+
"#{from_files_or_dir_or_loc.length} file(s)"
|
283
|
+
else
|
284
|
+
"files from #{from_files_or_dir_or_loc}"
|
285
|
+
end
|
286
|
+
end
|
287
|
+
module_function :describe_from
|
288
|
+
|
207
289
|
# Concurrent file operations between S3 locations. Supports:
|
208
290
|
# - Download
|
209
291
|
# - Upload
|
@@ -214,12 +296,12 @@ module Sluice
|
|
214
296
|
# Parameters:
|
215
297
|
# +operation+:: Operation to perform. :download, :upload, :copy, :delete, :move supported
|
216
298
|
# +s3+:: A Fog::Storage s3 connection
|
217
|
-
# +
|
299
|
+
# +from_files_or_dir_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, local directory or S3Location to process files from
|
218
300
|
# +match_regex_or_glob+:: a regex or glob string to match the files to process
|
219
301
|
# +to_loc_or_dir+:: S3Location or local directory to process files to
|
220
302
|
# +alter_filename_lambda+:: lambda to alter the written filename
|
221
303
|
# +flatten+:: strips off any sub-folders below the from_loc_or_dir
|
222
|
-
def process_files(operation, s3,
|
304
|
+
def process_files(operation, s3, from_files_or_dir_or_loc, match_regex_or_glob='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
|
223
305
|
|
224
306
|
# Validate that the file operation makes sense
|
225
307
|
case operation
|
@@ -238,12 +320,21 @@ module Sluice
|
|
238
320
|
raise StorageOperationError "File operation %s is unsupported. Try :download, :upload, :copy, :delete or :move" % operation
|
239
321
|
end
|
240
322
|
|
241
|
-
# If we
|
242
|
-
if
|
243
|
-
files_to_process =
|
323
|
+
# If we have an array of files, no additional globbing required
|
324
|
+
if from_files_or_dir_or_loc.is_a?(Array)
|
325
|
+
files_to_process = from_files_or_dir_or_loc # Could be filepaths or Fog::Storage::AWS::File's
|
326
|
+
globbed = true
|
327
|
+
# Otherwise if it's an upload, we can glob now
|
328
|
+
elsif operation == :upload
|
329
|
+
files_to_process = Dir.glob(File.join(from_files_or_dir_or_loc, match_regex_or_glob))
|
330
|
+
globbed = true
|
331
|
+
# Otherwise we'll do threaded globbing later...
|
244
332
|
else
|
245
333
|
files_to_process = []
|
334
|
+
from_loc = from_files_or_dir_or_loc # Alias
|
335
|
+
globbed = false
|
246
336
|
end
|
337
|
+
|
247
338
|
threads = []
|
248
339
|
mutex = Mutex.new
|
249
340
|
complete = false
|
@@ -261,20 +352,37 @@ module Sluice
|
|
261
352
|
loop do
|
262
353
|
file = false
|
263
354
|
filepath = false
|
355
|
+
from_bucket = false
|
356
|
+
from_path = false
|
264
357
|
match = false
|
265
358
|
|
266
359
|
# Critical section:
|
267
360
|
# only allow one thread to modify the array at any time
|
268
361
|
mutex.synchronize do
|
269
362
|
|
270
|
-
|
271
|
-
|
363
|
+
# No need to do further globbing
|
364
|
+
if globbed
|
272
365
|
if files_to_process.size == 0
|
273
366
|
complete = true
|
274
367
|
next
|
275
368
|
end
|
276
369
|
|
277
|
-
|
370
|
+
file = files_to_process.pop
|
371
|
+
# Support raw filenames and also Fog::Storage::AWS::File's
|
372
|
+
if (file.is_a?(Fog::Storage::AWS::File))
|
373
|
+
from_bucket = file.directory.key # Bucket
|
374
|
+
from_path = Sluice::Storage.trail_slash(File.dirname(file.key))
|
375
|
+
filepath = file.key
|
376
|
+
else
|
377
|
+
from_bucket = nil # Not used
|
378
|
+
if from_files_or_dir_or_loc.is_a?(Array)
|
379
|
+
from_path = Sluice::Storage.trail_slash(File.dirname(file))
|
380
|
+
else
|
381
|
+
from_path = from_files_or_dir_or_loc # The root dir
|
382
|
+
end
|
383
|
+
filepath = file
|
384
|
+
end
|
385
|
+
|
278
386
|
match = true # Match is implicit in the glob
|
279
387
|
else
|
280
388
|
|
@@ -282,8 +390,8 @@ module Sluice
|
|
282
390
|
if files_to_process.size == 0
|
283
391
|
# S3 batches 1000 files per request.
|
284
392
|
# We load up our array with the files to move
|
285
|
-
files_to_process = s3.directories.get(
|
286
|
-
# If we don't have any files after the
|
393
|
+
files_to_process = s3.directories.get(from_loc.bucket, :prefix => from_loc.dir).files.all(marker_opts)
|
394
|
+
# If we don't have any files after the S3 request, we're complete
|
287
395
|
if files_to_process.size == 0
|
288
396
|
complete = true
|
289
397
|
next
|
@@ -297,6 +405,8 @@ module Sluice
|
|
297
405
|
end
|
298
406
|
|
299
407
|
file = files_to_process.pop
|
408
|
+
from_bucket = from_loc.bucket
|
409
|
+
from_path = from_loc.dir_as_path
|
300
410
|
filepath = file.key
|
301
411
|
|
302
412
|
match = if match_regex_or_glob.is_a? NegativeRegex
|
@@ -308,21 +418,15 @@ module Sluice
|
|
308
418
|
end
|
309
419
|
end
|
310
420
|
|
311
|
-
# If we don't have a match, then we must be complete
|
312
421
|
break unless match
|
422
|
+
break if is_folder?(filepath)
|
313
423
|
|
314
|
-
#
|
315
|
-
|
316
|
-
|
317
|
-
# Match the filename, ignoring directories
|
318
|
-
file_match = filepath.match('([^/]+)$')
|
319
|
-
break unless file_match
|
320
|
-
|
321
|
-
# Rename
|
424
|
+
# Name file
|
425
|
+
basename = get_basename(filepath)
|
322
426
|
if alter_filename_lambda.class == Proc
|
323
|
-
filename = alter_filename_lambda.call(
|
427
|
+
filename = alter_filename_lambda.call(basename)
|
324
428
|
else
|
325
|
-
filename =
|
429
|
+
filename = basename
|
326
430
|
end
|
327
431
|
|
328
432
|
# What are we doing? Let's determine source and target
|
@@ -330,22 +434,22 @@ module Sluice
|
|
330
434
|
case operation
|
331
435
|
when :upload
|
332
436
|
source = "#{filepath}"
|
333
|
-
target = name_file(filepath, filename,
|
437
|
+
target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
|
334
438
|
puts " UPLOAD #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
|
335
439
|
when :download
|
336
|
-
source = "#{
|
337
|
-
target = name_file(filepath, filename,
|
440
|
+
source = "#{from_bucket}/#{filepath}"
|
441
|
+
target = name_file(filepath, filename, from_path, to_loc_or_dir, flatten)
|
338
442
|
puts " DOWNLOAD #{source} +-> #{target}"
|
339
443
|
when :move
|
340
|
-
source = "#{
|
341
|
-
target = name_file(filepath, filename,
|
444
|
+
source = "#{from_bucket}/#{filepath}"
|
445
|
+
target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
|
342
446
|
puts " MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
|
343
447
|
when :copy
|
344
|
-
source = "#{
|
345
|
-
target = name_file(filepath, filename,
|
448
|
+
source = "#{from_bucket}/#{filepath}"
|
449
|
+
target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
|
346
450
|
puts " COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
|
347
451
|
when :delete
|
348
|
-
source = "#{
|
452
|
+
source = "#{from_bucket}/#{filepath}"
|
349
453
|
# No target
|
350
454
|
puts " DELETE x #{source}"
|
351
455
|
end
|
data/lib/sluice.rb
CHANGED
data/sluice.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sluice
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2013-
|
13
|
+
date: 2013-08-14 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: fog
|
@@ -19,7 +19,7 @@ dependencies:
|
|
19
19
|
requirements:
|
20
20
|
- - ~>
|
21
21
|
- !ruby/object:Gem::Version
|
22
|
-
version: 1.
|
22
|
+
version: 1.14.0
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -27,7 +27,7 @@ dependencies:
|
|
27
27
|
requirements:
|
28
28
|
- - ~>
|
29
29
|
- !ruby/object:Gem::Version
|
30
|
-
version: 1.
|
30
|
+
version: 1.14.0
|
31
31
|
description: A Ruby gem to help you build ETL processes involving Amazon S3. Uses
|
32
32
|
Fog
|
33
33
|
email:
|
@@ -67,7 +67,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
67
67
|
version: '0'
|
68
68
|
requirements: []
|
69
69
|
rubyforge_project:
|
70
|
-
rubygems_version: 1.8.
|
70
|
+
rubygems_version: 1.8.24
|
71
71
|
signing_key:
|
72
72
|
specification_version: 3
|
73
73
|
summary: Ruby toolkit for cloud-friendly ETL
|