sluice 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,11 @@
1
+ Version 0.0.8 (2013-08-14)
2
+ --------------------------
3
+ Added in ability to process a list of files rather than an S3 location
4
+
5
+ Version 0.0.7 (2013-XX-XX)
6
+ --------------------------
7
+ Added in upload capability
8
+
1
9
  Version 0.0.6 (2012-12-31)
2
10
  --------------------------
3
11
  Fixed is_empty? (was actually is_not_empty?)
@@ -9,7 +9,7 @@
9
9
  # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
10
  # See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
11
11
 
12
- # Authors:: Alex Dean (mailto:support@snowplowanalytics.com), Michael Tibben
12
+ # Authors:: Alex Dean (mailto:support@snowplowanalytics.com), Michael Tibben
13
13
  # Copyright:: Copyright (c) 2012 SnowPlow Analytics Ltd
14
14
  # License:: Apache License Version 2.0
15
15
 
@@ -67,22 +67,88 @@ module Sluice
67
67
  # +access_key_id+:: AWS access key ID
68
68
  # +secret_access_key+:: AWS secret access key
69
69
  def new_fog_s3_from(region, access_key_id, secret_access_key)
70
- Fog::Storage.new({
70
+ fog = Fog::Storage.new({
71
71
  :provider => 'AWS',
72
72
  :region => region,
73
73
  :aws_access_key_id => access_key_id,
74
74
  :aws_secret_access_key => secret_access_key
75
75
  })
76
+ fog.sync_clock
77
+ fog
76
78
  end
77
79
  module_function :new_fog_s3_from
78
80
 
81
+ # Return an array of all Fog::Storage::AWS::File's
82
+ #
83
+ # Parameters:
84
+ # +s3+:: A Fog::Storage s3 connection
85
+ # +location+:: The location to return files from
86
+ #
87
+ # Returns array of Fog::Storage::AWS::File's
88
+ def list_files(s3, location)
89
+ files_and_dirs = s3.directories.get(location.bucket, prefix: location.dir).files
90
+
91
+ files = [] # Can't use a .select because of Ruby deep copy issues (array of non-POROs)
92
+ files_and_dirs.each { |f|
93
+ if is_file?(f.key)
94
+ files << f.dup
95
+ end
96
+ }
97
+ files
98
+ end
99
+ module_function :list_files
100
+
101
+ # Whether the given path is a directory or not
102
+ #
103
+ # Parameters:
104
+ # +path+:: S3 path in String form
105
+ #
106
+ # Returns boolean
107
+ def is_folder?(path)
108
+ (path.end_with?('_$folder$') || # EMR-created
109
+ path.end_with?('/'))
110
+ end
111
+ module_function :is_folder?
112
+
113
+ # Whether the given path is a file or not
114
+ #
115
+ # Parameters:
116
+ # +path+:: S3 path in String form
117
+ #
118
+ # Returns boolean
119
+ def is_file?(path)
120
+ !is_folder?(path)
121
+ end
122
+ module_function :is_file?
123
+
124
+ # Returns the basename for the given path
125
+ #
126
+ # Parameters:
127
+ # +path+:: S3 path in String form
128
+ #
129
+ # Returns the basename, or nil if the
130
+ # path is to a folder
131
+ def get_basename(path)
132
+ if is_folder?(path)
133
+ nil
134
+ else
135
+ match = path.match('([^/]+)$')
136
+ if match
137
+ match[1]
138
+ else
139
+ nil
140
+ end
141
+ end
142
+ end
143
+ module_function :get_basename
144
+
79
145
  # Determine if a bucket is empty
80
146
  #
81
147
  # Parameters:
82
148
  # +s3+:: A Fog::Storage s3 connection
83
149
  # +location+:: The location to check
84
150
  def is_empty?(s3, location)
85
- s3.directories.get(location.bucket, :prefix => location.dir).files().length <= 1
151
+ list_files(s3, location).length <= 1
86
152
  end
87
153
  module_function :is_empty?
88
154
 
@@ -91,13 +157,13 @@ module Sluice
91
157
  #
92
158
  # Parameters:
93
159
  # +s3+:: A Fog::Storage s3 connection
94
- # +from_location+:: S3Location to delete files from
160
+ # +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to download files from
95
161
  # +to_directory+:: Local directory to copy files to
96
162
  # +match_regex+:: a regex string to match the files to delete
97
- def download_files(s3, from_location, to_directory, match_regex='.+')
163
+ def download_files(s3, from_files_or_loc, to_directory, match_regex='.+')
98
164
 
99
- puts " downloading files from #{from_location} to #{to_directory}"
100
- process_files(:download, s3, from_location, match_regex, to_directory)
165
+ puts " downloading #{describe_from(from_files_or_loc)} to #{to_directory}"
166
+ process_files(:download, s3, from_files_or_loc, match_regex, to_directory)
101
167
  end
102
168
  module_function :download_files
103
169
 
@@ -105,12 +171,12 @@ module Sluice
105
171
  #
106
172
  # Parameters:
107
173
  # +s3+:: A Fog::Storage s3 connection
108
- # +from_location+:: S3Location to delete files from
174
+ # +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to delete files from
109
175
  # +match_regex+:: a regex string to match the files to delete
110
- def delete_files(s3, from_location, match_regex='.+')
176
+ def delete_files(s3, from_files_or_loc, match_regex='.+')
111
177
 
112
- puts " deleting files from #{from_location}"
113
- process_files(:delete, s3, from_location, match_regex)
178
+ puts " deleting #{describe_from(from_files_or_loc)}"
179
+ process_files(:delete, s3, from_files_or_loc, match_regex)
114
180
  end
115
181
  module_function :delete_files
116
182
 
@@ -118,15 +184,15 @@ module Sluice
118
184
  #
119
185
  # Parameters:
120
186
  # +s3+:: A Fog::Storage s3 connection
121
- # +from_location+:: S3Location to copy files from
187
+ # +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to copy files from
122
188
  # +to_location+:: S3Location to copy files to
123
189
  # +match_regex+:: a regex string to match the files to copy
124
190
  # +alter_filename_lambda+:: lambda to alter the written filename
125
191
  # +flatten+:: strips off any sub-folders below the from_location
126
- def copy_files(s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
192
+ def copy_files(s3, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
127
193
 
128
- puts " copying files from #{from_location} to #{to_location}"
129
- process_files(:copy, s3, from_location, match_regex, to_location, alter_filename_lambda, flatten)
194
+ puts " copying #{describe_from(from_files_or_loc)} to #{to_location}"
195
+ process_files(:copy, s3, from_files_or_loc, match_regex, to_location, alter_filename_lambda, flatten)
130
196
  end
131
197
  module_function :copy_files
132
198
 
@@ -134,15 +200,15 @@ module Sluice
134
200
  #
135
201
  # Parameters:
136
202
  # +s3+:: A Fog::Storage s3 connection
137
- # +from_location+:: S3Location to move files from
203
+ # +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to move files from
138
204
  # +to_location+:: S3Location to move files to
139
205
  # +match_regex+:: a regex string to match the files to move
140
206
  # +alter_filename_lambda+:: lambda to alter the written filename
141
207
  # +flatten+:: strips off any sub-folders below the from_location
142
- def move_files(s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
208
+ def move_files(s3, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
143
209
 
144
- puts " moving files from #{from_location} to #{to_location}"
145
- process_files(:move, s3, from_location, match_regex, to_location, alter_filename_lambda, flatten)
210
+ puts " moving #{describe_from(from_files_or_loc)} to #{to_location}"
211
+ process_files(:move, s3, from_files_or_loc, match_regex, to_location, alter_filename_lambda, flatten)
146
212
  end
147
213
  module_function :move_files
148
214
 
@@ -150,13 +216,13 @@ module Sluice
150
216
  #
151
217
  # Parameters:
152
218
  # +s3+:: A Fog::Storage s3 connection
153
- # +from_directory+:: Local directory to upload files from
219
+ # +from_files_or_dir+:: Local array of files or local directory to upload files from
154
220
  # +to_location+:: S3Location to upload files to
155
221
  # +match_glob+:: a filesystem glob to match the files to upload
156
- def upload_files(s3, from_directory, to_location, match_glob='*')
222
+ def upload_files(s3, from_files_or_dir, to_location, match_glob='*')
157
223
 
158
- puts " uploading files from #{from_directory} to #{to_location}"
159
- process_files(:upload, s3, from_directory, match_glob, to_location)
224
+ puts " uploading #{describe_from(from_files_or_dir)} to #{to_location}"
225
+ process_files(:upload, s3, from_files_or_dir, match_glob, to_location)
160
226
  end
161
227
  module_function :upload_files
162
228
 
@@ -188,7 +254,7 @@ module Sluice
188
254
  #
189
255
  # Parameters:
190
256
  # +s3+:: A Fog::Storage s3 connection
191
- # +from_file:: A Fog::File to download
257
+ # +from_file:: A Fog::Storage::AWS::File to download
192
258
  # +to_file:: A local file path
193
259
  def download_file(s3, from_file, to_file)
194
260
 
@@ -204,6 +270,22 @@ module Sluice
204
270
 
205
271
  private
206
272
 
273
+ # Provides string describing from_files_or_dir_or_loc
274
+ # for logging purposes.
275
+ #
276
+ # Parameters:
277
+ # +from_files_or_dir_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, local directory or S3Location to process files from
278
+ #
279
+ # Returns a log-friendly string
280
+ def describe_from(from_files_or_dir_or_loc)
281
+ if from_files_or_dir_or_loc.is_a?(Array)
282
+ "#{from_files_or_dir_or_loc.length} file(s)"
283
+ else
284
+ "files from #{from_files_or_dir_or_loc}"
285
+ end
286
+ end
287
+ module_function :describe_from
288
+
207
289
  # Concurrent file operations between S3 locations. Supports:
208
290
  # - Download
209
291
  # - Upload
@@ -214,12 +296,12 @@ module Sluice
214
296
  # Parameters:
215
297
  # +operation+:: Operation to perform. :download, :upload, :copy, :delete, :move supported
216
298
  # +s3+:: A Fog::Storage s3 connection
217
- # +from_loc_or_dir+:: S3Location to process files from
299
+ # +from_files_or_dir_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, local directory or S3Location to process files from
218
300
  # +match_regex_or_glob+:: a regex or glob string to match the files to process
219
301
  # +to_loc_or_dir+:: S3Location or local directory to process files to
220
302
  # +alter_filename_lambda+:: lambda to alter the written filename
221
303
  # +flatten+:: strips off any sub-folders below the from_loc_or_dir
222
- def process_files(operation, s3, from_loc_or_dir, match_regex_or_glob='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
304
+ def process_files(operation, s3, from_files_or_dir_or_loc, match_regex_or_glob='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
223
305
 
224
306
  # Validate that the file operation makes sense
225
307
  case operation
@@ -238,12 +320,21 @@ module Sluice
238
320
  raise StorageOperationError "File operation %s is unsupported. Try :download, :upload, :copy, :delete or :move" % operation
239
321
  end
240
322
 
241
- # If we are uploading, then we can glob the files before we thread
242
- if operation == :upload
243
- files_to_process = Dir.glob(File.join(from_loc_or_dir, match_regex_or_glob))
323
+ # If we have an array of files, no additional globbing required
324
+ if from_files_or_dir_or_loc.is_a?(Array)
325
+ files_to_process = from_files_or_dir_or_loc # Could be filepaths or Fog::Storage::AWS::File's
326
+ globbed = true
327
+ # Otherwise if it's an upload, we can glob now
328
+ elsif operation == :upload
329
+ files_to_process = Dir.glob(File.join(from_files_or_dir_or_loc, match_regex_or_glob))
330
+ globbed = true
331
+ # Otherwise we'll do threaded globbing later...
244
332
  else
245
333
  files_to_process = []
334
+ from_loc = from_files_or_dir_or_loc # Alias
335
+ globbed = false
246
336
  end
337
+
247
338
  threads = []
248
339
  mutex = Mutex.new
249
340
  complete = false
@@ -261,20 +352,37 @@ module Sluice
261
352
  loop do
262
353
  file = false
263
354
  filepath = false
355
+ from_bucket = false
356
+ from_path = false
264
357
  match = false
265
358
 
266
359
  # Critical section:
267
360
  # only allow one thread to modify the array at any time
268
361
  mutex.synchronize do
269
362
 
270
- if operation == :upload
271
-
363
+ # No need to do further globbing
364
+ if globbed
272
365
  if files_to_process.size == 0
273
366
  complete = true
274
367
  next
275
368
  end
276
369
 
277
- filepath = files_to_process.pop
370
+ file = files_to_process.pop
371
+ # Support raw filenames and also Fog::Storage::AWS::File's
372
+ if (file.is_a?(Fog::Storage::AWS::File))
373
+ from_bucket = file.directory.key # Bucket
374
+ from_path = Sluice::Storage.trail_slash(File.dirname(file.key))
375
+ filepath = file.key
376
+ else
377
+ from_bucket = nil # Not used
378
+ if from_files_or_dir_or_loc.is_a?(Array)
379
+ from_path = Sluice::Storage.trail_slash(File.dirname(file))
380
+ else
381
+ from_path = from_files_or_dir_or_loc # The root dir
382
+ end
383
+ filepath = file
384
+ end
385
+
278
386
  match = true # Match is implicit in the glob
279
387
  else
280
388
 
@@ -282,8 +390,8 @@ module Sluice
282
390
  if files_to_process.size == 0
283
391
  # S3 batches 1000 files per request.
284
392
  # We load up our array with the files to move
285
- files_to_process = s3.directories.get(from_loc_or_dir.bucket, :prefix => from_loc_or_dir.dir).files.all(marker_opts)
286
- # If we don't have any files after the s3 request, we're complete
393
+ files_to_process = s3.directories.get(from_loc.bucket, :prefix => from_loc.dir).files.all(marker_opts)
394
+ # If we don't have any files after the S3 request, we're complete
287
395
  if files_to_process.size == 0
288
396
  complete = true
289
397
  next
@@ -297,6 +405,8 @@ module Sluice
297
405
  end
298
406
 
299
407
  file = files_to_process.pop
408
+ from_bucket = from_loc.bucket
409
+ from_path = from_loc.dir_as_path
300
410
  filepath = file.key
301
411
 
302
412
  match = if match_regex_or_glob.is_a? NegativeRegex
@@ -308,21 +418,15 @@ module Sluice
308
418
  end
309
419
  end
310
420
 
311
- # If we don't have a match, then we must be complete
312
421
  break unless match
422
+ break if is_folder?(filepath)
313
423
 
314
- # Ignore any EMR-created _$folder$ entries
315
- break if filepath.end_with?('_$folder$')
316
-
317
- # Match the filename, ignoring directories
318
- file_match = filepath.match('([^/]+)$')
319
- break unless file_match
320
-
321
- # Rename
424
+ # Name file
425
+ basename = get_basename(filepath)
322
426
  if alter_filename_lambda.class == Proc
323
- filename = alter_filename_lambda.call(file_match[1])
427
+ filename = alter_filename_lambda.call(basename)
324
428
  else
325
- filename = file_match[1]
429
+ filename = basename
326
430
  end
327
431
 
328
432
  # What are we doing? Let's determine source and target
@@ -330,22 +434,22 @@ module Sluice
330
434
  case operation
331
435
  when :upload
332
436
  source = "#{filepath}"
333
- target = name_file(filepath, filename, from_loc_or_dir, to_loc_or_dir.dir_as_path, flatten)
437
+ target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
334
438
  puts " UPLOAD #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
335
439
  when :download
336
- source = "#{from_loc_or_dir.bucket}/#{filepath}"
337
- target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir, flatten)
440
+ source = "#{from_bucket}/#{filepath}"
441
+ target = name_file(filepath, filename, from_path, to_loc_or_dir, flatten)
338
442
  puts " DOWNLOAD #{source} +-> #{target}"
339
443
  when :move
340
- source = "#{from_loc_or_dir.bucket}/#{filepath}"
341
- target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
444
+ source = "#{from_bucket}/#{filepath}"
445
+ target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
342
446
  puts " MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
343
447
  when :copy
344
- source = "#{from_loc_or_dir.bucket}/#{filepath}"
345
- target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
448
+ source = "#{from_bucket}/#{filepath}"
449
+ target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
346
450
  puts " COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
347
451
  when :delete
348
- source = "#{from_loc_or_dir.bucket}/#{filepath}"
452
+ source = "#{from_bucket}/#{filepath}"
349
453
  # No target
350
454
  puts " DELETE x #{source}"
351
455
  end
data/lib/sluice.rb CHANGED
@@ -19,5 +19,5 @@ require 'sluice/storage/s3'
19
19
 
20
20
  module Sluice
21
21
  NAME = "sluice"
22
- VERSION = "0.0.7"
22
+ VERSION = "0.0.8"
23
23
  end
data/sluice.gemspec CHANGED
@@ -34,5 +34,5 @@ Gem::Specification.new do |gem|
34
34
  gem.require_paths = ["lib"]
35
35
 
36
36
  # Dependencies
37
- gem.add_dependency 'fog', '~> 1.6.0'
37
+ gem.add_dependency 'fog', '~> 1.14.0'
38
38
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sluice
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2013-07-11 00:00:00.000000000 Z
13
+ date: 2013-08-14 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: fog
@@ -19,7 +19,7 @@ dependencies:
19
19
  requirements:
20
20
  - - ~>
21
21
  - !ruby/object:Gem::Version
22
- version: 1.6.0
22
+ version: 1.14.0
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
@@ -27,7 +27,7 @@ dependencies:
27
27
  requirements:
28
28
  - - ~>
29
29
  - !ruby/object:Gem::Version
30
- version: 1.6.0
30
+ version: 1.14.0
31
31
  description: A Ruby gem to help you build ETL processes involving Amazon S3. Uses
32
32
  Fog
33
33
  email:
@@ -67,7 +67,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
67
67
  version: '0'
68
68
  requirements: []
69
69
  rubyforge_project:
70
- rubygems_version: 1.8.25
70
+ rubygems_version: 1.8.24
71
71
  signing_key:
72
72
  specification_version: 3
73
73
  summary: Ruby toolkit for cloud-friendly ETL