sluice 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,11 @@
1
+ Version 0.0.8 (2013-08-14)
2
+ --------------------------
3
+ Added in ability to process a list of files rather than an S3 location
4
+
5
+ Version 0.0.7 (2013-XX-XX)
6
+ --------------------------
7
+ Added in upload capability
8
+
1
9
  Version 0.0.6 (2012-12-31)
2
10
  --------------------------
3
11
  Fixed is_empty? (was actually is_not_empty?)
@@ -9,7 +9,7 @@
9
9
  # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
10
  # See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
11
11
 
12
- # Authors:: Alex Dean (mailto:support@snowplowanalytics.com), Michael Tibben
12
+ # Authors:: Alex Dean (mailto:support@snowplowanalytics.com), Michael Tibben
13
13
  # Copyright:: Copyright (c) 2012 SnowPlow Analytics Ltd
14
14
  # License:: Apache License Version 2.0
15
15
 
@@ -67,22 +67,88 @@ module Sluice
67
67
  # +access_key_id+:: AWS access key ID
68
68
  # +secret_access_key+:: AWS secret access key
69
69
  def new_fog_s3_from(region, access_key_id, secret_access_key)
70
- Fog::Storage.new({
70
+ fog = Fog::Storage.new({
71
71
  :provider => 'AWS',
72
72
  :region => region,
73
73
  :aws_access_key_id => access_key_id,
74
74
  :aws_secret_access_key => secret_access_key
75
75
  })
76
+ fog.sync_clock
77
+ fog
76
78
  end
77
79
  module_function :new_fog_s3_from
78
80
 
81
+ # Return an array of all Fog::Storage::AWS::File's
82
+ #
83
+ # Parameters:
84
+ # +s3+:: A Fog::Storage s3 connection
85
+ # +location+:: The location to return files from
86
+ #
87
+ # Returns array of Fog::Storage::AWS::File's
88
+ def list_files(s3, location)
89
+ files_and_dirs = s3.directories.get(location.bucket, prefix: location.dir).files
90
+
91
+ files = [] # Can't use a .select because of Ruby deep copy issues (array of non-POROs)
92
+ files_and_dirs.each { |f|
93
+ if is_file?(f.key)
94
+ files << f.dup
95
+ end
96
+ }
97
+ files
98
+ end
99
+ module_function :list_files
100
+
101
+ # Whether the given path is a directory or not
102
+ #
103
+ # Parameters:
104
+ # +path+:: S3 path in String form
105
+ #
106
+ # Returns boolean
107
+ def is_folder?(path)
108
+ (path.end_with?('_$folder$') || # EMR-created
109
+ path.end_with?('/'))
110
+ end
111
+ module_function :is_folder?
112
+
113
+ # Whether the given path is a file or not
114
+ #
115
+ # Parameters:
116
+ # +path+:: S3 path in String form
117
+ #
118
+ # Returns boolean
119
+ def is_file?(path)
120
+ !is_folder?(path)
121
+ end
122
+ module_function :is_file?
123
+
124
+ # Returns the basename for the given path
125
+ #
126
+ # Parameters:
127
+ # +path+:: S3 path in String form
128
+ #
129
+ # Returns the basename, or nil if the
130
+ # path is to a folder
131
+ def get_basename(path)
132
+ if is_folder?(path)
133
+ nil
134
+ else
135
+ match = path.match('([^/]+)$')
136
+ if match
137
+ match[1]
138
+ else
139
+ nil
140
+ end
141
+ end
142
+ end
143
+ module_function :get_basename
144
+
79
145
  # Determine if a bucket is empty
80
146
  #
81
147
  # Parameters:
82
148
  # +s3+:: A Fog::Storage s3 connection
83
149
  # +location+:: The location to check
84
150
  def is_empty?(s3, location)
85
- s3.directories.get(location.bucket, :prefix => location.dir).files().length <= 1
151
+ list_files(s3, location).length <= 1
86
152
  end
87
153
  module_function :is_empty?
88
154
 
@@ -91,13 +157,13 @@ module Sluice
91
157
  #
92
158
  # Parameters:
93
159
  # +s3+:: A Fog::Storage s3 connection
94
- # +from_location+:: S3Location to delete files from
160
+ # +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to download files from
95
161
  # +to_directory+:: Local directory to copy files to
96
162
  # +match_regex+:: a regex string to match the files to delete
97
- def download_files(s3, from_location, to_directory, match_regex='.+')
163
+ def download_files(s3, from_files_or_loc, to_directory, match_regex='.+')
98
164
 
99
- puts " downloading files from #{from_location} to #{to_directory}"
100
- process_files(:download, s3, from_location, match_regex, to_directory)
165
+ puts " downloading #{describe_from(from_files_or_loc)} to #{to_directory}"
166
+ process_files(:download, s3, from_files_or_loc, match_regex, to_directory)
101
167
  end
102
168
  module_function :download_files
103
169
 
@@ -105,12 +171,12 @@ module Sluice
105
171
  #
106
172
  # Parameters:
107
173
  # +s3+:: A Fog::Storage s3 connection
108
- # +from_location+:: S3Location to delete files from
174
+ # +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to delete files from
109
175
  # +match_regex+:: a regex string to match the files to delete
110
- def delete_files(s3, from_location, match_regex='.+')
176
+ def delete_files(s3, from_files_or_loc, match_regex='.+')
111
177
 
112
- puts " deleting files from #{from_location}"
113
- process_files(:delete, s3, from_location, match_regex)
178
+ puts " deleting #{describe_from(from_files_or_loc)}"
179
+ process_files(:delete, s3, from_files_or_loc, match_regex)
114
180
  end
115
181
  module_function :delete_files
116
182
 
@@ -118,15 +184,15 @@ module Sluice
118
184
  #
119
185
  # Parameters:
120
186
  # +s3+:: A Fog::Storage s3 connection
121
- # +from_location+:: S3Location to copy files from
187
+ # +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to copy files from
122
188
  # +to_location+:: S3Location to copy files to
123
189
  # +match_regex+:: a regex string to match the files to copy
124
190
  # +alter_filename_lambda+:: lambda to alter the written filename
125
191
  # +flatten+:: strips off any sub-folders below the from_location
126
- def copy_files(s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
192
+ def copy_files(s3, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
127
193
 
128
- puts " copying files from #{from_location} to #{to_location}"
129
- process_files(:copy, s3, from_location, match_regex, to_location, alter_filename_lambda, flatten)
194
+ puts " copying #{describe_from(from_files_or_loc)} to #{to_location}"
195
+ process_files(:copy, s3, from_files_or_loc, match_regex, to_location, alter_filename_lambda, flatten)
130
196
  end
131
197
  module_function :copy_files
132
198
 
@@ -134,15 +200,15 @@ module Sluice
134
200
  #
135
201
  # Parameters:
136
202
  # +s3+:: A Fog::Storage s3 connection
137
- # +from_location+:: S3Location to move files from
203
+ # +from_files_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, or S3Location to move files from
138
204
  # +to_location+:: S3Location to move files to
139
205
  # +match_regex+:: a regex string to match the files to move
140
206
  # +alter_filename_lambda+:: lambda to alter the written filename
141
207
  # +flatten+:: strips off any sub-folders below the from_location
142
- def move_files(s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
208
+ def move_files(s3, from_files_or_loc, to_location, match_regex='.+', alter_filename_lambda=false, flatten=false)
143
209
 
144
- puts " moving files from #{from_location} to #{to_location}"
145
- process_files(:move, s3, from_location, match_regex, to_location, alter_filename_lambda, flatten)
210
+ puts " moving #{describe_from(from_files_or_loc)} to #{to_location}"
211
+ process_files(:move, s3, from_files_or_loc, match_regex, to_location, alter_filename_lambda, flatten)
146
212
  end
147
213
  module_function :move_files
148
214
 
@@ -150,13 +216,13 @@ module Sluice
150
216
  #
151
217
  # Parameters:
152
218
  # +s3+:: A Fog::Storage s3 connection
153
- # +from_directory+:: Local directory to upload files from
219
+ # +from_files_or_dir+:: Local array of files or local directory to upload files from
154
220
  # +to_location+:: S3Location to upload files to
155
221
  # +match_glob+:: a filesystem glob to match the files to upload
156
- def upload_files(s3, from_directory, to_location, match_glob='*')
222
+ def upload_files(s3, from_files_or_dir, to_location, match_glob='*')
157
223
 
158
- puts " uploading files from #{from_directory} to #{to_location}"
159
- process_files(:upload, s3, from_directory, match_glob, to_location)
224
+ puts " uploading #{describe_from(from_files_or_dir)} to #{to_location}"
225
+ process_files(:upload, s3, from_files_or_dir, match_glob, to_location)
160
226
  end
161
227
  module_function :upload_files
162
228
 
@@ -188,7 +254,7 @@ module Sluice
188
254
  #
189
255
  # Parameters:
190
256
  # +s3+:: A Fog::Storage s3 connection
191
- # +from_file:: A Fog::File to download
257
+ # +from_file:: A Fog::Storage::AWS::File to download
192
258
  # +to_file:: A local file path
193
259
  def download_file(s3, from_file, to_file)
194
260
 
@@ -204,6 +270,22 @@ module Sluice
204
270
 
205
271
  private
206
272
 
273
+ # Provides string describing from_files_or_dir_or_loc
274
+ # for logging purposes.
275
+ #
276
+ # Parameters:
277
+ # +from_files_or_dir_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, local directory or S3Location to process files from
278
+ #
279
+ # Returns a log-friendly string
280
+ def describe_from(from_files_or_dir_or_loc)
281
+ if from_files_or_dir_or_loc.is_a?(Array)
282
+ "#{from_files_or_dir_or_loc.length} file(s)"
283
+ else
284
+ "files from #{from_files_or_dir_or_loc}"
285
+ end
286
+ end
287
+ module_function :describe_from
288
+
207
289
  # Concurrent file operations between S3 locations. Supports:
208
290
  # - Download
209
291
  # - Upload
@@ -214,12 +296,12 @@ module Sluice
214
296
  # Parameters:
215
297
  # +operation+:: Operation to perform. :download, :upload, :copy, :delete, :move supported
216
298
  # +s3+:: A Fog::Storage s3 connection
217
- # +from_loc_or_dir+:: S3Location to process files from
299
+ # +from_files_or_dir_or_loc+:: Array of filepaths or Fog::Storage::AWS::File objects, local directory or S3Location to process files from
218
300
  # +match_regex_or_glob+:: a regex or glob string to match the files to process
219
301
  # +to_loc_or_dir+:: S3Location or local directory to process files to
220
302
  # +alter_filename_lambda+:: lambda to alter the written filename
221
303
  # +flatten+:: strips off any sub-folders below the from_loc_or_dir
222
- def process_files(operation, s3, from_loc_or_dir, match_regex_or_glob='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
304
+ def process_files(operation, s3, from_files_or_dir_or_loc, match_regex_or_glob='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
223
305
 
224
306
  # Validate that the file operation makes sense
225
307
  case operation
@@ -238,12 +320,21 @@ module Sluice
238
320
  raise StorageOperationError "File operation %s is unsupported. Try :download, :upload, :copy, :delete or :move" % operation
239
321
  end
240
322
 
241
- # If we are uploading, then we can glob the files before we thread
242
- if operation == :upload
243
- files_to_process = Dir.glob(File.join(from_loc_or_dir, match_regex_or_glob))
323
+ # If we have an array of files, no additional globbing required
324
+ if from_files_or_dir_or_loc.is_a?(Array)
325
+ files_to_process = from_files_or_dir_or_loc # Could be filepaths or Fog::Storage::AWS::File's
326
+ globbed = true
327
+ # Otherwise if it's an upload, we can glob now
328
+ elsif operation == :upload
329
+ files_to_process = Dir.glob(File.join(from_files_or_dir_or_loc, match_regex_or_glob))
330
+ globbed = true
331
+ # Otherwise we'll do threaded globbing later...
244
332
  else
245
333
  files_to_process = []
334
+ from_loc = from_files_or_dir_or_loc # Alias
335
+ globbed = false
246
336
  end
337
+
247
338
  threads = []
248
339
  mutex = Mutex.new
249
340
  complete = false
@@ -261,20 +352,37 @@ module Sluice
261
352
  loop do
262
353
  file = false
263
354
  filepath = false
355
+ from_bucket = false
356
+ from_path = false
264
357
  match = false
265
358
 
266
359
  # Critical section:
267
360
  # only allow one thread to modify the array at any time
268
361
  mutex.synchronize do
269
362
 
270
- if operation == :upload
271
-
363
+ # No need to do further globbing
364
+ if globbed
272
365
  if files_to_process.size == 0
273
366
  complete = true
274
367
  next
275
368
  end
276
369
 
277
- filepath = files_to_process.pop
370
+ file = files_to_process.pop
371
+ # Support raw filenames and also Fog::Storage::AWS::File's
372
+ if (file.is_a?(Fog::Storage::AWS::File))
373
+ from_bucket = file.directory.key # Bucket
374
+ from_path = Sluice::Storage.trail_slash(File.dirname(file.key))
375
+ filepath = file.key
376
+ else
377
+ from_bucket = nil # Not used
378
+ if from_files_or_dir_or_loc.is_a?(Array)
379
+ from_path = Sluice::Storage.trail_slash(File.dirname(file))
380
+ else
381
+ from_path = from_files_or_dir_or_loc # The root dir
382
+ end
383
+ filepath = file
384
+ end
385
+
278
386
  match = true # Match is implicit in the glob
279
387
  else
280
388
 
@@ -282,8 +390,8 @@ module Sluice
282
390
  if files_to_process.size == 0
283
391
  # S3 batches 1000 files per request.
284
392
  # We load up our array with the files to move
285
- files_to_process = s3.directories.get(from_loc_or_dir.bucket, :prefix => from_loc_or_dir.dir).files.all(marker_opts)
286
- # If we don't have any files after the s3 request, we're complete
393
+ files_to_process = s3.directories.get(from_loc.bucket, :prefix => from_loc.dir).files.all(marker_opts)
394
+ # If we don't have any files after the S3 request, we're complete
287
395
  if files_to_process.size == 0
288
396
  complete = true
289
397
  next
@@ -297,6 +405,8 @@ module Sluice
297
405
  end
298
406
 
299
407
  file = files_to_process.pop
408
+ from_bucket = from_loc.bucket
409
+ from_path = from_loc.dir_as_path
300
410
  filepath = file.key
301
411
 
302
412
  match = if match_regex_or_glob.is_a? NegativeRegex
@@ -308,21 +418,15 @@ module Sluice
308
418
  end
309
419
  end
310
420
 
311
- # If we don't have a match, then we must be complete
312
421
  break unless match
422
+ break if is_folder?(filepath)
313
423
 
314
- # Ignore any EMR-created _$folder$ entries
315
- break if filepath.end_with?('_$folder$')
316
-
317
- # Match the filename, ignoring directories
318
- file_match = filepath.match('([^/]+)$')
319
- break unless file_match
320
-
321
- # Rename
424
+ # Name file
425
+ basename = get_basename(filepath)
322
426
  if alter_filename_lambda.class == Proc
323
- filename = alter_filename_lambda.call(file_match[1])
427
+ filename = alter_filename_lambda.call(basename)
324
428
  else
325
- filename = file_match[1]
429
+ filename = basename
326
430
  end
327
431
 
328
432
  # What are we doing? Let's determine source and target
@@ -330,22 +434,22 @@ module Sluice
330
434
  case operation
331
435
  when :upload
332
436
  source = "#{filepath}"
333
- target = name_file(filepath, filename, from_loc_or_dir, to_loc_or_dir.dir_as_path, flatten)
437
+ target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
334
438
  puts " UPLOAD #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
335
439
  when :download
336
- source = "#{from_loc_or_dir.bucket}/#{filepath}"
337
- target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir, flatten)
440
+ source = "#{from_bucket}/#{filepath}"
441
+ target = name_file(filepath, filename, from_path, to_loc_or_dir, flatten)
338
442
  puts " DOWNLOAD #{source} +-> #{target}"
339
443
  when :move
340
- source = "#{from_loc_or_dir.bucket}/#{filepath}"
341
- target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
444
+ source = "#{from_bucket}/#{filepath}"
445
+ target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
342
446
  puts " MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
343
447
  when :copy
344
- source = "#{from_loc_or_dir.bucket}/#{filepath}"
345
- target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
448
+ source = "#{from_bucket}/#{filepath}"
449
+ target = name_file(filepath, filename, from_path, to_loc_or_dir.dir_as_path, flatten)
346
450
  puts " COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
347
451
  when :delete
348
- source = "#{from_loc_or_dir.bucket}/#{filepath}"
452
+ source = "#{from_bucket}/#{filepath}"
349
453
  # No target
350
454
  puts " DELETE x #{source}"
351
455
  end
data/lib/sluice.rb CHANGED
@@ -19,5 +19,5 @@ require 'sluice/storage/s3'
19
19
 
20
20
  module Sluice
21
21
  NAME = "sluice"
22
- VERSION = "0.0.7"
22
+ VERSION = "0.0.8"
23
23
  end
data/sluice.gemspec CHANGED
@@ -34,5 +34,5 @@ Gem::Specification.new do |gem|
34
34
  gem.require_paths = ["lib"]
35
35
 
36
36
  # Dependencies
37
- gem.add_dependency 'fog', '~> 1.6.0'
37
+ gem.add_dependency 'fog', '~> 1.14.0'
38
38
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sluice
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2013-07-11 00:00:00.000000000 Z
13
+ date: 2013-08-14 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: fog
@@ -19,7 +19,7 @@ dependencies:
19
19
  requirements:
20
20
  - - ~>
21
21
  - !ruby/object:Gem::Version
22
- version: 1.6.0
22
+ version: 1.14.0
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
@@ -27,7 +27,7 @@ dependencies:
27
27
  requirements:
28
28
  - - ~>
29
29
  - !ruby/object:Gem::Version
30
- version: 1.6.0
30
+ version: 1.14.0
31
31
  description: A Ruby gem to help you build ETL processes involving Amazon S3. Uses
32
32
  Fog
33
33
  email:
@@ -67,7 +67,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
67
67
  version: '0'
68
68
  requirements: []
69
69
  rubyforge_project:
70
- rubygems_version: 1.8.25
70
+ rubygems_version: 1.8.24
71
71
  signing_key:
72
72
  specification_version: 3
73
73
  summary: Ruby toolkit for cloud-friendly ETL