sluice 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. data/README.md +2 -0
  2. data/lib/sluice.rb +1 -1
  3. data/lib/sluice/storage/s3.rb +117 -42
  4. metadata +33 -51
data/README.md CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  Sluice is a Ruby gem (built with [Bundler] [bundler]) to help you build cloud-friendly ETL (extract, transform, load) processes.
4
4
 
5
+ **Currently it does one thing: supports very robust, very parallel copy/delete/move of S3 files from one bucket to another.**
6
+
5
7
  Sluice has been extracted from a pair of Ruby ETL applications built by the [SnowPlow Analytics] [snowplow-analytics] team, specifically:
6
8
 
7
9
  1. [EmrEtlRunner] [emr-etl-runner], a Ruby application to run the SnowPlow ETL process on Elastic MapReduce
data/lib/sluice.rb CHANGED
@@ -19,5 +19,5 @@ require 'sluice/storage/s3'
19
19
 
20
20
  module Sluice
21
21
  NAME = "sluice"
22
- VERSION = "0.0.6"
22
+ VERSION = "0.0.7"
23
23
  end
@@ -135,7 +135,7 @@ module Sluice
135
135
  # Parameters:
136
136
  # +s3+:: A Fog::Storage s3 connection
137
137
  # +from_location+:: S3Location to move files from
138
- # +to+:: S3Location to move files to
138
+ # +to_location+:: S3Location to move files to
139
139
  # +match_regex+:: a regex string to match the files to move
140
140
  # +alter_filename_lambda+:: lambda to alter the written filename
141
141
  # +flatten+:: strips off any sub-folders below the from_location
@@ -146,7 +146,43 @@ module Sluice
146
146
  end
147
147
  module_function :move_files
148
148
 
149
- # Download a single file to the exact path specified.
149
+ # Uploads files to S3 locations concurrently
150
+ #
151
+ # Parameters:
152
+ # +s3+:: A Fog::Storage s3 connection
153
+ # +from_directory+:: Local directory to upload files from
154
+ # +to_location+:: S3Location to upload files to
155
+ # +match_glob+:: a filesystem glob to match the files to upload
156
+ def upload_files(s3, from_directory, to_location, match_glob='*')
157
+
158
+ puts " uploading files from #{from_directory} to #{to_location}"
159
+ process_files(:upload, s3, from_directory, match_glob, to_location)
160
+ end
161
+ module_function :upload_files
162
+
163
+ # Upload a single file to the exact location specified
164
+ # Has no intelligence around filenaming.
165
+ #
166
+ # Parameters:
167
+ # +s3+:: A Fog::Storage s3 connection
168
+ # +from_file:: A local file path
169
+ # +to_bucket:: The Fog::Directory to upload to
170
+ # +to_file:: The file path to upload to
171
+ def upload_file(s3, from_file, to_bucket, to_file)
172
+
173
+ local_file = File.open(from_file)
174
+
175
+ dir = s3.directories.new(:key => to_bucket) # No request made
176
+ file = dir.files.create(
177
+ :key => to_file,
178
+ :body => local_file
179
+ )
180
+
181
+ local_file.close
182
+ end
183
+ module_function :upload_file
184
+
185
+ # Download a single file to the exact path specified
150
186
  # Has no intelligence around filenaming.
151
187
  # Makes sure to create the path as needed.
152
188
  #
@@ -169,23 +205,25 @@ module Sluice
169
205
  private
170
206
 
171
207
  # Concurrent file operations between S3 locations. Supports:
208
+ # - Download
209
+ # - Upload
172
210
  # - Copy
173
211
  # - Delete
174
212
  # - Move (= Copy + Delete)
175
213
  #
176
214
  # Parameters:
177
- # +operation+:: Operation to perform. :copy, :delete, :move supported
215
+ # +operation+:: Operation to perform. :download, :upload, :copy, :delete, :move supported
178
216
  # +s3+:: A Fog::Storage s3 connection
179
- # +from_location+:: S3Location to process files from
180
- # +match_regex+:: a regex string to match the files to process
217
+ # +from_loc_or_dir+:: S3Location to process files from
218
+ # +match_regex_or_glob+:: a regex or glob string to match the files to process
181
219
  # +to_loc_or_dir+:: S3Location or local directory to process files to
182
220
  # +alter_filename_lambda+:: lambda to alter the written filename
183
- # +flatten+:: strips off any sub-folders below the from_location
184
- def process_files(operation, s3, from_location, match_regex='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
221
+ # +flatten+:: strips off any sub-folders below the from_loc_or_dir
222
+ def process_files(operation, s3, from_loc_or_dir, match_regex_or_glob='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
185
223
 
186
224
  # Validate that the file operation makes sense
187
225
  case operation
188
- when :copy, :move, :download
226
+ when :copy, :move, :download, :upload
189
227
  if to_loc_or_dir.nil?
190
228
  raise StorageOperationError "File operation %s requires the to_loc_or_dir to be set" % operation
191
229
  end
@@ -197,10 +235,15 @@ module Sluice
197
235
  raise StorageOperationError "File operation %s does not support the alter_filename_lambda argument" % operation
198
236
  end
199
237
  else
200
- raise StorageOperationError "File operation %s is unsupported. Try :download, :copy, :delete or :move" % operation
238
+ raise StorageOperationError "File operation %s is unsupported. Try :download, :upload, :copy, :delete or :move" % operation
201
239
  end
202
240
 
203
- files_to_process = []
241
+ # If we are uploading, then we can glob the files before we thread
242
+ if operation == :upload
243
+ files_to_process = Dir.glob(File.join(from_loc_or_dir, match_regex_or_glob))
244
+ else
245
+ files_to_process = []
246
+ end
204
247
  threads = []
205
248
  mutex = Mutex.new
206
249
  complete = false
@@ -217,37 +260,51 @@ module Sluice
217
260
  threads << Thread.new do
218
261
  loop do
219
262
  file = false
263
+ filepath = false
220
264
  match = false
221
265
 
222
266
  # Critical section:
223
267
  # only allow one thread to modify the array at any time
224
268
  mutex.synchronize do
225
269
 
226
- while !complete && !match do
270
+ if operation == :upload
271
+
227
272
  if files_to_process.size == 0
228
- # S3 batches 1000 files per request.
229
- # We load up our array with the files to move
230
- files_to_process = s3.directories.get(from_location.bucket, :prefix => from_location.dir).files.all(marker_opts)
231
- # If we don't have any files after the s3 request, we're complete
273
+ complete = true
274
+ next
275
+ end
276
+
277
+ filepath = files_to_process.pop
278
+ match = true # Match is implicit in the glob
279
+ else
280
+
281
+ while !complete && !match do
232
282
  if files_to_process.size == 0
233
- complete = true
234
- next
235
- else
236
- marker_opts['marker'] = files_to_process.last.key
237
-
238
- # By reversing the array we can use pop and get FIFO behaviour
239
- # instead of the performance penalty incurred by unshift
240
- files_to_process = files_to_process.reverse
283
+ # S3 batches 1000 files per request.
284
+ # We load up our array with the files to move
285
+ files_to_process = s3.directories.get(from_loc_or_dir.bucket, :prefix => from_loc_or_dir.dir).files.all(marker_opts)
286
+ # If we don't have any files after the s3 request, we're complete
287
+ if files_to_process.size == 0
288
+ complete = true
289
+ next
290
+ else
291
+ marker_opts['marker'] = files_to_process.last.key
292
+
293
+ # By reversing the array we can use pop and get FIFO behaviour
294
+ # instead of the performance penalty incurred by unshift
295
+ files_to_process = files_to_process.reverse
296
+ end
241
297
  end
242
- end
243
298
 
244
- file = files_to_process.pop
299
+ file = files_to_process.pop
300
+ filepath = file.key
245
301
 
246
- match = if match_regex.is_a? NegativeRegex
247
- !file.key.match(match_regex.regex)
248
- else
249
- file.key.match(match_regex)
250
- end
302
+ match = if match_regex_or_glob.is_a? NegativeRegex
303
+ !filepath.match(match_regex_or_glob.regex)
304
+ else
305
+ filepath.match(match_regex_or_glob)
306
+ end
307
+ end
251
308
  end
252
309
  end
253
310
 
@@ -255,12 +312,13 @@ module Sluice
255
312
  break unless match
256
313
 
257
314
  # Ignore any EMR-created _$folder$ entries
258
- break if file.key.end_with?('_$folder$')
315
+ break if filepath.end_with?('_$folder$')
259
316
 
260
- # Match the filename, ignoring directory
261
- file_match = file.key.match('([^/]+)$')
317
+ # Match the filename, ignoring directories
318
+ file_match = filepath.match('([^/]+)$')
262
319
  break unless file_match
263
320
 
321
+ # Rename
264
322
  if alter_filename_lambda.class == Proc
265
323
  filename = alter_filename_lambda.call(file_match[1])
266
324
  else
@@ -269,30 +327,47 @@ module Sluice
269
327
 
270
328
  # What are we doing? Let's determine source and target
271
329
  # Note that target excludes bucket name where relevant
272
- source = "#{from_location.bucket}/#{file.key}"
273
330
  case operation
331
+ when :upload
332
+ source = "#{filepath}"
333
+ target = name_file(filepath, filename, from_loc_or_dir, to_loc_or_dir.dir_as_path, flatten)
334
+ puts " UPLOAD #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
274
335
  when :download
275
- target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir, flatten)
336
+ source = "#{from_loc_or_dir.bucket}/#{filepath}"
337
+ target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir, flatten)
276
338
  puts " DOWNLOAD #{source} +-> #{target}"
277
339
  when :move
278
- target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
340
+ source = "#{from_loc_or_dir.bucket}/#{filepath}"
341
+ target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
279
342
  puts " MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
280
343
  when :copy
281
- target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
344
+ source = "#{from_loc_or_dir.bucket}/#{filepath}"
345
+ target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
282
346
  puts " COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
283
347
  when :delete
348
+ source = "#{from_loc_or_dir.bucket}/#{filepath}"
284
349
  # No target
285
350
  puts " DELETE x #{source}"
286
351
  end
287
352
 
288
- # Download is a stand-alone operation vs move/copy/delete
353
+ # Upload is a standalone operation vs move/copy/delete
354
+ if operation == :upload
355
+ retry_x(
356
+ Sluice::Storage::S3,
357
+ [:upload_file, s3, filepath, to_loc_or_dir.bucket, target],
358
+ RETRIES,
359
+ " +/> #{target}",
360
+ "Problem uploading #{filepath}. Retrying.")
361
+ end
362
+
363
+ # Download is a standalone operation vs move/copy/delete
289
364
  if operation == :download
290
365
  retry_x(
291
366
  Sluice::Storage::S3,
292
367
  [:download_file, s3, file, target],
293
368
  RETRIES,
294
369
  " +/> #{target}",
295
- "Problem downloading #{file.key}. Retrying.")
370
+ "Problem downloading #{filepath}. Retrying.")
296
371
  end
297
372
 
298
373
  # A move or copy starts with a copy file
@@ -302,7 +377,7 @@ module Sluice
302
377
  [:copy, to_loc_or_dir.bucket, target],
303
378
  RETRIES,
304
379
  " +-> #{to_loc_or_dir.bucket}/#{target}",
305
- "Problem copying #{file.key}. Retrying.")
380
+ "Problem copying #{filepath}. Retrying.")
306
381
  end
307
382
 
308
383
  # A move or delete ends with a delete
@@ -312,7 +387,7 @@ module Sluice
312
387
  [:destroy],
313
388
  RETRIES,
314
389
  " x #{source}",
315
- "Problem destroying #{file.key}. Retrying.")
390
+ "Problem destroying #{filepath}. Retrying.")
316
391
  end
317
392
  end
318
393
  end
@@ -390,7 +465,7 @@ module Sluice
390
465
  return shortened_filepath if add_path.nil?
391
466
 
392
467
  # Add the new filepath on to the start and return
393
- add_path + shortened_filepath
468
+ return add_path + shortened_filepath
394
469
  end
395
470
  module_function :name_file
396
471
 
metadata CHANGED
@@ -1,49 +1,41 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: sluice
3
- version: !ruby/object:Gem::Version
4
- hash: 19
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.7
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 0
9
- - 6
10
- version: 0.0.6
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Alex Dean
14
9
  - Michael Tibben
15
10
  autorequire:
16
11
  bindir: bin
17
12
  cert_chain: []
18
-
19
- date: 2012-12-31 00:00:00 Z
20
- dependencies:
21
- - !ruby/object:Gem::Dependency
13
+ date: 2013-07-11 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
22
16
  name: fog
23
- prerelease: false
24
- requirement: &id001 !ruby/object:Gem::Requirement
17
+ requirement: !ruby/object:Gem::Requirement
25
18
  none: false
26
- requirements:
19
+ requirements:
27
20
  - - ~>
28
- - !ruby/object:Gem::Version
29
- hash: 15
30
- segments:
31
- - 1
32
- - 6
33
- - 0
21
+ - !ruby/object:Gem::Version
34
22
  version: 1.6.0
35
23
  type: :runtime
36
- version_requirements: *id001
37
- description: A Ruby gem to help you build ETL processes involving Amazon S3. Uses Fog
38
- email:
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ none: false
27
+ requirements:
28
+ - - ~>
29
+ - !ruby/object:Gem::Version
30
+ version: 1.6.0
31
+ description: A Ruby gem to help you build ETL processes involving Amazon S3. Uses
32
+ Fog
33
+ email:
39
34
  - support@snowplowanalytics.com
40
35
  executables: []
41
-
42
36
  extensions: []
43
-
44
37
  extra_rdoc_files: []
45
-
46
- files:
38
+ files:
47
39
  - .gitignore
48
40
  - CHANGELOG
49
41
  - Gemfile
@@ -57,36 +49,26 @@ files:
57
49
  - sluice.gemspec
58
50
  homepage: http://snowplowanalytics.com
59
51
  licenses: []
60
-
61
52
  post_install_message:
62
53
  rdoc_options: []
63
-
64
- require_paths:
54
+ require_paths:
65
55
  - lib
66
- required_ruby_version: !ruby/object:Gem::Requirement
56
+ required_ruby_version: !ruby/object:Gem::Requirement
67
57
  none: false
68
- requirements:
69
- - - ">="
70
- - !ruby/object:Gem::Version
71
- hash: 3
72
- segments:
73
- - 0
74
- version: "0"
75
- required_rubygems_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
63
  none: false
77
- requirements:
78
- - - ">="
79
- - !ruby/object:Gem::Version
80
- hash: 3
81
- segments:
82
- - 0
83
- version: "0"
64
+ requirements:
65
+ - - ! '>='
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
84
68
  requirements: []
85
-
86
69
  rubyforge_project:
87
- rubygems_version: 1.8.24
70
+ rubygems_version: 1.8.25
88
71
  signing_key:
89
72
  specification_version: 3
90
73
  summary: Ruby toolkit for cloud-friendly ETL
91
74
  test_files: []
92
-