sluice 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. data/README.md +2 -0
  2. data/lib/sluice.rb +1 -1
  3. data/lib/sluice/storage/s3.rb +117 -42
  4. metadata +33 -51
data/README.md CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  Sluice is a Ruby gem (built with [Bundler] [bundler]) to help you build cloud-friendly ETL (extract, transform, load) processes.
4
4
 
5
+ **Currently it does one thing: supports very robust, very parallel copy/delete/move of S3 files from one bucket to another.**
6
+
5
7
  Sluice has been extracted from a pair of Ruby ETL applications built by the [SnowPlow Analytics] [snowplow-analytics] team, specifically:
6
8
 
7
9
  1. [EmrEtlRunner] [emr-etl-runner], a Ruby application to run the SnowPlow ETL process on Elastic MapReduce
data/lib/sluice.rb CHANGED
@@ -19,5 +19,5 @@ require 'sluice/storage/s3'
19
19
 
20
20
  module Sluice
21
21
  NAME = "sluice"
22
- VERSION = "0.0.6"
22
+ VERSION = "0.0.7"
23
23
  end
@@ -135,7 +135,7 @@ module Sluice
135
135
  # Parameters:
136
136
  # +s3+:: A Fog::Storage s3 connection
137
137
  # +from_location+:: S3Location to move files from
138
- # +to+:: S3Location to move files to
138
+ # +to_location+:: S3Location to move files to
139
139
  # +match_regex+:: a regex string to match the files to move
140
140
  # +alter_filename_lambda+:: lambda to alter the written filename
141
141
  # +flatten+:: strips off any sub-folders below the from_location
@@ -146,7 +146,43 @@ module Sluice
146
146
  end
147
147
  module_function :move_files
148
148
 
149
- # Download a single file to the exact path specified.
149
+ # Uploads files to S3 locations concurrently
150
+ #
151
+ # Parameters:
152
+ # +s3+:: A Fog::Storage s3 connection
153
+ # +from_directory+:: Local directory to upload files from
154
+ # +to_location+:: S3Location to upload files to
155
+ # +match_glob+:: a filesystem glob to match the files to upload
156
+ def upload_files(s3, from_directory, to_location, match_glob='*')
157
+
158
+ puts " uploading files from #{from_directory} to #{to_location}"
159
+ process_files(:upload, s3, from_directory, match_glob, to_location)
160
+ end
161
+ module_function :upload_files
162
+
163
+ # Upload a single file to the exact location specified
164
+ # Has no intelligence around filenaming.
165
+ #
166
+ # Parameters:
167
+ # +s3+:: A Fog::Storage s3 connection
168
+ # +from_file:: A local file path
169
+ # +to_bucket:: The Fog::Directory to upload to
170
+ # +to_file:: The file path to upload to
171
+ def upload_file(s3, from_file, to_bucket, to_file)
172
+
173
+ local_file = File.open(from_file)
174
+
175
+ dir = s3.directories.new(:key => to_bucket) # No request made
176
+ file = dir.files.create(
177
+ :key => to_file,
178
+ :body => local_file
179
+ )
180
+
181
+ local_file.close
182
+ end
183
+ module_function :upload_file
184
+
185
+ # Download a single file to the exact path specified
150
186
  # Has no intelligence around filenaming.
151
187
  # Makes sure to create the path as needed.
152
188
  #
@@ -169,23 +205,25 @@ module Sluice
169
205
  private
170
206
 
171
207
  # Concurrent file operations between S3 locations. Supports:
208
+ # - Download
209
+ # - Upload
172
210
  # - Copy
173
211
  # - Delete
174
212
  # - Move (= Copy + Delete)
175
213
  #
176
214
  # Parameters:
177
- # +operation+:: Operation to perform. :copy, :delete, :move supported
215
+ # +operation+:: Operation to perform. :download, :upload, :copy, :delete, :move supported
178
216
  # +s3+:: A Fog::Storage s3 connection
179
- # +from_location+:: S3Location to process files from
180
- # +match_regex+:: a regex string to match the files to process
217
+ # +from_loc_or_dir+:: S3Location to process files from
218
+ # +match_regex_or_glob+:: a regex or glob string to match the files to process
181
219
  # +to_loc_or_dir+:: S3Location or local directory to process files to
182
220
  # +alter_filename_lambda+:: lambda to alter the written filename
183
- # +flatten+:: strips off any sub-folders below the from_location
184
- def process_files(operation, s3, from_location, match_regex='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
221
+ # +flatten+:: strips off any sub-folders below the from_loc_or_dir
222
+ def process_files(operation, s3, from_loc_or_dir, match_regex_or_glob='.+', to_loc_or_dir=nil, alter_filename_lambda=false, flatten=false)
185
223
 
186
224
  # Validate that the file operation makes sense
187
225
  case operation
188
- when :copy, :move, :download
226
+ when :copy, :move, :download, :upload
189
227
  if to_loc_or_dir.nil?
190
228
  raise StorageOperationError "File operation %s requires the to_loc_or_dir to be set" % operation
191
229
  end
@@ -197,10 +235,15 @@ module Sluice
197
235
  raise StorageOperationError "File operation %s does not support the alter_filename_lambda argument" % operation
198
236
  end
199
237
  else
200
- raise StorageOperationError "File operation %s is unsupported. Try :download, :copy, :delete or :move" % operation
238
+ raise StorageOperationError "File operation %s is unsupported. Try :download, :upload, :copy, :delete or :move" % operation
201
239
  end
202
240
 
203
- files_to_process = []
241
+ # If we are uploading, then we can glob the files before we thread
242
+ if operation == :upload
243
+ files_to_process = Dir.glob(File.join(from_loc_or_dir, match_regex_or_glob))
244
+ else
245
+ files_to_process = []
246
+ end
204
247
  threads = []
205
248
  mutex = Mutex.new
206
249
  complete = false
@@ -217,37 +260,51 @@ module Sluice
217
260
  threads << Thread.new do
218
261
  loop do
219
262
  file = false
263
+ filepath = false
220
264
  match = false
221
265
 
222
266
  # Critical section:
223
267
  # only allow one thread to modify the array at any time
224
268
  mutex.synchronize do
225
269
 
226
- while !complete && !match do
270
+ if operation == :upload
271
+
227
272
  if files_to_process.size == 0
228
- # S3 batches 1000 files per request.
229
- # We load up our array with the files to move
230
- files_to_process = s3.directories.get(from_location.bucket, :prefix => from_location.dir).files.all(marker_opts)
231
- # If we don't have any files after the s3 request, we're complete
273
+ complete = true
274
+ next
275
+ end
276
+
277
+ filepath = files_to_process.pop
278
+ match = true # Match is implicit in the glob
279
+ else
280
+
281
+ while !complete && !match do
232
282
  if files_to_process.size == 0
233
- complete = true
234
- next
235
- else
236
- marker_opts['marker'] = files_to_process.last.key
237
-
238
- # By reversing the array we can use pop and get FIFO behaviour
239
- # instead of the performance penalty incurred by unshift
240
- files_to_process = files_to_process.reverse
283
+ # S3 batches 1000 files per request.
284
+ # We load up our array with the files to move
285
+ files_to_process = s3.directories.get(from_loc_or_dir.bucket, :prefix => from_loc_or_dir.dir).files.all(marker_opts)
286
+ # If we don't have any files after the s3 request, we're complete
287
+ if files_to_process.size == 0
288
+ complete = true
289
+ next
290
+ else
291
+ marker_opts['marker'] = files_to_process.last.key
292
+
293
+ # By reversing the array we can use pop and get FIFO behaviour
294
+ # instead of the performance penalty incurred by unshift
295
+ files_to_process = files_to_process.reverse
296
+ end
241
297
  end
242
- end
243
298
 
244
- file = files_to_process.pop
299
+ file = files_to_process.pop
300
+ filepath = file.key
245
301
 
246
- match = if match_regex.is_a? NegativeRegex
247
- !file.key.match(match_regex.regex)
248
- else
249
- file.key.match(match_regex)
250
- end
302
+ match = if match_regex_or_glob.is_a? NegativeRegex
303
+ !filepath.match(match_regex_or_glob.regex)
304
+ else
305
+ filepath.match(match_regex_or_glob)
306
+ end
307
+ end
251
308
  end
252
309
  end
253
310
 
@@ -255,12 +312,13 @@ module Sluice
255
312
  break unless match
256
313
 
257
314
  # Ignore any EMR-created _$folder$ entries
258
- break if file.key.end_with?('_$folder$')
315
+ break if filepath.end_with?('_$folder$')
259
316
 
260
- # Match the filename, ignoring directory
261
- file_match = file.key.match('([^/]+)$')
317
+ # Match the filename, ignoring directories
318
+ file_match = filepath.match('([^/]+)$')
262
319
  break unless file_match
263
320
 
321
+ # Rename
264
322
  if alter_filename_lambda.class == Proc
265
323
  filename = alter_filename_lambda.call(file_match[1])
266
324
  else
@@ -269,30 +327,47 @@ module Sluice
269
327
 
270
328
  # What are we doing? Let's determine source and target
271
329
  # Note that target excludes bucket name where relevant
272
- source = "#{from_location.bucket}/#{file.key}"
273
330
  case operation
331
+ when :upload
332
+ source = "#{filepath}"
333
+ target = name_file(filepath, filename, from_loc_or_dir, to_loc_or_dir.dir_as_path, flatten)
334
+ puts " UPLOAD #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
274
335
  when :download
275
- target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir, flatten)
336
+ source = "#{from_loc_or_dir.bucket}/#{filepath}"
337
+ target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir, flatten)
276
338
  puts " DOWNLOAD #{source} +-> #{target}"
277
339
  when :move
278
- target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
340
+ source = "#{from_loc_or_dir.bucket}/#{filepath}"
341
+ target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
279
342
  puts " MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
280
343
  when :copy
281
- target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
344
+ source = "#{from_loc_or_dir.bucket}/#{filepath}"
345
+ target = name_file(filepath, filename, from_loc_or_dir.dir_as_path, to_loc_or_dir.dir_as_path, flatten)
282
346
  puts " COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
283
347
  when :delete
348
+ source = "#{from_loc_or_dir.bucket}/#{filepath}"
284
349
  # No target
285
350
  puts " DELETE x #{source}"
286
351
  end
287
352
 
288
- # Download is a stand-alone operation vs move/copy/delete
353
+ # Upload is a standalone operation vs move/copy/delete
354
+ if operation == :upload
355
+ retry_x(
356
+ Sluice::Storage::S3,
357
+ [:upload_file, s3, filepath, to_loc_or_dir.bucket, target],
358
+ RETRIES,
359
+ " +/> #{target}",
360
+ "Problem uploading #{filepath}. Retrying.")
361
+ end
362
+
363
+ # Download is a standalone operation vs move/copy/delete
289
364
  if operation == :download
290
365
  retry_x(
291
366
  Sluice::Storage::S3,
292
367
  [:download_file, s3, file, target],
293
368
  RETRIES,
294
369
  " +/> #{target}",
295
- "Problem downloading #{file.key}. Retrying.")
370
+ "Problem downloading #{filepath}. Retrying.")
296
371
  end
297
372
 
298
373
  # A move or copy starts with a copy file
@@ -302,7 +377,7 @@ module Sluice
302
377
  [:copy, to_loc_or_dir.bucket, target],
303
378
  RETRIES,
304
379
  " +-> #{to_loc_or_dir.bucket}/#{target}",
305
- "Problem copying #{file.key}. Retrying.")
380
+ "Problem copying #{filepath}. Retrying.")
306
381
  end
307
382
 
308
383
  # A move or delete ends with a delete
@@ -312,7 +387,7 @@ module Sluice
312
387
  [:destroy],
313
388
  RETRIES,
314
389
  " x #{source}",
315
- "Problem destroying #{file.key}. Retrying.")
390
+ "Problem destroying #{filepath}. Retrying.")
316
391
  end
317
392
  end
318
393
  end
@@ -390,7 +465,7 @@ module Sluice
390
465
  return shortened_filepath if add_path.nil?
391
466
 
392
467
  # Add the new filepath on to the start and return
393
- add_path + shortened_filepath
468
+ return add_path + shortened_filepath
394
469
  end
395
470
  module_function :name_file
396
471
 
metadata CHANGED
@@ -1,49 +1,41 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: sluice
3
- version: !ruby/object:Gem::Version
4
- hash: 19
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.7
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 0
9
- - 6
10
- version: 0.0.6
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Alex Dean
14
9
  - Michael Tibben
15
10
  autorequire:
16
11
  bindir: bin
17
12
  cert_chain: []
18
-
19
- date: 2012-12-31 00:00:00 Z
20
- dependencies:
21
- - !ruby/object:Gem::Dependency
13
+ date: 2013-07-11 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
22
16
  name: fog
23
- prerelease: false
24
- requirement: &id001 !ruby/object:Gem::Requirement
17
+ requirement: !ruby/object:Gem::Requirement
25
18
  none: false
26
- requirements:
19
+ requirements:
27
20
  - - ~>
28
- - !ruby/object:Gem::Version
29
- hash: 15
30
- segments:
31
- - 1
32
- - 6
33
- - 0
21
+ - !ruby/object:Gem::Version
34
22
  version: 1.6.0
35
23
  type: :runtime
36
- version_requirements: *id001
37
- description: A Ruby gem to help you build ETL processes involving Amazon S3. Uses Fog
38
- email:
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ none: false
27
+ requirements:
28
+ - - ~>
29
+ - !ruby/object:Gem::Version
30
+ version: 1.6.0
31
+ description: A Ruby gem to help you build ETL processes involving Amazon S3. Uses
32
+ Fog
33
+ email:
39
34
  - support@snowplowanalytics.com
40
35
  executables: []
41
-
42
36
  extensions: []
43
-
44
37
  extra_rdoc_files: []
45
-
46
- files:
38
+ files:
47
39
  - .gitignore
48
40
  - CHANGELOG
49
41
  - Gemfile
@@ -57,36 +49,26 @@ files:
57
49
  - sluice.gemspec
58
50
  homepage: http://snowplowanalytics.com
59
51
  licenses: []
60
-
61
52
  post_install_message:
62
53
  rdoc_options: []
63
-
64
- require_paths:
54
+ require_paths:
65
55
  - lib
66
- required_ruby_version: !ruby/object:Gem::Requirement
56
+ required_ruby_version: !ruby/object:Gem::Requirement
67
57
  none: false
68
- requirements:
69
- - - ">="
70
- - !ruby/object:Gem::Version
71
- hash: 3
72
- segments:
73
- - 0
74
- version: "0"
75
- required_rubygems_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
63
  none: false
77
- requirements:
78
- - - ">="
79
- - !ruby/object:Gem::Version
80
- hash: 3
81
- segments:
82
- - 0
83
- version: "0"
64
+ requirements:
65
+ - - ! '>='
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
84
68
  requirements: []
85
-
86
69
  rubyforge_project:
87
- rubygems_version: 1.8.24
70
+ rubygems_version: 1.8.25
88
71
  signing_key:
89
72
  specification_version: 3
90
73
  summary: Ruby toolkit for cloud-friendly ETL
91
74
  test_files: []
92
-