sluice 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG ADDED
@@ -0,0 +1,14 @@
1
+ Version 0.0.3 (2012-11-07)
2
+ --------------------------
3
+ Added parallel file download
4
+ Some simplification of existing code
5
+ Fixed a couple of bugs in the file move code (around S3 file keys versus local filepaths)
6
+
7
+ Version 0.0.2 (2012-11-06)
8
+ --------------------------
9
+ Added trail_slash
10
+ Bug fixing in regexify and files_up_to
11
+
12
+ Version 0.0.1
13
+ -------------
14
+ Initial release of Sluice
data/README.md CHANGED
@@ -13,7 +13,7 @@ Sluice has been extracted from a pair of Ruby ETL applications built by the [Sno
13
13
 
14
14
  Or in your Gemfile:
15
15
 
16
- gem 'sluice', '~> 0.0.1'
16
+ gem 'sluice', '~> 0.0.3'
17
17
 
18
18
  ## Usage
19
19
 
@@ -61,4 +61,4 @@ limitations under the License.
61
61
  [emr-etl-runner]: https://github.com/snowplow/snowplow/tree/master/3-etl/emr-etl-runner
62
62
  [storage-loader]: https://github.com/snowplow/snowplow/tree/master/4-storage/storage-loader
63
63
 
64
- [license]: http://www.apache.org/licenses/LICENSE-2.0
64
+ [license]: http://www.apache.org/licenses/LICENSE-2.0
data/lib/sluice.rb CHANGED
@@ -19,5 +19,5 @@ require 'sluice/storage/s3'
19
19
 
20
20
  module Sluice
21
21
  NAME = "sluice"
22
- VERSION = "0.0.2"
22
+ VERSION = "0.0.3"
23
23
  end
@@ -86,11 +86,26 @@ module Sluice
86
86
  end
87
87
  module_function :is_empty?
88
88
 
89
+ # Download files from an S3 location to
90
+ # local storage, concurrently
91
+ #
92
+ # Parameters:
93
+ # +s3+:: A Fog::Storage s3 connection
94
+ # +from_location+:: S3Location to delete files from
95
+ # +to_directory+:: Local directory to copy files to
96
+ # +match_regex+:: a regex string to match the files to delete
97
+ def download_files(s3, from_location, to_directory, match_regex='.+')
98
+
99
+ puts " downloading files from #{from_location} to #{to_directory}"
100
+ process_files(:download, s3, from_location, match_regex, to_directory)
101
+ end
102
+ module_function :download_files
103
+
89
104
  # Delete files from S3 locations concurrently
90
105
  #
91
106
  # Parameters:
92
107
  # +s3+:: A Fog::Storage s3 connection
93
- # +from+:: S3Location to delete files from
108
+ # +from_location+:: S3Location to delete files from
94
109
  # +match_regex+:: a regex string to match the files to delete
95
110
  def delete_files(s3, from_location, match_regex='.+')
96
111
 
@@ -103,8 +118,8 @@ module Sluice
103
118
  #
104
119
  # Parameters:
105
120
  # +s3+:: A Fog::Storage s3 connection
106
- # +from+:: S3Location to copy files from
107
- # +to+:: S3Location to copy files to
121
+ # +from_location+:: S3Location to copy files from
122
+ # +to_location+:: S3Location to copy files to
108
123
  # +match_regex+:: a regex string to match the files to copy
109
124
  # +alter_filename_lambda+:: lambda to alter the written filename
110
125
  def copy_files(s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false)
@@ -118,7 +133,7 @@ module Sluice
118
133
  #
119
134
  # Parameters:
120
135
  # +s3+:: A Fog::Storage s3 connection
121
- # +from+:: S3Location to move files from
136
+ # +from_location+:: S3Location to move files from
122
137
  # +to+:: S3Location to move files to
123
138
  # +match_regex+:: a regex string to match the files to move
124
139
  # +alter_filename_lambda+:: lambda to alter the written filename
@@ -129,6 +144,26 @@ module Sluice
129
144
  end
130
145
  module_function :move_files
131
146
 
147
+ # Download a single file to the exact path specified.
148
+ # Has no intelligence around filenaming.
149
+ # Makes sure to create the path as needed.
150
+ #
151
+ # Parameters:
152
+ # +s3+:: A Fog::Storage s3 connection
153
+ # +from_file:: A Fog::File to download
154
+ # +to_file:: A local file path
155
+ def download_file(s3, from_file, to_file)
156
+
157
+ FileUtils.mkdir_p(File.dirname(to_file))
158
+
159
+ # TODO: deal with bug where Fog hangs indefinitely if network connection dies during download
160
+
161
+ local_file = File.open(to_file, "w")
162
+ local_file.write(from_file.body)
163
+ local_file.close
164
+ end
165
+ module_function :download_file
166
+
132
167
  private
133
168
 
134
169
  # Concurrent file operations between S3 locations. Supports:
@@ -139,27 +174,27 @@ module Sluice
139
174
  # Parameters:
140
175
  # +operation+:: Operation to perform. :copy, :delete, :move supported
141
176
  # +s3+:: A Fog::Storage s3 connection
142
- # +from+:: S3Location to process files from
177
+ # +from_location+:: S3Location to process files from
143
178
  # +match_regex+:: a regex string to match the files to process
144
- # +to+:: S3Location to process files to
179
+ # +to_loc_or_dir+:: S3Location or local directory to process files to
145
180
  # +alter_filename_lambda+:: lambda to alter the written filename
146
- def process_files(operation, s3, from_location, match_regex='.+', to_location=nil, alter_filename_lambda=false)
181
+ def process_files(operation, s3, from_location, match_regex='.+', to_loc_or_dir=nil, alter_filename_lambda=false)
147
182
 
148
183
  # Validate that the file operation makes sense
149
184
  case operation
150
- when :copy, :move
151
- if to_location.nil?
152
- raise StorageOperationError "File operation %s requires a to_location to be set" % operation
185
+ when :copy, :move, :download
186
+ if to_loc_or_dir.nil?
187
+ raise StorageOperationError "File operation %s requires the to_loc_or_dir to be set" % operation
153
188
  end
154
189
  when :delete
155
- unless to_location.nil?
156
- raise StorageOperationError "File operation %s does not support the to_location argument" % operation
190
+ unless to_loc_or_dir.nil?
191
+ raise StorageOperationError "File operation %s does not support the to_loc_or_dir argument" % operation
157
192
  end
158
193
  if alter_filename_lambda.class == Proc
159
194
  raise StorageOperationError "File operation %s does not support the alter_filename_lambda argument" % operation
160
195
  end
161
196
  else
162
- raise StorageOperationError "File operation %s is unsupported. Try :copy, :delete or :move" % operation
197
+ raise StorageOperationError "File operation %s is unsupported. Try :download, :copy, :delete or :move" % operation
163
198
  end
164
199
 
165
200
  files_to_process = []
@@ -171,7 +206,7 @@ module Sluice
171
206
  # If an exception is thrown in a thread that isn't handled, die quickly
172
207
  Thread.abort_on_exception = true
173
208
 
174
- # Create ruby threads to concurrently execute s3 operations
209
+ # Create Ruby threads to concurrently execute s3 operations
175
210
  for i in (0...CONCURRENCY)
176
211
 
177
212
  # Each thread pops a file off the files_to_process array, and moves it.
@@ -187,10 +222,10 @@ module Sluice
187
222
 
188
223
  while !complete && !match do
189
224
  if files_to_process.size == 0
190
- # s3 batches 1000 files per request
191
- # we load up our array with the files to move
225
+ # S3 batches 1000 files per request.
226
+ # We load up our array with the files to move
192
227
  files_to_process = s3.directories.get(from_location.bucket, :prefix => from_location.dir).files.all(marker_opts)
193
- # if we don't have any files after the s3 request, we're complete
228
+ # If we don't have any files after the s3 request, we're complete
194
229
  if files_to_process.size == 0
195
230
  complete = true
196
231
  next
@@ -204,6 +239,7 @@ module Sluice
204
239
  end
205
240
 
206
241
  file = files_to_process.pop
242
+
207
243
  match = if match_regex.is_a? NegativeRegex
208
244
  !file.key.match(match_regex.regex)
209
245
  else
@@ -213,12 +249,13 @@ module Sluice
213
249
  end
214
250
 
215
251
  # If we don't have a match, then we must be complete
216
- break unless match # exit the thread
252
+ break unless match
253
+
254
+ # Ignore any EMR-created _$folder$ entries
255
+ break if file.key.end_with?('_$folder$')
217
256
 
218
257
  # Match the filename, ignoring directory
219
258
  file_match = file.key.match('([^/]+)$')
220
-
221
- # Silently skip any sub-directories in the list
222
259
  break unless file_match
223
260
 
224
261
  if alter_filename_lambda.class == Proc
@@ -227,44 +264,46 @@ module Sluice
227
264
  filename = file_match[1]
228
265
  end
229
266
 
230
- # What are we doing?
267
+ # What are we doing? Let's determine source and target
268
+ # Note that target excludes bucket name where relevant
269
+ source = "#{from_location.bucket}/#{file.key}"
231
270
  case operation
271
+ when :download
272
+ target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir)
273
+ puts " DOWNLOAD #{source} +-> #{target}"
232
274
  when :move
233
- puts " MOVE #{from_location.bucket}/#{file.key} -> #{to_location.bucket}/#{to_location.dir_as_path}#{filename}"
275
+ target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir.dir_as_path)
276
+ puts " MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
234
277
  when :copy
235
- puts " COPY #{from_location.bucket}/#{file.key} +-> #{to_location.bucket}/#{to_location.dir_as_path}#{filename}"
278
+ target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir.dir_as_path)
279
+ puts " COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
236
280
  when :delete
237
- puts " DELETE x #{from_location.bucket}/#{file.key}"
281
+ # No target
282
+ puts " DELETE x #{source}"
283
+ end
284
+
285
+ # Download is a stand-alone operation vs move/copy/delete
286
+ if operation == :download
287
+ retry_x(
288
+ download_file(s3, file, target), RETRIES,
289
+ " +/> #{target}",
290
+ "Problem downloading #{file.key}. Retrying.")
238
291
  end
239
292
 
240
293
  # A move or copy starts with a copy file
241
294
  if [:move, :copy].include? operation
242
- i = 0
243
- begin
244
- file.copy(to_location.bucket, to_location.dir_as_path + filename)
245
- puts " +-> #{to_location.bucket}/#{to_location.dir_as_path}#{filename}"
246
- rescue
247
- raise unless i < RETRIES
248
- puts "Problem copying #{file.key}. Retrying.", $!, $@
249
- sleep(RETRY_WAIT) # give us a bit of time before retrying
250
- i += 1
251
- retry
252
- end
295
+ retry_x(
296
+ file.copy(to_loc_or_dir.bucket, target), RETRIES,
297
+ " +-> #{to_loc_or_dir.bucket}/#{target}",
298
+ "Problem copying #{file.key}. Retrying.")
253
299
  end
254
300
 
255
301
  # A move or delete ends with a delete
256
302
  if [:move, :delete].include? operation
257
- i = 0
258
- begin
259
- file.destroy()
260
- puts " x #{from_location.bucket}/#{file.key}"
261
- rescue
262
- raise unless i < RETRIES
263
- puts "Problem destroying #{file.key}. Retrying.", $!, $@
264
- sleep(RETRY_WAIT) # Give us a bit of time before retrying
265
- i += 1
266
- retry
267
- end
303
+ retry_x(
304
+ file.destroy(), RETRIES,
305
+ " x #{source}",
306
+ "Problem destroying #{file.key}. Retrying.")
268
307
  end
269
308
  end
270
309
  end
@@ -276,6 +315,70 @@ module Sluice
276
315
  end
277
316
  module_function :process_files
278
317
 
318
+ # A helper function to attempt to run a
319
+ # function retries times
320
+ #
321
+ # Parameters:
322
+ # +function+:: Function to run
323
+ # +retries+:: Number of retries to attempt
324
+ # +attempt_msg+:: Message to puts on each attempt
325
+ # +failure_msg+:: Message to puts on each failure
326
+ def retry_x(function, retries, attempt_msg, failure_msg)
327
+ i = 0
328
+ begin
329
+ function
330
+ puts attempt_msg
331
+ rescue
332
+ raise unless i < retries
333
+ puts failure_msg
334
+ sleep(RETRY_WAIT) # Give us a bit of time before retrying
335
+ i += 1
336
+ retry
337
+ end
338
+ end
339
+ module_function :retry_x
340
+
341
+ # A helper function to prepare destination
342
+ # filenames and paths. This is a bit weird
343
+ # - it needs to exist because of differences
344
+ # in the way that Amazon S3, Fog and Unix
345
+ # treat filepaths versus keys.
346
+ #
347
+ # Parameters:
348
+ # +filepath+:: Path to file (including old filename)
349
+ # +new_filename+:: Replace the filename in the path with this
350
+ # +remove_path+:: If this is set, strip this from the front of the path
351
+ # +add_path+:: If this is set, add this to the front of the path
352
+ #
353
+ # TODO: this really needs unit tests
354
+ def name_file(filepath, new_filename, remove_path=nil, add_path=nil)
355
+
356
+ # First, replace the filename in filepath with new one
357
+ dirname = File.dirname(filepath)
358
+ new_filepath = (dirname == '.') ? new_filename : dirname + '/' + new_filename
359
+
360
+ # Nothing more to do
361
+ return new_filepath if remove_path.nil?
362
+
363
+ # If we have a 'remove_path', it must be found at
364
+ # the start of the path.
365
+ # If it's not, you're probably using name_file()
366
+ # wrong.
367
+ if !filepath.start_with?(remove_path)
368
+ raise StorageOperationError, "name_file failed. Filepath '#{filepath}' does not start with '#{remove_path}'"
369
+ end
370
+
371
+ # Okay, let's remove the filepath
372
+ shortened_filepath = new_filepath[remove_path.length()..-1]
373
+
374
+ # Nothing more to do
375
+ return shortened_filepath if add_path.nil?
376
+
377
+ # Add the new filepath on to the start and return
378
+ add_path + shortened_filepath
379
+ end
380
+ module_function :name_file
381
+
279
382
  end
280
383
  end
281
384
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sluice
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 2
10
- version: 0.0.2
9
+ - 3
10
+ version: 0.0.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Alex Dean
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2012-11-06 00:00:00 Z
19
+ date: 2012-11-09 00:00:00 Z
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
22
22
  name: fog
@@ -45,6 +45,7 @@ extra_rdoc_files: []
45
45
 
46
46
  files:
47
47
  - .gitignore
48
+ - CHANGELOG
48
49
  - Gemfile
49
50
  - LICENSE-2.0.txt
50
51
  - README.md