sluice 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG ADDED
@@ -0,0 +1,14 @@
1
+ Version 0.0.3 (2012-11-07)
2
+ --------------------------
3
+ Added parallel file download
4
+ Some simplification of existing code
5
+ Fixed a couple of bugs in the file move code (around S3 file keys versus local filepaths)
6
+
7
+ Version 0.0.2 (2012-11-06)
8
+ --------------------------
9
+ Added trail_slash
10
+ Bug fixing in regexify and files_up_to
11
+
12
+ Version 0.0.1
13
+ -------------
14
+ Initial release of Sluice
data/README.md CHANGED
@@ -13,7 +13,7 @@ Sluice has been extracted from a pair of Ruby ETL applications built by the [Sno
13
13
 
14
14
  Or in your Gemfile:
15
15
 
16
- gem 'sluice', '~> 0.0.1'
16
+ gem 'sluice', '~> 0.0.3'
17
17
 
18
18
  ## Usage
19
19
 
@@ -61,4 +61,4 @@ limitations under the License.
61
61
  [emr-etl-runner]: https://github.com/snowplow/snowplow/tree/master/3-etl/emr-etl-runner
62
62
  [storage-loader]: https://github.com/snowplow/snowplow/tree/master/4-storage/storage-loader
63
63
 
64
- [license]: http://www.apache.org/licenses/LICENSE-2.0
64
+ [license]: http://www.apache.org/licenses/LICENSE-2.0
data/lib/sluice.rb CHANGED
@@ -19,5 +19,5 @@ require 'sluice/storage/s3'
19
19
 
20
20
  module Sluice
21
21
  NAME = "sluice"
22
- VERSION = "0.0.2"
22
+ VERSION = "0.0.3"
23
23
  end
@@ -86,11 +86,26 @@ module Sluice
86
86
  end
87
87
  module_function :is_empty?
88
88
 
89
+ # Download files from an S3 location to
90
+ # local storage, concurrently
91
+ #
92
+ # Parameters:
93
+ # +s3+:: A Fog::Storage s3 connection
94
+ # +from_location+:: S3Location to delete files from
95
+ # +to_directory+:: Local directory to copy files to
96
+ # +match_regex+:: a regex string to match the files to delete
97
+ def download_files(s3, from_location, to_directory, match_regex='.+')
98
+
99
+ puts " downloading files from #{from_location} to #{to_directory}"
100
+ process_files(:download, s3, from_location, match_regex, to_directory)
101
+ end
102
+ module_function :download_files
103
+
89
104
  # Delete files from S3 locations concurrently
90
105
  #
91
106
  # Parameters:
92
107
  # +s3+:: A Fog::Storage s3 connection
93
- # +from+:: S3Location to delete files from
108
+ # +from_location+:: S3Location to delete files from
94
109
  # +match_regex+:: a regex string to match the files to delete
95
110
  def delete_files(s3, from_location, match_regex='.+')
96
111
 
@@ -103,8 +118,8 @@ module Sluice
103
118
  #
104
119
  # Parameters:
105
120
  # +s3+:: A Fog::Storage s3 connection
106
- # +from+:: S3Location to copy files from
107
- # +to+:: S3Location to copy files to
121
+ # +from_location+:: S3Location to copy files from
122
+ # +to_location+:: S3Location to copy files to
108
123
  # +match_regex+:: a regex string to match the files to copy
109
124
  # +alter_filename_lambda+:: lambda to alter the written filename
110
125
  def copy_files(s3, from_location, to_location, match_regex='.+', alter_filename_lambda=false)
@@ -118,7 +133,7 @@ module Sluice
118
133
  #
119
134
  # Parameters:
120
135
  # +s3+:: A Fog::Storage s3 connection
121
- # +from+:: S3Location to move files from
136
+ # +from_location+:: S3Location to move files from
122
137
  # +to+:: S3Location to move files to
123
138
  # +match_regex+:: a regex string to match the files to move
124
139
  # +alter_filename_lambda+:: lambda to alter the written filename
@@ -129,6 +144,26 @@ module Sluice
129
144
  end
130
145
  module_function :move_files
131
146
 
147
+ # Download a single file to the exact path specified.
148
+ # Has no intelligence around filenaming.
149
+ # Makes sure to create the path as needed.
150
+ #
151
+ # Parameters:
152
+ # +s3+:: A Fog::Storage s3 connection
153
+ # +from_file:: A Fog::File to download
154
+ # +to_file:: A local file path
155
+ def download_file(s3, from_file, to_file)
156
+
157
+ FileUtils.mkdir_p(File.dirname(to_file))
158
+
159
+ # TODO: deal with bug where Fog hangs indefinitely if network connection dies during download
160
+
161
+ local_file = File.open(to_file, "w")
162
+ local_file.write(from_file.body)
163
+ local_file.close
164
+ end
165
+ module_function :download_file
166
+
132
167
  private
133
168
 
134
169
  # Concurrent file operations between S3 locations. Supports:
@@ -139,27 +174,27 @@ module Sluice
139
174
  # Parameters:
140
175
  # +operation+:: Operation to perform. :copy, :delete, :move supported
141
176
  # +s3+:: A Fog::Storage s3 connection
142
- # +from+:: S3Location to process files from
177
+ # +from_location+:: S3Location to process files from
143
178
  # +match_regex+:: a regex string to match the files to process
144
- # +to+:: S3Location to process files to
179
+ # +to_loc_or_dir+:: S3Location or local directory to process files to
145
180
  # +alter_filename_lambda+:: lambda to alter the written filename
146
- def process_files(operation, s3, from_location, match_regex='.+', to_location=nil, alter_filename_lambda=false)
181
+ def process_files(operation, s3, from_location, match_regex='.+', to_loc_or_dir=nil, alter_filename_lambda=false)
147
182
 
148
183
  # Validate that the file operation makes sense
149
184
  case operation
150
- when :copy, :move
151
- if to_location.nil?
152
- raise StorageOperationError "File operation %s requires a to_location to be set" % operation
185
+ when :copy, :move, :download
186
+ if to_loc_or_dir.nil?
187
+ raise StorageOperationError "File operation %s requires the to_loc_or_dir to be set" % operation
153
188
  end
154
189
  when :delete
155
- unless to_location.nil?
156
- raise StorageOperationError "File operation %s does not support the to_location argument" % operation
190
+ unless to_loc_or_dir.nil?
191
+ raise StorageOperationError "File operation %s does not support the to_loc_or_dir argument" % operation
157
192
  end
158
193
  if alter_filename_lambda.class == Proc
159
194
  raise StorageOperationError "File operation %s does not support the alter_filename_lambda argument" % operation
160
195
  end
161
196
  else
162
- raise StorageOperationError "File operation %s is unsupported. Try :copy, :delete or :move" % operation
197
+ raise StorageOperationError "File operation %s is unsupported. Try :download, :copy, :delete or :move" % operation
163
198
  end
164
199
 
165
200
  files_to_process = []
@@ -171,7 +206,7 @@ module Sluice
171
206
  # If an exception is thrown in a thread that isn't handled, die quickly
172
207
  Thread.abort_on_exception = true
173
208
 
174
- # Create ruby threads to concurrently execute s3 operations
209
+ # Create Ruby threads to concurrently execute s3 operations
175
210
  for i in (0...CONCURRENCY)
176
211
 
177
212
  # Each thread pops a file off the files_to_process array, and moves it.
@@ -187,10 +222,10 @@ module Sluice
187
222
 
188
223
  while !complete && !match do
189
224
  if files_to_process.size == 0
190
- # s3 batches 1000 files per request
191
- # we load up our array with the files to move
225
+ # S3 batches 1000 files per request.
226
+ # We load up our array with the files to move
192
227
  files_to_process = s3.directories.get(from_location.bucket, :prefix => from_location.dir).files.all(marker_opts)
193
- # if we don't have any files after the s3 request, we're complete
228
+ # If we don't have any files after the s3 request, we're complete
194
229
  if files_to_process.size == 0
195
230
  complete = true
196
231
  next
@@ -204,6 +239,7 @@ module Sluice
204
239
  end
205
240
 
206
241
  file = files_to_process.pop
242
+
207
243
  match = if match_regex.is_a? NegativeRegex
208
244
  !file.key.match(match_regex.regex)
209
245
  else
@@ -213,12 +249,13 @@ module Sluice
213
249
  end
214
250
 
215
251
  # If we don't have a match, then we must be complete
216
- break unless match # exit the thread
252
+ break unless match
253
+
254
+ # Ignore any EMR-created _$folder$ entries
255
+ break if file.key.end_with?('_$folder$')
217
256
 
218
257
  # Match the filename, ignoring directory
219
258
  file_match = file.key.match('([^/]+)$')
220
-
221
- # Silently skip any sub-directories in the list
222
259
  break unless file_match
223
260
 
224
261
  if alter_filename_lambda.class == Proc
@@ -227,44 +264,46 @@ module Sluice
227
264
  filename = file_match[1]
228
265
  end
229
266
 
230
- # What are we doing?
267
+ # What are we doing? Let's determine source and target
268
+ # Note that target excludes bucket name where relevant
269
+ source = "#{from_location.bucket}/#{file.key}"
231
270
  case operation
271
+ when :download
272
+ target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir)
273
+ puts " DOWNLOAD #{source} +-> #{target}"
232
274
  when :move
233
- puts " MOVE #{from_location.bucket}/#{file.key} -> #{to_location.bucket}/#{to_location.dir_as_path}#{filename}"
275
+ target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir.dir_as_path)
276
+ puts " MOVE #{source} -> #{to_loc_or_dir.bucket}/#{target}"
234
277
  when :copy
235
- puts " COPY #{from_location.bucket}/#{file.key} +-> #{to_location.bucket}/#{to_location.dir_as_path}#{filename}"
278
+ target = name_file(file.key, filename, from_location.dir_as_path, to_loc_or_dir.dir_as_path)
279
+ puts " COPY #{source} +-> #{to_loc_or_dir.bucket}/#{target}"
236
280
  when :delete
237
- puts " DELETE x #{from_location.bucket}/#{file.key}"
281
+ # No target
282
+ puts " DELETE x #{source}"
283
+ end
284
+
285
+ # Download is a stand-alone operation vs move/copy/delete
286
+ if operation == :download
287
+ retry_x(
288
+ download_file(s3, file, target), RETRIES,
289
+ " +/> #{target}",
290
+ "Problem downloading #{file.key}. Retrying.")
238
291
  end
239
292
 
240
293
  # A move or copy starts with a copy file
241
294
  if [:move, :copy].include? operation
242
- i = 0
243
- begin
244
- file.copy(to_location.bucket, to_location.dir_as_path + filename)
245
- puts " +-> #{to_location.bucket}/#{to_location.dir_as_path}#{filename}"
246
- rescue
247
- raise unless i < RETRIES
248
- puts "Problem copying #{file.key}. Retrying.", $!, $@
249
- sleep(RETRY_WAIT) # give us a bit of time before retrying
250
- i += 1
251
- retry
252
- end
295
+ retry_x(
296
+ file.copy(to_loc_or_dir.bucket, target), RETRIES,
297
+ " +-> #{to_loc_or_dir.bucket}/#{target}",
298
+ "Problem copying #{file.key}. Retrying.")
253
299
  end
254
300
 
255
301
  # A move or delete ends with a delete
256
302
  if [:move, :delete].include? operation
257
- i = 0
258
- begin
259
- file.destroy()
260
- puts " x #{from_location.bucket}/#{file.key}"
261
- rescue
262
- raise unless i < RETRIES
263
- puts "Problem destroying #{file.key}. Retrying.", $!, $@
264
- sleep(RETRY_WAIT) # Give us a bit of time before retrying
265
- i += 1
266
- retry
267
- end
303
+ retry_x(
304
+ file.destroy(), RETRIES,
305
+ " x #{source}",
306
+ "Problem destroying #{file.key}. Retrying.")
268
307
  end
269
308
  end
270
309
  end
@@ -276,6 +315,70 @@ module Sluice
276
315
  end
277
316
  module_function :process_files
278
317
 
318
+ # A helper function to attempt to run a
319
+ # function retries times
320
+ #
321
+ # Parameters:
322
+ # +function+:: Function to run
323
+ # +retries+:: Number of retries to attempt
324
+ # +attempt_msg+:: Message to puts on each attempt
325
+ # +failure_msg+:: Message to puts on each failure
326
+ def retry_x(function, retries, attempt_msg, failure_msg)
327
+ i = 0
328
+ begin
329
+ function
330
+ puts attempt_msg
331
+ rescue
332
+ raise unless i < retries
333
+ puts failure_msg
334
+ sleep(RETRY_WAIT) # Give us a bit of time before retrying
335
+ i += 1
336
+ retry
337
+ end
338
+ end
339
+ module_function :retry_x
340
+
341
+ # A helper function to prepare destination
342
+ # filenames and paths. This is a bit weird
343
+ # - it needs to exist because of differences
344
+ # in the way that Amazon S3, Fog and Unix
345
+ # treat filepaths versus keys.
346
+ #
347
+ # Parameters:
348
+ # +filepath+:: Path to file (including old filename)
349
+ # +new_filename+:: Replace the filename in the path with this
350
+ # +remove_path+:: If this is set, strip this from the front of the path
351
+ # +add_path+:: If this is set, add this to the front of the path
352
+ #
353
+ # TODO: this really needs unit tests
354
+ def name_file(filepath, new_filename, remove_path=nil, add_path=nil)
355
+
356
+ # First, replace the filename in filepath with new one
357
+ dirname = File.dirname(filepath)
358
+ new_filepath = (dirname == '.') ? new_filename : dirname + '/' + new_filename
359
+
360
+ # Nothing more to do
361
+ return new_filepath if remove_path.nil?
362
+
363
+ # If we have a 'remove_path', it must be found at
364
+ # the start of the path.
365
+ # If it's not, you're probably using name_file()
366
+ # wrong.
367
+ if !filepath.start_with?(remove_path)
368
+ raise StorageOperationError, "name_file failed. Filepath '#{filepath}' does not start with '#{remove_path}'"
369
+ end
370
+
371
+ # Okay, let's remove the filepath
372
+ shortened_filepath = new_filepath[remove_path.length()..-1]
373
+
374
+ # Nothing more to do
375
+ return shortened_filepath if add_path.nil?
376
+
377
+ # Add the new filepath on to the start and return
378
+ add_path + shortened_filepath
379
+ end
380
+ module_function :name_file
381
+
279
382
  end
280
383
  end
281
384
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sluice
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 2
10
- version: 0.0.2
9
+ - 3
10
+ version: 0.0.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Alex Dean
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2012-11-06 00:00:00 Z
19
+ date: 2012-11-09 00:00:00 Z
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
22
22
  name: fog
@@ -45,6 +45,7 @@ extra_rdoc_files: []
45
45
 
46
46
  files:
47
47
  - .gitignore
48
+ - CHANGELOG
48
49
  - Gemfile
49
50
  - LICENSE-2.0.txt
50
51
  - README.md