s3-object-processor 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +13 -0
- data/Gemfile.lock +69 -0
- data/LICENSE.txt +20 -0
- data/README.md +105 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/s3-object-processor.rb +360 -0
- data/lib/s3-object-processor/cli.rb +137 -0
- data/s3-object-processor.gemspec +67 -0
- data/spec/s3-object-processor_spec.rb +7 -0
- data/spec/spec_helper.rb +12 -0
- metadata +160 -0
data/.document
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/Gemfile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
gem "cli", "~> 1.3"
|
4
|
+
gem "right_aws", "~> 3.0"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
group :development do
|
9
|
+
gem "rspec", "~> 2.8.0"
|
10
|
+
gem "rdoc", "~> 3.12"
|
11
|
+
gem "bundler", "~> 1.0"
|
12
|
+
gem "jeweler", "~> 1.8.7"
|
13
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
addressable (2.3.5)
|
5
|
+
builder (3.2.2)
|
6
|
+
cli (1.3.1)
|
7
|
+
diff-lcs (1.1.3)
|
8
|
+
faraday (0.8.8)
|
9
|
+
multipart-post (~> 1.2.0)
|
10
|
+
git (1.2.6)
|
11
|
+
github_api (0.10.1)
|
12
|
+
addressable
|
13
|
+
faraday (~> 0.8.1)
|
14
|
+
hashie (>= 1.2)
|
15
|
+
multi_json (~> 1.4)
|
16
|
+
nokogiri (~> 1.5.2)
|
17
|
+
oauth2
|
18
|
+
hashie (2.0.5)
|
19
|
+
highline (1.6.20)
|
20
|
+
httpauth (0.2.0)
|
21
|
+
jeweler (1.8.8)
|
22
|
+
builder
|
23
|
+
bundler (~> 1.0)
|
24
|
+
git (>= 1.2.5)
|
25
|
+
github_api (= 0.10.1)
|
26
|
+
highline (>= 1.6.15)
|
27
|
+
nokogiri (= 1.5.10)
|
28
|
+
rake
|
29
|
+
rdoc
|
30
|
+
json (1.8.1)
|
31
|
+
jwt (0.1.8)
|
32
|
+
multi_json (>= 1.5)
|
33
|
+
multi_json (1.8.2)
|
34
|
+
multi_xml (0.5.5)
|
35
|
+
multipart-post (1.2.0)
|
36
|
+
nokogiri (1.5.10)
|
37
|
+
oauth2 (0.9.2)
|
38
|
+
faraday (~> 0.8)
|
39
|
+
httpauth (~> 0.2)
|
40
|
+
jwt (~> 0.1.4)
|
41
|
+
multi_json (~> 1.0)
|
42
|
+
multi_xml (~> 0.5)
|
43
|
+
rack (~> 1.2)
|
44
|
+
rack (1.5.2)
|
45
|
+
rake (10.1.0)
|
46
|
+
rdoc (3.12.2)
|
47
|
+
json (~> 1.4)
|
48
|
+
right_aws (3.1.0)
|
49
|
+
right_http_connection (>= 1.2.5)
|
50
|
+
right_http_connection (1.4.0)
|
51
|
+
rspec (2.8.0)
|
52
|
+
rspec-core (~> 2.8.0)
|
53
|
+
rspec-expectations (~> 2.8.0)
|
54
|
+
rspec-mocks (~> 2.8.0)
|
55
|
+
rspec-core (2.8.0)
|
56
|
+
rspec-expectations (2.8.0)
|
57
|
+
diff-lcs (~> 1.1.2)
|
58
|
+
rspec-mocks (2.8.0)
|
59
|
+
|
60
|
+
PLATFORMS
|
61
|
+
ruby
|
62
|
+
|
63
|
+
DEPENDENCIES
|
64
|
+
bundler (~> 1.0)
|
65
|
+
cli (~> 1.3)
|
66
|
+
jeweler (~> 1.8.7)
|
67
|
+
rdoc (~> 3.12)
|
68
|
+
right_aws (~> 3.0)
|
69
|
+
rspec (~> 2.8.0)
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2013 Jakub Pastuszek
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
# s3-object-processor
|
2
|
+
|
3
|
+
This library tries to help in development of CLI programs that can process all objects (or from list file) stored in given S3 bucket.
|
4
|
+
|
5
|
+
It is using multi-threaded worker model to allow for processing parallelism.
|
6
|
+
|
7
|
+
# Example usage
|
8
|
+
|
9
|
+
This example program can be used to call one of two HTTP endpoints for each object that matches a regexp and some other criteria.
|
10
|
+
In addition total handled objects size is counted and time spent on getting S3 object data and spend on API call is measured and reported at the end of the run.
|
11
|
+
|
12
|
+
```ruby
|
13
|
+
require 's3-object-processor/cli'
|
14
|
+
require 'httpclient'
|
15
|
+
|
16
|
+
S3ObjectProcessor::CLI.new do
|
17
|
+
cli do
|
18
|
+
option :endpoint,
|
19
|
+
description: 'API endpoint URI for JPEG uploads',
|
20
|
+
default: '/iss/v2/pictures'
|
21
|
+
option :endpoint_as_is,
|
22
|
+
description: 'API endpoint URI for as-is uploads',
|
23
|
+
default: '/iss/v2/images'
|
24
|
+
switch :as_is,
|
25
|
+
description: 'upload images without conversion to JPEG'
|
26
|
+
option :httpimagestore,
|
27
|
+
description: 'URL to HTTP Image Store',
|
28
|
+
default: 'http://localhost:3000'
|
29
|
+
end
|
30
|
+
|
31
|
+
cli_process do |settings|
|
32
|
+
end
|
33
|
+
|
34
|
+
report :input_object_size, 0 do
|
35
|
+
report "total input object size [KiB]", "%d" do |value|
|
36
|
+
(value.to_f / 1024).round
|
37
|
+
end
|
38
|
+
end
|
39
|
+
report :s3_body_get_time, 0.0 do
|
40
|
+
report "total S3 get body time [s]", "%.3f"
|
41
|
+
end
|
42
|
+
report :httpimagestore_time, 0.0 do
|
43
|
+
report "total ISS request time [s]", "%.3f"
|
44
|
+
end
|
45
|
+
|
46
|
+
processor do |bucket, key, settings, log, reporter|
|
47
|
+
unless key.to_s =~ %r{(^|.*?/)([0-f]{16})(|/.*)\.(.{3,4})$}
|
48
|
+
log.warn "skipping bad format: #{key}"
|
49
|
+
reporter.report :skipped_key, key
|
50
|
+
next
|
51
|
+
end
|
52
|
+
|
53
|
+
dir = $1
|
54
|
+
hash = $2
|
55
|
+
name = $3
|
56
|
+
extension = $4
|
57
|
+
|
58
|
+
if name =~ /-(search|original|search_thumb|brochure|brochure_thumb|admin|admin_thumb|treatment_thumb|staff_member_thumb|consultation|clinic_google_map_thumb)$/
|
59
|
+
log.debug "skipping not original: #{key}"
|
60
|
+
reporter.report :skipped_key, key
|
61
|
+
next
|
62
|
+
end
|
63
|
+
|
64
|
+
log.debug "processing original dir: '#{dir}' hash: '#{hash}' name: '#{name}' extension: '#{extension}'"
|
65
|
+
|
66
|
+
data = nil
|
67
|
+
reporter.time :s3_body_get_time do
|
68
|
+
data = key.data
|
69
|
+
end
|
70
|
+
fail "no data for key; key not found?!" unless data
|
71
|
+
reporter.report :input_object_size, data.length
|
72
|
+
|
73
|
+
if settings.noop
|
74
|
+
reporter.report :noop_key, key
|
75
|
+
next
|
76
|
+
end
|
77
|
+
|
78
|
+
reporter.time :httpimagestore_time do
|
79
|
+
if settings.as_is
|
80
|
+
response = HTTPClient.put(settings.httpimagestore + settings.endpoint_as_is + "/#{hash}.#{extension}", data)
|
81
|
+
else
|
82
|
+
response = HTTPClient.put(settings.httpimagestore + settings.endpoint + "/#{hash}.jpg", data)
|
83
|
+
end
|
84
|
+
fail "bad HTTP Image Store response: #{response.status}: #{response.body}" if response.status != 200
|
85
|
+
end
|
86
|
+
reporter.report :handled_key, key
|
87
|
+
end
|
88
|
+
end
|
89
|
+
```
|
90
|
+
|
91
|
+
## Contributing to s3-object-processor
|
92
|
+
|
93
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
|
94
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
|
95
|
+
* Fork the project.
|
96
|
+
* Start a feature/bugfix branch.
|
97
|
+
* Commit and push until you are happy with your contribution.
|
98
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
99
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
100
|
+
|
101
|
+
## Copyright
|
102
|
+
|
103
|
+
Copyright (c) 2013 Jakub Pastuszek. See LICENSE.txt for
|
104
|
+
further details.
|
105
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "s3-object-processor"
|
18
|
+
gem.homepage = "http://github.com/jpastuszek/s3-object-processor"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = "S3 key-by-kye processor builder"
|
21
|
+
gem.description = "DSL tools for building programs that can process S3 object key-by-key using threaded worker pool"
|
22
|
+
gem.email = "jpastuszek@gmail.com"
|
23
|
+
gem.authors = ["Jakub Pastuszek"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
|
26
|
+
gem.files.select{|f| f =~ /^.idea/}.each do |file|
|
27
|
+
gem.files.exclude file
|
28
|
+
end
|
29
|
+
end
|
30
|
+
Jeweler::RubygemsDotOrgTasks.new
|
31
|
+
|
32
|
+
require 'rspec/core'
|
33
|
+
require 'rspec/core/rake_task'
|
34
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
35
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
36
|
+
end
|
37
|
+
|
38
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
39
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
40
|
+
spec.rcov = true
|
41
|
+
end
|
42
|
+
|
43
|
+
task :default => :spec
|
44
|
+
|
45
|
+
require 'rdoc/task'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
48
|
+
|
49
|
+
rdoc.rdoc_dir = 'rdoc'
|
50
|
+
rdoc.title = "s3-object-processor #{version}"
|
51
|
+
rdoc.rdoc_files.include('README*')
|
52
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
53
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,360 @@
|
|
1
|
+
require 'thread'
|
2
|
+
Thread.abort_on_exception = true
|
3
|
+
|
4
|
+
class Runnable
|
5
|
+
def on_finish(&callback)
|
6
|
+
(@on_finish ||= []) << callback
|
7
|
+
self
|
8
|
+
end
|
9
|
+
|
10
|
+
def run
|
11
|
+
@thread = Thread.new do
|
12
|
+
begin
|
13
|
+
yield
|
14
|
+
rescue Interrupt
|
15
|
+
# ignored
|
16
|
+
ensure
|
17
|
+
begin
|
18
|
+
@on_finish.each{|on_finish| on_finish.call} if @on_finish
|
19
|
+
rescue
|
20
|
+
# ignored
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
self
|
25
|
+
end
|
26
|
+
|
27
|
+
def join
|
28
|
+
@thread.join if @thread
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class Lister < Runnable
|
33
|
+
def initialize(bucket, key_queue, fetch_size, max_keys = nil)
|
34
|
+
@bucket = bucket
|
35
|
+
@key_queue = key_queue
|
36
|
+
@fetch_size = fetch_size
|
37
|
+
@max_keys = max_keys
|
38
|
+
end
|
39
|
+
|
40
|
+
def on_keys_chunk(&callback)
|
41
|
+
@on_keys_chunk = callback
|
42
|
+
self
|
43
|
+
end
|
44
|
+
|
45
|
+
def run(prefix = nil)
|
46
|
+
super() do
|
47
|
+
catch :done do
|
48
|
+
marker = ''
|
49
|
+
total_keys = 0
|
50
|
+
loop do
|
51
|
+
keys_chunk = @bucket.keys(prefix: prefix, 'max-keys' => @fetch_size, marker: marker)
|
52
|
+
break if keys_chunk.empty?
|
53
|
+
@on_keys_chunk.call(keys_chunk) if @on_keys_chunk
|
54
|
+
keys_chunk.each do |key|
|
55
|
+
throw :done if @max_keys and total_keys >= @max_keys
|
56
|
+
@key_queue << key
|
57
|
+
total_keys += 1
|
58
|
+
end
|
59
|
+
marker = keys_chunk.last.name
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class ListLister < Runnable
|
67
|
+
def initialize(bucket, key_queue, max_keys = nil)
|
68
|
+
@bucket = bucket
|
69
|
+
@key_queue = key_queue
|
70
|
+
@max_keys = max_keys
|
71
|
+
end
|
72
|
+
|
73
|
+
def on_keys_chunk(&callback)
|
74
|
+
@on_keys_chunk = callback
|
75
|
+
self
|
76
|
+
end
|
77
|
+
|
78
|
+
def run(list)
|
79
|
+
super() do
|
80
|
+
total_keys = 0
|
81
|
+
@on_keys_chunk.call(list) if @on_keys_chunk
|
82
|
+
list.each do |key|
|
83
|
+
@key_queue << @bucket.key(key)
|
84
|
+
break if @max_keys and total_keys >= @max_keys
|
85
|
+
total_keys += 1
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
class Worker < Runnable
|
92
|
+
def initialize(no, key_queue, &process_key)
|
93
|
+
@no = no
|
94
|
+
@key_queue = key_queue
|
95
|
+
@process_key = process_key
|
96
|
+
end
|
97
|
+
|
98
|
+
def on_error(&callback)
|
99
|
+
@on_error = callback
|
100
|
+
self
|
101
|
+
end
|
102
|
+
|
103
|
+
def run
|
104
|
+
super do
|
105
|
+
until (key = @key_queue.pop) == :end
|
106
|
+
begin
|
107
|
+
@process_key.call(key)
|
108
|
+
rescue => error
|
109
|
+
@on_error.call(key, error) if @on_error
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
class Reporter < Runnable
|
117
|
+
class Report
|
118
|
+
class DSL
|
119
|
+
attr_reader :update_callback, :description, :value_pattern, :value_processor
|
120
|
+
|
121
|
+
def initialize(&setup)
|
122
|
+
@update_callback = ->(t,v){t + v}
|
123
|
+
@value_processor = ->(v){v}
|
124
|
+
instance_eval &setup
|
125
|
+
end
|
126
|
+
|
127
|
+
def update(&callback)
|
128
|
+
@update_callback = callback
|
129
|
+
end
|
130
|
+
|
131
|
+
def report(description, value_pattern, &value_processor)
|
132
|
+
@description = description
|
133
|
+
@value_pattern = value_pattern
|
134
|
+
@value_processor = value_processor if value_processor
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def initialize(init_value, &setup)
|
139
|
+
@dsl = DSL.new(&setup)
|
140
|
+
@value = init_value
|
141
|
+
end
|
142
|
+
|
143
|
+
def update(value)
|
144
|
+
@value = @dsl.update_callback.call(@value, value)
|
145
|
+
end
|
146
|
+
|
147
|
+
def value
|
148
|
+
@dsl.value_pattern % [@dsl.value_processor.call(@value)]
|
149
|
+
end
|
150
|
+
|
151
|
+
def description
|
152
|
+
@dsl.description
|
153
|
+
end
|
154
|
+
|
155
|
+
def to_s
|
156
|
+
"#{description}: #{value.rjust(6)}"
|
157
|
+
end
|
158
|
+
|
159
|
+
def final
|
160
|
+
"#{description}: ".ljust(40) + value
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def initialize(queue_size, &callback)
|
165
|
+
@report_queue = SizedQueue.new(queue_size)
|
166
|
+
@processor = callback
|
167
|
+
|
168
|
+
on_finish do
|
169
|
+
# flush thread waiting on queue
|
170
|
+
@report_queue.max = 999999
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def run
|
175
|
+
super do
|
176
|
+
@processor.call(self) if @processor
|
177
|
+
until (report = @report_queue.pop) == :end
|
178
|
+
@sink.call(*report) if @sink
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def each(&callback)
|
184
|
+
@sink = callback
|
185
|
+
end
|
186
|
+
|
187
|
+
def report(key, value)
|
188
|
+
@report_queue << [key, value]
|
189
|
+
end
|
190
|
+
|
191
|
+
def time(key)
|
192
|
+
time = Time.now.to_f
|
193
|
+
yield
|
194
|
+
report key, Time.now.to_f - time
|
195
|
+
end
|
196
|
+
|
197
|
+
def join
|
198
|
+
@report_queue << :end
|
199
|
+
super
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
class BucketProcessor
|
204
|
+
def initialize(key_id, key_secret, bucket, options = {}, &callback)
|
205
|
+
protocol = options[:no_https] ? 'http' : 'https'
|
206
|
+
port = options[:no_https] ? 80 : 443
|
207
|
+
@log = options[:log] || Logger.new(STDERR)
|
208
|
+
workers = options[:workers] || 10
|
209
|
+
lister_fetch_size = options[:lister_fetch_size] || 200
|
210
|
+
lister_backlog = options[:lister_backlog] || 1000
|
211
|
+
max_keys = options[:max_keys]
|
212
|
+
reporter_backlog = options[:reporter_backlog] || 1000
|
213
|
+
reporter_summary_interval = options[:reporter_summary_interval] || 100
|
214
|
+
reporter_average_contribution = options[:reporter_average_contribution] || 0.10
|
215
|
+
custom_reports = options[:reports] || []
|
216
|
+
@key_list = options[:key_list]
|
217
|
+
|
218
|
+
s3 = RightAws::S3.new(key_id, key_secret, multi_thread: true, logger: @log, protocol: protocol, port: port)
|
219
|
+
bucket = s3.bucket(bucket)
|
220
|
+
|
221
|
+
@key_queue = SizedQueue.new(lister_backlog)
|
222
|
+
|
223
|
+
@reporter = Reporter.new(reporter_backlog) do |reports|
|
224
|
+
total_listed_keys = 0
|
225
|
+
total_processed_keys = 0
|
226
|
+
total_succeeded_keys = 0
|
227
|
+
total_failed_keys = 0
|
228
|
+
total_handled_keys = 0
|
229
|
+
total_skipped_keys = 0
|
230
|
+
total_nooped_keys = 0
|
231
|
+
|
232
|
+
processed_avg = 0.0
|
233
|
+
last_time = nil
|
234
|
+
last_total = 0
|
235
|
+
|
236
|
+
reports.each do |key, value|
|
237
|
+
case key
|
238
|
+
when :new_keys_count
|
239
|
+
total_listed_keys += value
|
240
|
+
when :processed_key
|
241
|
+
total_processed_keys += 1
|
242
|
+
if total_processed_keys % reporter_summary_interval == 0
|
243
|
+
if last_time
|
244
|
+
contribution = reporter_average_contribution
|
245
|
+
new = (total_processed_keys - last_total).to_f / (Time.now.to_f - last_time)
|
246
|
+
processed_avg = processed_avg * (1.0 - contribution) + new * contribution
|
247
|
+
end
|
248
|
+
last_time = Time.now.to_f
|
249
|
+
last_total = total_processed_keys
|
250
|
+
|
251
|
+
log_line = "-[%s]- processed %6d: failed: %6d (%6.2f %%) handled: %6d skipped: %6d (%6.2f %%)" % [
|
252
|
+
value.to_s[0...2].ljust(2),
|
253
|
+
total_processed_keys,
|
254
|
+
total_failed_keys,
|
255
|
+
total_failed_keys.to_f / total_processed_keys * 100,
|
256
|
+
total_handled_keys,
|
257
|
+
total_skipped_keys,
|
258
|
+
total_skipped_keys.to_f / total_processed_keys * 100
|
259
|
+
]
|
260
|
+
log_line << custom_reports.each_value.map{|v| ' ' + v.to_s}.join
|
261
|
+
log_line << " [backlog: %4d] @ %.1f op/s" % [
|
262
|
+
@key_queue.size,
|
263
|
+
processed_avg
|
264
|
+
]
|
265
|
+
|
266
|
+
@log.info log_line
|
267
|
+
end
|
268
|
+
when :succeeded_key
|
269
|
+
total_succeeded_keys += 1
|
270
|
+
when :failed_key
|
271
|
+
key, error = *value
|
272
|
+
@log.error "Key processing failed: #{key}: #{error.class.name}, #{error.message}"
|
273
|
+
total_failed_keys += 1
|
274
|
+
when :handled_key
|
275
|
+
total_handled_keys += 1
|
276
|
+
when :skipped_key
|
277
|
+
total_skipped_keys += 1
|
278
|
+
when :noop_key
|
279
|
+
total_nooped_keys += 1
|
280
|
+
else
|
281
|
+
#@log.debug "custom report event: #{key}: #{value}"
|
282
|
+
custom_reports[key].update(value)
|
283
|
+
end
|
284
|
+
#@log.debug("Report: #{key}: #{value}")
|
285
|
+
end
|
286
|
+
|
287
|
+
reports.on_finish do
|
288
|
+
@log.info("total listed keys: #{total_listed_keys}")
|
289
|
+
@log.info("total processed keys: #{total_processed_keys}")
|
290
|
+
@log.info("total succeeded keys: #{total_succeeded_keys}")
|
291
|
+
@log.info("total failed keys: #{total_failed_keys}")
|
292
|
+
@log.info("total handled keys: #{total_handled_keys}")
|
293
|
+
@log.info("total skipped keys: #{total_skipped_keys}")
|
294
|
+
@log.info("total nooped keys: #{total_nooped_keys}")
|
295
|
+
custom_reports.each_value do |report|
|
296
|
+
@log.info report.final
|
297
|
+
end
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
# create lister
|
302
|
+
@lister = if @key_list
|
303
|
+
@log.info "processing #{@key_list.length} keys from list file"
|
304
|
+
@lister = ListLister.new(bucket, @key_queue, max_keys)
|
305
|
+
else
|
306
|
+
@lister = Lister.new(bucket, @key_queue, lister_fetch_size, max_keys)
|
307
|
+
end
|
308
|
+
.on_keys_chunk do |keys_chunk|
|
309
|
+
@log.debug "Got #{keys_chunk.length} new keys"
|
310
|
+
@reporter.report(:new_keys_count, keys_chunk.length)
|
311
|
+
end
|
312
|
+
.on_finish do
|
313
|
+
@log.debug "Done listing keys"
|
314
|
+
# notify all workers that no more messages will be posted
|
315
|
+
workers.times{ @key_queue << :end }
|
316
|
+
end
|
317
|
+
|
318
|
+
# create workers
|
319
|
+
@log.info "Launching #{workers} workers"
|
320
|
+
@workers = (1..workers).to_a.map do |worker_no|
|
321
|
+
Worker.new(worker_no, @key_queue) do |key|
|
322
|
+
@log.debug "Worker[#{worker_no}]: Processing key #{key}"
|
323
|
+
yield bucket, key, @reporter
|
324
|
+
@reporter.report :processed_key, key
|
325
|
+
@reporter.report :succeeded_key, key
|
326
|
+
end
|
327
|
+
.on_error do |key, error|
|
328
|
+
@reporter.report :processed_key, key
|
329
|
+
@reporter.report :failed_key, [key, error]
|
330
|
+
end
|
331
|
+
.on_finish do
|
332
|
+
@log.debug "Worker #{worker_no} done"
|
333
|
+
end
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
def run(prefix = nil)
|
338
|
+
begin
|
339
|
+
@reporter.run
|
340
|
+
if @key_list
|
341
|
+
@lister.run(@key_list)
|
342
|
+
else
|
343
|
+
@lister.run(prefix)
|
344
|
+
end
|
345
|
+
@workers.each(&:run)
|
346
|
+
|
347
|
+
# wait for all to finish
|
348
|
+
@workers.each(&:join)
|
349
|
+
@log.info "All workers done"
|
350
|
+
|
351
|
+
@lister.join
|
352
|
+
@reporter.join
|
353
|
+
rescue Interrupt => error
|
354
|
+
@log.warn 'Interrupted'
|
355
|
+
# flush thread waiting on queues
|
356
|
+
@key_queue.max = 999999
|
357
|
+
@reporter.join
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
require 's3-object-processor'
|
2
|
+
require 'cli'
|
3
|
+
require 'logger'
|
4
|
+
require 'right_aws'
|
5
|
+
require 'time'
|
6
|
+
|
7
|
+
module S3ObjectProcessor
|
8
|
+
class CLI
|
9
|
+
def initialize(&config)
|
10
|
+
@reports = {}
|
11
|
+
instance_eval &config
|
12
|
+
|
13
|
+
cli_setup = @cli_setup
|
14
|
+
cli_process_setup = @cli_process_setup
|
15
|
+
|
16
|
+
settings = ::CLI.new do
|
17
|
+
description 'Set header of S3 object'
|
18
|
+
|
19
|
+
option :key_id,
|
20
|
+
short: :i,
|
21
|
+
description: 'AWS access key ID',
|
22
|
+
default_label: 'AWS_SECRET_KEY_ID environment variable',
|
23
|
+
default: ENV['AWS_ACCESS_KEY_ID'],
|
24
|
+
required: true
|
25
|
+
option :key_secret,
|
26
|
+
short: :s,
|
27
|
+
description: 'AWS access key secret',
|
28
|
+
default_label: 'AWS_SECRET_ACCESS_KEY environment variable',
|
29
|
+
default: ENV['AWS_SECRET_ACCESS_KEY'],
|
30
|
+
required: true
|
31
|
+
switch :no_https,
|
32
|
+
description: 'use plain HTTP S3 connections'
|
33
|
+
|
34
|
+
option :bucket,
|
35
|
+
short: :b,
|
36
|
+
description: 'bucket to process',
|
37
|
+
required: true
|
38
|
+
option :prefix,
|
39
|
+
short: :p,
|
40
|
+
description: 'process only objects of key starting with given prefix'
|
41
|
+
|
42
|
+
option :lister_fetch_size,
|
43
|
+
description: 'fetch no more that that number of keys per request',
|
44
|
+
cast: Integer,
|
45
|
+
default: 200
|
46
|
+
option :lister_backlog,
|
47
|
+
description: 'maximum length of to be processed key queue',
|
48
|
+
cast: Integer,
|
49
|
+
default: 1000
|
50
|
+
option :key_list,
|
51
|
+
description: 'file with keys to process (one per line)',
|
52
|
+
default_label: 'process all keys in S3 bucket',
|
53
|
+
cast: Pathname
|
54
|
+
|
55
|
+
option :reporter_backlog,
|
56
|
+
description: 'maximum length of to be processed report queue',
|
57
|
+
cast: Integer,
|
58
|
+
default: 1000
|
59
|
+
option :reporter_summary_interval,
|
60
|
+
description: 'pring summary every some number of processed objects',
|
61
|
+
cast: Integer,
|
62
|
+
default: 100
|
63
|
+
option :reporter_average_contribution,
|
64
|
+
description: 'how much does last average calculation contribute in the printed value - less => more stable',
|
65
|
+
cast: Float,
|
66
|
+
default: 0.10
|
67
|
+
|
68
|
+
option :workers,
|
69
|
+
short: :t,
|
70
|
+
description: 'number of processing threads to start',
|
71
|
+
cast: Integer,
|
72
|
+
default: 10
|
73
|
+
|
74
|
+
switch :noop,
|
75
|
+
short: :n,
|
76
|
+
description: 'do not change any object; just say what would be done'
|
77
|
+
|
78
|
+
switch :debug,
|
79
|
+
short: :d,
|
80
|
+
description: 'log at DEBUG level'
|
81
|
+
|
82
|
+
option :max_keys,
|
83
|
+
description: 'stop after processing this amout of keys',
|
84
|
+
cast: Integer
|
85
|
+
|
86
|
+
instance_eval &cli_setup if cli_setup
|
87
|
+
end.parse! do |settings|
|
88
|
+
instance_eval &cli_process_setup if cli_process_setup
|
89
|
+
end
|
90
|
+
|
91
|
+
log = Logger.new(STDERR)
|
92
|
+
log.level = settings.debug ? Logger::DEBUG : Logger::INFO
|
93
|
+
|
94
|
+
log.debug(settings.inspect)
|
95
|
+
|
96
|
+
trap 'QUIT' do
|
97
|
+
Thread.list.each do |thread|
|
98
|
+
STDERR.puts "Thread-#{thread.object_id.to_s(36)}"
|
99
|
+
STDERR.puts thread.backtrace.join("\n \\_ ")
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
BucketProcessor.new(settings.key_id, settings.key_secret, settings.bucket,
|
104
|
+
no_https: settings.no_https,
|
105
|
+
log: log,
|
106
|
+
workers: settings.workers,
|
107
|
+
max_keys: settings.max_keys,
|
108
|
+
lister_fetch_size: settings.lister_fetch_size,
|
109
|
+
lister_backlog: settings.lister_backlog,
|
110
|
+
key_list: settings.key_list && settings.key_list.readlines.map(&:strip),
|
111
|
+
reporter_backlog: settings.reporter_backlog,
|
112
|
+
reporter_summary_interval: settings.reporter_summary_interval,
|
113
|
+
reporter_average_contribution: settings.reporter_average_contribution,
|
114
|
+
reports: @reports
|
115
|
+
) do |bucket, key, reporter|
|
116
|
+
@processor.call(bucket, key, settings, log, reporter)
|
117
|
+
end
|
118
|
+
.run(settings.prefix)
|
119
|
+
end
|
120
|
+
|
121
|
+
def cli(&setup)
|
122
|
+
@cli_setup = setup
|
123
|
+
end
|
124
|
+
|
125
|
+
def cli_process(&setup)
|
126
|
+
@cli_process_setup = setup
|
127
|
+
end
|
128
|
+
|
129
|
+
def report(name, init_value, &setup)
|
130
|
+
@reports[name] = Reporter::Report.new(init_value, &setup)
|
131
|
+
end
|
132
|
+
|
133
|
+
def processor(&callback)
|
134
|
+
@processor = callback
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "s3-object-processor"
|
8
|
+
s.version = "0.1.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Jakub Pastuszek"]
|
12
|
+
s.date = "2013-10-29"
|
13
|
+
s.description = "DSL tools for building programs that can process S3 object key-by-key using threaded worker pool"
|
14
|
+
s.email = "jpastuszek@gmail.com"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README.md"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".rspec",
|
22
|
+
"Gemfile",
|
23
|
+
"Gemfile.lock",
|
24
|
+
"LICENSE.txt",
|
25
|
+
"README.md",
|
26
|
+
"Rakefile",
|
27
|
+
"VERSION",
|
28
|
+
"lib/s3-object-processor.rb",
|
29
|
+
"lib/s3-object-processor/cli.rb",
|
30
|
+
"s3-object-processor.gemspec",
|
31
|
+
"spec/s3-object-processor_spec.rb",
|
32
|
+
"spec/spec_helper.rb"
|
33
|
+
]
|
34
|
+
s.homepage = "http://github.com/jpastuszek/s3-object-processor"
|
35
|
+
s.licenses = ["MIT"]
|
36
|
+
s.require_paths = ["lib"]
|
37
|
+
s.rubygems_version = "1.8.25"
|
38
|
+
s.summary = "S3 key-by-kye processor builder"
|
39
|
+
|
40
|
+
if s.respond_to? :specification_version then
|
41
|
+
s.specification_version = 3
|
42
|
+
|
43
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
44
|
+
s.add_runtime_dependency(%q<cli>, ["~> 1.3"])
|
45
|
+
s.add_runtime_dependency(%q<right_aws>, ["~> 3.0"])
|
46
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
|
47
|
+
s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
|
48
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0"])
|
49
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.8.7"])
|
50
|
+
else
|
51
|
+
s.add_dependency(%q<cli>, ["~> 1.3"])
|
52
|
+
s.add_dependency(%q<right_aws>, ["~> 3.0"])
|
53
|
+
s.add_dependency(%q<rspec>, ["~> 2.8.0"])
|
54
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
55
|
+
s.add_dependency(%q<bundler>, ["~> 1.0"])
|
56
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8.7"])
|
57
|
+
end
|
58
|
+
else
|
59
|
+
s.add_dependency(%q<cli>, ["~> 1.3"])
|
60
|
+
s.add_dependency(%q<right_aws>, ["~> 3.0"])
|
61
|
+
s.add_dependency(%q<rspec>, ["~> 2.8.0"])
|
62
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
63
|
+
s.add_dependency(%q<bundler>, ["~> 1.0"])
|
64
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8.7"])
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
require 'rspec'
|
4
|
+
require 's3_object_processor'
|
5
|
+
|
6
|
+
# Requires supporting files with custom matchers and macros, etc,
|
7
|
+
# in ./support/ and its subdirectories.
|
8
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
9
|
+
|
10
|
+
RSpec.configure do |config|
|
11
|
+
|
12
|
+
end
|
metadata
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: s3-object-processor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jakub Pastuszek
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-10-29 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: cli
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.3'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.3'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: right_aws
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '3.0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '3.0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rspec
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 2.8.0
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 2.8.0
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: rdoc
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ~>
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '3.12'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '3.12'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: bundler
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ~>
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '1.0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ~>
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '1.0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: jeweler
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ~>
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.8.7
|
102
|
+
type: :development
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ~>
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 1.8.7
|
110
|
+
description: DSL tools for building programs that can process S3 object key-by-key
|
111
|
+
using threaded worker pool
|
112
|
+
email: jpastuszek@gmail.com
|
113
|
+
executables: []
|
114
|
+
extensions: []
|
115
|
+
extra_rdoc_files:
|
116
|
+
- LICENSE.txt
|
117
|
+
- README.md
|
118
|
+
files:
|
119
|
+
- .document
|
120
|
+
- .rspec
|
121
|
+
- Gemfile
|
122
|
+
- Gemfile.lock
|
123
|
+
- LICENSE.txt
|
124
|
+
- README.md
|
125
|
+
- Rakefile
|
126
|
+
- VERSION
|
127
|
+
- lib/s3-object-processor.rb
|
128
|
+
- lib/s3-object-processor/cli.rb
|
129
|
+
- s3-object-processor.gemspec
|
130
|
+
- spec/s3-object-processor_spec.rb
|
131
|
+
- spec/spec_helper.rb
|
132
|
+
homepage: http://github.com/jpastuszek/s3-object-processor
|
133
|
+
licenses:
|
134
|
+
- MIT
|
135
|
+
post_install_message:
|
136
|
+
rdoc_options: []
|
137
|
+
require_paths:
|
138
|
+
- lib
|
139
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
140
|
+
none: false
|
141
|
+
requirements:
|
142
|
+
- - ! '>='
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: '0'
|
145
|
+
segments:
|
146
|
+
- 0
|
147
|
+
hash: 3595802382691813695
|
148
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
149
|
+
none: false
|
150
|
+
requirements:
|
151
|
+
- - ! '>='
|
152
|
+
- !ruby/object:Gem::Version
|
153
|
+
version: '0'
|
154
|
+
requirements: []
|
155
|
+
rubyforge_project:
|
156
|
+
rubygems_version: 1.8.25
|
157
|
+
signing_key:
|
158
|
+
specification_version: 3
|
159
|
+
summary: S3 key-by-kye processor builder
|
160
|
+
test_files: []
|