skyrunner 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -0
- data/bin/skyrunner +2 -3
- data/lib/generators/sky_runner/install/templates/skyrunner.rb +2 -6
- data/lib/skyrunner/job.rb +83 -9
- data/lib/skyrunner/version.rb +1 -1
- data/lib/skyrunner.rb +103 -60
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3b4147a678d048c85f857479136563287a39576e
|
4
|
+
data.tar.gz: b1ccac1353150bc7948332135aefd8c90bbefb5b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c4c80e760302f36ccdafc30b7c76b9f86838728ee479f578fd71f8e401285845fcb07a7c4003787f2d0d843959f6a9edf7dcddf7d6decec56aaff462aeee2ed8
|
7
|
+
data.tar.gz: a56cfeae042ce3387ffaa1cb757f06d26639f65b2b9ed4789aefe04bdc02442b65367943ca2625161e4826bac8a5bfac010e38c062bf981aa2bdbb13aa68562d
|
data/Gemfile
CHANGED
data/bin/skyrunner
CHANGED
@@ -38,8 +38,7 @@ end
|
|
38
38
|
|
39
39
|
SkyRunner.dynamo_db_table_name = opts[:dynamo_db_table_name]
|
40
40
|
SkyRunner.sqs_queue_name = opts[:sqs_queue_name]
|
41
|
-
SkyRunner.
|
42
|
-
SkyRunner.num_threads = opts[:num_threads].to_i
|
41
|
+
SkyRunner.consumer_threads = opts[:num_threads].to_i
|
43
42
|
|
44
43
|
COMMANDS = ["init", "purge", "consume", "test"]
|
45
44
|
|
@@ -57,6 +56,6 @@ when "consume"
|
|
57
56
|
when "test"
|
58
57
|
$: << "."
|
59
58
|
require "#{File.dirname(__FILE__)}/../jobs/example_job"
|
60
|
-
ExampleJobModule::ExampleJob.new.execute!(number_of_tasks:
|
59
|
+
ExampleJobModule::ExampleJob.new.execute!(number_of_tasks: 1000)
|
61
60
|
SkyRunner.consume!
|
62
61
|
end
|
@@ -5,17 +5,13 @@ SkyRunner.setup do |config|
|
|
5
5
|
config.dynamo_db_table_name = "skyrunner_jobs_#{Rails.env}"
|
6
6
|
config.sqs_queue_name = "skyrunner_tasks_#{Rails.env}"
|
7
7
|
|
8
|
-
# Set the number of tasks for a consumer to pull and run from SQS at a time. (Max 10, default 10)
|
9
|
-
#
|
10
|
-
# config.consumer_batch_size = 10
|
11
|
-
|
12
8
|
# Set the visibility timeout of queue items. If the consumer batch size (above) is set to 10,
|
13
9
|
# this should provide sufficient time for a consumer to process 10 tasks, for example. (default 90)
|
14
10
|
#
|
15
11
|
# config.visibility_timeout = 90
|
16
12
|
|
17
|
-
# Set the number of concurrent threads
|
13
|
+
# Set the number of concurrent consumer threads when running the consumer.
|
18
14
|
# (If greater than one, you obviously need to make sure your tasks are thread-safe.)
|
19
15
|
#
|
20
|
-
# config.
|
16
|
+
# config.consumer_threads = 10
|
21
17
|
end
|
data/lib/skyrunner/job.rb
CHANGED
@@ -34,13 +34,17 @@ module SkyRunner::Job
|
|
34
34
|
table = SkyRunner.dynamo_db_table
|
35
35
|
queue = SkyRunner.sqs_queue
|
36
36
|
|
37
|
-
record =
|
37
|
+
record = nil
|
38
|
+
|
39
|
+
SkyRunner::retry_dynamo_db do
|
40
|
+
record = table.items.put(id: job_id, task_id: job_id, class: self.class.name, args: args.to_json, total_tasks: 1, completed_tasks: 0, done: 0, failed: 0)
|
41
|
+
end
|
38
42
|
|
39
43
|
pending_args = []
|
40
44
|
|
41
45
|
flush = lambda do
|
42
46
|
messages = pending_args.map do |task_args|
|
43
|
-
{ job_id: job_id, task_id: SecureRandom.hex, task_args: task_args }.to_json
|
47
|
+
{ job_id: job_id, task_id: SecureRandom.hex, task_args: task_args, job_class: self.class.name }.to_json
|
44
48
|
end
|
45
49
|
|
46
50
|
dropped_message_count = 0
|
@@ -57,7 +61,9 @@ module SkyRunner::Job
|
|
57
61
|
end
|
58
62
|
end
|
59
63
|
|
60
|
-
|
64
|
+
SkyRunner::retry_dynamo_db do
|
65
|
+
record.attributes.add({ total_tasks: messages.size - dropped_message_count })
|
66
|
+
end
|
61
67
|
end
|
62
68
|
|
63
69
|
self.run(args) do |*task_args|
|
@@ -94,7 +100,7 @@ module SkyRunner::Job
|
|
94
100
|
private
|
95
101
|
|
96
102
|
def dynamo_db_record
|
97
|
-
SkyRunner.dynamo_db_table.items[self.skyrunner_job_id]
|
103
|
+
SkyRunner.dynamo_db_table.items[self.skyrunner_job_id, self.skyrunner_job_id]
|
98
104
|
end
|
99
105
|
|
100
106
|
def handle_task_failed!
|
@@ -102,7 +108,10 @@ module SkyRunner::Job
|
|
102
108
|
|
103
109
|
begin
|
104
110
|
record = dynamo_db_record
|
105
|
-
|
111
|
+
|
112
|
+
SkyRunner::retry_dynamo_db do
|
113
|
+
record.attributes.add({ failed: 1 })
|
114
|
+
end
|
106
115
|
|
107
116
|
(self.class.job_event_methods[:failed] || []).each do |method|
|
108
117
|
if self.method(method).arity == 0 && self.method(method).parameters.size == 0
|
@@ -111,6 +120,8 @@ module SkyRunner::Job
|
|
111
120
|
self.send(method, JSON.parse(record.attributes["args"]).symbolize_keys)
|
112
121
|
end
|
113
122
|
end
|
123
|
+
|
124
|
+
delete_task_records! rescue nil
|
114
125
|
rescue Exception => e
|
115
126
|
end
|
116
127
|
end
|
@@ -119,15 +130,20 @@ module SkyRunner::Job
|
|
119
130
|
return false unless self.skyrunner_job_id
|
120
131
|
|
121
132
|
record = dynamo_db_record
|
133
|
+
new_attributes = nil
|
122
134
|
|
123
|
-
|
135
|
+
SkyRunner::retry_dynamo_db do
|
136
|
+
new_attributes = record.attributes.add({ completed_tasks: 1 }, return: :all_new)
|
137
|
+
end
|
124
138
|
|
125
139
|
if new_attributes["total_tasks"] == new_attributes["completed_tasks"]
|
126
140
|
begin
|
127
141
|
if_condition = { completed_tasks: new_attributes["total_tasks"], done: 0 }
|
128
142
|
|
129
|
-
|
130
|
-
|
143
|
+
SkyRunner::retry_dynamo_db do
|
144
|
+
record.attributes.update(if: if_condition) do |u|
|
145
|
+
u.add(done: 1)
|
146
|
+
end
|
131
147
|
end
|
132
148
|
|
133
149
|
(self.class.job_event_methods[:completed] || []).each do |method|
|
@@ -137,11 +153,69 @@ module SkyRunner::Job
|
|
137
153
|
self.send(method, JSON.parse(record.attributes["args"]).symbolize_keys)
|
138
154
|
end
|
139
155
|
end
|
156
|
+
|
157
|
+
delete_task_records! rescue nil
|
140
158
|
rescue AWS::DynamoDB::Errors::ConditionalCheckFailedException => e
|
141
|
-
# This is OK, we had a double finisher.
|
159
|
+
# This is OK, we had a double finisher so lets block them.
|
142
160
|
end
|
143
161
|
end
|
144
162
|
|
145
163
|
true
|
146
164
|
end
|
165
|
+
|
166
|
+
def delete_task_records!
|
167
|
+
delete_batch_queue = Queue.new
|
168
|
+
mutex = Mutex.new
|
169
|
+
delete_items_queued = false
|
170
|
+
threads = []
|
171
|
+
|
172
|
+
1.upto([1, (SkyRunner.consumer_threads / 4.0).floor].max) do
|
173
|
+
threads << Thread.new do
|
174
|
+
|
175
|
+
db_table = SkyRunner.dynamo_db_table
|
176
|
+
|
177
|
+
loop do
|
178
|
+
should_break = false
|
179
|
+
|
180
|
+
mutex.synchronize do
|
181
|
+
should_break = (SkyRunner::stop_consuming? || delete_items_queued) && delete_batch_queue.empty?
|
182
|
+
end
|
183
|
+
|
184
|
+
break if should_break
|
185
|
+
|
186
|
+
if delete_batch_queue.size > 0
|
187
|
+
batch = delete_batch_queue.pop
|
188
|
+
|
189
|
+
if batch
|
190
|
+
SkyRunner::retry_dynamo_db do
|
191
|
+
db_table.batch_delete(batch)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
else
|
195
|
+
sleep 1
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
items_to_delete = []
|
202
|
+
table = SkyRunner.dynamo_db_table
|
203
|
+
|
204
|
+
table.items.query(hash_value: "#{self.skyrunner_job_id}-tasks", select: [:id, :task_id]) do |task_item|
|
205
|
+
items_to_delete << [task_item.attributes["id"], task_item.attributes["task_id"]]
|
206
|
+
|
207
|
+
if items_to_delete.size >= 25
|
208
|
+
delete_batch_queue << items_to_delete
|
209
|
+
items_to_delete = []
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
delete_batch_queue << items_to_delete unless items_to_delete.empty?
|
214
|
+
|
215
|
+
mutex.synchronize do
|
216
|
+
delete_items_queued = true
|
217
|
+
end
|
218
|
+
|
219
|
+
threads.each(&:join)
|
220
|
+
end
|
147
221
|
end
|
data/lib/skyrunner/version.rb
CHANGED
data/lib/skyrunner.rb
CHANGED
@@ -5,6 +5,7 @@ require "active_support/core_ext"
|
|
5
5
|
require "log4r"
|
6
6
|
require "json"
|
7
7
|
require "set"
|
8
|
+
require "retries"
|
8
9
|
|
9
10
|
module SkyRunner
|
10
11
|
require "skyrunner/engine" if defined?(Rails)
|
@@ -33,7 +34,8 @@ module SkyRunner
|
|
33
34
|
table = dynamo_db.tables.create(table_name,
|
34
35
|
SkyRunner.dynamo_db_read_capacity,
|
35
36
|
SkyRunner.dynamo_db_write_capacity,
|
36
|
-
hash_key: {
|
37
|
+
hash_key: { id: :string },
|
38
|
+
range_key: { task_id: :string })
|
37
39
|
|
38
40
|
sleep 1 while table.status == :creating
|
39
41
|
end
|
@@ -61,92 +63,124 @@ module SkyRunner
|
|
61
63
|
end
|
62
64
|
|
63
65
|
def self.consume!(&block)
|
64
|
-
|
65
|
-
table
|
66
|
-
raise "Queue #{SkyRunner::sqs_queue_name} not found. Try running 'skyrunner init'" unless queue
|
67
|
-
raise "DynamoDB table #{SkyRunner::dynamo_db_table_name} not found. Try running 'skyrunner init'" unless table && table.exists?
|
66
|
+
raise "Queue #{SkyRunner::sqs_queue_name} not found. Try running 'skyrunner init'" unless sqs_queue
|
67
|
+
raise "DynamoDB table #{SkyRunner::dynamo_db_table_name} not found. Try running 'skyrunner init'" unless dynamo_db_table && dynamo_db_table.exists?
|
68
68
|
|
69
69
|
local_queue = Queue.new
|
70
|
-
error_queue = Queue.new
|
71
70
|
|
72
71
|
threads = []
|
73
72
|
|
74
|
-
1.upto(SkyRunner::
|
73
|
+
1.upto(SkyRunner::consumer_threads) do
|
75
74
|
threads << Thread.new do
|
75
|
+
table = SkyRunner::dynamo_db_table
|
76
|
+
|
76
77
|
loop do
|
77
|
-
|
78
|
+
begin
|
79
|
+
if local_queue.empty?
|
80
|
+
break if SkyRunner::stop_consuming?
|
81
|
+
|
82
|
+
sleep 1
|
83
|
+
next
|
84
|
+
end
|
78
85
|
|
79
|
-
|
86
|
+
klass, job_id, task_id, task_args, message = local_queue.pop
|
80
87
|
|
81
|
-
|
88
|
+
if klass
|
89
|
+
begin
|
90
|
+
# Avoid running the same task twice, enter record and raise error if exists already.
|
82
91
|
|
83
|
-
|
84
|
-
|
92
|
+
SkyRunner::retry_dynamo_db do
|
93
|
+
table.items.put({ id: "#{job_id}-tasks", task_id: task_id }, unless_exists: ["id", "task_id"])
|
94
|
+
end
|
85
95
|
|
86
|
-
|
87
|
-
job.skyrunner_job_id = job_id
|
96
|
+
SkyRunner::log :info, "Run Task: #{task_args} Job: #{job_id} Message: #{message.id}"
|
88
97
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
98
|
+
job = klass.new
|
99
|
+
job.skyrunner_job_id = job_id
|
100
|
+
|
101
|
+
begin
|
102
|
+
job.consume!(task_args)
|
103
|
+
message.delete
|
104
|
+
rescue Exception => e
|
105
|
+
message.delete rescue nil
|
106
|
+
block.call(e) if block_given?
|
107
|
+
SkyRunner::log :error, "Task Failed: #{task_args} Job: #{job_id} #{e.message} #{e.backtrace.join("\n")}"
|
108
|
+
end
|
109
|
+
rescue AWS::DynamoDB::Errors::ConditionalCheckFailedException => e
|
110
|
+
message.delete rescue nil
|
111
|
+
end
|
96
112
|
end
|
113
|
+
rescue Exception => e
|
114
|
+
puts e.message
|
115
|
+
puts e.backtrace.join("\n")
|
116
|
+
raise e
|
97
117
|
end
|
98
118
|
end
|
99
119
|
end
|
100
120
|
end
|
101
121
|
|
102
|
-
|
122
|
+
1.upto((SkyRunner::consumer_threads.to_f / SQS_MAX_BATCH_SIZE).ceil + 1) do
|
123
|
+
threads << Thread.new do
|
124
|
+
begin
|
125
|
+
loop do
|
126
|
+
table = SkyRunner::dynamo_db_table
|
127
|
+
queue = sqs_queue
|
103
128
|
|
104
|
-
|
105
|
-
if error_queue.size > 0
|
106
|
-
SkyRunner::stop_consuming!
|
129
|
+
break if SkyRunner::stop_consuming?
|
107
130
|
|
108
|
-
|
109
|
-
error = error_queue.pop
|
110
|
-
yield error if block_given?
|
111
|
-
end
|
112
|
-
end
|
131
|
+
sleep 1 while local_queue.size >= SkyRunner.consumer_threads
|
113
132
|
|
114
|
-
|
133
|
+
received_messages = []
|
115
134
|
|
116
|
-
|
135
|
+
queue.receive_messages(limit: SQS_MAX_BATCH_SIZE, wait_time_seconds: 5) do |message|
|
136
|
+
received_messages << [message, JSON.parse(message.body)]
|
137
|
+
end
|
117
138
|
|
118
|
-
|
139
|
+
next unless received_messages.size > 0
|
119
140
|
|
120
|
-
|
141
|
+
job_ids = received_messages.map { |m| [m[1]["job_id"], m[1]["job_id"]] }.uniq
|
121
142
|
|
122
|
-
|
123
|
-
received_messages << [message, JSON.parse(message.body)]
|
124
|
-
end
|
143
|
+
job_records = {}
|
125
144
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
145
|
+
SkyRunner::retry_dynamo_db do
|
146
|
+
# Read DynamoDB records into job and task lookup tables.
|
147
|
+
table.batch_get(["id", "task_id", "failed"], job_ids.uniq, consistent_read: true) do |record|
|
148
|
+
job_records[record["id"]] = record
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
received_messages.each do |received_message|
|
153
|
+
message, message_data = received_message
|
154
|
+
job_id = message_data["job_id"]
|
155
|
+
task_id = message_data["task_id"]
|
156
|
+
|
157
|
+
job_record = job_records[job_id]
|
158
|
+
|
159
|
+
if job_record && job_record["failed"] == 0
|
160
|
+
begin
|
161
|
+
klass = Kernel.const_get(message_data["job_class"])
|
162
|
+
task_args = message_data["task_args"]
|
163
|
+
local_queue.push([klass, job_id, task_id, task_args, message])
|
164
|
+
rescue NameError => e
|
165
|
+
block.call(e) if block_given?
|
166
|
+
message.delete rescue nil
|
167
|
+
log :error, "Task Failed: No such class #{message_data["job_class"]} #{e.message}"
|
168
|
+
end
|
169
|
+
else
|
170
|
+
message.delete rescue nil
|
171
|
+
end
|
142
172
|
end
|
143
|
-
else
|
144
|
-
message.delete
|
145
173
|
end
|
174
|
+
rescue Exception => e
|
175
|
+
puts e.message
|
176
|
+
puts e.backtrace.join("\n")
|
177
|
+
raise e
|
146
178
|
end
|
147
179
|
end
|
148
180
|
end
|
149
181
|
|
182
|
+
log :info, "Consumer started."
|
183
|
+
|
150
184
|
threads.each(&:join)
|
151
185
|
|
152
186
|
true
|
@@ -188,14 +222,11 @@ module SkyRunner
|
|
188
222
|
mattr_accessor :sqs_message_retention_period
|
189
223
|
@@sqs_message_retention_period = 345600
|
190
224
|
|
191
|
-
mattr_accessor :consumer_batch_size
|
192
|
-
@@consumer_batch_size = 10
|
193
|
-
|
194
225
|
mattr_accessor :logger
|
195
226
|
@@logger = Log4r::Logger.new("skyrunner")
|
196
227
|
|
197
|
-
mattr_accessor :
|
198
|
-
@@
|
228
|
+
mattr_accessor :consumer_threads
|
229
|
+
@@consumer_threads = 10
|
199
230
|
|
200
231
|
mattr_accessor :stop_consuming_flag
|
201
232
|
|
@@ -217,6 +248,18 @@ module SkyRunner
|
|
217
248
|
end
|
218
249
|
end
|
219
250
|
|
251
|
+
def self.retry_dynamo_db(&block)
|
252
|
+
handler = Proc.new do |exception, num, delay|
|
253
|
+
if exception
|
254
|
+
SkyRunner.log :warn, "Having to retry DynamoDB requests. #{exception.message}"
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
with_retries(handler: handler, max_tries: 100, rescue: AWS::DynamoDB::Errors::ProvisionedThroughputExceededException, base_sleep_seconds: 2, max_sleep_seconds: 60) do
|
259
|
+
block.call
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
220
263
|
private
|
221
264
|
|
222
265
|
def self.dynamo_db
|