kraps 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/CHANGELOG.md +8 -0
- data/Gemfile.lock +2 -5
- data/README.md +17 -10
- data/docker-compose.yml +1 -1
- data/lib/kraps/job.rb +12 -7
- data/lib/kraps/redis_queue.rb +151 -0
- data/lib/kraps/runner.rb +48 -79
- data/lib/kraps/step.rb +1 -1
- data/lib/kraps/version.rb +1 -1
- data/lib/kraps/worker.rb +43 -32
- data/lib/kraps.rb +15 -5
- metadata +3 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 19635ced3e745d44313ed3bc416ef73eb555134c591725f4dab7b38208e21393
|
4
|
+
data.tar.gz: bb7f679c7e2cd053744d1c7d857629de1912a305880d0538bdab418de3861ba1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 91273ba54ea33c6d5cb1b4f335ad8039c35601953fdf1e6b9b2ac3117ceb25d81e9be569e5c8b70deda22b53a4c72f04d60ffb4b251badf5c5a64d13d399f36c
|
7
|
+
data.tar.gz: 4e563257fcba0c9f457b363da4f43d000ee239a5179e2f965071bb3df27e362cdf7bc9950a1624520cb862acfb0fde89e96b0a4987d79a93524230c4b84619cd
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,13 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## v0.7.0
|
4
|
+
|
5
|
+
* Added a `jobs` option to the actions to limit the concurrency
|
6
|
+
when e.g. accessing external data stores and to avoid overloading
|
7
|
+
them
|
8
|
+
* Added a queue using redis for the jobs to avoid starving workers
|
9
|
+
* Removed `distributed_job` dependency
|
10
|
+
|
3
11
|
## v0.6.0
|
4
12
|
|
5
13
|
* Added `map_partitions`
|
data/Gemfile.lock
CHANGED
@@ -1,9 +1,8 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
kraps (0.
|
4
|
+
kraps (0.7.0)
|
5
5
|
attachie
|
6
|
-
distributed_job
|
7
6
|
map-reduce-ruby (>= 3.0.0)
|
8
7
|
redis
|
9
8
|
ruby-progressbar
|
@@ -41,8 +40,6 @@ GEM
|
|
41
40
|
concurrent-ruby (1.1.10)
|
42
41
|
connection_pool (2.3.0)
|
43
42
|
diff-lcs (1.5.0)
|
44
|
-
distributed_job (3.1.0)
|
45
|
-
redis (>= 4.1.0)
|
46
43
|
i18n (1.12.0)
|
47
44
|
concurrent-ruby (~> 1.0)
|
48
45
|
jmespath (1.6.1)
|
@@ -62,7 +59,7 @@ GEM
|
|
62
59
|
rake (13.0.6)
|
63
60
|
redis (5.0.5)
|
64
61
|
redis-client (>= 0.9.0)
|
65
|
-
redis-client (0.11.
|
62
|
+
redis-client (0.11.2)
|
66
63
|
connection_pool
|
67
64
|
regexp_parser (2.6.0)
|
68
65
|
rexml (3.2.5)
|
data/README.md
CHANGED
@@ -30,7 +30,7 @@ Kraps.configure(
|
|
30
30
|
driver: Kraps::Drivers::S3Driver.new(s3_client: Aws::S3::Client.new("..."), bucket: "some-bucket", prefix: "temp/kraps/"),
|
31
31
|
redis: Redis.new,
|
32
32
|
namespace: "my-application", # An optional namespace to be used for redis keys, default: nil
|
33
|
-
job_ttl:
|
33
|
+
job_ttl: 7.days, # Job information in redis will automatically be removed after this amount of time, default: 4 days
|
34
34
|
show_progress: true # Whether or not to show the progress in the terminal when executing jobs, default: true
|
35
35
|
enqueuer: ->(worker, json) { worker.perform_async(json) } # Allows to customize the enqueueing of worker jobs
|
36
36
|
)
|
@@ -220,7 +220,7 @@ items are used as keys and the values are set to `nil`.
|
|
220
220
|
* `map`: Maps the key value pairs to other key value pairs
|
221
221
|
|
222
222
|
```ruby
|
223
|
-
job.map(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker) do |key, value, collector|
|
223
|
+
job.map(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8) do |key, value, collector|
|
224
224
|
collector.call("changed #{key}", "changed #{value}")
|
225
225
|
end
|
226
226
|
```
|
@@ -229,13 +229,21 @@ The block gets each key-value pair passed and the `collector` block can be
|
|
229
229
|
called as often as neccessary. This is also the reason why `map` can not simply
|
230
230
|
return the new key-value pair, but the `collector` must be used instead.
|
231
231
|
|
232
|
+
The `jobs` argument can be useful when you need to access an external data
|
233
|
+
source, like a relational database and you want to limit the number of workers
|
234
|
+
accessing the store concurrently to avoid overloading it. If you don't specify
|
235
|
+
it, it will be identical to the number of partitions. It is recommended to only
|
236
|
+
use it for steps where you need to throttle the concurrency, because it will of
|
237
|
+
course slow down the processing. The `jobs` argument only applies to the
|
238
|
+
current step. The following steps don't inherit the argument, but reset it.
|
239
|
+
|
232
240
|
* `map_partitions`: Maps the key value pairs to other key value pairs, but the
|
233
241
|
block receives all data of each partition as an enumerable and sorted by key.
|
234
242
|
Please be aware that you should not call `to_a` or similar on the enumerable.
|
235
243
|
Prefer `map` over `map_partitions` when possible.
|
236
244
|
|
237
245
|
```ruby
|
238
|
-
job.map_partitions(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker) do |pairs, collector|
|
246
|
+
job.map_partitions(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8) do |pairs, collector|
|
239
247
|
pairs.each do |key, value|
|
240
248
|
collector.call("changed #{key}", "changed #{value}")
|
241
249
|
end
|
@@ -245,7 +253,7 @@ end
|
|
245
253
|
* `reduce`: Reduces the values of pairs having the same key
|
246
254
|
|
247
255
|
```ruby
|
248
|
-
job.reduce(worker: MyKrapsWorker) do |key, value1, value2|
|
256
|
+
job.reduce(worker: MyKrapsWorker, jobs: 8) do |key, value1, value2|
|
249
257
|
value1 + value2
|
250
258
|
end
|
251
259
|
```
|
@@ -265,7 +273,7 @@ most of the time, this is not neccessary and the key can simply be ignored.
|
|
265
273
|
passed job result are completely omitted.
|
266
274
|
|
267
275
|
```ruby
|
268
|
-
job.combine(other_job, worker: MyKrapsWorker) do |key, value1, value2|
|
276
|
+
job.combine(other_job, worker: MyKrapsWorker, jobs: 8) do |key, value1, value2|
|
269
277
|
(value1 || {}).merge(value2 || {})
|
270
278
|
end
|
271
279
|
```
|
@@ -279,7 +287,7 @@ since Kraps detects the dependency on its own.
|
|
279
287
|
* `repartition`: Used to change the partitioning
|
280
288
|
|
281
289
|
```ruby
|
282
|
-
job.repartition(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker)
|
290
|
+
job.repartition(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8)
|
283
291
|
```
|
284
292
|
|
285
293
|
Repartitions all data into the specified number of partitions and using the
|
@@ -290,7 +298,7 @@ specified partitioner.
|
|
290
298
|
`to_a` or similar on the enumerable.
|
291
299
|
|
292
300
|
```ruby
|
293
|
-
job.each_partition do |partition, pairs|
|
301
|
+
job.each_partition(jobs: 8) do |partition, pairs|
|
294
302
|
pairs.each do |key, value|
|
295
303
|
# ...
|
296
304
|
end
|
@@ -379,7 +387,8 @@ jobs only once.
|
|
379
387
|
Kraps ships with an in-memory fake driver for storage, which you can use for
|
380
388
|
testing purposes instead of the s3 driver:
|
381
389
|
|
382
|
-
```ruby
|
390
|
+
```ruby
|
391
|
+
Kraps.configure(
|
383
392
|
driver: Kraps::Drivers::FakeDriver.new(bucket: "kraps"),
|
384
393
|
# ...
|
385
394
|
) ```
|
@@ -425,8 +434,6 @@ The API of the driver is:
|
|
425
434
|
Kraps is built on top of
|
426
435
|
[map-reduce-ruby](https://github.com/mrkamel/map-reduce-ruby) for the
|
427
436
|
map/reduce framework,
|
428
|
-
[distributed_job](https://github.com/mrkamel/distributed_job)
|
429
|
-
to keep track of the job/step status,
|
430
437
|
[attachie](https://github.com/mrkamel/attachie) to interact with the storage
|
431
438
|
layer (s3),
|
432
439
|
[ruby-progressbar](https://github.com/jfelchner/ruby-progressbar) to
|
data/docker-compose.yml
CHANGED
data/lib/kraps/job.rb
CHANGED
@@ -27,7 +27,7 @@ module Kraps
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
-
def map(partitions: nil, partitioner: nil, worker: @worker, before: nil, &block)
|
30
|
+
def map(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
|
31
31
|
fresh.tap do |job|
|
32
32
|
job.instance_eval do
|
33
33
|
@partitions = partitions if partitions
|
@@ -35,6 +35,7 @@ module Kraps
|
|
35
35
|
|
36
36
|
@steps << Step.new(
|
37
37
|
action: Actions::MAP,
|
38
|
+
jobs: [jobs, @partitions].compact.min,
|
38
39
|
partitions: @partitions,
|
39
40
|
partitioner: @partitioner,
|
40
41
|
worker: worker,
|
@@ -45,7 +46,7 @@ module Kraps
|
|
45
46
|
end
|
46
47
|
end
|
47
48
|
|
48
|
-
def map_partitions(partitions: nil, partitioner: nil, worker: @worker, before: nil, &block)
|
49
|
+
def map_partitions(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
|
49
50
|
fresh.tap do |job|
|
50
51
|
job.instance_eval do
|
51
52
|
@partitions = partitions if partitions
|
@@ -53,6 +54,7 @@ module Kraps
|
|
53
54
|
|
54
55
|
@steps << Step.new(
|
55
56
|
action: Actions::MAP_PARTITIONS,
|
57
|
+
jobs: [jobs, @partitions].compact.min,
|
56
58
|
partitions: @partitions,
|
57
59
|
partitioner: @partitioner,
|
58
60
|
worker: worker,
|
@@ -63,11 +65,12 @@ module Kraps
|
|
63
65
|
end
|
64
66
|
end
|
65
67
|
|
66
|
-
def reduce(worker: @worker, before: nil, &block)
|
68
|
+
def reduce(jobs: nil, worker: @worker, before: nil, &block)
|
67
69
|
fresh.tap do |job|
|
68
70
|
job.instance_eval do
|
69
71
|
@steps << Step.new(
|
70
72
|
action: Actions::REDUCE,
|
73
|
+
jobs: [jobs, @partitions].compact.min,
|
71
74
|
partitions: @partitions,
|
72
75
|
partitioner: @partitioner,
|
73
76
|
worker: worker,
|
@@ -78,11 +81,12 @@ module Kraps
|
|
78
81
|
end
|
79
82
|
end
|
80
83
|
|
81
|
-
def combine(other_job, worker: @worker, before: nil, &block)
|
84
|
+
def combine(other_job, jobs: nil, worker: @worker, before: nil, &block)
|
82
85
|
fresh.tap do |job|
|
83
86
|
job.instance_eval do
|
84
87
|
@steps << Step.new(
|
85
88
|
action: Actions::COMBINE,
|
89
|
+
jobs: [jobs, @partitions].compact.min,
|
86
90
|
partitions: @partitions,
|
87
91
|
partitioner: @partitioner,
|
88
92
|
worker: worker,
|
@@ -95,11 +99,12 @@ module Kraps
|
|
95
99
|
end
|
96
100
|
end
|
97
101
|
|
98
|
-
def each_partition(worker: @worker, before: nil, &block)
|
102
|
+
def each_partition(jobs: nil, worker: @worker, before: nil, &block)
|
99
103
|
fresh.tap do |job|
|
100
104
|
job.instance_eval do
|
101
105
|
@steps << Step.new(
|
102
106
|
action: Actions::EACH_PARTITION,
|
107
|
+
jobs: [jobs, @partitions].compact.min,
|
103
108
|
partitions: @partitions,
|
104
109
|
partitioner: @partitioner,
|
105
110
|
worker: worker,
|
@@ -110,8 +115,8 @@ module Kraps
|
|
110
115
|
end
|
111
116
|
end
|
112
117
|
|
113
|
-
def repartition(partitions:, partitioner: nil, worker: @worker, before: nil)
|
114
|
-
map(partitions: partitions, partitioner: partitioner, worker: worker, before: before) do |key, value, collector|
|
118
|
+
def repartition(partitions:, jobs: nil, partitioner: nil, worker: @worker, before: nil)
|
119
|
+
map(jobs: jobs, partitions: partitions, partitioner: partitioner, worker: worker, before: before) do |key, value, collector|
|
115
120
|
collector.call(key, value)
|
116
121
|
end
|
117
122
|
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
module Kraps
|
2
|
+
class RedisQueue
|
3
|
+
VISIBILITY_TIMEOUT = 60
|
4
|
+
|
5
|
+
attr_reader :token
|
6
|
+
|
7
|
+
def initialize(redis:, token:, namespace:, ttl:)
|
8
|
+
@redis = redis
|
9
|
+
@token = token
|
10
|
+
@namespace = namespace
|
11
|
+
@ttl = ttl
|
12
|
+
end
|
13
|
+
|
14
|
+
def size
|
15
|
+
@size_script ||= <<~SCRIPT
|
16
|
+
local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
|
17
|
+
|
18
|
+
redis.call('expire', queue_key, ttl)
|
19
|
+
redis.call('expire', pending_key, ttl)
|
20
|
+
redis.call('expire', status_key, ttl)
|
21
|
+
|
22
|
+
return redis.call('llen', queue_key) + redis.call('zcard', pending_key)
|
23
|
+
SCRIPT
|
24
|
+
|
25
|
+
@redis.eval(@size_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl])
|
26
|
+
end
|
27
|
+
|
28
|
+
def enqueue(payload)
|
29
|
+
@enqueue_script ||= <<~SCRIPT
|
30
|
+
local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
|
31
|
+
|
32
|
+
redis.call('rpush', queue_key, job)
|
33
|
+
|
34
|
+
redis.call('expire', queue_key, ttl)
|
35
|
+
redis.call('expire', pending_key, ttl)
|
36
|
+
redis.call('expire', status_key, ttl)
|
37
|
+
SCRIPT
|
38
|
+
|
39
|
+
@redis.eval(@enqueue_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, JSON.generate(payload)])
|
40
|
+
end
|
41
|
+
|
42
|
+
def dequeue
|
43
|
+
@dequeue_script ||= <<~SCRIPT
|
44
|
+
local queue_key, pending_key, status_key, ttl, visibility_timeout = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), tonumber(ARGV[5])
|
45
|
+
|
46
|
+
local zitem = redis.call('zrange', pending_key, 0, 0, 'WITHSCORES')
|
47
|
+
local job = zitem[1]
|
48
|
+
|
49
|
+
if not zitem[2] or tonumber(zitem[2]) > tonumber(redis.call('time')[1]) then
|
50
|
+
job = redis.call('lpop', queue_key)
|
51
|
+
end
|
52
|
+
|
53
|
+
redis.call('expire', queue_key, ttl)
|
54
|
+
redis.call('expire', pending_key, ttl)
|
55
|
+
redis.call('expire', status_key, ttl)
|
56
|
+
|
57
|
+
if not job then return nil end
|
58
|
+
|
59
|
+
redis.call('zadd', pending_key, tonumber(redis.call('time')[1]) + visibility_timeout, job)
|
60
|
+
redis.call('expire', pending_key, ttl)
|
61
|
+
|
62
|
+
return job
|
63
|
+
SCRIPT
|
64
|
+
|
65
|
+
job = @redis.eval(@dequeue_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, VISIBILITY_TIMEOUT])
|
66
|
+
|
67
|
+
unless job
|
68
|
+
yield(nil)
|
69
|
+
return
|
70
|
+
end
|
71
|
+
|
72
|
+
keep_alive(job) do
|
73
|
+
yield(JSON.parse(job)) if job
|
74
|
+
end
|
75
|
+
|
76
|
+
@remove_script ||= <<~SCRIPT
|
77
|
+
local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
|
78
|
+
|
79
|
+
redis.call('zrem', pending_key, job)
|
80
|
+
|
81
|
+
redis.call('expire', queue_key, ttl)
|
82
|
+
redis.call('expire', pending_key, ttl)
|
83
|
+
redis.call('expire', status_key, ttl)
|
84
|
+
SCRIPT
|
85
|
+
|
86
|
+
@redis.eval(@remove_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, job])
|
87
|
+
end
|
88
|
+
|
89
|
+
def stop
|
90
|
+
@stop_script ||= <<~SCRIPT
|
91
|
+
local queue_key, pending_key, status_key, ttl = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4])
|
92
|
+
|
93
|
+
redis.call('hset', status_key, 'stopped', 1)
|
94
|
+
|
95
|
+
redis.call('expire', queue_key, ttl)
|
96
|
+
redis.call('expire', pending_key, ttl)
|
97
|
+
redis.call('expire', status_key, ttl)
|
98
|
+
SCRIPT
|
99
|
+
|
100
|
+
@redis.eval(@stop_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl])
|
101
|
+
end
|
102
|
+
|
103
|
+
def stopped?
|
104
|
+
@stopped_script ||= <<~SCRIPT
|
105
|
+
local queue_key, pending_key, status_key, ttl = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4])
|
106
|
+
|
107
|
+
redis.call('expire', queue_key, ttl)
|
108
|
+
redis.call('expire', pending_key, ttl)
|
109
|
+
redis.call('expire', status_key, ttl)
|
110
|
+
|
111
|
+
return redis.call('hget', status_key, 'stopped')
|
112
|
+
SCRIPT
|
113
|
+
|
114
|
+
@redis.eval(@stopped_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl]).to_i == 1
|
115
|
+
end
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
def keep_alive(job)
|
120
|
+
@keep_alive_script ||= <<~SCRIPT
|
121
|
+
local queue_key, pending_key, status_key, ttl, job, visibility_timeout = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5], tonumber(ARGV[6])
|
122
|
+
|
123
|
+
redis.call('zadd', pending_key, tonumber(redis.call('time')[1]) + visibility_timeout, job)
|
124
|
+
|
125
|
+
redis.call('expire', queue_key, ttl)
|
126
|
+
redis.call('expire', pending_key, ttl)
|
127
|
+
redis.call('expire', status_key, ttl)
|
128
|
+
SCRIPT
|
129
|
+
|
130
|
+
interval = Interval.new(5) do
|
131
|
+
@redis.eval(@keep_alive_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, job, VISIBILITY_TIMEOUT])
|
132
|
+
end
|
133
|
+
|
134
|
+
yield
|
135
|
+
ensure
|
136
|
+
interval&.stop
|
137
|
+
end
|
138
|
+
|
139
|
+
def redis_queue_key
|
140
|
+
[@namespace, "kraps", "queue", @token].compact.join(":")
|
141
|
+
end
|
142
|
+
|
143
|
+
def redis_pending_key
|
144
|
+
[@namespace, "kraps", "pending", @token].compact.join(":")
|
145
|
+
end
|
146
|
+
|
147
|
+
def redis_status_key
|
148
|
+
[@namespace, "kraps", "status", @token].compact.join(":")
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
data/lib/kraps/runner.rb
CHANGED
@@ -45,48 +45,35 @@ module Kraps
|
|
45
45
|
|
46
46
|
def perform_parallelize
|
47
47
|
enum = Enumerator.new do |yielder|
|
48
|
-
collector = proc { |item| yielder << item }
|
48
|
+
collector = proc { |item| yielder << { item: item } }
|
49
49
|
|
50
50
|
@step.block.call(collector)
|
51
51
|
end
|
52
52
|
|
53
|
-
|
54
|
-
push_and_wait(distributed_job, enum) do |item, part|
|
55
|
-
enqueue(token: distributed_job.token, part: part, item: item)
|
56
|
-
end
|
53
|
+
token = push_and_wait(enum: enum)
|
57
54
|
|
58
|
-
|
59
|
-
end
|
55
|
+
Frame.new(token: token, partitions: @step.partitions)
|
60
56
|
end
|
61
57
|
|
62
58
|
def perform_map
|
63
|
-
|
64
|
-
|
65
|
-
enqueue(token: distributed_job.token, part: part, partition: partition)
|
66
|
-
end
|
59
|
+
enum = (0...@frame.partitions).map { |partition| { partition: partition } }
|
60
|
+
token = push_and_wait(job_count: @step.jobs, enum: enum)
|
67
61
|
|
68
|
-
|
69
|
-
end
|
62
|
+
Frame.new(token: token, partitions: @step.partitions)
|
70
63
|
end
|
71
64
|
|
72
65
|
def perform_map_partitions
|
73
|
-
|
74
|
-
|
75
|
-
enqueue(token: distributed_job.token, part: part, partition: partition)
|
76
|
-
end
|
66
|
+
enum = (0...@frame.partitions).map { |partition| { partition: partition } }
|
67
|
+
token = push_and_wait(job_count: @step.jobs, enum: enum)
|
77
68
|
|
78
|
-
|
79
|
-
end
|
69
|
+
Frame.new(token: token, partitions: @step.partitions)
|
80
70
|
end
|
81
71
|
|
82
72
|
def perform_reduce
|
83
|
-
|
84
|
-
|
85
|
-
enqueue(token: distributed_job.token, part: part, partition: partition)
|
86
|
-
end
|
73
|
+
enum = (0...@frame.partitions).map { |partition| { partition: partition } }
|
74
|
+
token = push_and_wait(job_count: @step.jobs, enum: enum)
|
87
75
|
|
88
|
-
|
89
|
-
end
|
76
|
+
Frame.new(token: token, partitions: @step.partitions)
|
90
77
|
end
|
91
78
|
|
92
79
|
def perform_combine
|
@@ -95,82 +82,64 @@ module Kraps
|
|
95
82
|
|
96
83
|
raise(IncompatibleFrame, "Incompatible number of partitions") if combine_step.partitions != @step.partitions
|
97
84
|
|
98
|
-
|
99
|
-
|
100
|
-
enqueue(token: distributed_job.token, part: part, partition: partition, combine_frame: combine_step.frame.to_h)
|
101
|
-
end
|
102
|
-
|
103
|
-
Frame.new(token: distributed_job.token, partitions: @step.partitions)
|
85
|
+
enum = (0...@frame.partitions).map do |partition|
|
86
|
+
{ partition: partition, combine_frame: combine_step.frame.to_h }
|
104
87
|
end
|
88
|
+
|
89
|
+
token = push_and_wait(job_count: @step.jobs, enum: enum)
|
90
|
+
|
91
|
+
Frame.new(token: token, partitions: @step.partitions)
|
105
92
|
end
|
106
93
|
|
107
94
|
def perform_each_partition
|
108
|
-
|
109
|
-
|
110
|
-
enqueue(token: distributed_job.token, part: part, partition: partition)
|
111
|
-
end
|
95
|
+
enum = (0...@frame.partitions).map { |partition| { partition: partition } }
|
96
|
+
push_and_wait(job_count: @step.jobs, enum: enum)
|
112
97
|
|
113
|
-
|
114
|
-
end
|
98
|
+
@frame
|
115
99
|
end
|
116
100
|
|
117
|
-
def
|
118
|
-
Kraps.
|
119
|
-
|
120
|
-
JSON.generate(
|
121
|
-
job_index: @job_index,
|
122
|
-
step_index: @step_index,
|
123
|
-
frame: @frame.to_h,
|
124
|
-
token: token,
|
125
|
-
part: part,
|
126
|
-
klass: @klass,
|
127
|
-
args: @args,
|
128
|
-
kwargs: @kwargs,
|
129
|
-
**rest
|
130
|
-
)
|
131
|
-
)
|
132
|
-
end
|
101
|
+
def push_and_wait(enum:, job_count: nil)
|
102
|
+
redis_queue = RedisQueue.new(redis: Kraps.redis, token: SecureRandom.hex, namespace: Kraps.namespace, ttl: Kraps.job_ttl)
|
103
|
+
progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, token #{redis_queue.token}, %a, %c/%C (%p%) => #{@step.action}")
|
133
104
|
|
134
|
-
|
135
|
-
distributed_job = Kraps.distributed_job_client.build(token: SecureRandom.hex)
|
105
|
+
total = 0
|
136
106
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
end
|
107
|
+
interval = Interval.new(1) do
|
108
|
+
# The interval is used to continously update the progress bar even
|
109
|
+
# when push_all is used and to avoid sessions being terminated due
|
110
|
+
# to inactivity etc
|
142
111
|
|
143
|
-
|
144
|
-
|
112
|
+
progress_bar.total = total
|
113
|
+
progress_bar.progress = [progress_bar.total - redis_queue.size, 0].max
|
114
|
+
end
|
145
115
|
|
146
|
-
|
147
|
-
total
|
116
|
+
enum.each_with_index do |item, part|
|
117
|
+
total += 1
|
148
118
|
|
149
|
-
|
150
|
-
|
151
|
-
end
|
119
|
+
redis_queue.enqueue(item.merge(part: part))
|
120
|
+
end
|
152
121
|
|
153
|
-
|
154
|
-
|
155
|
-
interval.fire(timeout: 1)
|
122
|
+
(job_count || total).times do
|
123
|
+
break if redis_queue.stopped?
|
156
124
|
|
157
|
-
|
158
|
-
end
|
159
|
-
ensure
|
160
|
-
interval&.stop
|
125
|
+
Kraps.enqueuer.call(@step.worker, JSON.generate(job_index: @job_index, step_index: @step_index, frame: @frame.to_h, token: redis_queue.token, klass: @klass, args: @args, kwargs: @kwargs))
|
161
126
|
end
|
162
127
|
|
163
128
|
loop do
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
break if distributed_job.finished? || distributed_job.stopped?
|
129
|
+
break if redis_queue.size.zero?
|
130
|
+
break if redis_queue.stopped?
|
168
131
|
|
169
132
|
sleep(1)
|
170
133
|
end
|
171
134
|
|
172
|
-
raise(JobStopped, "The job was stopped") if
|
135
|
+
raise(JobStopped, "The job was stopped") if redis_queue.stopped?
|
136
|
+
|
137
|
+
interval.fire(timeout: 1)
|
138
|
+
|
139
|
+
redis_queue.token
|
173
140
|
ensure
|
141
|
+
redis_queue&.stop
|
142
|
+
interval&.stop
|
174
143
|
progress_bar&.stop
|
175
144
|
end
|
176
145
|
|
data/lib/kraps/step.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
1
|
module Kraps
|
2
|
-
Step = Struct.new(:action, :partitioner, :partitions, :block, :worker, :before, :frame, :dependency, :options, keyword_init: true)
|
2
|
+
Step = Struct.new(:action, :partitioner, :partitions, :jobs, :block, :worker, :before, :frame, :dependency, :options, keyword_init: true)
|
3
3
|
end
|
data/lib/kraps/version.rb
CHANGED
data/lib/kraps/worker.rb
CHANGED
@@ -11,22 +11,22 @@ module Kraps
|
|
11
11
|
end
|
12
12
|
|
13
13
|
def call(retries: 3)
|
14
|
-
return if
|
14
|
+
return if redis_queue.stopped?
|
15
15
|
|
16
16
|
raise(InvalidAction, "Invalid action #{step.action}") unless Actions::ALL.include?(step.action)
|
17
17
|
|
18
|
-
|
19
|
-
|
18
|
+
dequeue do |payload|
|
19
|
+
with_retries(retries) do # TODO: allow to use queue based retries
|
20
|
+
step.before&.call
|
20
21
|
|
21
|
-
|
22
|
-
|
23
|
-
distributed_job.done(@args["part"])
|
22
|
+
send(:"perform_#{step.action}", payload)
|
23
|
+
end
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
27
27
|
private
|
28
28
|
|
29
|
-
def perform_parallelize
|
29
|
+
def perform_parallelize(payload)
|
30
30
|
implementation = Class.new do
|
31
31
|
def map(key)
|
32
32
|
yield(key, nil)
|
@@ -34,19 +34,19 @@ module Kraps
|
|
34
34
|
end
|
35
35
|
|
36
36
|
mapper = MapReduce::Mapper.new(implementation.new, partitioner: partitioner, memory_limit: @memory_limit)
|
37
|
-
mapper.map(
|
37
|
+
mapper.map(payload["item"])
|
38
38
|
|
39
39
|
mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
|
40
40
|
Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
|
41
41
|
File.open(path) do |stream|
|
42
|
-
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{
|
42
|
+
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["part"]}.json"), stream)
|
43
43
|
end
|
44
44
|
end
|
45
45
|
end
|
46
46
|
end
|
47
47
|
|
48
|
-
def perform_map
|
49
|
-
temp_paths = download_all(token: @args["frame"]["token"], partition:
|
48
|
+
def perform_map(payload)
|
49
|
+
temp_paths = download_all(token: @args["frame"]["token"], partition: payload["partition"])
|
50
50
|
|
51
51
|
current_step = step
|
52
52
|
|
@@ -78,7 +78,7 @@ module Kraps
|
|
78
78
|
mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
|
79
79
|
Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
|
80
80
|
File.open(path) do |stream|
|
81
|
-
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{
|
81
|
+
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
|
82
82
|
end
|
83
83
|
end
|
84
84
|
end
|
@@ -86,11 +86,11 @@ module Kraps
|
|
86
86
|
temp_paths&.delete
|
87
87
|
end
|
88
88
|
|
89
|
-
def perform_map_partitions
|
90
|
-
temp_paths = download_all(token: @args["frame"]["token"], partition:
|
89
|
+
def perform_map_partitions(payload)
|
90
|
+
temp_paths = download_all(token: @args["frame"]["token"], partition: payload["partition"])
|
91
91
|
|
92
92
|
current_step = step
|
93
|
-
current_partition =
|
93
|
+
current_partition = payload["partition"]
|
94
94
|
|
95
95
|
implementation = Object.new
|
96
96
|
implementation.define_singleton_method(:map) do |enum, &block|
|
@@ -111,7 +111,7 @@ module Kraps
|
|
111
111
|
mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
|
112
112
|
Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
|
113
113
|
File.open(path) do |stream|
|
114
|
-
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{
|
114
|
+
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
|
115
115
|
end
|
116
116
|
end
|
117
117
|
end
|
@@ -119,7 +119,7 @@ module Kraps
|
|
119
119
|
temp_paths&.delete
|
120
120
|
end
|
121
121
|
|
122
|
-
def perform_reduce
|
122
|
+
def perform_reduce(payload)
|
123
123
|
current_step = step
|
124
124
|
|
125
125
|
implementation = Object.new
|
@@ -129,7 +129,7 @@ module Kraps
|
|
129
129
|
|
130
130
|
reducer = MapReduce::Reducer.new(implementation)
|
131
131
|
|
132
|
-
Parallelizer.each(Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{
|
132
|
+
Parallelizer.each(Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{payload["partition"]}/")), @concurrency) do |file|
|
133
133
|
Kraps.driver.download(file, reducer.add_chunk)
|
134
134
|
end
|
135
135
|
|
@@ -139,14 +139,14 @@ module Kraps
|
|
139
139
|
tempfile.puts(JSON.generate([key, value]))
|
140
140
|
end
|
141
141
|
|
142
|
-
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{
|
142
|
+
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{payload["partition"]}/chunk.#{payload["partition"]}.json"), tempfile.tap(&:rewind))
|
143
143
|
ensure
|
144
144
|
tempfile&.close(true)
|
145
145
|
end
|
146
146
|
|
147
|
-
def perform_combine
|
148
|
-
temp_paths1 = download_all(token: @args["frame"]["token"], partition:
|
149
|
-
temp_paths2 = download_all(token:
|
147
|
+
def perform_combine(payload)
|
148
|
+
temp_paths1 = download_all(token: @args["frame"]["token"], partition: payload["partition"])
|
149
|
+
temp_paths2 = download_all(token: payload["combine_frame"]["token"], partition: payload["partition"])
|
150
150
|
|
151
151
|
enum1 = k_way_merge(temp_paths1.each.to_a, chunk_limit: @chunk_limit)
|
152
152
|
enum2 = k_way_merge(temp_paths2.each.to_a, chunk_limit: @chunk_limit)
|
@@ -167,7 +167,7 @@ module Kraps
|
|
167
167
|
mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
|
168
168
|
Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
|
169
169
|
File.open(path) do |stream|
|
170
|
-
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{
|
170
|
+
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
|
171
171
|
end
|
172
172
|
end
|
173
173
|
end
|
@@ -213,10 +213,10 @@ module Kraps
|
|
213
213
|
end
|
214
214
|
end
|
215
215
|
|
216
|
-
def perform_each_partition
|
216
|
+
def perform_each_partition(payload)
|
217
217
|
temp_paths = TempPaths.new
|
218
218
|
|
219
|
-
files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{
|
219
|
+
files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{payload["partition"]}/")).sort
|
220
220
|
|
221
221
|
temp_paths_index = files.each_with_object({}) do |file, hash|
|
222
222
|
hash[file] = temp_paths.add
|
@@ -226,7 +226,7 @@ module Kraps
|
|
226
226
|
Kraps.driver.download(file, temp_paths_index[file].path)
|
227
227
|
end
|
228
228
|
|
229
|
-
step.block.call(
|
229
|
+
step.block.call(payload["partition"], k_way_merge(temp_paths.each.to_a, chunk_limit: @chunk_limit))
|
230
230
|
ensure
|
231
231
|
temp_paths&.delete
|
232
232
|
end
|
@@ -237,11 +237,11 @@ module Kraps
|
|
237
237
|
begin
|
238
238
|
yield
|
239
239
|
rescue Kraps::Error
|
240
|
-
|
240
|
+
redis_queue.stop
|
241
241
|
raise
|
242
242
|
rescue StandardError => e
|
243
243
|
if retries >= num_retries
|
244
|
-
|
244
|
+
redis_queue.stop
|
245
245
|
raise
|
246
246
|
end
|
247
247
|
|
@@ -254,6 +254,21 @@ module Kraps
|
|
254
254
|
end
|
255
255
|
end
|
256
256
|
|
257
|
+
def dequeue
|
258
|
+
loop do
|
259
|
+
break if redis_queue.stopped?
|
260
|
+
break if redis_queue.size.zero?
|
261
|
+
|
262
|
+
redis_queue.dequeue do |payload|
|
263
|
+
payload ? yield(payload) : sleep(1)
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
def redis_queue
|
269
|
+
@redis_queue ||= RedisQueue.new(redis: Kraps.redis, token: @args["token"], namespace: Kraps.namespace, ttl: Kraps.job_ttl)
|
270
|
+
end
|
271
|
+
|
257
272
|
def download_all(token:, partition:)
|
258
273
|
temp_paths = TempPaths.new
|
259
274
|
|
@@ -301,9 +316,5 @@ module Kraps
|
|
301
316
|
def partitioner
|
302
317
|
@partitioner ||= proc { |key| step.partitioner.call(key, step.partitions) }
|
303
318
|
end
|
304
|
-
|
305
|
-
def distributed_job
|
306
|
-
@distributed_job ||= Kraps.distributed_job_client.build(token: @args["token"])
|
307
|
-
end
|
308
319
|
end
|
309
320
|
end
|
data/lib/kraps.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require "distributed_job"
|
2
1
|
require "ruby-progressbar"
|
3
2
|
require "ruby-progressbar/outputs/null"
|
4
3
|
require "map_reduce"
|
@@ -9,6 +8,7 @@ require_relative "kraps/drivers"
|
|
9
8
|
require_relative "kraps/actions"
|
10
9
|
require_relative "kraps/parallelizer"
|
11
10
|
require_relative "kraps/hash_partitioner"
|
11
|
+
require_relative "kraps/redis_queue"
|
12
12
|
require_relative "kraps/temp_path"
|
13
13
|
require_relative "kraps/temp_paths"
|
14
14
|
require_relative "kraps/timeout_queue"
|
@@ -27,9 +27,11 @@ module Kraps
|
|
27
27
|
class JobStopped < Error; end
|
28
28
|
class IncompatibleFrame < Error; end
|
29
29
|
|
30
|
-
def self.configure(driver:, redis: Redis.new, namespace: nil, job_ttl: 24 * 60 * 60, show_progress: true, enqueuer: ->(worker, json) { worker.perform_async(json) })
|
30
|
+
def self.configure(driver:, redis: Redis.new, namespace: nil, job_ttl: 4 * 24 * 60 * 60, show_progress: true, enqueuer: ->(worker, json) { worker.perform_async(json) })
|
31
31
|
@driver = driver
|
32
|
-
@
|
32
|
+
@redis = redis
|
33
|
+
@namespace = namespace
|
34
|
+
@job_ttl = job_ttl.to_i
|
33
35
|
@show_progress = show_progress
|
34
36
|
@enqueuer = enqueuer
|
35
37
|
end
|
@@ -38,8 +40,16 @@ module Kraps
|
|
38
40
|
@driver
|
39
41
|
end
|
40
42
|
|
41
|
-
def self.
|
42
|
-
@
|
43
|
+
def self.redis
|
44
|
+
@redis
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.namespace
|
48
|
+
@namespace
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.job_ttl
|
52
|
+
@job_ttl
|
43
53
|
end
|
44
54
|
|
45
55
|
def self.show_progress?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kraps
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Vetter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-12-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: attachie
|
@@ -24,20 +24,6 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: distributed_job
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :runtime
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - ">="
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: map-reduce-ruby
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -149,6 +135,7 @@ files:
|
|
149
135
|
- lib/kraps/job.rb
|
150
136
|
- lib/kraps/job_resolver.rb
|
151
137
|
- lib/kraps/parallelizer.rb
|
138
|
+
- lib/kraps/redis_queue.rb
|
152
139
|
- lib/kraps/runner.rb
|
153
140
|
- lib/kraps/step.rb
|
154
141
|
- lib/kraps/temp_path.rb
|