kraps 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 02ba582478178273300e77d5ddd18a8568fd682b0c53a444f1c5e7b756f9fd9a
4
- data.tar.gz: 9e1698a67c252512a2f277ca1a6e079e063f8f1e6c13500245aabd969cca122d
3
+ metadata.gz: 19635ced3e745d44313ed3bc416ef73eb555134c591725f4dab7b38208e21393
4
+ data.tar.gz: bb7f679c7e2cd053744d1c7d857629de1912a305880d0538bdab418de3861ba1
5
5
  SHA512:
6
- metadata.gz: 2d1b3bd10d1048c64804ddf86069c0757247b581edea0f38330189a10d35ed096970d008533a167ddaf97c16479425e3baa2bb56f5a8888255ecfd12b911a168
7
- data.tar.gz: 045263d6aa920cef97a162fcbfd41f239087c459a8c86bd2112d3ce4524c48c6e65ceaac8993f8ea7fb9089a8877db2863e39644888c8bd884df0fa95241277d
6
+ metadata.gz: 91273ba54ea33c6d5cb1b4f335ad8039c35601953fdf1e6b9b2ac3117ceb25d81e9be569e5c8b70deda22b53a4c72f04d60ffb4b251badf5c5a64d13d399f36c
7
+ data.tar.gz: 4e563257fcba0c9f457b363da4f43d000ee239a5179e2f965071bb3df27e362cdf7bc9950a1624520cb862acfb0fde89e96b0a4987d79a93524230c4b84619cd
data/.rubocop.yml CHANGED
@@ -80,3 +80,6 @@ Style/WordArray:
80
80
 
81
81
  Style/RedundantEach:
82
82
  Enabled: false
83
+
84
+ Lint/NonLocalExitFromIterator:
85
+ Enabled: false
data/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## v0.7.0
4
+
5
+ * Added a `jobs` option to the actions to limit the concurrency
6
+ when e.g. accessing external data stores and to avoid overloading
7
+ them
8
+ * Added a queue using redis for the jobs to avoid starving workers
9
+ * Removed `distributed_job` dependency
10
+
3
11
  ## v0.6.0
4
12
 
5
13
  * Added `map_partitions`
data/Gemfile.lock CHANGED
@@ -1,9 +1,8 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kraps (0.6.0)
4
+ kraps (0.7.0)
5
5
  attachie
6
- distributed_job
7
6
  map-reduce-ruby (>= 3.0.0)
8
7
  redis
9
8
  ruby-progressbar
@@ -41,8 +40,6 @@ GEM
41
40
  concurrent-ruby (1.1.10)
42
41
  connection_pool (2.3.0)
43
42
  diff-lcs (1.5.0)
44
- distributed_job (3.1.0)
45
- redis (>= 4.1.0)
46
43
  i18n (1.12.0)
47
44
  concurrent-ruby (~> 1.0)
48
45
  jmespath (1.6.1)
@@ -62,7 +59,7 @@ GEM
62
59
  rake (13.0.6)
63
60
  redis (5.0.5)
64
61
  redis-client (>= 0.9.0)
65
- redis-client (0.11.1)
62
+ redis-client (0.11.2)
66
63
  connection_pool
67
64
  regexp_parser (2.6.0)
68
65
  rexml (3.2.5)
data/README.md CHANGED
@@ -30,7 +30,7 @@ Kraps.configure(
30
30
  driver: Kraps::Drivers::S3Driver.new(s3_client: Aws::S3::Client.new("..."), bucket: "some-bucket", prefix: "temp/kraps/"),
31
31
  redis: Redis.new,
32
32
  namespace: "my-application", # An optional namespace to be used for redis keys, default: nil
33
- job_ttl: 24.hours, # Job information in redis will automatically be removed after this amount of time, default: 24 hours
33
+ job_ttl: 7.days, # Job information in redis will automatically be removed after this amount of time, default: 4 days
34
34
  show_progress: true # Whether or not to show the progress in the terminal when executing jobs, default: true
35
35
  enqueuer: ->(worker, json) { worker.perform_async(json) } # Allows to customize the enqueueing of worker jobs
36
36
  )
@@ -220,7 +220,7 @@ items are used as keys and the values are set to `nil`.
220
220
  * `map`: Maps the key value pairs to other key value pairs
221
221
 
222
222
  ```ruby
223
- job.map(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker) do |key, value, collector|
223
+ job.map(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8) do |key, value, collector|
224
224
  collector.call("changed #{key}", "changed #{value}")
225
225
  end
226
226
  ```
@@ -229,13 +229,21 @@ The block gets each key-value pair passed and the `collector` block can be
229
229
  called as often as neccessary. This is also the reason why `map` can not simply
230
230
  return the new key-value pair, but the `collector` must be used instead.
231
231
 
232
+ The `jobs` argument can be useful when you need to access an external data
233
+ source, like a relational database and you want to limit the number of workers
234
+ accessing the store concurrently to avoid overloading it. If you don't specify
235
+ it, it will be identical to the number of partitions. It is recommended to only
236
+ use it for steps where you need to throttle the concurrency, because it will of
237
+ course slow down the processing. The `jobs` argument only applies to the
238
+ current step. The following steps don't inherit the argument, but reset it.
239
+
232
240
  * `map_partitions`: Maps the key value pairs to other key value pairs, but the
233
241
  block receives all data of each partition as an enumerable and sorted by key.
234
242
  Please be aware that you should not call `to_a` or similar on the enumerable.
235
243
  Prefer `map` over `map_partitions` when possible.
236
244
 
237
245
  ```ruby
238
- job.map_partitions(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker) do |pairs, collector|
246
+ job.map_partitions(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8) do |pairs, collector|
239
247
  pairs.each do |key, value|
240
248
  collector.call("changed #{key}", "changed #{value}")
241
249
  end
@@ -245,7 +253,7 @@ end
245
253
  * `reduce`: Reduces the values of pairs having the same key
246
254
 
247
255
  ```ruby
248
- job.reduce(worker: MyKrapsWorker) do |key, value1, value2|
256
+ job.reduce(worker: MyKrapsWorker, jobs: 8) do |key, value1, value2|
249
257
  value1 + value2
250
258
  end
251
259
  ```
@@ -265,7 +273,7 @@ most of the time, this is not neccessary and the key can simply be ignored.
265
273
  passed job result are completely omitted.
266
274
 
267
275
  ```ruby
268
- job.combine(other_job, worker: MyKrapsWorker) do |key, value1, value2|
276
+ job.combine(other_job, worker: MyKrapsWorker, jobs: 8) do |key, value1, value2|
269
277
  (value1 || {}).merge(value2 || {})
270
278
  end
271
279
  ```
@@ -279,7 +287,7 @@ since Kraps detects the dependency on its own.
279
287
  * `repartition`: Used to change the partitioning
280
288
 
281
289
  ```ruby
282
- job.repartition(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker)
290
+ job.repartition(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8)
283
291
  ```
284
292
 
285
293
  Repartitions all data into the specified number of partitions and using the
@@ -290,7 +298,7 @@ specified partitioner.
290
298
  `to_a` or similar on the enumerable.
291
299
 
292
300
  ```ruby
293
- job.each_partition do |partition, pairs|
301
+ job.each_partition(jobs: 8) do |partition, pairs|
294
302
  pairs.each do |key, value|
295
303
  # ...
296
304
  end
@@ -379,7 +387,8 @@ jobs only once.
379
387
  Kraps ships with an in-memory fake driver for storage, which you can use for
380
388
  testing purposes instead of the s3 driver:
381
389
 
382
- ```ruby Kraps.configure(
390
+ ```ruby
391
+ Kraps.configure(
383
392
  driver: Kraps::Drivers::FakeDriver.new(bucket: "kraps"),
384
393
  # ...
385
394
  ) ```
@@ -425,8 +434,6 @@ The API of the driver is:
425
434
  Kraps is built on top of
426
435
  [map-reduce-ruby](https://github.com/mrkamel/map-reduce-ruby) for the
427
436
  map/reduce framework,
428
- [distributed_job](https://github.com/mrkamel/distributed_job)
429
- to keep track of the job/step status,
430
437
  [attachie](https://github.com/mrkamel/attachie) to interact with the storage
431
438
  layer (s3),
432
439
  [ruby-progressbar](https://github.com/jfelchner/ruby-progressbar) to
data/docker-compose.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  version: '2'
2
2
  services:
3
- elasticsearch:
3
+ redis:
4
4
  image: redis
5
5
  ports:
6
6
  - 6379:6379
data/lib/kraps/job.rb CHANGED
@@ -27,7 +27,7 @@ module Kraps
27
27
  end
28
28
  end
29
29
 
30
- def map(partitions: nil, partitioner: nil, worker: @worker, before: nil, &block)
30
+ def map(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
31
31
  fresh.tap do |job|
32
32
  job.instance_eval do
33
33
  @partitions = partitions if partitions
@@ -35,6 +35,7 @@ module Kraps
35
35
 
36
36
  @steps << Step.new(
37
37
  action: Actions::MAP,
38
+ jobs: [jobs, @partitions].compact.min,
38
39
  partitions: @partitions,
39
40
  partitioner: @partitioner,
40
41
  worker: worker,
@@ -45,7 +46,7 @@ module Kraps
45
46
  end
46
47
  end
47
48
 
48
- def map_partitions(partitions: nil, partitioner: nil, worker: @worker, before: nil, &block)
49
+ def map_partitions(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
49
50
  fresh.tap do |job|
50
51
  job.instance_eval do
51
52
  @partitions = partitions if partitions
@@ -53,6 +54,7 @@ module Kraps
53
54
 
54
55
  @steps << Step.new(
55
56
  action: Actions::MAP_PARTITIONS,
57
+ jobs: [jobs, @partitions].compact.min,
56
58
  partitions: @partitions,
57
59
  partitioner: @partitioner,
58
60
  worker: worker,
@@ -63,11 +65,12 @@ module Kraps
63
65
  end
64
66
  end
65
67
 
66
- def reduce(worker: @worker, before: nil, &block)
68
+ def reduce(jobs: nil, worker: @worker, before: nil, &block)
67
69
  fresh.tap do |job|
68
70
  job.instance_eval do
69
71
  @steps << Step.new(
70
72
  action: Actions::REDUCE,
73
+ jobs: [jobs, @partitions].compact.min,
71
74
  partitions: @partitions,
72
75
  partitioner: @partitioner,
73
76
  worker: worker,
@@ -78,11 +81,12 @@ module Kraps
78
81
  end
79
82
  end
80
83
 
81
- def combine(other_job, worker: @worker, before: nil, &block)
84
+ def combine(other_job, jobs: nil, worker: @worker, before: nil, &block)
82
85
  fresh.tap do |job|
83
86
  job.instance_eval do
84
87
  @steps << Step.new(
85
88
  action: Actions::COMBINE,
89
+ jobs: [jobs, @partitions].compact.min,
86
90
  partitions: @partitions,
87
91
  partitioner: @partitioner,
88
92
  worker: worker,
@@ -95,11 +99,12 @@ module Kraps
95
99
  end
96
100
  end
97
101
 
98
- def each_partition(worker: @worker, before: nil, &block)
102
+ def each_partition(jobs: nil, worker: @worker, before: nil, &block)
99
103
  fresh.tap do |job|
100
104
  job.instance_eval do
101
105
  @steps << Step.new(
102
106
  action: Actions::EACH_PARTITION,
107
+ jobs: [jobs, @partitions].compact.min,
103
108
  partitions: @partitions,
104
109
  partitioner: @partitioner,
105
110
  worker: worker,
@@ -110,8 +115,8 @@ module Kraps
110
115
  end
111
116
  end
112
117
 
113
- def repartition(partitions:, partitioner: nil, worker: @worker, before: nil)
114
- map(partitions: partitions, partitioner: partitioner, worker: worker, before: before) do |key, value, collector|
118
+ def repartition(partitions:, jobs: nil, partitioner: nil, worker: @worker, before: nil)
119
+ map(jobs: jobs, partitions: partitions, partitioner: partitioner, worker: worker, before: before) do |key, value, collector|
115
120
  collector.call(key, value)
116
121
  end
117
122
  end
@@ -0,0 +1,151 @@
1
+ module Kraps
2
+ class RedisQueue
3
+ VISIBILITY_TIMEOUT = 60
4
+
5
+ attr_reader :token
6
+
7
+ def initialize(redis:, token:, namespace:, ttl:)
8
+ @redis = redis
9
+ @token = token
10
+ @namespace = namespace
11
+ @ttl = ttl
12
+ end
13
+
14
+ def size
15
+ @size_script ||= <<~SCRIPT
16
+ local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
17
+
18
+ redis.call('expire', queue_key, ttl)
19
+ redis.call('expire', pending_key, ttl)
20
+ redis.call('expire', status_key, ttl)
21
+
22
+ return redis.call('llen', queue_key) + redis.call('zcard', pending_key)
23
+ SCRIPT
24
+
25
+ @redis.eval(@size_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl])
26
+ end
27
+
28
+ def enqueue(payload)
29
+ @enqueue_script ||= <<~SCRIPT
30
+ local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
31
+
32
+ redis.call('rpush', queue_key, job)
33
+
34
+ redis.call('expire', queue_key, ttl)
35
+ redis.call('expire', pending_key, ttl)
36
+ redis.call('expire', status_key, ttl)
37
+ SCRIPT
38
+
39
+ @redis.eval(@enqueue_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, JSON.generate(payload)])
40
+ end
41
+
42
+ def dequeue
43
+ @dequeue_script ||= <<~SCRIPT
44
+ local queue_key, pending_key, status_key, ttl, visibility_timeout = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), tonumber(ARGV[5])
45
+
46
+ local zitem = redis.call('zrange', pending_key, 0, 0, 'WITHSCORES')
47
+ local job = zitem[1]
48
+
49
+ if not zitem[2] or tonumber(zitem[2]) > tonumber(redis.call('time')[1]) then
50
+ job = redis.call('lpop', queue_key)
51
+ end
52
+
53
+ redis.call('expire', queue_key, ttl)
54
+ redis.call('expire', pending_key, ttl)
55
+ redis.call('expire', status_key, ttl)
56
+
57
+ if not job then return nil end
58
+
59
+ redis.call('zadd', pending_key, tonumber(redis.call('time')[1]) + visibility_timeout, job)
60
+ redis.call('expire', pending_key, ttl)
61
+
62
+ return job
63
+ SCRIPT
64
+
65
+ job = @redis.eval(@dequeue_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, VISIBILITY_TIMEOUT])
66
+
67
+ unless job
68
+ yield(nil)
69
+ return
70
+ end
71
+
72
+ keep_alive(job) do
73
+ yield(JSON.parse(job)) if job
74
+ end
75
+
76
+ @remove_script ||= <<~SCRIPT
77
+ local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
78
+
79
+ redis.call('zrem', pending_key, job)
80
+
81
+ redis.call('expire', queue_key, ttl)
82
+ redis.call('expire', pending_key, ttl)
83
+ redis.call('expire', status_key, ttl)
84
+ SCRIPT
85
+
86
+ @redis.eval(@remove_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, job])
87
+ end
88
+
89
+ def stop
90
+ @stop_script ||= <<~SCRIPT
91
+ local queue_key, pending_key, status_key, ttl = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4])
92
+
93
+ redis.call('hset', status_key, 'stopped', 1)
94
+
95
+ redis.call('expire', queue_key, ttl)
96
+ redis.call('expire', pending_key, ttl)
97
+ redis.call('expire', status_key, ttl)
98
+ SCRIPT
99
+
100
+ @redis.eval(@stop_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl])
101
+ end
102
+
103
+ def stopped?
104
+ @stopped_script ||= <<~SCRIPT
105
+ local queue_key, pending_key, status_key, ttl = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4])
106
+
107
+ redis.call('expire', queue_key, ttl)
108
+ redis.call('expire', pending_key, ttl)
109
+ redis.call('expire', status_key, ttl)
110
+
111
+ return redis.call('hget', status_key, 'stopped')
112
+ SCRIPT
113
+
114
+ @redis.eval(@stopped_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl]).to_i == 1
115
+ end
116
+
117
+ private
118
+
119
+ def keep_alive(job)
120
+ @keep_alive_script ||= <<~SCRIPT
121
+ local queue_key, pending_key, status_key, ttl, job, visibility_timeout = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5], tonumber(ARGV[6])
122
+
123
+ redis.call('zadd', pending_key, tonumber(redis.call('time')[1]) + visibility_timeout, job)
124
+
125
+ redis.call('expire', queue_key, ttl)
126
+ redis.call('expire', pending_key, ttl)
127
+ redis.call('expire', status_key, ttl)
128
+ SCRIPT
129
+
130
+ interval = Interval.new(5) do
131
+ @redis.eval(@keep_alive_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, job, VISIBILITY_TIMEOUT])
132
+ end
133
+
134
+ yield
135
+ ensure
136
+ interval&.stop
137
+ end
138
+
139
+ def redis_queue_key
140
+ [@namespace, "kraps", "queue", @token].compact.join(":")
141
+ end
142
+
143
+ def redis_pending_key
144
+ [@namespace, "kraps", "pending", @token].compact.join(":")
145
+ end
146
+
147
+ def redis_status_key
148
+ [@namespace, "kraps", "status", @token].compact.join(":")
149
+ end
150
+ end
151
+ end
data/lib/kraps/runner.rb CHANGED
@@ -45,48 +45,35 @@ module Kraps
45
45
 
46
46
  def perform_parallelize
47
47
  enum = Enumerator.new do |yielder|
48
- collector = proc { |item| yielder << item }
48
+ collector = proc { |item| yielder << { item: item } }
49
49
 
50
50
  @step.block.call(collector)
51
51
  end
52
52
 
53
- with_distributed_job do |distributed_job|
54
- push_and_wait(distributed_job, enum) do |item, part|
55
- enqueue(token: distributed_job.token, part: part, item: item)
56
- end
53
+ token = push_and_wait(enum: enum)
57
54
 
58
- Frame.new(token: distributed_job.token, partitions: @step.partitions)
59
- end
55
+ Frame.new(token: token, partitions: @step.partitions)
60
56
  end
61
57
 
62
58
  def perform_map
63
- with_distributed_job do |distributed_job|
64
- push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
65
- enqueue(token: distributed_job.token, part: part, partition: partition)
66
- end
59
+ enum = (0...@frame.partitions).map { |partition| { partition: partition } }
60
+ token = push_and_wait(job_count: @step.jobs, enum: enum)
67
61
 
68
- Frame.new(token: distributed_job.token, partitions: @step.partitions)
69
- end
62
+ Frame.new(token: token, partitions: @step.partitions)
70
63
  end
71
64
 
72
65
  def perform_map_partitions
73
- with_distributed_job do |distributed_job|
74
- push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
75
- enqueue(token: distributed_job.token, part: part, partition: partition)
76
- end
66
+ enum = (0...@frame.partitions).map { |partition| { partition: partition } }
67
+ token = push_and_wait(job_count: @step.jobs, enum: enum)
77
68
 
78
- Frame.new(token: distributed_job.token, partitions: @step.partitions)
79
- end
69
+ Frame.new(token: token, partitions: @step.partitions)
80
70
  end
81
71
 
82
72
  def perform_reduce
83
- with_distributed_job do |distributed_job|
84
- push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
85
- enqueue(token: distributed_job.token, part: part, partition: partition)
86
- end
73
+ enum = (0...@frame.partitions).map { |partition| { partition: partition } }
74
+ token = push_and_wait(job_count: @step.jobs, enum: enum)
87
75
 
88
- Frame.new(token: distributed_job.token, partitions: @step.partitions)
89
- end
76
+ Frame.new(token: token, partitions: @step.partitions)
90
77
  end
91
78
 
92
79
  def perform_combine
@@ -95,82 +82,64 @@ module Kraps
95
82
 
96
83
  raise(IncompatibleFrame, "Incompatible number of partitions") if combine_step.partitions != @step.partitions
97
84
 
98
- with_distributed_job do |distributed_job|
99
- push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
100
- enqueue(token: distributed_job.token, part: part, partition: partition, combine_frame: combine_step.frame.to_h)
101
- end
102
-
103
- Frame.new(token: distributed_job.token, partitions: @step.partitions)
85
+ enum = (0...@frame.partitions).map do |partition|
86
+ { partition: partition, combine_frame: combine_step.frame.to_h }
104
87
  end
88
+
89
+ token = push_and_wait(job_count: @step.jobs, enum: enum)
90
+
91
+ Frame.new(token: token, partitions: @step.partitions)
105
92
  end
106
93
 
107
94
  def perform_each_partition
108
- with_distributed_job do |distributed_job|
109
- push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
110
- enqueue(token: distributed_job.token, part: part, partition: partition)
111
- end
95
+ enum = (0...@frame.partitions).map { |partition| { partition: partition } }
96
+ push_and_wait(job_count: @step.jobs, enum: enum)
112
97
 
113
- @frame
114
- end
98
+ @frame
115
99
  end
116
100
 
117
- def enqueue(token:, part:, **rest)
118
- Kraps.enqueuer.call(
119
- @step.worker,
120
- JSON.generate(
121
- job_index: @job_index,
122
- step_index: @step_index,
123
- frame: @frame.to_h,
124
- token: token,
125
- part: part,
126
- klass: @klass,
127
- args: @args,
128
- kwargs: @kwargs,
129
- **rest
130
- )
131
- )
132
- end
101
+ def push_and_wait(enum:, job_count: nil)
102
+ redis_queue = RedisQueue.new(redis: Kraps.redis, token: SecureRandom.hex, namespace: Kraps.namespace, ttl: Kraps.job_ttl)
103
+ progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, token #{redis_queue.token}, %a, %c/%C (%p%) => #{@step.action}")
133
104
 
134
- def with_distributed_job
135
- distributed_job = Kraps.distributed_job_client.build(token: SecureRandom.hex)
105
+ total = 0
136
106
 
137
- yield(distributed_job)
138
- rescue Interrupt
139
- distributed_job&.stop
140
- raise
141
- end
107
+ interval = Interval.new(1) do
108
+ # The interval is used to continously update the progress bar even
109
+ # when push_all is used and to avoid sessions being terminated due
110
+ # to inactivity etc
142
111
 
143
- def push_and_wait(distributed_job, enum)
144
- progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, token #{distributed_job.token}, %a, %c/%C (%p%) => #{@step.action}")
112
+ progress_bar.total = total
113
+ progress_bar.progress = [progress_bar.total - redis_queue.size, 0].max
114
+ end
145
115
 
146
- begin
147
- total = 0
116
+ enum.each_with_index do |item, part|
117
+ total += 1
148
118
 
149
- interval = Interval.new(1) do
150
- progress_bar.total = total
151
- end
119
+ redis_queue.enqueue(item.merge(part: part))
120
+ end
152
121
 
153
- distributed_job.push_each(enum) do |item, part|
154
- total += 1
155
- interval.fire(timeout: 1)
122
+ (job_count || total).times do
123
+ break if redis_queue.stopped?
156
124
 
157
- yield(item, part)
158
- end
159
- ensure
160
- interval&.stop
125
+ Kraps.enqueuer.call(@step.worker, JSON.generate(job_index: @job_index, step_index: @step_index, frame: @frame.to_h, token: redis_queue.token, klass: @klass, args: @args, kwargs: @kwargs))
161
126
  end
162
127
 
163
128
  loop do
164
- progress_bar.total = distributed_job.total
165
- progress_bar.progress = progress_bar.total - distributed_job.count
166
-
167
- break if distributed_job.finished? || distributed_job.stopped?
129
+ break if redis_queue.size.zero?
130
+ break if redis_queue.stopped?
168
131
 
169
132
  sleep(1)
170
133
  end
171
134
 
172
- raise(JobStopped, "The job was stopped") if distributed_job.stopped?
135
+ raise(JobStopped, "The job was stopped") if redis_queue.stopped?
136
+
137
+ interval.fire(timeout: 1)
138
+
139
+ redis_queue.token
173
140
  ensure
141
+ redis_queue&.stop
142
+ interval&.stop
174
143
  progress_bar&.stop
175
144
  end
176
145
 
data/lib/kraps/step.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kraps
2
- Step = Struct.new(:action, :partitioner, :partitions, :block, :worker, :before, :frame, :dependency, :options, keyword_init: true)
2
+ Step = Struct.new(:action, :partitioner, :partitions, :jobs, :block, :worker, :before, :frame, :dependency, :options, keyword_init: true)
3
3
  end
data/lib/kraps/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kraps
2
- VERSION = "0.6.0"
2
+ VERSION = "0.7.0"
3
3
  end
data/lib/kraps/worker.rb CHANGED
@@ -11,22 +11,22 @@ module Kraps
11
11
  end
12
12
 
13
13
  def call(retries: 3)
14
- return if distributed_job.stopped?
14
+ return if redis_queue.stopped?
15
15
 
16
16
  raise(InvalidAction, "Invalid action #{step.action}") unless Actions::ALL.include?(step.action)
17
17
 
18
- with_retries(retries) do # TODO: allow to use queue based retries
19
- step.before&.call
18
+ dequeue do |payload|
19
+ with_retries(retries) do # TODO: allow to use queue based retries
20
+ step.before&.call
20
21
 
21
- send(:"perform_#{step.action}")
22
-
23
- distributed_job.done(@args["part"])
22
+ send(:"perform_#{step.action}", payload)
23
+ end
24
24
  end
25
25
  end
26
26
 
27
27
  private
28
28
 
29
- def perform_parallelize
29
+ def perform_parallelize(payload)
30
30
  implementation = Class.new do
31
31
  def map(key)
32
32
  yield(key, nil)
@@ -34,19 +34,19 @@ module Kraps
34
34
  end
35
35
 
36
36
  mapper = MapReduce::Mapper.new(implementation.new, partitioner: partitioner, memory_limit: @memory_limit)
37
- mapper.map(@args["item"])
37
+ mapper.map(payload["item"])
38
38
 
39
39
  mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
40
40
  Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
41
41
  File.open(path) do |stream|
42
- Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream)
42
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["part"]}.json"), stream)
43
43
  end
44
44
  end
45
45
  end
46
46
  end
47
47
 
48
- def perform_map
49
- temp_paths = download_all(token: @args["frame"]["token"], partition: @args["partition"])
48
+ def perform_map(payload)
49
+ temp_paths = download_all(token: @args["frame"]["token"], partition: payload["partition"])
50
50
 
51
51
  current_step = step
52
52
 
@@ -78,7 +78,7 @@ module Kraps
78
78
  mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
79
79
  Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
80
80
  File.open(path) do |stream|
81
- Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream)
81
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
82
82
  end
83
83
  end
84
84
  end
@@ -86,11 +86,11 @@ module Kraps
86
86
  temp_paths&.delete
87
87
  end
88
88
 
89
- def perform_map_partitions
90
- temp_paths = download_all(token: @args["frame"]["token"], partition: @args["partition"])
89
+ def perform_map_partitions(payload)
90
+ temp_paths = download_all(token: @args["frame"]["token"], partition: payload["partition"])
91
91
 
92
92
  current_step = step
93
- current_partition = @args["partition"]
93
+ current_partition = payload["partition"]
94
94
 
95
95
  implementation = Object.new
96
96
  implementation.define_singleton_method(:map) do |enum, &block|
@@ -111,7 +111,7 @@ module Kraps
111
111
  mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
112
112
  Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
113
113
  File.open(path) do |stream|
114
- Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream)
114
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
115
115
  end
116
116
  end
117
117
  end
@@ -119,7 +119,7 @@ module Kraps
119
119
  temp_paths&.delete
120
120
  end
121
121
 
122
- def perform_reduce
122
+ def perform_reduce(payload)
123
123
  current_step = step
124
124
 
125
125
  implementation = Object.new
@@ -129,7 +129,7 @@ module Kraps
129
129
 
130
130
  reducer = MapReduce::Reducer.new(implementation)
131
131
 
132
- Parallelizer.each(Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")), @concurrency) do |file|
132
+ Parallelizer.each(Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{payload["partition"]}/")), @concurrency) do |file|
133
133
  Kraps.driver.download(file, reducer.add_chunk)
134
134
  end
135
135
 
@@ -139,14 +139,14 @@ module Kraps
139
139
  tempfile.puts(JSON.generate([key, value]))
140
140
  end
141
141
 
142
- Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{@args["partition"]}/chunk.#{@args["part"]}.json"), tempfile.tap(&:rewind))
142
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{payload["partition"]}/chunk.#{payload["partition"]}.json"), tempfile.tap(&:rewind))
143
143
  ensure
144
144
  tempfile&.close(true)
145
145
  end
146
146
 
147
- def perform_combine
148
- temp_paths1 = download_all(token: @args["frame"]["token"], partition: @args["partition"])
149
- temp_paths2 = download_all(token: @args["combine_frame"]["token"], partition: @args["partition"])
147
+ def perform_combine(payload)
148
+ temp_paths1 = download_all(token: @args["frame"]["token"], partition: payload["partition"])
149
+ temp_paths2 = download_all(token: payload["combine_frame"]["token"], partition: payload["partition"])
150
150
 
151
151
  enum1 = k_way_merge(temp_paths1.each.to_a, chunk_limit: @chunk_limit)
152
152
  enum2 = k_way_merge(temp_paths2.each.to_a, chunk_limit: @chunk_limit)
@@ -167,7 +167,7 @@ module Kraps
167
167
  mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
168
168
  Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
169
169
  File.open(path) do |stream|
170
- Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream)
170
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
171
171
  end
172
172
  end
173
173
  end
@@ -213,10 +213,10 @@ module Kraps
213
213
  end
214
214
  end
215
215
 
216
- def perform_each_partition
216
+ def perform_each_partition(payload)
217
217
  temp_paths = TempPaths.new
218
218
 
219
- files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")).sort
219
+ files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{payload["partition"]}/")).sort
220
220
 
221
221
  temp_paths_index = files.each_with_object({}) do |file, hash|
222
222
  hash[file] = temp_paths.add
@@ -226,7 +226,7 @@ module Kraps
226
226
  Kraps.driver.download(file, temp_paths_index[file].path)
227
227
  end
228
228
 
229
- step.block.call(@args["partition"], k_way_merge(temp_paths.each.to_a, chunk_limit: @chunk_limit))
229
+ step.block.call(payload["partition"], k_way_merge(temp_paths.each.to_a, chunk_limit: @chunk_limit))
230
230
  ensure
231
231
  temp_paths&.delete
232
232
  end
@@ -237,11 +237,11 @@ module Kraps
237
237
  begin
238
238
  yield
239
239
  rescue Kraps::Error
240
- distributed_job.stop
240
+ redis_queue.stop
241
241
  raise
242
242
  rescue StandardError => e
243
243
  if retries >= num_retries
244
- distributed_job.stop
244
+ redis_queue.stop
245
245
  raise
246
246
  end
247
247
 
@@ -254,6 +254,21 @@ module Kraps
254
254
  end
255
255
  end
256
256
 
257
+ def dequeue
258
+ loop do
259
+ break if redis_queue.stopped?
260
+ break if redis_queue.size.zero?
261
+
262
+ redis_queue.dequeue do |payload|
263
+ payload ? yield(payload) : sleep(1)
264
+ end
265
+ end
266
+ end
267
+
268
+ def redis_queue
269
+ @redis_queue ||= RedisQueue.new(redis: Kraps.redis, token: @args["token"], namespace: Kraps.namespace, ttl: Kraps.job_ttl)
270
+ end
271
+
257
272
  def download_all(token:, partition:)
258
273
  temp_paths = TempPaths.new
259
274
 
@@ -301,9 +316,5 @@ module Kraps
301
316
  def partitioner
302
317
  @partitioner ||= proc { |key| step.partitioner.call(key, step.partitions) }
303
318
  end
304
-
305
- def distributed_job
306
- @distributed_job ||= Kraps.distributed_job_client.build(token: @args["token"])
307
- end
308
319
  end
309
320
  end
data/lib/kraps.rb CHANGED
@@ -1,4 +1,3 @@
1
- require "distributed_job"
2
1
  require "ruby-progressbar"
3
2
  require "ruby-progressbar/outputs/null"
4
3
  require "map_reduce"
@@ -9,6 +8,7 @@ require_relative "kraps/drivers"
9
8
  require_relative "kraps/actions"
10
9
  require_relative "kraps/parallelizer"
11
10
  require_relative "kraps/hash_partitioner"
11
+ require_relative "kraps/redis_queue"
12
12
  require_relative "kraps/temp_path"
13
13
  require_relative "kraps/temp_paths"
14
14
  require_relative "kraps/timeout_queue"
@@ -27,9 +27,11 @@ module Kraps
27
27
  class JobStopped < Error; end
28
28
  class IncompatibleFrame < Error; end
29
29
 
30
- def self.configure(driver:, redis: Redis.new, namespace: nil, job_ttl: 24 * 60 * 60, show_progress: true, enqueuer: ->(worker, json) { worker.perform_async(json) })
30
+ def self.configure(driver:, redis: Redis.new, namespace: nil, job_ttl: 4 * 24 * 60 * 60, show_progress: true, enqueuer: ->(worker, json) { worker.perform_async(json) })
31
31
  @driver = driver
32
- @distributed_job_client = DistributedJob::Client.new(redis: redis, namespace: namespace, default_ttl: job_ttl)
32
+ @redis = redis
33
+ @namespace = namespace
34
+ @job_ttl = job_ttl.to_i
33
35
  @show_progress = show_progress
34
36
  @enqueuer = enqueuer
35
37
  end
@@ -38,8 +40,16 @@ module Kraps
38
40
  @driver
39
41
  end
40
42
 
41
- def self.distributed_job_client
42
- @distributed_job_client
43
+ def self.redis
44
+ @redis
45
+ end
46
+
47
+ def self.namespace
48
+ @namespace
49
+ end
50
+
51
+ def self.job_ttl
52
+ @job_ttl
43
53
  end
44
54
 
45
55
  def self.show_progress?
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kraps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Vetter
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-11-16 00:00:00.000000000 Z
11
+ date: 2022-12-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: attachie
@@ -24,20 +24,6 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: distributed_job
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: map-reduce-ruby
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -149,6 +135,7 @@ files:
149
135
  - lib/kraps/job.rb
150
136
  - lib/kraps/job_resolver.rb
151
137
  - lib/kraps/parallelizer.rb
138
+ - lib/kraps/redis_queue.rb
152
139
  - lib/kraps/runner.rb
153
140
  - lib/kraps/step.rb
154
141
  - lib/kraps/temp_path.rb