kraps 0.6.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 02ba582478178273300e77d5ddd18a8568fd682b0c53a444f1c5e7b756f9fd9a
4
- data.tar.gz: 9e1698a67c252512a2f277ca1a6e079e063f8f1e6c13500245aabd969cca122d
3
+ metadata.gz: d261c779e82209152e26decbc6c5a6c5c5ddb0fb40803884383617635727d3b2
4
+ data.tar.gz: 1b9c6fa8db7a7811cbac5a7a5db518e1f3ee75df583521b64417341e830425f4
5
5
  SHA512:
6
- metadata.gz: 2d1b3bd10d1048c64804ddf86069c0757247b581edea0f38330189a10d35ed096970d008533a167ddaf97c16479425e3baa2bb56f5a8888255ecfd12b911a168
7
- data.tar.gz: 045263d6aa920cef97a162fcbfd41f239087c459a8c86bd2112d3ce4524c48c6e65ceaac8993f8ea7fb9089a8877db2863e39644888c8bd884df0fa95241277d
6
+ metadata.gz: dcb05139042149be087b1a2c7f14a31cd5e28dedb1517aca83299f63b90046e4d05e0ab19dfaeede329e784880623abda19675252cdeaad04f8ccd87249afde5
7
+ data.tar.gz: 10fd07c322c659ae21a682832eba30416c830f9d2146af685d69168ad5137045ef4268c0a43cee4e879bb875edf900ca740bbe4cbfe8b91b34ad3df40763bce0
data/.rubocop.yml CHANGED
@@ -80,3 +80,6 @@ Style/WordArray:
80
80
 
81
81
  Style/RedundantEach:
82
82
  Enabled: false
83
+
84
+ Lint/NonLocalExitFromIterator:
85
+ Enabled: false
data/CHANGELOG.md CHANGED
@@ -1,5 +1,19 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## v0.8.0
4
+
5
+ * Use number of partitions of previous step for `jobs` option by default
6
+ * Changed `combine` to receive a `collector`
7
+ * Added mandatory `concurrency` argument to `load`
8
+
9
+ ## v0.7.0
10
+
11
+ * Added a `jobs` option to the actions to limit the concurrency
12
+ when e.g. accessing external data stores and to avoid overloading
13
+ them
14
+ * Added a queue using redis for the jobs to avoid starving workers
15
+ * Removed `distributed_job` dependency
16
+
3
17
  ## v0.6.0
4
18
 
5
19
  * Added `map_partitions`
data/Gemfile.lock CHANGED
@@ -1,9 +1,8 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kraps (0.6.0)
4
+ kraps (0.7.0)
5
5
  attachie
6
- distributed_job
7
6
  map-reduce-ruby (>= 3.0.0)
8
7
  redis
9
8
  ruby-progressbar
@@ -41,8 +40,6 @@ GEM
41
40
  concurrent-ruby (1.1.10)
42
41
  connection_pool (2.3.0)
43
42
  diff-lcs (1.5.0)
44
- distributed_job (3.1.0)
45
- redis (>= 4.1.0)
46
43
  i18n (1.12.0)
47
44
  concurrent-ruby (~> 1.0)
48
45
  jmespath (1.6.1)
@@ -62,7 +59,7 @@ GEM
62
59
  rake (13.0.6)
63
60
  redis (5.0.5)
64
61
  redis-client (>= 0.9.0)
65
- redis-client (0.11.1)
62
+ redis-client (0.11.2)
66
63
  connection_pool
67
64
  regexp_parser (2.6.0)
68
65
  rexml (3.2.5)
data/README.md CHANGED
@@ -30,7 +30,7 @@ Kraps.configure(
30
30
  driver: Kraps::Drivers::S3Driver.new(s3_client: Aws::S3::Client.new("..."), bucket: "some-bucket", prefix: "temp/kraps/"),
31
31
  redis: Redis.new,
32
32
  namespace: "my-application", # An optional namespace to be used for redis keys, default: nil
33
- job_ttl: 24.hours, # Job information in redis will automatically be removed after this amount of time, default: 24 hours
33
+ job_ttl: 7.days, # Job information in redis will automatically be removed after this amount of time, default: 4 days
34
34
  show_progress: true # Whether or not to show the progress in the terminal when executing jobs, default: true
35
35
  enqueuer: ->(worker, json) { worker.perform_async(json) } # Allows to customize the enqueueing of worker jobs
36
36
  )
@@ -220,7 +220,7 @@ items are used as keys and the values are set to `nil`.
220
220
  * `map`: Maps the key value pairs to other key value pairs
221
221
 
222
222
  ```ruby
223
- job.map(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker) do |key, value, collector|
223
+ job.map(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8) do |key, value, collector|
224
224
  collector.call("changed #{key}", "changed #{value}")
225
225
  end
226
226
  ```
@@ -229,13 +229,22 @@ The block gets each key-value pair passed and the `collector` block can be
229
229
  called as often as neccessary. This is also the reason why `map` can not simply
230
230
  return the new key-value pair, but the `collector` must be used instead.
231
231
 
232
+ The `jobs` argument can be useful when you need to access an external data
233
+ source, like a relational database and you want to limit the number of workers
234
+ accessing the store concurrently to avoid overloading it. If you don't specify
235
+ it, it will be identical to the number of partitions of the previous step. It
236
+ is recommended to only use it for steps where you need to throttle the
237
+ concurrency, because it will of course slow down the processing. The `jobs`
238
+ argument only applies to the current step. The following steps don't inherit
239
+ the argument, but reset it.
240
+
232
241
  * `map_partitions`: Maps the key value pairs to other key value pairs, but the
233
242
  block receives all data of each partition as an enumerable and sorted by key.
234
243
  Please be aware that you should not call `to_a` or similar on the enumerable.
235
244
  Prefer `map` over `map_partitions` when possible.
236
245
 
237
246
  ```ruby
238
- job.map_partitions(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker) do |pairs, collector|
247
+ job.map_partitions(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8) do |pairs, collector|
239
248
  pairs.each do |key, value|
240
249
  collector.call("changed #{key}", "changed #{value}")
241
250
  end
@@ -245,7 +254,7 @@ end
245
254
  * `reduce`: Reduces the values of pairs having the same key
246
255
 
247
256
  ```ruby
248
- job.reduce(worker: MyKrapsWorker) do |key, value1, value2|
257
+ job.reduce(worker: MyKrapsWorker, jobs: 8) do |key, value1, value2|
249
258
  value1 + value2
250
259
  end
251
260
  ```
@@ -265,8 +274,8 @@ most of the time, this is not neccessary and the key can simply be ignored.
265
274
  passed job result are completely omitted.
266
275
 
267
276
  ```ruby
268
- job.combine(other_job, worker: MyKrapsWorker) do |key, value1, value2|
269
- (value1 || {}).merge(value2 || {})
277
+ job.combine(other_job, worker: MyKrapsWorker, jobs: 8) do |key, value1, value2, collector|
278
+ collector.call(key, (value1 || {}).merge(value2 || {}))
270
279
  end
271
280
  ```
272
281
 
@@ -279,7 +288,7 @@ since Kraps detects the dependency on its own.
279
288
  * `repartition`: Used to change the partitioning
280
289
 
281
290
  ```ruby
282
- job.repartition(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker)
291
+ job.repartition(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8)
283
292
  ```
284
293
 
285
294
  Repartitions all data into the specified number of partitions and using the
@@ -290,7 +299,7 @@ specified partitioner.
290
299
  `to_a` or similar on the enumerable.
291
300
 
292
301
  ```ruby
293
- job.each_partition do |partition, pairs|
302
+ job.each_partition(jobs: 8) do |partition, pairs|
294
303
  pairs.each do |key, value|
295
304
  # ...
296
305
  end
@@ -308,10 +317,12 @@ It creates a folder for every partition and stores one or more chunks in there.
308
317
  * `load`: Loads the previously dumped data
309
318
 
310
319
  ```ruby
311
- job.load(prefix: "path/to/dump", partitions: 32, partitioner: Kraps::HashPartitioner.new, worker: MyKrapsWorker)
320
+ job.load(prefix: "path/to/dump", partitions: 32, concurrency: 8, partitioner: Kraps::HashPartitioner.new, worker: MyKrapsWorker)
312
321
  ```
313
322
 
314
- The number of partitions and the partitioner must be specified.
323
+ The number of partitions, the partitioner and concurrency must be specified.
324
+ The concurrency specifies the number of threads used for downloading chunks in
325
+ parallel.
315
326
 
316
327
  Please note that every API method accepts a `before` callable:
317
328
 
@@ -379,7 +390,8 @@ jobs only once.
379
390
  Kraps ships with an in-memory fake driver for storage, which you can use for
380
391
  testing purposes instead of the s3 driver:
381
392
 
382
- ```ruby Kraps.configure(
393
+ ```ruby
394
+ Kraps.configure(
383
395
  driver: Kraps::Drivers::FakeDriver.new(bucket: "kraps"),
384
396
  # ...
385
397
  ) ```
@@ -425,8 +437,6 @@ The API of the driver is:
425
437
  Kraps is built on top of
426
438
  [map-reduce-ruby](https://github.com/mrkamel/map-reduce-ruby) for the
427
439
  map/reduce framework,
428
- [distributed_job](https://github.com/mrkamel/distributed_job)
429
- to keep track of the job/step status,
430
440
  [attachie](https://github.com/mrkamel/attachie) to interact with the storage
431
441
  layer (s3),
432
442
  [ruby-progressbar](https://github.com/jfelchner/ruby-progressbar) to
data/docker-compose.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  version: '2'
2
2
  services:
3
- elasticsearch:
3
+ redis:
4
4
  image: redis
5
5
  ports:
6
6
  - 6379:6379
@@ -0,0 +1,19 @@
1
+ module Kraps
2
+ class Downloader
3
+ def self.download_all(prefix:, concurrency:)
4
+ temp_paths = TempPaths.new
5
+
6
+ files = Kraps.driver.list(prefix: prefix).sort
7
+
8
+ temp_paths_index = files.each_with_object({}) do |file, hash|
9
+ hash[file] = temp_paths.add
10
+ end
11
+
12
+ Parallelizer.each(files, concurrency) do |file|
13
+ Kraps.driver.download(file, temp_paths_index[file].path)
14
+ end
15
+
16
+ temp_paths
17
+ end
18
+ end
19
+ end
data/lib/kraps/job.rb CHANGED
@@ -27,14 +27,17 @@ module Kraps
27
27
  end
28
28
  end
29
29
 
30
- def map(partitions: nil, partitioner: nil, worker: @worker, before: nil, &block)
30
+ def map(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
31
31
  fresh.tap do |job|
32
32
  job.instance_eval do
33
+ jobs = [jobs, @partitions].compact.min
34
+
33
35
  @partitions = partitions if partitions
34
36
  @partitioner = partitioner if partitioner
35
37
 
36
38
  @steps << Step.new(
37
39
  action: Actions::MAP,
40
+ jobs: jobs,
38
41
  partitions: @partitions,
39
42
  partitioner: @partitioner,
40
43
  worker: worker,
@@ -45,14 +48,17 @@ module Kraps
45
48
  end
46
49
  end
47
50
 
48
- def map_partitions(partitions: nil, partitioner: nil, worker: @worker, before: nil, &block)
51
+ def map_partitions(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
49
52
  fresh.tap do |job|
50
53
  job.instance_eval do
54
+ jobs = [jobs, @partitions].compact.min
55
+
51
56
  @partitions = partitions if partitions
52
57
  @partitioner = partitioner if partitioner
53
58
 
54
59
  @steps << Step.new(
55
60
  action: Actions::MAP_PARTITIONS,
61
+ jobs: jobs,
56
62
  partitions: @partitions,
57
63
  partitioner: @partitioner,
58
64
  worker: worker,
@@ -63,11 +69,12 @@ module Kraps
63
69
  end
64
70
  end
65
71
 
66
- def reduce(worker: @worker, before: nil, &block)
72
+ def reduce(jobs: nil, worker: @worker, before: nil, &block)
67
73
  fresh.tap do |job|
68
74
  job.instance_eval do
69
75
  @steps << Step.new(
70
76
  action: Actions::REDUCE,
77
+ jobs: [jobs, @partitions].compact.min,
71
78
  partitions: @partitions,
72
79
  partitioner: @partitioner,
73
80
  worker: worker,
@@ -78,11 +85,12 @@ module Kraps
78
85
  end
79
86
  end
80
87
 
81
- def combine(other_job, worker: @worker, before: nil, &block)
88
+ def combine(other_job, jobs: nil, worker: @worker, before: nil, &block)
82
89
  fresh.tap do |job|
83
90
  job.instance_eval do
84
91
  @steps << Step.new(
85
92
  action: Actions::COMBINE,
93
+ jobs: [jobs, @partitions].compact.min,
86
94
  partitions: @partitions,
87
95
  partitioner: @partitioner,
88
96
  worker: worker,
@@ -95,11 +103,12 @@ module Kraps
95
103
  end
96
104
  end
97
105
 
98
- def each_partition(worker: @worker, before: nil, &block)
106
+ def each_partition(jobs: nil, worker: @worker, before: nil, &block)
99
107
  fresh.tap do |job|
100
108
  job.instance_eval do
101
109
  @steps << Step.new(
102
110
  action: Actions::EACH_PARTITION,
111
+ jobs: [jobs, @partitions].compact.min,
103
112
  partitions: @partitions,
104
113
  partitioner: @partitioner,
105
114
  worker: worker,
@@ -110,8 +119,8 @@ module Kraps
110
119
  end
111
120
  end
112
121
 
113
- def repartition(partitions:, partitioner: nil, worker: @worker, before: nil)
114
- map(partitions: partitions, partitioner: partitioner, worker: worker, before: before) do |key, value, collector|
122
+ def repartition(partitions:, jobs: nil, partitioner: nil, worker: @worker, before: nil)
123
+ map(jobs: jobs, partitions: partitions, partitioner: partitioner, worker: worker, before: before) do |key, value, collector|
115
124
  collector.call(key, value)
116
125
  end
117
126
  end
@@ -130,7 +139,7 @@ module Kraps
130
139
  end
131
140
  end
132
141
 
133
- def load(prefix:, partitions:, partitioner:, worker: @worker)
142
+ def load(prefix:, partitions:, partitioner:, concurrency:, worker: @worker)
134
143
  job = parallelize(partitions: partitions, partitioner: proc { |key, _| key }, worker: worker) do |collector|
135
144
  (0...partitions).each do |partition|
136
145
  collector.call(partition)
@@ -138,20 +147,19 @@ module Kraps
138
147
  end
139
148
 
140
149
  job.map_partitions(partitioner: partitioner, worker: worker) do |partition, _, collector|
141
- tempfile = Tempfile.new
150
+ temp_paths = Downloader.download_all(prefix: File.join(prefix, partition.to_s, "/"), concurrency: concurrency)
142
151
 
143
- path = File.join(prefix, partition.to_s, "chunk.json")
144
- next unless Kraps.driver.exists?(path)
152
+ temp_paths.each do |temp_path|
153
+ File.open(temp_path.path) do |stream|
154
+ stream.each_line do |line|
155
+ key, value = JSON.parse(line)
145
156
 
146
- Kraps.driver.download(path, tempfile.path)
147
-
148
- tempfile.each_line do |line|
149
- key, value = JSON.parse(line)
150
-
151
- collector.call(key, value)
157
+ collector.call(key, value)
158
+ end
159
+ end
152
160
  end
153
161
  ensure
154
- tempfile&.close(true)
162
+ temp_paths&.delete
155
163
  end
156
164
  end
157
165
 
@@ -0,0 +1,151 @@
1
+ module Kraps
2
+ class RedisQueue
3
+ VISIBILITY_TIMEOUT = 60
4
+
5
+ attr_reader :token
6
+
7
+ def initialize(redis:, token:, namespace:, ttl:)
8
+ @redis = redis
9
+ @token = token
10
+ @namespace = namespace
11
+ @ttl = ttl
12
+ end
13
+
14
+ def size
15
+ @size_script ||= <<~SCRIPT
16
+ local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
17
+
18
+ redis.call('expire', queue_key, ttl)
19
+ redis.call('expire', pending_key, ttl)
20
+ redis.call('expire', status_key, ttl)
21
+
22
+ return redis.call('llen', queue_key) + redis.call('zcard', pending_key)
23
+ SCRIPT
24
+
25
+ @redis.eval(@size_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl])
26
+ end
27
+
28
+ def enqueue(payload)
29
+ @enqueue_script ||= <<~SCRIPT
30
+ local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
31
+
32
+ redis.call('rpush', queue_key, job)
33
+
34
+ redis.call('expire', queue_key, ttl)
35
+ redis.call('expire', pending_key, ttl)
36
+ redis.call('expire', status_key, ttl)
37
+ SCRIPT
38
+
39
+ @redis.eval(@enqueue_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, JSON.generate(payload)])
40
+ end
41
+
42
+ def dequeue
43
+ @dequeue_script ||= <<~SCRIPT
44
+ local queue_key, pending_key, status_key, ttl, visibility_timeout = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), tonumber(ARGV[5])
45
+
46
+ local zitem = redis.call('zrange', pending_key, 0, 0, 'WITHSCORES')
47
+ local job = zitem[1]
48
+
49
+ if not zitem[2] or tonumber(zitem[2]) > tonumber(redis.call('time')[1]) then
50
+ job = redis.call('lpop', queue_key)
51
+ end
52
+
53
+ redis.call('expire', queue_key, ttl)
54
+ redis.call('expire', pending_key, ttl)
55
+ redis.call('expire', status_key, ttl)
56
+
57
+ if not job then return nil end
58
+
59
+ redis.call('zadd', pending_key, tonumber(redis.call('time')[1]) + visibility_timeout, job)
60
+ redis.call('expire', pending_key, ttl)
61
+
62
+ return job
63
+ SCRIPT
64
+
65
+ job = @redis.eval(@dequeue_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, VISIBILITY_TIMEOUT])
66
+
67
+ unless job
68
+ yield(nil)
69
+ return
70
+ end
71
+
72
+ keep_alive(job) do
73
+ yield(JSON.parse(job)) if job
74
+ end
75
+
76
+ @remove_script ||= <<~SCRIPT
77
+ local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
78
+
79
+ redis.call('zrem', pending_key, job)
80
+
81
+ redis.call('expire', queue_key, ttl)
82
+ redis.call('expire', pending_key, ttl)
83
+ redis.call('expire', status_key, ttl)
84
+ SCRIPT
85
+
86
+ @redis.eval(@remove_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, job])
87
+ end
88
+
89
+ def stop
90
+ @stop_script ||= <<~SCRIPT
91
+ local queue_key, pending_key, status_key, ttl = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4])
92
+
93
+ redis.call('hset', status_key, 'stopped', 1)
94
+
95
+ redis.call('expire', queue_key, ttl)
96
+ redis.call('expire', pending_key, ttl)
97
+ redis.call('expire', status_key, ttl)
98
+ SCRIPT
99
+
100
+ @redis.eval(@stop_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl])
101
+ end
102
+
103
+ def stopped?
104
+ @stopped_script ||= <<~SCRIPT
105
+ local queue_key, pending_key, status_key, ttl = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4])
106
+
107
+ redis.call('expire', queue_key, ttl)
108
+ redis.call('expire', pending_key, ttl)
109
+ redis.call('expire', status_key, ttl)
110
+
111
+ return redis.call('hget', status_key, 'stopped')
112
+ SCRIPT
113
+
114
+ @redis.eval(@stopped_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl]).to_i == 1
115
+ end
116
+
117
+ private
118
+
119
+ def keep_alive(job)
120
+ @keep_alive_script ||= <<~SCRIPT
121
+ local queue_key, pending_key, status_key, ttl, job, visibility_timeout = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5], tonumber(ARGV[6])
122
+
123
+ redis.call('zadd', pending_key, tonumber(redis.call('time')[1]) + visibility_timeout, job)
124
+
125
+ redis.call('expire', queue_key, ttl)
126
+ redis.call('expire', pending_key, ttl)
127
+ redis.call('expire', status_key, ttl)
128
+ SCRIPT
129
+
130
+ interval = Interval.new(5) do
131
+ @redis.eval(@keep_alive_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, job, VISIBILITY_TIMEOUT])
132
+ end
133
+
134
+ yield
135
+ ensure
136
+ interval&.stop
137
+ end
138
+
139
+ def redis_queue_key
140
+ [@namespace, "kraps", "queue", @token].compact.join(":")
141
+ end
142
+
143
+ def redis_pending_key
144
+ [@namespace, "kraps", "pending", @token].compact.join(":")
145
+ end
146
+
147
+ def redis_status_key
148
+ [@namespace, "kraps", "status", @token].compact.join(":")
149
+ end
150
+ end
151
+ end
data/lib/kraps/runner.rb CHANGED
@@ -45,48 +45,35 @@ module Kraps
45
45
 
46
46
  def perform_parallelize
47
47
  enum = Enumerator.new do |yielder|
48
- collector = proc { |item| yielder << item }
48
+ collector = proc { |item| yielder << { item: item } }
49
49
 
50
50
  @step.block.call(collector)
51
51
  end
52
52
 
53
- with_distributed_job do |distributed_job|
54
- push_and_wait(distributed_job, enum) do |item, part|
55
- enqueue(token: distributed_job.token, part: part, item: item)
56
- end
53
+ token = push_and_wait(enum: enum)
57
54
 
58
- Frame.new(token: distributed_job.token, partitions: @step.partitions)
59
- end
55
+ Frame.new(token: token, partitions: @step.partitions)
60
56
  end
61
57
 
62
58
  def perform_map
63
- with_distributed_job do |distributed_job|
64
- push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
65
- enqueue(token: distributed_job.token, part: part, partition: partition)
66
- end
59
+ enum = (0...@frame.partitions).map { |partition| { partition: partition } }
60
+ token = push_and_wait(job_count: @step.jobs, enum: enum)
67
61
 
68
- Frame.new(token: distributed_job.token, partitions: @step.partitions)
69
- end
62
+ Frame.new(token: token, partitions: @step.partitions)
70
63
  end
71
64
 
72
65
  def perform_map_partitions
73
- with_distributed_job do |distributed_job|
74
- push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
75
- enqueue(token: distributed_job.token, part: part, partition: partition)
76
- end
66
+ enum = (0...@frame.partitions).map { |partition| { partition: partition } }
67
+ token = push_and_wait(job_count: @step.jobs, enum: enum)
77
68
 
78
- Frame.new(token: distributed_job.token, partitions: @step.partitions)
79
- end
69
+ Frame.new(token: token, partitions: @step.partitions)
80
70
  end
81
71
 
82
72
  def perform_reduce
83
- with_distributed_job do |distributed_job|
84
- push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
85
- enqueue(token: distributed_job.token, part: part, partition: partition)
86
- end
73
+ enum = (0...@frame.partitions).map { |partition| { partition: partition } }
74
+ token = push_and_wait(job_count: @step.jobs, enum: enum)
87
75
 
88
- Frame.new(token: distributed_job.token, partitions: @step.partitions)
89
- end
76
+ Frame.new(token: token, partitions: @step.partitions)
90
77
  end
91
78
 
92
79
  def perform_combine
@@ -95,82 +82,64 @@ module Kraps
95
82
 
96
83
  raise(IncompatibleFrame, "Incompatible number of partitions") if combine_step.partitions != @step.partitions
97
84
 
98
- with_distributed_job do |distributed_job|
99
- push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
100
- enqueue(token: distributed_job.token, part: part, partition: partition, combine_frame: combine_step.frame.to_h)
101
- end
102
-
103
- Frame.new(token: distributed_job.token, partitions: @step.partitions)
85
+ enum = (0...@frame.partitions).map do |partition|
86
+ { partition: partition, combine_frame: combine_step.frame.to_h }
104
87
  end
88
+
89
+ token = push_and_wait(job_count: @step.jobs, enum: enum)
90
+
91
+ Frame.new(token: token, partitions: @step.partitions)
105
92
  end
106
93
 
107
94
  def perform_each_partition
108
- with_distributed_job do |distributed_job|
109
- push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
110
- enqueue(token: distributed_job.token, part: part, partition: partition)
111
- end
95
+ enum = (0...@frame.partitions).map { |partition| { partition: partition } }
96
+ push_and_wait(job_count: @step.jobs, enum: enum)
112
97
 
113
- @frame
114
- end
98
+ @frame
115
99
  end
116
100
 
117
- def enqueue(token:, part:, **rest)
118
- Kraps.enqueuer.call(
119
- @step.worker,
120
- JSON.generate(
121
- job_index: @job_index,
122
- step_index: @step_index,
123
- frame: @frame.to_h,
124
- token: token,
125
- part: part,
126
- klass: @klass,
127
- args: @args,
128
- kwargs: @kwargs,
129
- **rest
130
- )
131
- )
132
- end
101
+ def push_and_wait(enum:, job_count: nil)
102
+ redis_queue = RedisQueue.new(redis: Kraps.redis, token: SecureRandom.hex, namespace: Kraps.namespace, ttl: Kraps.job_ttl)
103
+ progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, #{@step.jobs || "?"} jobs, token #{redis_queue.token}, %a, %c/%C (%p%) => #{@step.action}")
133
104
 
134
- def with_distributed_job
135
- distributed_job = Kraps.distributed_job_client.build(token: SecureRandom.hex)
105
+ total = 0
136
106
 
137
- yield(distributed_job)
138
- rescue Interrupt
139
- distributed_job&.stop
140
- raise
141
- end
107
+ interval = Interval.new(1) do
108
+ # The interval is used to continously update the progress bar even
109
+ # when push_all is used and to avoid sessions being terminated due
110
+ # to inactivity etc
142
111
 
143
- def push_and_wait(distributed_job, enum)
144
- progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, token #{distributed_job.token}, %a, %c/%C (%p%) => #{@step.action}")
112
+ progress_bar.total = total
113
+ progress_bar.progress = [progress_bar.total - redis_queue.size, 0].max
114
+ end
145
115
 
146
- begin
147
- total = 0
116
+ enum.each_with_index do |item, part|
117
+ total += 1
148
118
 
149
- interval = Interval.new(1) do
150
- progress_bar.total = total
151
- end
119
+ redis_queue.enqueue(item.merge(part: part))
120
+ end
152
121
 
153
- distributed_job.push_each(enum) do |item, part|
154
- total += 1
155
- interval.fire(timeout: 1)
122
+ (job_count || total).times do
123
+ break if redis_queue.stopped?
156
124
 
157
- yield(item, part)
158
- end
159
- ensure
160
- interval&.stop
125
+ Kraps.enqueuer.call(@step.worker, JSON.generate(job_index: @job_index, step_index: @step_index, frame: @frame.to_h, token: redis_queue.token, klass: @klass, args: @args, kwargs: @kwargs))
161
126
  end
162
127
 
163
128
  loop do
164
- progress_bar.total = distributed_job.total
165
- progress_bar.progress = progress_bar.total - distributed_job.count
166
-
167
- break if distributed_job.finished? || distributed_job.stopped?
129
+ break if redis_queue.size.zero?
130
+ break if redis_queue.stopped?
168
131
 
169
132
  sleep(1)
170
133
  end
171
134
 
172
- raise(JobStopped, "The job was stopped") if distributed_job.stopped?
135
+ raise(JobStopped, "The job was stopped") if redis_queue.stopped?
136
+
137
+ interval.fire(timeout: 1)
138
+
139
+ redis_queue.token
173
140
  ensure
141
+ redis_queue&.stop
142
+ interval&.stop
174
143
  progress_bar&.stop
175
144
  end
176
145
 
data/lib/kraps/step.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kraps
2
- Step = Struct.new(:action, :partitioner, :partitions, :block, :worker, :before, :frame, :dependency, :options, keyword_init: true)
2
+ Step = Struct.new(:action, :partitioner, :partitions, :jobs, :block, :worker, :before, :frame, :dependency, :options, keyword_init: true)
3
3
  end
data/lib/kraps/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kraps
2
- VERSION = "0.6.0"
2
+ VERSION = "0.8.0"
3
3
  end
data/lib/kraps/worker.rb CHANGED
@@ -11,22 +11,22 @@ module Kraps
11
11
  end
12
12
 
13
13
  def call(retries: 3)
14
- return if distributed_job.stopped?
14
+ return if redis_queue.stopped?
15
15
 
16
16
  raise(InvalidAction, "Invalid action #{step.action}") unless Actions::ALL.include?(step.action)
17
17
 
18
- with_retries(retries) do # TODO: allow to use queue based retries
19
- step.before&.call
18
+ dequeue do |payload|
19
+ with_retries(retries) do # TODO: allow to use queue based retries
20
+ step.before&.call
20
21
 
21
- send(:"perform_#{step.action}")
22
-
23
- distributed_job.done(@args["part"])
22
+ send(:"perform_#{step.action}", payload)
23
+ end
24
24
  end
25
25
  end
26
26
 
27
27
  private
28
28
 
29
- def perform_parallelize
29
+ def perform_parallelize(payload)
30
30
  implementation = Class.new do
31
31
  def map(key)
32
32
  yield(key, nil)
@@ -34,19 +34,19 @@ module Kraps
34
34
  end
35
35
 
36
36
  mapper = MapReduce::Mapper.new(implementation.new, partitioner: partitioner, memory_limit: @memory_limit)
37
- mapper.map(@args["item"])
37
+ mapper.map(payload["item"])
38
38
 
39
39
  mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
40
40
  Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
41
41
  File.open(path) do |stream|
42
- Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream)
42
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["part"]}.json"), stream)
43
43
  end
44
44
  end
45
45
  end
46
46
  end
47
47
 
48
- def perform_map
49
- temp_paths = download_all(token: @args["frame"]["token"], partition: @args["partition"])
48
+ def perform_map(payload)
49
+ temp_paths = download_all(token: @args["frame"]["token"], partition: payload["partition"])
50
50
 
51
51
  current_step = step
52
52
 
@@ -78,7 +78,7 @@ module Kraps
78
78
  mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
79
79
  Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
80
80
  File.open(path) do |stream|
81
- Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream)
81
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
82
82
  end
83
83
  end
84
84
  end
@@ -86,11 +86,11 @@ module Kraps
86
86
  temp_paths&.delete
87
87
  end
88
88
 
89
- def perform_map_partitions
90
- temp_paths = download_all(token: @args["frame"]["token"], partition: @args["partition"])
89
+ def perform_map_partitions(payload)
90
+ temp_paths = download_all(token: @args["frame"]["token"], partition: payload["partition"])
91
91
 
92
92
  current_step = step
93
- current_partition = @args["partition"]
93
+ current_partition = payload["partition"]
94
94
 
95
95
  implementation = Object.new
96
96
  implementation.define_singleton_method(:map) do |enum, &block|
@@ -111,7 +111,7 @@ module Kraps
111
111
  mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
112
112
  Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
113
113
  File.open(path) do |stream|
114
- Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream)
114
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
115
115
  end
116
116
  end
117
117
  end
@@ -119,7 +119,7 @@ module Kraps
119
119
  temp_paths&.delete
120
120
  end
121
121
 
122
- def perform_reduce
122
+ def perform_reduce(payload)
123
123
  current_step = step
124
124
 
125
125
  implementation = Object.new
@@ -129,7 +129,7 @@ module Kraps
129
129
 
130
130
  reducer = MapReduce::Reducer.new(implementation)
131
131
 
132
- Parallelizer.each(Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")), @concurrency) do |file|
132
+ Parallelizer.each(Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{payload["partition"]}/")), @concurrency) do |file|
133
133
  Kraps.driver.download(file, reducer.add_chunk)
134
134
  end
135
135
 
@@ -139,14 +139,14 @@ module Kraps
139
139
  tempfile.puts(JSON.generate([key, value]))
140
140
  end
141
141
 
142
- Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{@args["partition"]}/chunk.#{@args["part"]}.json"), tempfile.tap(&:rewind))
142
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{payload["partition"]}/chunk.#{payload["partition"]}.json"), tempfile.tap(&:rewind))
143
143
  ensure
144
144
  tempfile&.close(true)
145
145
  end
146
146
 
147
- def perform_combine
148
- temp_paths1 = download_all(token: @args["frame"]["token"], partition: @args["partition"])
149
- temp_paths2 = download_all(token: @args["combine_frame"]["token"], partition: @args["partition"])
147
+ def perform_combine(payload)
148
+ temp_paths1 = download_all(token: @args["frame"]["token"], partition: payload["partition"])
149
+ temp_paths2 = download_all(token: payload["combine_frame"]["token"], partition: payload["partition"])
150
150
 
151
151
  enum1 = k_way_merge(temp_paths1.each.to_a, chunk_limit: @chunk_limit)
152
152
  enum2 = k_way_merge(temp_paths2.each.to_a, chunk_limit: @chunk_limit)
@@ -157,7 +157,7 @@ module Kraps
157
157
  implementation = Object.new
158
158
  implementation.define_singleton_method(:map) do |&block|
159
159
  combine_method.call(enum1, enum2) do |key, value1, value2|
160
- block.call(key, current_step.block.call(key, value1, value2))
160
+ current_step.block.call(key, value1, value2, block)
161
161
  end
162
162
  end
163
163
 
@@ -167,7 +167,7 @@ module Kraps
167
167
  mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
168
168
  Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
169
169
  File.open(path) do |stream|
170
- Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream)
170
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
171
171
  end
172
172
  end
173
173
  end
@@ -213,10 +213,10 @@ module Kraps
213
213
  end
214
214
  end
215
215
 
216
- def perform_each_partition
216
+ def perform_each_partition(payload)
217
217
  temp_paths = TempPaths.new
218
218
 
219
- files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")).sort
219
+ files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{payload["partition"]}/")).sort
220
220
 
221
221
  temp_paths_index = files.each_with_object({}) do |file, hash|
222
222
  hash[file] = temp_paths.add
@@ -226,7 +226,7 @@ module Kraps
226
226
  Kraps.driver.download(file, temp_paths_index[file].path)
227
227
  end
228
228
 
229
- step.block.call(@args["partition"], k_way_merge(temp_paths.each.to_a, chunk_limit: @chunk_limit))
229
+ step.block.call(payload["partition"], k_way_merge(temp_paths.each.to_a, chunk_limit: @chunk_limit))
230
230
  ensure
231
231
  temp_paths&.delete
232
232
  end
@@ -237,11 +237,11 @@ module Kraps
237
237
  begin
238
238
  yield
239
239
  rescue Kraps::Error
240
- distributed_job.stop
240
+ redis_queue.stop
241
241
  raise
242
242
  rescue StandardError => e
243
243
  if retries >= num_retries
244
- distributed_job.stop
244
+ redis_queue.stop
245
245
  raise
246
246
  end
247
247
 
@@ -254,20 +254,23 @@ module Kraps
254
254
  end
255
255
  end
256
256
 
257
- def download_all(token:, partition:)
258
- temp_paths = TempPaths.new
259
-
260
- files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{token}/#{partition}/")).sort
257
+ def dequeue
258
+ loop do
259
+ break if redis_queue.stopped?
260
+ break if redis_queue.size.zero?
261
261
 
262
- temp_paths_index = files.each_with_object({}) do |file, hash|
263
- hash[file] = temp_paths.add
262
+ redis_queue.dequeue do |payload|
263
+ payload ? yield(payload) : sleep(1)
264
+ end
264
265
  end
266
+ end
265
267
 
266
- Parallelizer.each(files, @concurrency) do |file|
267
- Kraps.driver.download(file, temp_paths_index[file].path)
268
- end
268
+ def redis_queue
269
+ @redis_queue ||= RedisQueue.new(redis: Kraps.redis, token: @args["token"], namespace: Kraps.namespace, ttl: Kraps.job_ttl)
270
+ end
269
271
 
270
- temp_paths
272
+ def download_all(token:, partition:)
273
+ Downloader.download_all(prefix: Kraps.driver.with_prefix("#{token}/#{partition}/"), concurrency: @concurrency)
271
274
  end
272
275
 
273
276
  def jobs
@@ -301,9 +304,5 @@ module Kraps
301
304
  def partitioner
302
305
  @partitioner ||= proc { |key| step.partitioner.call(key, step.partitions) }
303
306
  end
304
-
305
- def distributed_job
306
- @distributed_job ||= Kraps.distributed_job_client.build(token: @args["token"])
307
- end
308
307
  end
309
308
  end
data/lib/kraps.rb CHANGED
@@ -1,4 +1,3 @@
1
- require "distributed_job"
2
1
  require "ruby-progressbar"
3
2
  require "ruby-progressbar/outputs/null"
4
3
  require "map_reduce"
@@ -9,6 +8,7 @@ require_relative "kraps/drivers"
9
8
  require_relative "kraps/actions"
10
9
  require_relative "kraps/parallelizer"
11
10
  require_relative "kraps/hash_partitioner"
11
+ require_relative "kraps/redis_queue"
12
12
  require_relative "kraps/temp_path"
13
13
  require_relative "kraps/temp_paths"
14
14
  require_relative "kraps/timeout_queue"
@@ -19,6 +19,7 @@ require_relative "kraps/runner"
19
19
  require_relative "kraps/step"
20
20
  require_relative "kraps/frame"
21
21
  require_relative "kraps/worker"
22
+ require_relative "kraps/downloader"
22
23
 
23
24
  module Kraps
24
25
  class Error < StandardError; end
@@ -27,9 +28,11 @@ module Kraps
27
28
  class JobStopped < Error; end
28
29
  class IncompatibleFrame < Error; end
29
30
 
30
- def self.configure(driver:, redis: Redis.new, namespace: nil, job_ttl: 24 * 60 * 60, show_progress: true, enqueuer: ->(worker, json) { worker.perform_async(json) })
31
+ def self.configure(driver:, redis: Redis.new, namespace: nil, job_ttl: 4 * 24 * 60 * 60, show_progress: true, enqueuer: ->(worker, json) { worker.perform_async(json) })
31
32
  @driver = driver
32
- @distributed_job_client = DistributedJob::Client.new(redis: redis, namespace: namespace, default_ttl: job_ttl)
33
+ @redis = redis
34
+ @namespace = namespace
35
+ @job_ttl = job_ttl.to_i
33
36
  @show_progress = show_progress
34
37
  @enqueuer = enqueuer
35
38
  end
@@ -38,8 +41,16 @@ module Kraps
38
41
  @driver
39
42
  end
40
43
 
41
- def self.distributed_job_client
42
- @distributed_job_client
44
+ def self.redis
45
+ @redis
46
+ end
47
+
48
+ def self.namespace
49
+ @namespace
50
+ end
51
+
52
+ def self.job_ttl
53
+ @job_ttl
43
54
  end
44
55
 
45
56
  def self.show_progress?
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kraps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Vetter
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-11-16 00:00:00.000000000 Z
11
+ date: 2023-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: attachie
@@ -24,20 +24,6 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: distributed_job
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: map-reduce-ruby
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -142,6 +128,7 @@ files:
142
128
  - docker-compose.yml
143
129
  - lib/kraps.rb
144
130
  - lib/kraps/actions.rb
131
+ - lib/kraps/downloader.rb
145
132
  - lib/kraps/drivers.rb
146
133
  - lib/kraps/frame.rb
147
134
  - lib/kraps/hash_partitioner.rb
@@ -149,6 +136,7 @@ files:
149
136
  - lib/kraps/job.rb
150
137
  - lib/kraps/job_resolver.rb
151
138
  - lib/kraps/parallelizer.rb
139
+ - lib/kraps/redis_queue.rb
152
140
  - lib/kraps/runner.rb
153
141
  - lib/kraps/step.rb
154
142
  - lib/kraps/temp_path.rb