kraps 0.5.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/CHANGELOG.md +14 -0
- data/Gemfile.lock +2 -5
- data/README.md +126 -25
- data/docker-compose.yml +1 -1
- data/lib/kraps/actions.rb +2 -0
- data/lib/kraps/drivers.rb +24 -0
- data/lib/kraps/job.rb +84 -5
- data/lib/kraps/job_resolver.rb +13 -0
- data/lib/kraps/redis_queue.rb +151 -0
- data/lib/kraps/runner.rb +61 -67
- data/lib/kraps/step.rb +1 -1
- data/lib/kraps/temp_path.rb +1 -27
- data/lib/kraps/temp_paths.rb +2 -2
- data/lib/kraps/version.rb +1 -1
- data/lib/kraps/worker.rb +165 -51
- data/lib/kraps.rb +22 -9
- metadata +4 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 19635ced3e745d44313ed3bc416ef73eb555134c591725f4dab7b38208e21393
|
4
|
+
data.tar.gz: bb7f679c7e2cd053744d1c7d857629de1912a305880d0538bdab418de3861ba1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 91273ba54ea33c6d5cb1b4f335ad8039c35601953fdf1e6b9b2ac3117ceb25d81e9be569e5c8b70deda22b53a4c72f04d60ffb4b251badf5c5a64d13d399f36c
|
7
|
+
data.tar.gz: 4e563257fcba0c9f457b363da4f43d000ee239a5179e2f965071bb3df27e362cdf7bc9950a1624520cb862acfb0fde89e96b0a4987d79a93524230c4b84619cd
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,19 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## v0.7.0
|
4
|
+
|
5
|
+
* Added a `jobs` option to the actions to limit the concurrency
|
6
|
+
when e.g. accessing external data stores and to avoid overloading
|
7
|
+
them
|
8
|
+
* Added a queue using redis for the jobs to avoid starving workers
|
9
|
+
* Removed `distributed_job` dependency
|
10
|
+
|
11
|
+
## v0.6.0
|
12
|
+
|
13
|
+
* Added `map_partitions`
|
14
|
+
* Added `combine`
|
15
|
+
* Added `dump` and `load`
|
16
|
+
|
3
17
|
## v0.5.0
|
4
18
|
|
5
19
|
* Added a `before` option to specify a callable to run before
|
data/Gemfile.lock
CHANGED
@@ -1,9 +1,8 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
kraps (0.
|
4
|
+
kraps (0.7.0)
|
5
5
|
attachie
|
6
|
-
distributed_job
|
7
6
|
map-reduce-ruby (>= 3.0.0)
|
8
7
|
redis
|
9
8
|
ruby-progressbar
|
@@ -41,8 +40,6 @@ GEM
|
|
41
40
|
concurrent-ruby (1.1.10)
|
42
41
|
connection_pool (2.3.0)
|
43
42
|
diff-lcs (1.5.0)
|
44
|
-
distributed_job (3.1.0)
|
45
|
-
redis (>= 4.1.0)
|
46
43
|
i18n (1.12.0)
|
47
44
|
concurrent-ruby (~> 1.0)
|
48
45
|
jmespath (1.6.1)
|
@@ -62,7 +59,7 @@ GEM
|
|
62
59
|
rake (13.0.6)
|
63
60
|
redis (5.0.5)
|
64
61
|
redis-client (>= 0.9.0)
|
65
|
-
redis-client (0.11.
|
62
|
+
redis-client (0.11.2)
|
66
63
|
connection_pool
|
67
64
|
regexp_parser (2.6.0)
|
68
65
|
rexml (3.2.5)
|
data/README.md
CHANGED
@@ -3,11 +3,12 @@
|
|
3
3
|
**Easily process big data in ruby**
|
4
4
|
|
5
5
|
Kraps allows to process and perform calculations on very large datasets in
|
6
|
-
parallel using a map/reduce framework
|
7
|
-
you already have. You just need some
|
8
|
-
layer with temporary lifecycle policy
|
9
|
-
job framework (like sidekiq,
|
10
|
-
progress. Most things you most
|
6
|
+
parallel using a map/reduce framework similar to [spark](https://spark.apache.org/),
|
7
|
+
but runs on a background job framework you already have. You just need some
|
8
|
+
space on your filesystem, S3 as a storage layer with temporary lifecycle policy
|
9
|
+
enabled, the already mentioned background job framework (like sidekiq,
|
10
|
+
shoryuken, etc) and redis to keep track of the progress. Most things you most
|
11
|
+
likely already have in place anyways.
|
11
12
|
|
12
13
|
## Installation
|
13
14
|
|
@@ -29,7 +30,7 @@ Kraps.configure(
|
|
29
30
|
driver: Kraps::Drivers::S3Driver.new(s3_client: Aws::S3::Client.new("..."), bucket: "some-bucket", prefix: "temp/kraps/"),
|
30
31
|
redis: Redis.new,
|
31
32
|
namespace: "my-application", # An optional namespace to be used for redis keys, default: nil
|
32
|
-
job_ttl:
|
33
|
+
job_ttl: 7.days, # Job information in redis will automatically be removed after this amount of time, default: 4 days
|
33
34
|
show_progress: true # Whether or not to show the progress in the terminal when executing jobs, default: true
|
34
35
|
enqueuer: ->(worker, json) { worker.perform_async(json) } # Allows to customize the enqueueing of worker jobs
|
35
36
|
)
|
@@ -115,13 +116,13 @@ be able to give 300-400 megabytes to Kraps then, but now divide this by 10 and
|
|
115
116
|
specify a `memory_limit` of around `30.megabytes`, better less. The
|
116
117
|
`memory_limit` affects how much chunks will be written to disk depending on the
|
117
118
|
data size you are processing and how big these chunks are. The smaller the
|
118
|
-
value, the more chunks
|
119
|
-
the chunks.
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
119
|
+
value, the more chunks. The more chunks, the more runs Kraps need to merge
|
120
|
+
the chunks. The `chunk_limit` ensures that only the specified amount of chunks
|
121
|
+
are processed in a single run. A run basically means: it takes up to
|
122
|
+
`chunk_limit` chunks, reduces them and pushes the result as a new chunk to the
|
123
|
+
list of chunks to process. Thus, if your number of file descriptors is
|
124
|
+
unlimited, you want to set it to a higher number to avoid the overhead of
|
125
|
+
multiple runs. `concurrency` tells Kraps how much threads to use to
|
125
126
|
concurrently upload/download files from the storage layer. Finally, `retries`
|
126
127
|
specifies how often Kraps should retry the job step in case of errors. Kraps
|
127
128
|
will sleep for 5 seconds between those retries. Please note that it's not yet
|
@@ -130,7 +131,6 @@ Kraps. Please note, however, that `parallelize` is not covered by `retries`
|
|
130
131
|
yet, as the block passed to `parallelize` is executed by the runner, not the
|
131
132
|
workers.
|
132
133
|
|
133
|
-
|
134
134
|
Now, executing your job is super easy:
|
135
135
|
|
136
136
|
```ruby
|
@@ -182,11 +182,11 @@ https://github.com/mrkamel/map-reduce-ruby/#limitations-for-keys
|
|
182
182
|
## Storage
|
183
183
|
|
184
184
|
Kraps stores temporary results of steps in a storage layer. Currently, only S3
|
185
|
-
is supported besides a in
|
185
|
+
is supported besides a in-memory driver used for testing purposes. Please be
|
186
186
|
aware that Kraps does not clean up any files from the storage layer, as it
|
187
|
-
would be a safe thing to do in case of errors anyways. Instead, Kraps
|
188
|
-
lifecycle features of modern object storage systems. Therefore, it is
|
189
|
-
to
|
187
|
+
would not be a safe thing to do in case of errors anyways. Instead, Kraps
|
188
|
+
relies on lifecycle features of modern object storage systems. Therefore, it is
|
189
|
+
required to configure a lifecycle policy to delete any files after e.g. 7 days
|
190
190
|
either for a whole bucket or for a certain prefix like e.g. `temp/` and tell
|
191
191
|
Kraps about the prefix to use (e.g. `temp/kraps/`).
|
192
192
|
|
@@ -220,7 +220,7 @@ items are used as keys and the values are set to `nil`.
|
|
220
220
|
* `map`: Maps the key value pairs to other key value pairs
|
221
221
|
|
222
222
|
```ruby
|
223
|
-
job.map(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker) do |key, value, collector|
|
223
|
+
job.map(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8) do |key, value, collector|
|
224
224
|
collector.call("changed #{key}", "changed #{value}")
|
225
225
|
end
|
226
226
|
```
|
@@ -229,10 +229,31 @@ The block gets each key-value pair passed and the `collector` block can be
|
|
229
229
|
called as often as neccessary. This is also the reason why `map` can not simply
|
230
230
|
return the new key-value pair, but the `collector` must be used instead.
|
231
231
|
|
232
|
+
The `jobs` argument can be useful when you need to access an external data
|
233
|
+
source, like a relational database and you want to limit the number of workers
|
234
|
+
accessing the store concurrently to avoid overloading it. If you don't specify
|
235
|
+
it, it will be identical to the number of partitions. It is recommended to only
|
236
|
+
use it for steps where you need to throttle the concurrency, because it will of
|
237
|
+
course slow down the processing. The `jobs` argument only applies to the
|
238
|
+
current step. The following steps don't inherit the argument, but reset it.
|
239
|
+
|
240
|
+
* `map_partitions`: Maps the key value pairs to other key value pairs, but the
|
241
|
+
block receives all data of each partition as an enumerable and sorted by key.
|
242
|
+
Please be aware that you should not call `to_a` or similar on the enumerable.
|
243
|
+
Prefer `map` over `map_partitions` when possible.
|
244
|
+
|
245
|
+
```ruby
|
246
|
+
job.map_partitions(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8) do |pairs, collector|
|
247
|
+
pairs.each do |key, value|
|
248
|
+
collector.call("changed #{key}", "changed #{value}")
|
249
|
+
end
|
250
|
+
end
|
251
|
+
```
|
252
|
+
|
232
253
|
* `reduce`: Reduces the values of pairs having the same key
|
233
254
|
|
234
255
|
```ruby
|
235
|
-
job.reduce(worker: MyKrapsWorker) do |key, value1, value2|
|
256
|
+
job.reduce(worker: MyKrapsWorker, jobs: 8) do |key, value1, value2|
|
236
257
|
value1 + value2
|
237
258
|
end
|
238
259
|
```
|
@@ -245,26 +266,61 @@ The `key` itself is also passed to the block for the case that you need to
|
|
245
266
|
customize the reduce calculation according to the value of the key. However,
|
246
267
|
most of the time, this is not neccessary and the key can simply be ignored.
|
247
268
|
|
269
|
+
* `combine`: Combines the results of 2 jobs by combining every key available
|
270
|
+
in the current job result with the corresponding key from the passed job
|
271
|
+
result. When the passed job result does not have the corresponding key,
|
272
|
+
`nil` will be passed to the block. Keys which are only available in the
|
273
|
+
passed job result are completely omitted.
|
274
|
+
|
275
|
+
```ruby
|
276
|
+
job.combine(other_job, worker: MyKrapsWorker, jobs: 8) do |key, value1, value2|
|
277
|
+
(value1 || {}).merge(value2 || {})
|
278
|
+
end
|
279
|
+
```
|
280
|
+
|
281
|
+
Please note that the keys, partitioners and the number of partitions must match
|
282
|
+
for the jobs to be combined. Further note that the results of `other_job` must
|
283
|
+
be reduced, meaning that every key must be unique. Finally, `other_job` must
|
284
|
+
not neccessarily be listed in the array of jobs returned by the `call` method,
|
285
|
+
since Kraps detects the dependency on its own.
|
286
|
+
|
248
287
|
* `repartition`: Used to change the partitioning
|
249
288
|
|
250
289
|
```ruby
|
251
|
-
job.repartition(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker)
|
290
|
+
job.repartition(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8)
|
252
291
|
```
|
253
292
|
|
254
293
|
Repartitions all data into the specified number of partitions and using the
|
255
294
|
specified partitioner.
|
256
295
|
|
257
296
|
* `each_partition`: Passes the partition number and all data of each partition
|
258
|
-
as
|
297
|
+
as an enumerable and sorted by key. Please be aware that you should not call
|
298
|
+
`to_a` or similar on the enumerable.
|
259
299
|
|
260
300
|
```ruby
|
261
|
-
job.each_partition do |partition, pairs|
|
301
|
+
job.each_partition(jobs: 8) do |partition, pairs|
|
262
302
|
pairs.each do |key, value|
|
263
303
|
# ...
|
264
304
|
end
|
265
305
|
end
|
266
306
|
```
|
267
307
|
|
308
|
+
* `dump`: Store all current data per partition under the specified prefix
|
309
|
+
|
310
|
+
```ruby
|
311
|
+
job.dump(prefix: "path/to/dump", worker: MyKrapsWorker)
|
312
|
+
```
|
313
|
+
|
314
|
+
It creates a folder for every partition and stores one or more chunks in there.
|
315
|
+
|
316
|
+
* `load`: Loads the previously dumped data
|
317
|
+
|
318
|
+
```ruby
|
319
|
+
job.load(prefix: "path/to/dump", partitions: 32, partitioner: Kraps::HashPartitioner.new, worker: MyKrapsWorker)
|
320
|
+
```
|
321
|
+
|
322
|
+
The number of partitions and the partitioner must be specified.
|
323
|
+
|
268
324
|
Please note that every API method accepts a `before` callable:
|
269
325
|
|
270
326
|
```ruby
|
@@ -326,13 +382,58 @@ When you execute the job, Kraps will execute the jobs one after another and as
|
|
326
382
|
the jobs build up on each other, Kraps will execute the steps shared by both
|
327
383
|
jobs only once.
|
328
384
|
|
385
|
+
## Testing
|
386
|
+
|
387
|
+
Kraps ships with an in-memory fake driver for storage, which you can use for
|
388
|
+
testing purposes instead of the s3 driver:
|
389
|
+
|
390
|
+
```ruby
|
391
|
+
Kraps.configure(
|
392
|
+
driver: Kraps::Drivers::FakeDriver.new(bucket: "kraps"),
|
393
|
+
# ...
|
394
|
+
) ```
|
395
|
+
|
396
|
+
This is of course much faster than using s3 or some s3 compatible service.
|
397
|
+
Moreover, when testing large Kraps jobs you maybe want to test intermediate
|
398
|
+
steps. You can use `#dump` for this purpose and test that the data dumped is
|
399
|
+
correct.
|
400
|
+
|
401
|
+
```ruby
|
402
|
+
job = job.dump("path/to/dump")
|
403
|
+
```
|
404
|
+
|
405
|
+
and in your tests do
|
406
|
+
|
407
|
+
```ruby
|
408
|
+
Kraps.driver.value("path/to/dump/0/chunk.json") # => data of partition 0
|
409
|
+
Kraps.driver.value("path/to/dump/1/chunk.json") # => data of partition 1
|
410
|
+
# ...
|
411
|
+
```
|
412
|
+
|
413
|
+
The data is stored in lines, each line is a json encoded array of key and
|
414
|
+
value.
|
415
|
+
|
416
|
+
```ruby
|
417
|
+
data = Kraps.driver.value("path/to/dump/0/chunk.json).lines.map do |line|
|
418
|
+
JSON.parse(line) # => [key, value]
|
419
|
+
end
|
420
|
+
```
|
421
|
+
|
422
|
+
The API of the driver is:
|
423
|
+
|
424
|
+
* `store(name, data_or_ui, options = {})`: Stores `data_or_io` as `name`
|
425
|
+
* `list(prefix: nil)`: Lists all objects or all objects matching the `prefix`
|
426
|
+
* `value(name)`: Returns the object content of `name`
|
427
|
+
* `download(name, path)`: Downloads the object `name` to `path` in your
|
428
|
+
filesystem
|
429
|
+
* `exists?(name)`: Returns `true`/`false`
|
430
|
+
* `flush`: Removes all objects from the fake storage
|
431
|
+
|
329
432
|
## Dependencies
|
330
433
|
|
331
434
|
Kraps is built on top of
|
332
435
|
[map-reduce-ruby](https://github.com/mrkamel/map-reduce-ruby) for the
|
333
436
|
map/reduce framework,
|
334
|
-
[distributed_job](https://github.com/mrkamel/distributed_job)
|
335
|
-
to keep track of the job/step status,
|
336
437
|
[attachie](https://github.com/mrkamel/attachie) to interact with the storage
|
337
438
|
layer (s3),
|
338
439
|
[ruby-progressbar](https://github.com/jfelchner/ruby-progressbar) to
|
data/docker-compose.yml
CHANGED
data/lib/kraps/actions.rb
CHANGED
data/lib/kraps/drivers.rb
CHANGED
@@ -8,6 +8,26 @@ module Kraps
|
|
8
8
|
def with_prefix(path)
|
9
9
|
File.join(*[@prefix, path].compact)
|
10
10
|
end
|
11
|
+
|
12
|
+
def list(prefix: nil)
|
13
|
+
driver.list(bucket, prefix: prefix)
|
14
|
+
end
|
15
|
+
|
16
|
+
def value(name)
|
17
|
+
driver.value(name, bucket)
|
18
|
+
end
|
19
|
+
|
20
|
+
def download(name, path)
|
21
|
+
driver.download(name, bucket, path)
|
22
|
+
end
|
23
|
+
|
24
|
+
def exists?(name)
|
25
|
+
driver.exists?(name, bucket)
|
26
|
+
end
|
27
|
+
|
28
|
+
def store(name, data_or_io, options = {})
|
29
|
+
driver.store(name, data_or_io, bucket, options)
|
30
|
+
end
|
11
31
|
end
|
12
32
|
|
13
33
|
class S3Driver
|
@@ -32,6 +52,10 @@ module Kraps
|
|
32
52
|
@bucket = bucket
|
33
53
|
@prefix = prefix
|
34
54
|
end
|
55
|
+
|
56
|
+
def flush
|
57
|
+
driver.flush
|
58
|
+
end
|
35
59
|
end
|
36
60
|
end
|
37
61
|
end
|
data/lib/kraps/job.rb
CHANGED
@@ -27,7 +27,7 @@ module Kraps
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
-
def map(partitions: nil, partitioner: nil, worker: @worker, before: nil, &block)
|
30
|
+
def map(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
|
31
31
|
fresh.tap do |job|
|
32
32
|
job.instance_eval do
|
33
33
|
@partitions = partitions if partitions
|
@@ -35,6 +35,7 @@ module Kraps
|
|
35
35
|
|
36
36
|
@steps << Step.new(
|
37
37
|
action: Actions::MAP,
|
38
|
+
jobs: [jobs, @partitions].compact.min,
|
38
39
|
partitions: @partitions,
|
39
40
|
partitioner: @partitioner,
|
40
41
|
worker: worker,
|
@@ -45,11 +46,31 @@ module Kraps
|
|
45
46
|
end
|
46
47
|
end
|
47
48
|
|
48
|
-
def
|
49
|
+
def map_partitions(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
|
50
|
+
fresh.tap do |job|
|
51
|
+
job.instance_eval do
|
52
|
+
@partitions = partitions if partitions
|
53
|
+
@partitioner = partitioner if partitioner
|
54
|
+
|
55
|
+
@steps << Step.new(
|
56
|
+
action: Actions::MAP_PARTITIONS,
|
57
|
+
jobs: [jobs, @partitions].compact.min,
|
58
|
+
partitions: @partitions,
|
59
|
+
partitioner: @partitioner,
|
60
|
+
worker: worker,
|
61
|
+
before: before,
|
62
|
+
block: block
|
63
|
+
)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def reduce(jobs: nil, worker: @worker, before: nil, &block)
|
49
69
|
fresh.tap do |job|
|
50
70
|
job.instance_eval do
|
51
71
|
@steps << Step.new(
|
52
72
|
action: Actions::REDUCE,
|
73
|
+
jobs: [jobs, @partitions].compact.min,
|
53
74
|
partitions: @partitions,
|
54
75
|
partitioner: @partitioner,
|
55
76
|
worker: worker,
|
@@ -60,11 +81,30 @@ module Kraps
|
|
60
81
|
end
|
61
82
|
end
|
62
83
|
|
63
|
-
def
|
84
|
+
def combine(other_job, jobs: nil, worker: @worker, before: nil, &block)
|
85
|
+
fresh.tap do |job|
|
86
|
+
job.instance_eval do
|
87
|
+
@steps << Step.new(
|
88
|
+
action: Actions::COMBINE,
|
89
|
+
jobs: [jobs, @partitions].compact.min,
|
90
|
+
partitions: @partitions,
|
91
|
+
partitioner: @partitioner,
|
92
|
+
worker: worker,
|
93
|
+
before: before,
|
94
|
+
block: block,
|
95
|
+
dependency: other_job,
|
96
|
+
options: { combine_step_index: other_job.steps.size - 1 }
|
97
|
+
)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def each_partition(jobs: nil, worker: @worker, before: nil, &block)
|
64
103
|
fresh.tap do |job|
|
65
104
|
job.instance_eval do
|
66
105
|
@steps << Step.new(
|
67
106
|
action: Actions::EACH_PARTITION,
|
107
|
+
jobs: [jobs, @partitions].compact.min,
|
68
108
|
partitions: @partitions,
|
69
109
|
partitioner: @partitioner,
|
70
110
|
worker: worker,
|
@@ -75,12 +115,51 @@ module Kraps
|
|
75
115
|
end
|
76
116
|
end
|
77
117
|
|
78
|
-
def repartition(partitions:, partitioner: nil, worker: @worker, before: nil)
|
79
|
-
map(partitions: partitions, partitioner: partitioner, worker: worker, before: before) do |key, value, collector|
|
118
|
+
def repartition(partitions:, jobs: nil, partitioner: nil, worker: @worker, before: nil)
|
119
|
+
map(jobs: jobs, partitions: partitions, partitioner: partitioner, worker: worker, before: before) do |key, value, collector|
|
80
120
|
collector.call(key, value)
|
81
121
|
end
|
82
122
|
end
|
83
123
|
|
124
|
+
def dump(prefix:, worker: @worker)
|
125
|
+
each_partition(worker: worker) do |partition, pairs|
|
126
|
+
tempfile = Tempfile.new
|
127
|
+
|
128
|
+
pairs.each do |pair|
|
129
|
+
tempfile.puts(JSON.generate(pair))
|
130
|
+
end
|
131
|
+
|
132
|
+
Kraps.driver.store(File.join(prefix, partition.to_s, "chunk.json"), tempfile.tap(&:rewind))
|
133
|
+
ensure
|
134
|
+
tempfile&.close(true)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def load(prefix:, partitions:, partitioner:, worker: @worker)
|
139
|
+
job = parallelize(partitions: partitions, partitioner: proc { |key, _| key }, worker: worker) do |collector|
|
140
|
+
(0...partitions).each do |partition|
|
141
|
+
collector.call(partition)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
job.map_partitions(partitioner: partitioner, worker: worker) do |partition, _, collector|
|
146
|
+
tempfile = Tempfile.new
|
147
|
+
|
148
|
+
path = File.join(prefix, partition.to_s, "chunk.json")
|
149
|
+
next unless Kraps.driver.exists?(path)
|
150
|
+
|
151
|
+
Kraps.driver.download(path, tempfile.path)
|
152
|
+
|
153
|
+
tempfile.each_line do |line|
|
154
|
+
key, value = JSON.parse(line)
|
155
|
+
|
156
|
+
collector.call(key, value)
|
157
|
+
end
|
158
|
+
ensure
|
159
|
+
tempfile&.close(true)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
84
163
|
def fresh
|
85
164
|
dup.tap do |job|
|
86
165
|
job.instance_variable_set(:@steps, @steps.dup)
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Kraps
|
2
|
+
class JobResolver
|
3
|
+
def call(jobs)
|
4
|
+
resolve_dependencies(Array(jobs)).uniq
|
5
|
+
end
|
6
|
+
|
7
|
+
private
|
8
|
+
|
9
|
+
def resolve_dependencies(jobs)
|
10
|
+
jobs.map { |job| [resolve_dependencies(job.steps.map(&:dependency).compact), job] }.flatten
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
module Kraps
|
2
|
+
class RedisQueue
|
3
|
+
VISIBILITY_TIMEOUT = 60
|
4
|
+
|
5
|
+
attr_reader :token
|
6
|
+
|
7
|
+
def initialize(redis:, token:, namespace:, ttl:)
|
8
|
+
@redis = redis
|
9
|
+
@token = token
|
10
|
+
@namespace = namespace
|
11
|
+
@ttl = ttl
|
12
|
+
end
|
13
|
+
|
14
|
+
def size
|
15
|
+
@size_script ||= <<~SCRIPT
|
16
|
+
local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
|
17
|
+
|
18
|
+
redis.call('expire', queue_key, ttl)
|
19
|
+
redis.call('expire', pending_key, ttl)
|
20
|
+
redis.call('expire', status_key, ttl)
|
21
|
+
|
22
|
+
return redis.call('llen', queue_key) + redis.call('zcard', pending_key)
|
23
|
+
SCRIPT
|
24
|
+
|
25
|
+
@redis.eval(@size_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl])
|
26
|
+
end
|
27
|
+
|
28
|
+
def enqueue(payload)
|
29
|
+
@enqueue_script ||= <<~SCRIPT
|
30
|
+
local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
|
31
|
+
|
32
|
+
redis.call('rpush', queue_key, job)
|
33
|
+
|
34
|
+
redis.call('expire', queue_key, ttl)
|
35
|
+
redis.call('expire', pending_key, ttl)
|
36
|
+
redis.call('expire', status_key, ttl)
|
37
|
+
SCRIPT
|
38
|
+
|
39
|
+
@redis.eval(@enqueue_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, JSON.generate(payload)])
|
40
|
+
end
|
41
|
+
|
42
|
+
def dequeue
|
43
|
+
@dequeue_script ||= <<~SCRIPT
|
44
|
+
local queue_key, pending_key, status_key, ttl, visibility_timeout = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), tonumber(ARGV[5])
|
45
|
+
|
46
|
+
local zitem = redis.call('zrange', pending_key, 0, 0, 'WITHSCORES')
|
47
|
+
local job = zitem[1]
|
48
|
+
|
49
|
+
if not zitem[2] or tonumber(zitem[2]) > tonumber(redis.call('time')[1]) then
|
50
|
+
job = redis.call('lpop', queue_key)
|
51
|
+
end
|
52
|
+
|
53
|
+
redis.call('expire', queue_key, ttl)
|
54
|
+
redis.call('expire', pending_key, ttl)
|
55
|
+
redis.call('expire', status_key, ttl)
|
56
|
+
|
57
|
+
if not job then return nil end
|
58
|
+
|
59
|
+
redis.call('zadd', pending_key, tonumber(redis.call('time')[1]) + visibility_timeout, job)
|
60
|
+
redis.call('expire', pending_key, ttl)
|
61
|
+
|
62
|
+
return job
|
63
|
+
SCRIPT
|
64
|
+
|
65
|
+
job = @redis.eval(@dequeue_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, VISIBILITY_TIMEOUT])
|
66
|
+
|
67
|
+
unless job
|
68
|
+
yield(nil)
|
69
|
+
return
|
70
|
+
end
|
71
|
+
|
72
|
+
keep_alive(job) do
|
73
|
+
yield(JSON.parse(job)) if job
|
74
|
+
end
|
75
|
+
|
76
|
+
@remove_script ||= <<~SCRIPT
|
77
|
+
local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
|
78
|
+
|
79
|
+
redis.call('zrem', pending_key, job)
|
80
|
+
|
81
|
+
redis.call('expire', queue_key, ttl)
|
82
|
+
redis.call('expire', pending_key, ttl)
|
83
|
+
redis.call('expire', status_key, ttl)
|
84
|
+
SCRIPT
|
85
|
+
|
86
|
+
@redis.eval(@remove_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, job])
|
87
|
+
end
|
88
|
+
|
89
|
+
def stop
|
90
|
+
@stop_script ||= <<~SCRIPT
|
91
|
+
local queue_key, pending_key, status_key, ttl = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4])
|
92
|
+
|
93
|
+
redis.call('hset', status_key, 'stopped', 1)
|
94
|
+
|
95
|
+
redis.call('expire', queue_key, ttl)
|
96
|
+
redis.call('expire', pending_key, ttl)
|
97
|
+
redis.call('expire', status_key, ttl)
|
98
|
+
SCRIPT
|
99
|
+
|
100
|
+
@redis.eval(@stop_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl])
|
101
|
+
end
|
102
|
+
|
103
|
+
def stopped?
|
104
|
+
@stopped_script ||= <<~SCRIPT
|
105
|
+
local queue_key, pending_key, status_key, ttl = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4])
|
106
|
+
|
107
|
+
redis.call('expire', queue_key, ttl)
|
108
|
+
redis.call('expire', pending_key, ttl)
|
109
|
+
redis.call('expire', status_key, ttl)
|
110
|
+
|
111
|
+
return redis.call('hget', status_key, 'stopped')
|
112
|
+
SCRIPT
|
113
|
+
|
114
|
+
@redis.eval(@stopped_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl]).to_i == 1
|
115
|
+
end
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
def keep_alive(job)
|
120
|
+
@keep_alive_script ||= <<~SCRIPT
|
121
|
+
local queue_key, pending_key, status_key, ttl, job, visibility_timeout = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5], tonumber(ARGV[6])
|
122
|
+
|
123
|
+
redis.call('zadd', pending_key, tonumber(redis.call('time')[1]) + visibility_timeout, job)
|
124
|
+
|
125
|
+
redis.call('expire', queue_key, ttl)
|
126
|
+
redis.call('expire', pending_key, ttl)
|
127
|
+
redis.call('expire', status_key, ttl)
|
128
|
+
SCRIPT
|
129
|
+
|
130
|
+
interval = Interval.new(5) do
|
131
|
+
@redis.eval(@keep_alive_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, job, VISIBILITY_TIMEOUT])
|
132
|
+
end
|
133
|
+
|
134
|
+
yield
|
135
|
+
ensure
|
136
|
+
interval&.stop
|
137
|
+
end
|
138
|
+
|
139
|
+
def redis_queue_key
|
140
|
+
[@namespace, "kraps", "queue", @token].compact.join(":")
|
141
|
+
end
|
142
|
+
|
143
|
+
def redis_pending_key
|
144
|
+
[@namespace, "kraps", "pending", @token].compact.join(":")
|
145
|
+
end
|
146
|
+
|
147
|
+
def redis_status_key
|
148
|
+
[@namespace, "kraps", "status", @token].compact.join(":")
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
data/lib/kraps/runner.rb
CHANGED
@@ -5,7 +5,7 @@ module Kraps
|
|
5
5
|
end
|
6
6
|
|
7
7
|
def call(*args, **kwargs)
|
8
|
-
|
8
|
+
JobResolver.new.call(@klass.new.call(*args, **kwargs)).tap do |jobs|
|
9
9
|
jobs.each_with_index do |job, job_index|
|
10
10
|
job.steps.each_with_index.inject(nil) do |frame, (_, step_index)|
|
11
11
|
StepRunner.new(
|
@@ -45,107 +45,101 @@ module Kraps
|
|
45
45
|
|
46
46
|
def perform_parallelize
|
47
47
|
enum = Enumerator.new do |yielder|
|
48
|
-
collector = proc { |item| yielder << item }
|
48
|
+
collector = proc { |item| yielder << { item: item } }
|
49
49
|
|
50
50
|
@step.block.call(collector)
|
51
51
|
end
|
52
52
|
|
53
|
-
|
54
|
-
push_and_wait(distributed_job, enum) do |item, part|
|
55
|
-
enqueue(token: distributed_job.token, part: part, item: item)
|
56
|
-
end
|
53
|
+
token = push_and_wait(enum: enum)
|
57
54
|
|
58
|
-
|
59
|
-
end
|
55
|
+
Frame.new(token: token, partitions: @step.partitions)
|
60
56
|
end
|
61
57
|
|
62
58
|
def perform_map
|
63
|
-
|
64
|
-
|
65
|
-
enqueue(token: distributed_job.token, part: part, partition: partition)
|
66
|
-
end
|
59
|
+
enum = (0...@frame.partitions).map { |partition| { partition: partition } }
|
60
|
+
token = push_and_wait(job_count: @step.jobs, enum: enum)
|
67
61
|
|
68
|
-
|
69
|
-
|
62
|
+
Frame.new(token: token, partitions: @step.partitions)
|
63
|
+
end
|
64
|
+
|
65
|
+
def perform_map_partitions
|
66
|
+
enum = (0...@frame.partitions).map { |partition| { partition: partition } }
|
67
|
+
token = push_and_wait(job_count: @step.jobs, enum: enum)
|
68
|
+
|
69
|
+
Frame.new(token: token, partitions: @step.partitions)
|
70
70
|
end
|
71
71
|
|
72
72
|
def perform_reduce
|
73
|
-
|
74
|
-
|
75
|
-
enqueue(token: distributed_job.token, part: part, partition: partition)
|
76
|
-
end
|
73
|
+
enum = (0...@frame.partitions).map { |partition| { partition: partition } }
|
74
|
+
token = push_and_wait(job_count: @step.jobs, enum: enum)
|
77
75
|
|
78
|
-
|
79
|
-
end
|
76
|
+
Frame.new(token: token, partitions: @step.partitions)
|
80
77
|
end
|
81
78
|
|
82
|
-
def
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
79
|
+
def perform_combine
|
80
|
+
combine_job = @step.dependency
|
81
|
+
combine_step = combine_job.steps[@step.options[:combine_step_index]]
|
82
|
+
|
83
|
+
raise(IncompatibleFrame, "Incompatible number of partitions") if combine_step.partitions != @step.partitions
|
87
84
|
|
88
|
-
|
85
|
+
enum = (0...@frame.partitions).map do |partition|
|
86
|
+
{ partition: partition, combine_frame: combine_step.frame.to_h }
|
89
87
|
end
|
90
|
-
end
|
91
88
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
JSON.generate(
|
96
|
-
job_index: @job_index,
|
97
|
-
step_index: @step_index,
|
98
|
-
frame: @frame.to_h,
|
99
|
-
token: token,
|
100
|
-
part: part,
|
101
|
-
klass: @klass,
|
102
|
-
args: @args,
|
103
|
-
kwargs: @kwargs,
|
104
|
-
**rest
|
105
|
-
)
|
106
|
-
)
|
89
|
+
token = push_and_wait(job_count: @step.jobs, enum: enum)
|
90
|
+
|
91
|
+
Frame.new(token: token, partitions: @step.partitions)
|
107
92
|
end
|
108
93
|
|
109
|
-
def
|
110
|
-
|
94
|
+
def perform_each_partition
|
95
|
+
enum = (0...@frame.partitions).map { |partition| { partition: partition } }
|
96
|
+
push_and_wait(job_count: @step.jobs, enum: enum)
|
111
97
|
|
112
|
-
|
113
|
-
rescue Interrupt
|
114
|
-
distributed_job&.stop
|
115
|
-
raise
|
98
|
+
@frame
|
116
99
|
end
|
117
100
|
|
118
|
-
def push_and_wait(
|
119
|
-
|
101
|
+
def push_and_wait(enum:, job_count: nil)
|
102
|
+
redis_queue = RedisQueue.new(redis: Kraps.redis, token: SecureRandom.hex, namespace: Kraps.namespace, ttl: Kraps.job_ttl)
|
103
|
+
progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, token #{redis_queue.token}, %a, %c/%C (%p%) => #{@step.action}")
|
120
104
|
|
121
|
-
|
122
|
-
total = 0
|
105
|
+
total = 0
|
123
106
|
|
124
|
-
|
125
|
-
|
126
|
-
|
107
|
+
interval = Interval.new(1) do
|
108
|
+
# The interval is used to continously update the progress bar even
|
109
|
+
# when push_all is used and to avoid sessions being terminated due
|
110
|
+
# to inactivity etc
|
127
111
|
|
128
|
-
|
129
|
-
|
130
|
-
|
112
|
+
progress_bar.total = total
|
113
|
+
progress_bar.progress = [progress_bar.total - redis_queue.size, 0].max
|
114
|
+
end
|
131
115
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
116
|
+
enum.each_with_index do |item, part|
|
117
|
+
total += 1
|
118
|
+
|
119
|
+
redis_queue.enqueue(item.merge(part: part))
|
136
120
|
end
|
137
121
|
|
138
|
-
|
139
|
-
|
140
|
-
|
122
|
+
(job_count || total).times do
|
123
|
+
break if redis_queue.stopped?
|
124
|
+
|
125
|
+
Kraps.enqueuer.call(@step.worker, JSON.generate(job_index: @job_index, step_index: @step_index, frame: @frame.to_h, token: redis_queue.token, klass: @klass, args: @args, kwargs: @kwargs))
|
126
|
+
end
|
141
127
|
|
142
|
-
|
128
|
+
loop do
|
129
|
+
break if redis_queue.size.zero?
|
130
|
+
break if redis_queue.stopped?
|
143
131
|
|
144
132
|
sleep(1)
|
145
133
|
end
|
146
134
|
|
147
|
-
raise(JobStopped, "The job was stopped") if
|
135
|
+
raise(JobStopped, "The job was stopped") if redis_queue.stopped?
|
136
|
+
|
137
|
+
interval.fire(timeout: 1)
|
138
|
+
|
139
|
+
redis_queue.token
|
148
140
|
ensure
|
141
|
+
redis_queue&.stop
|
142
|
+
interval&.stop
|
149
143
|
progress_bar&.stop
|
150
144
|
end
|
151
145
|
|
data/lib/kraps/step.rb
CHANGED
data/lib/kraps/temp_path.rb
CHANGED
@@ -1,29 +1,3 @@
|
|
1
1
|
module Kraps
|
2
|
-
|
3
|
-
attr_reader :path
|
4
|
-
|
5
|
-
def initialize(prefix: nil, suffix: nil)
|
6
|
-
@path = File.join(Dir.tmpdir, [prefix, SecureRandom.hex[0, 16], Process.pid, suffix].compact.join("."))
|
7
|
-
|
8
|
-
File.open(@path, File::CREAT | File::EXCL) {}
|
9
|
-
|
10
|
-
ObjectSpace.define_finalizer(self, self.class.finalize(@path))
|
11
|
-
|
12
|
-
return unless block_given?
|
13
|
-
|
14
|
-
begin
|
15
|
-
yield
|
16
|
-
ensure
|
17
|
-
unlink
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
def unlink
|
22
|
-
FileUtils.rm_f(@path)
|
23
|
-
end
|
24
|
-
|
25
|
-
def self.finalize(path)
|
26
|
-
proc { FileUtils.rm_f(path) }
|
27
|
-
end
|
28
|
-
end
|
2
|
+
TempPath = MapReduce::TempPath
|
29
3
|
end
|
data/lib/kraps/temp_paths.rb
CHANGED
data/lib/kraps/version.rb
CHANGED
data/lib/kraps/worker.rb
CHANGED
@@ -1,29 +1,32 @@
|
|
1
1
|
module Kraps
|
2
2
|
class Worker
|
3
|
-
|
3
|
+
include MapReduce::Mergeable
|
4
|
+
|
5
|
+
def initialize(json, memory_limit:, chunk_limit:, concurrency:, logger: Logger.new("/dev/null"))
|
4
6
|
@args = JSON.parse(json)
|
5
7
|
@memory_limit = memory_limit
|
6
8
|
@chunk_limit = chunk_limit
|
7
9
|
@concurrency = concurrency
|
10
|
+
@logger = logger
|
8
11
|
end
|
9
12
|
|
10
13
|
def call(retries: 3)
|
11
|
-
return if
|
14
|
+
return if redis_queue.stopped?
|
12
15
|
|
13
16
|
raise(InvalidAction, "Invalid action #{step.action}") unless Actions::ALL.include?(step.action)
|
14
17
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
send(:"perform_#{step.action}")
|
18
|
+
dequeue do |payload|
|
19
|
+
with_retries(retries) do # TODO: allow to use queue based retries
|
20
|
+
step.before&.call
|
19
21
|
|
20
|
-
|
22
|
+
send(:"perform_#{step.action}", payload)
|
23
|
+
end
|
21
24
|
end
|
22
25
|
end
|
23
26
|
|
24
27
|
private
|
25
28
|
|
26
|
-
def perform_parallelize
|
29
|
+
def perform_parallelize(payload)
|
27
30
|
implementation = Class.new do
|
28
31
|
def map(key)
|
29
32
|
yield(key, nil)
|
@@ -31,29 +34,19 @@ module Kraps
|
|
31
34
|
end
|
32
35
|
|
33
36
|
mapper = MapReduce::Mapper.new(implementation.new, partitioner: partitioner, memory_limit: @memory_limit)
|
34
|
-
mapper.map(
|
37
|
+
mapper.map(payload["item"])
|
35
38
|
|
36
39
|
mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
|
37
40
|
Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
|
38
41
|
File.open(path) do |stream|
|
39
|
-
Kraps.driver.
|
42
|
+
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["part"]}.json"), stream)
|
40
43
|
end
|
41
44
|
end
|
42
45
|
end
|
43
46
|
end
|
44
47
|
|
45
|
-
def perform_map
|
46
|
-
temp_paths =
|
47
|
-
|
48
|
-
files = Kraps.driver.driver.list(Kraps.driver.bucket, prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")).sort
|
49
|
-
|
50
|
-
temp_paths_index = files.each_with_object({}) do |file, hash|
|
51
|
-
hash[file] = temp_paths.add
|
52
|
-
end
|
53
|
-
|
54
|
-
Parallelizer.each(files, @concurrency) do |file|
|
55
|
-
Kraps.driver.driver.download(file, Kraps.driver.bucket, temp_paths_index[file].path)
|
56
|
-
end
|
48
|
+
def perform_map(payload)
|
49
|
+
temp_paths = download_all(token: @args["frame"]["token"], partition: payload["partition"])
|
57
50
|
|
58
51
|
current_step = step
|
59
52
|
|
@@ -85,17 +78,48 @@ module Kraps
|
|
85
78
|
mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
|
86
79
|
Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
|
87
80
|
File.open(path) do |stream|
|
88
|
-
Kraps.driver.driver.
|
89
|
-
Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream, Kraps.driver.bucket
|
90
|
-
)
|
81
|
+
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
|
91
82
|
end
|
92
83
|
end
|
93
84
|
end
|
94
85
|
ensure
|
95
|
-
temp_paths&.
|
86
|
+
temp_paths&.delete
|
96
87
|
end
|
97
88
|
|
98
|
-
def
|
89
|
+
def perform_map_partitions(payload)
|
90
|
+
temp_paths = download_all(token: @args["frame"]["token"], partition: payload["partition"])
|
91
|
+
|
92
|
+
current_step = step
|
93
|
+
current_partition = payload["partition"]
|
94
|
+
|
95
|
+
implementation = Object.new
|
96
|
+
implementation.define_singleton_method(:map) do |enum, &block|
|
97
|
+
current_step.block.call(current_partition, enum, block)
|
98
|
+
end
|
99
|
+
|
100
|
+
subsequent_step = next_step
|
101
|
+
|
102
|
+
if subsequent_step&.action == Actions::REDUCE
|
103
|
+
implementation.define_singleton_method(:reduce) do |key, value1, value2|
|
104
|
+
subsequent_step.block.call(key, value1, value2)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
mapper = MapReduce::Mapper.new(implementation, partitioner: partitioner, memory_limit: @memory_limit)
|
109
|
+
mapper.map(k_way_merge(temp_paths.each.to_a, chunk_limit: @chunk_limit))
|
110
|
+
|
111
|
+
mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
|
112
|
+
Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
|
113
|
+
File.open(path) do |stream|
|
114
|
+
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
ensure
|
119
|
+
temp_paths&.delete
|
120
|
+
end
|
121
|
+
|
122
|
+
def perform_reduce(payload)
|
99
123
|
current_step = step
|
100
124
|
|
101
125
|
implementation = Object.new
|
@@ -105,8 +129,8 @@ module Kraps
|
|
105
129
|
|
106
130
|
reducer = MapReduce::Reducer.new(implementation)
|
107
131
|
|
108
|
-
Parallelizer.each(Kraps.driver.
|
109
|
-
Kraps.driver.
|
132
|
+
Parallelizer.each(Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{payload["partition"]}/")), @concurrency) do |file|
|
133
|
+
Kraps.driver.download(file, reducer.add_chunk)
|
110
134
|
end
|
111
135
|
|
112
136
|
tempfile = Tempfile.new
|
@@ -115,35 +139,96 @@ module Kraps
|
|
115
139
|
tempfile.puts(JSON.generate([key, value]))
|
116
140
|
end
|
117
141
|
|
118
|
-
Kraps.driver.
|
142
|
+
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{payload["partition"]}/chunk.#{payload["partition"]}.json"), tempfile.tap(&:rewind))
|
119
143
|
ensure
|
120
144
|
tempfile&.close(true)
|
121
145
|
end
|
122
146
|
|
123
|
-
def
|
124
|
-
|
147
|
+
def perform_combine(payload)
|
148
|
+
temp_paths1 = download_all(token: @args["frame"]["token"], partition: payload["partition"])
|
149
|
+
temp_paths2 = download_all(token: payload["combine_frame"]["token"], partition: payload["partition"])
|
125
150
|
|
126
|
-
|
151
|
+
enum1 = k_way_merge(temp_paths1.each.to_a, chunk_limit: @chunk_limit)
|
152
|
+
enum2 = k_way_merge(temp_paths2.each.to_a, chunk_limit: @chunk_limit)
|
127
153
|
|
128
|
-
|
129
|
-
|
154
|
+
combine_method = method(:combine)
|
155
|
+
current_step = step
|
156
|
+
|
157
|
+
implementation = Object.new
|
158
|
+
implementation.define_singleton_method(:map) do |&block|
|
159
|
+
combine_method.call(enum1, enum2) do |key, value1, value2|
|
160
|
+
block.call(key, current_step.block.call(key, value1, value2))
|
161
|
+
end
|
130
162
|
end
|
131
163
|
|
132
|
-
|
133
|
-
|
164
|
+
mapper = MapReduce::Mapper.new(implementation, partitioner: partitioner, memory_limit: @memory_limit)
|
165
|
+
mapper.map
|
166
|
+
|
167
|
+
mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
|
168
|
+
Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
|
169
|
+
File.open(path) do |stream|
|
170
|
+
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
|
171
|
+
end
|
172
|
+
end
|
134
173
|
end
|
174
|
+
ensure
|
175
|
+
temp_paths1&.delete
|
176
|
+
temp_paths2&.delete
|
177
|
+
end
|
135
178
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
179
|
+
def combine(enum1, enum2)
|
180
|
+
current1 = begin; enum1.next; rescue StopIteration; nil; end
|
181
|
+
current2 = begin; enum2.next; rescue StopIteration; nil; end
|
182
|
+
|
183
|
+
loop do
|
184
|
+
return if current1.nil? && current2.nil?
|
185
|
+
return if current1.nil?
|
186
|
+
|
187
|
+
if current2.nil?
|
188
|
+
yield(current1[0], current1[1], nil)
|
189
|
+
|
190
|
+
current1 = begin; enum1.next; rescue StopIteration; nil; end
|
191
|
+
elsif current1[0] == current2[0]
|
192
|
+
loop do
|
193
|
+
yield(current1[0], current1[1], current2[1])
|
194
|
+
|
195
|
+
current1 = begin; enum1.next; rescue StopIteration; nil; end
|
196
|
+
|
197
|
+
break if current1.nil?
|
198
|
+
break if current1[0] != current2[0]
|
199
|
+
end
|
200
|
+
|
201
|
+
current2 = begin; enum2.next; rescue StopIteration; nil; end
|
202
|
+
else
|
203
|
+
res = current1[0] <=> current2[0]
|
204
|
+
|
205
|
+
if res < 0
|
206
|
+
yield(current1[0], current1[1], nil)
|
207
|
+
|
208
|
+
current1 = begin; enum1.next; rescue StopIteration; nil; end
|
209
|
+
else
|
210
|
+
current2 = begin; enum2.next; rescue StopIteration; nil; end
|
140
211
|
end
|
141
212
|
end
|
142
213
|
end
|
214
|
+
end
|
215
|
+
|
216
|
+
def perform_each_partition(payload)
|
217
|
+
temp_paths = TempPaths.new
|
218
|
+
|
219
|
+
files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{payload["partition"]}/")).sort
|
220
|
+
|
221
|
+
temp_paths_index = files.each_with_object({}) do |file, hash|
|
222
|
+
hash[file] = temp_paths.add
|
223
|
+
end
|
143
224
|
|
144
|
-
|
225
|
+
Parallelizer.each(files, @concurrency) do |file|
|
226
|
+
Kraps.driver.download(file, temp_paths_index[file].path)
|
227
|
+
end
|
228
|
+
|
229
|
+
step.block.call(payload["partition"], k_way_merge(temp_paths.each.to_a, chunk_limit: @chunk_limit))
|
145
230
|
ensure
|
146
|
-
temp_paths&.
|
231
|
+
temp_paths&.delete
|
147
232
|
end
|
148
233
|
|
149
234
|
def with_retries(num_retries)
|
@@ -152,14 +237,16 @@ module Kraps
|
|
152
237
|
begin
|
153
238
|
yield
|
154
239
|
rescue Kraps::Error
|
155
|
-
|
240
|
+
redis_queue.stop
|
156
241
|
raise
|
157
|
-
rescue StandardError
|
242
|
+
rescue StandardError => e
|
158
243
|
if retries >= num_retries
|
159
|
-
|
244
|
+
redis_queue.stop
|
160
245
|
raise
|
161
246
|
end
|
162
247
|
|
248
|
+
@logger.error(e)
|
249
|
+
|
163
250
|
sleep(5)
|
164
251
|
retries += 1
|
165
252
|
|
@@ -167,8 +254,39 @@ module Kraps
|
|
167
254
|
end
|
168
255
|
end
|
169
256
|
|
257
|
+
def dequeue
|
258
|
+
loop do
|
259
|
+
break if redis_queue.stopped?
|
260
|
+
break if redis_queue.size.zero?
|
261
|
+
|
262
|
+
redis_queue.dequeue do |payload|
|
263
|
+
payload ? yield(payload) : sleep(1)
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
def redis_queue
|
269
|
+
@redis_queue ||= RedisQueue.new(redis: Kraps.redis, token: @args["token"], namespace: Kraps.namespace, ttl: Kraps.job_ttl)
|
270
|
+
end
|
271
|
+
|
272
|
+
def download_all(token:, partition:)
|
273
|
+
temp_paths = TempPaths.new
|
274
|
+
|
275
|
+
files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{token}/#{partition}/")).sort
|
276
|
+
|
277
|
+
temp_paths_index = files.each_with_object({}) do |file, hash|
|
278
|
+
hash[file] = temp_paths.add
|
279
|
+
end
|
280
|
+
|
281
|
+
Parallelizer.each(files, @concurrency) do |file|
|
282
|
+
Kraps.driver.download(file, temp_paths_index[file].path)
|
283
|
+
end
|
284
|
+
|
285
|
+
temp_paths
|
286
|
+
end
|
287
|
+
|
170
288
|
def jobs
|
171
|
-
@jobs ||=
|
289
|
+
@jobs ||= JobResolver.new.call(@args["klass"].constantize.new.call(*@args["args"], **@args["kwargs"].transform_keys(&:to_sym)))
|
172
290
|
end
|
173
291
|
|
174
292
|
def job
|
@@ -198,9 +316,5 @@ module Kraps
|
|
198
316
|
def partitioner
|
199
317
|
@partitioner ||= proc { |key| step.partitioner.call(key, step.partitions) }
|
200
318
|
end
|
201
|
-
|
202
|
-
def distributed_job
|
203
|
-
@distributed_job ||= Kraps.distributed_job_client.build(token: @args["token"])
|
204
|
-
end
|
205
319
|
end
|
206
320
|
end
|
data/lib/kraps.rb
CHANGED
@@ -1,32 +1,37 @@
|
|
1
|
+
require "ruby-progressbar"
|
2
|
+
require "ruby-progressbar/outputs/null"
|
3
|
+
require "map_reduce"
|
4
|
+
require "redis"
|
5
|
+
|
1
6
|
require_relative "kraps/version"
|
2
7
|
require_relative "kraps/drivers"
|
3
8
|
require_relative "kraps/actions"
|
4
9
|
require_relative "kraps/parallelizer"
|
5
10
|
require_relative "kraps/hash_partitioner"
|
11
|
+
require_relative "kraps/redis_queue"
|
6
12
|
require_relative "kraps/temp_path"
|
7
13
|
require_relative "kraps/temp_paths"
|
8
14
|
require_relative "kraps/timeout_queue"
|
9
15
|
require_relative "kraps/interval"
|
10
16
|
require_relative "kraps/job"
|
17
|
+
require_relative "kraps/job_resolver"
|
11
18
|
require_relative "kraps/runner"
|
12
19
|
require_relative "kraps/step"
|
13
20
|
require_relative "kraps/frame"
|
14
21
|
require_relative "kraps/worker"
|
15
|
-
require "distributed_job"
|
16
|
-
require "ruby-progressbar"
|
17
|
-
require "ruby-progressbar/outputs/null"
|
18
|
-
require "map_reduce"
|
19
|
-
require "redis"
|
20
22
|
|
21
23
|
module Kraps
|
22
24
|
class Error < StandardError; end
|
23
25
|
class InvalidAction < Error; end
|
24
26
|
class InvalidStep < Error; end
|
25
27
|
class JobStopped < Error; end
|
28
|
+
class IncompatibleFrame < Error; end
|
26
29
|
|
27
|
-
def self.configure(driver:, redis: Redis.new, namespace: nil, job_ttl: 24 * 60 * 60, show_progress: true, enqueuer: ->(worker, json) { worker.perform_async(json) })
|
30
|
+
def self.configure(driver:, redis: Redis.new, namespace: nil, job_ttl: 4 * 24 * 60 * 60, show_progress: true, enqueuer: ->(worker, json) { worker.perform_async(json) })
|
28
31
|
@driver = driver
|
29
|
-
@
|
32
|
+
@redis = redis
|
33
|
+
@namespace = namespace
|
34
|
+
@job_ttl = job_ttl.to_i
|
30
35
|
@show_progress = show_progress
|
31
36
|
@enqueuer = enqueuer
|
32
37
|
end
|
@@ -35,8 +40,16 @@ module Kraps
|
|
35
40
|
@driver
|
36
41
|
end
|
37
42
|
|
38
|
-
def self.
|
39
|
-
@
|
43
|
+
def self.redis
|
44
|
+
@redis
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.namespace
|
48
|
+
@namespace
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.job_ttl
|
52
|
+
@job_ttl
|
40
53
|
end
|
41
54
|
|
42
55
|
def self.show_progress?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kraps
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Vetter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-12-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: attachie
|
@@ -24,20 +24,6 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: distributed_job
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :runtime
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - ">="
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: map-reduce-ruby
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -147,7 +133,9 @@ files:
|
|
147
133
|
- lib/kraps/hash_partitioner.rb
|
148
134
|
- lib/kraps/interval.rb
|
149
135
|
- lib/kraps/job.rb
|
136
|
+
- lib/kraps/job_resolver.rb
|
150
137
|
- lib/kraps/parallelizer.rb
|
138
|
+
- lib/kraps/redis_queue.rb
|
151
139
|
- lib/kraps/runner.rb
|
152
140
|
- lib/kraps/step.rb
|
153
141
|
- lib/kraps/temp_path.rb
|