kraps 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/Gemfile.lock +3 -3
- data/README.md +42 -16
- data/lib/kraps/job.rb +38 -10
- data/lib/kraps/runner.rb +4 -4
- data/lib/kraps/step.rb +1 -1
- data/lib/kraps/version.rb +1 -1
- data/lib/kraps/worker.rb +19 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 921ae08326c96216136418861b88af7f11bce519c924cd1813216165f7f02690
|
4
|
+
data.tar.gz: '0913d31d3caeea0be664bc714e9d0da58227f515c047be31359e96040bc0c141'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8e43e5229fc310019801e62a2e278470a1eb37b50e4aca27b9c64edb6666115f0f25c7a7375790516e2726fcf10980cdac1523c54dde8d3527a39fd919a2a5a
|
7
|
+
data.tar.gz: 30b1a9edcdd4f7ff476bfa4c070aef31debd727500e27a08b59f1df2663362c60e3cc3a3c860455d568abd994bb56a216f7eedf8baea6cc06ca73b1d0bdf9a07
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## v0.5.0
|
4
|
+
|
5
|
+
* Added a `before` option to specify a callable to run before
|
6
|
+
a step to e.g. populate caches upfront, etc.
|
7
|
+
|
8
|
+
## v0.4.0
|
9
|
+
|
10
|
+
* Pre-reduce in a map step when the subsequent step is a
|
11
|
+
reduce step
|
12
|
+
|
3
13
|
## v0.3.0
|
4
14
|
|
5
15
|
* Changed partitioners to receive the number of partitions
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
kraps (0.
|
4
|
+
kraps (0.5.0)
|
5
5
|
attachie
|
6
6
|
distributed_job
|
7
7
|
map-reduce-ruby (>= 3.0.0)
|
@@ -23,7 +23,7 @@ GEM
|
|
23
23
|
connection_pool
|
24
24
|
mime-types
|
25
25
|
aws-eventstream (1.2.0)
|
26
|
-
aws-partitions (1.
|
26
|
+
aws-partitions (1.657.0)
|
27
27
|
aws-sdk-core (3.166.0)
|
28
28
|
aws-eventstream (~> 1, >= 1.0.2)
|
29
29
|
aws-partitions (~> 1, >= 1.651.0)
|
@@ -62,7 +62,7 @@ GEM
|
|
62
62
|
rake (13.0.6)
|
63
63
|
redis (5.0.5)
|
64
64
|
redis-client (>= 0.9.0)
|
65
|
-
redis-client (0.11.
|
65
|
+
redis-client (0.11.1)
|
66
66
|
connection_pool
|
67
67
|
regexp_parser (2.6.0)
|
68
68
|
rexml (3.2.5)
|
data/README.md
CHANGED
@@ -95,28 +95,41 @@ class MyKrapsWorker
|
|
95
95
|
include Sidekiq::Worker
|
96
96
|
|
97
97
|
def perform(json)
|
98
|
-
Kraps::Worker.new(json, memory_limit:
|
98
|
+
Kraps::Worker.new(json, memory_limit: 16.megabytes, chunk_limit: 64, concurrency: 8).call(retries: 3)
|
99
99
|
end
|
100
100
|
end
|
101
101
|
```
|
102
102
|
|
103
103
|
The `json` argument is automatically enqueued by Kraps and contains everything
|
104
104
|
it needs to know about the job and step to execute. The `memory_limit` tells
|
105
|
-
Kraps how much memory it is allowed to allocate for temporary chunks
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
`
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
105
|
+
Kraps how much memory it is allowed to allocate for temporary chunks. More
|
106
|
+
concretely, it tells Kraps how big the file size of a temporary chunk can grow
|
107
|
+
in memory up until Kraps must write it to disk. However, ruby of course
|
108
|
+
allocates much more memory for a chunk than the raw file size of the chunk. As
|
109
|
+
a rule of thumb, it allocates 10 times more memory. Still, choosing a value for
|
110
|
+
`memory_size` depends on the memory size of your container/server, how much
|
111
|
+
worker threads your background queue spawns and how much memory your workers
|
112
|
+
need besides of Kraps. Let's say your container/server has 2 gigabytes of
|
113
|
+
memory and your background framework spawns 5 threads. Theoretically, you might
|
114
|
+
be able to give 300-400 megabytes to Kraps then, but now divide this by 10 and
|
115
|
+
specify a `memory_limit` of around `30.megabytes`, better less. The
|
116
|
+
`memory_limit` affects how much chunks will be written to disk depending on the
|
117
|
+
data size you are processing and how big these chunks are. The smaller the
|
118
|
+
value, the more chunks and the more chunks, the more runs Kraps need to merge
|
119
|
+
the chunks. It can affect the performance The `chunk_limit` ensures that only
|
120
|
+
the specified amount of chunks are processed in a single run. A run basically
|
121
|
+
means: it takes up to `chunk_limit` chunks, reduces them and pushes the result
|
122
|
+
as a new chunk to the list of chunks to process. Thus, if your number of file
|
123
|
+
descriptors is unlimited, you want to set it to a higher number to avoid the
|
124
|
+
overhead of multiple runs. `concurrency` tells Kraps how much threads to use to
|
125
|
+
concurrently upload/download files from the storage layer. Finally, `retries`
|
126
|
+
specifies how often Kraps should retry the job step in case of errors. Kraps
|
127
|
+
will sleep for 5 seconds between those retries. Please note that it's not yet
|
128
|
+
possible to use the retry mechanism of your background job framework with
|
129
|
+
Kraps. Please note, however, that `parallelize` is not covered by `retries`
|
130
|
+
yet, as the block passed to `parallelize` is executed by the runner, not the
|
131
|
+
workers.
|
132
|
+
|
120
133
|
|
121
134
|
Now, executing your job is super easy:
|
122
135
|
|
@@ -252,6 +265,19 @@ job.each_partition do |partition, pairs|
|
|
252
265
|
end
|
253
266
|
```
|
254
267
|
|
268
|
+
Please note that every API method accepts a `before` callable:
|
269
|
+
|
270
|
+
```ruby
|
271
|
+
before_block = proc do
|
272
|
+
# runs once before the map action in every worker, which can be useful to
|
273
|
+
# e.g. populate caches etc.
|
274
|
+
end
|
275
|
+
|
276
|
+
job.map(before: before_block) do |key, value, collector|
|
277
|
+
# ...
|
278
|
+
end
|
279
|
+
```
|
280
|
+
|
255
281
|
## More Complex Jobs
|
256
282
|
|
257
283
|
Please note that a job class can return multiple jobs and jobs can build up on
|
data/lib/kraps/job.rb
CHANGED
@@ -9,46 +9,74 @@ module Kraps
|
|
9
9
|
@partitioner = HashPartitioner.new
|
10
10
|
end
|
11
11
|
|
12
|
-
def parallelize(partitions:, partitioner: HashPartitioner.new, worker: @worker, &block)
|
12
|
+
def parallelize(partitions:, partitioner: HashPartitioner.new, worker: @worker, before: nil, &block)
|
13
13
|
fresh.tap do |job|
|
14
14
|
job.instance_eval do
|
15
15
|
@partitions = partitions
|
16
16
|
@partitioner = partitioner
|
17
17
|
|
18
|
-
@steps << Step.new(
|
18
|
+
@steps << Step.new(
|
19
|
+
action: Actions::PARALLELIZE,
|
20
|
+
partitions: @partitions,
|
21
|
+
partitioner: @partitioner,
|
22
|
+
worker: worker,
|
23
|
+
before: before,
|
24
|
+
block: block
|
25
|
+
)
|
19
26
|
end
|
20
27
|
end
|
21
28
|
end
|
22
29
|
|
23
|
-
def map(partitions: nil, partitioner: nil, worker: @worker, &block)
|
30
|
+
def map(partitions: nil, partitioner: nil, worker: @worker, before: nil, &block)
|
24
31
|
fresh.tap do |job|
|
25
32
|
job.instance_eval do
|
26
33
|
@partitions = partitions if partitions
|
27
34
|
@partitioner = partitioner if partitioner
|
28
35
|
|
29
|
-
@steps << Step.new(
|
36
|
+
@steps << Step.new(
|
37
|
+
action: Actions::MAP,
|
38
|
+
partitions: @partitions,
|
39
|
+
partitioner: @partitioner,
|
40
|
+
worker: worker,
|
41
|
+
before: before,
|
42
|
+
block: block
|
43
|
+
)
|
30
44
|
end
|
31
45
|
end
|
32
46
|
end
|
33
47
|
|
34
|
-
def reduce(worker: @worker, &block)
|
48
|
+
def reduce(worker: @worker, before: nil, &block)
|
35
49
|
fresh.tap do |job|
|
36
50
|
job.instance_eval do
|
37
|
-
@steps << Step.new(
|
51
|
+
@steps << Step.new(
|
52
|
+
action: Actions::REDUCE,
|
53
|
+
partitions: @partitions,
|
54
|
+
partitioner: @partitioner,
|
55
|
+
worker: worker,
|
56
|
+
before: before,
|
57
|
+
block: block
|
58
|
+
)
|
38
59
|
end
|
39
60
|
end
|
40
61
|
end
|
41
62
|
|
42
|
-
def each_partition(worker: @worker, &block)
|
63
|
+
def each_partition(worker: @worker, before: nil, &block)
|
43
64
|
fresh.tap do |job|
|
44
65
|
job.instance_eval do
|
45
|
-
@steps << Step.new(
|
66
|
+
@steps << Step.new(
|
67
|
+
action: Actions::EACH_PARTITION,
|
68
|
+
partitions: @partitions,
|
69
|
+
partitioner: @partitioner,
|
70
|
+
worker: worker,
|
71
|
+
before: before,
|
72
|
+
block: block
|
73
|
+
)
|
46
74
|
end
|
47
75
|
end
|
48
76
|
end
|
49
77
|
|
50
|
-
def repartition(partitions:, partitioner: nil, worker: @worker)
|
51
|
-
map(partitions: partitions, partitioner: partitioner, worker: worker) do |key, value, collector|
|
78
|
+
def repartition(partitions:, partitioner: nil, worker: @worker, before: nil)
|
79
|
+
map(partitions: partitions, partitioner: partitioner, worker: worker, before: before) do |key, value, collector|
|
52
80
|
collector.call(key, value)
|
53
81
|
end
|
54
82
|
end
|
data/lib/kraps/runner.rb
CHANGED
@@ -55,7 +55,7 @@ module Kraps
|
|
55
55
|
enqueue(token: distributed_job.token, part: part, item: item)
|
56
56
|
end
|
57
57
|
|
58
|
-
Frame.new(token: distributed_job.token, partitions: @step.
|
58
|
+
Frame.new(token: distributed_job.token, partitions: @step.partitions)
|
59
59
|
end
|
60
60
|
end
|
61
61
|
|
@@ -65,7 +65,7 @@ module Kraps
|
|
65
65
|
enqueue(token: distributed_job.token, part: part, partition: partition)
|
66
66
|
end
|
67
67
|
|
68
|
-
Frame.new(token: distributed_job.token, partitions: @step.
|
68
|
+
Frame.new(token: distributed_job.token, partitions: @step.partitions)
|
69
69
|
end
|
70
70
|
end
|
71
71
|
|
@@ -75,7 +75,7 @@ module Kraps
|
|
75
75
|
enqueue(token: distributed_job.token, part: part, partition: partition)
|
76
76
|
end
|
77
77
|
|
78
|
-
Frame.new(token: distributed_job.token, partitions: @step.
|
78
|
+
Frame.new(token: distributed_job.token, partitions: @step.partitions)
|
79
79
|
end
|
80
80
|
end
|
81
81
|
|
@@ -91,7 +91,7 @@ module Kraps
|
|
91
91
|
|
92
92
|
def enqueue(token:, part:, **rest)
|
93
93
|
Kraps.enqueuer.call(
|
94
|
-
@step.
|
94
|
+
@step.worker,
|
95
95
|
JSON.generate(
|
96
96
|
job_index: @job_index,
|
97
97
|
step_index: @step_index,
|
data/lib/kraps/step.rb
CHANGED
data/lib/kraps/version.rb
CHANGED
data/lib/kraps/worker.rb
CHANGED
@@ -13,6 +13,8 @@ module Kraps
|
|
13
13
|
raise(InvalidAction, "Invalid action #{step.action}") unless Actions::ALL.include?(step.action)
|
14
14
|
|
15
15
|
with_retries(retries) do # TODO: allow to use queue based retries
|
16
|
+
step.before&.call
|
17
|
+
|
16
18
|
send(:"perform_#{step.action}")
|
17
19
|
|
18
20
|
distributed_job.done(@args["part"])
|
@@ -60,6 +62,14 @@ module Kraps
|
|
60
62
|
current_step.block.call(key, value, block)
|
61
63
|
end
|
62
64
|
|
65
|
+
subsequent_step = next_step
|
66
|
+
|
67
|
+
if subsequent_step&.action == Actions::REDUCE
|
68
|
+
implementation.define_singleton_method(:reduce) do |key, value1, value2|
|
69
|
+
subsequent_step.block.call(key, value1, value2)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
63
73
|
mapper = MapReduce::Mapper.new(implementation, partitioner: partitioner, memory_limit: @memory_limit)
|
64
74
|
|
65
75
|
temp_paths.each do |temp_path|
|
@@ -143,15 +153,16 @@ module Kraps
|
|
143
153
|
yield
|
144
154
|
rescue Kraps::Error
|
145
155
|
distributed_job.stop
|
156
|
+
raise
|
146
157
|
rescue StandardError
|
147
|
-
sleep(5)
|
148
|
-
retries += 1
|
149
|
-
|
150
158
|
if retries >= num_retries
|
151
159
|
distributed_job.stop
|
152
160
|
raise
|
153
161
|
end
|
154
162
|
|
163
|
+
sleep(5)
|
164
|
+
retries += 1
|
165
|
+
|
155
166
|
retry
|
156
167
|
end
|
157
168
|
end
|
@@ -180,8 +191,12 @@ module Kraps
|
|
180
191
|
end
|
181
192
|
end
|
182
193
|
|
194
|
+
def next_step
|
195
|
+
@next_step ||= steps[@args["step_index"] + 1]
|
196
|
+
end
|
197
|
+
|
183
198
|
def partitioner
|
184
|
-
@partitioner ||= proc { |key| step.
|
199
|
+
@partitioner ||= proc { |key| step.partitioner.call(key, step.partitions) }
|
185
200
|
end
|
186
201
|
|
187
202
|
def distributed_job
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kraps
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Vetter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-11-
|
11
|
+
date: 2022-11-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: attachie
|