kraps 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Gemfile.lock +3 -3
- data/README.md +29 -16
- data/lib/kraps/version.rb +1 -1
- data/lib/kraps/worker.rb +16 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 15d08cf8952d4e5a083a6a4f9791fd16e9e2dbf67c1c71326f3af840c0c72eb8
|
4
|
+
data.tar.gz: c6542584846c54e5897b7b59ef40e8ec282ee27521b2d5ff39551eb02755882d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 21d1ef7a132edacf54e0b2df12b8d085af84ec1ed1cd019d258e43aba4cffbecdeada9b2b7f4baeefec4b59d115eb3e38400da94a3d7961ab19bbbb7dd2cf58c
|
7
|
+
data.tar.gz: fde066e9fdc5f9df7e95be43142cb04a7a1c5279decb277f1d815db508c87d2c04be46ea9559069c8a2c9539ee2eaa949a2fe2fdc3bf862937f9211cdfd8fbd5
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
kraps (0.
|
4
|
+
kraps (0.4.0)
|
5
5
|
attachie
|
6
6
|
distributed_job
|
7
7
|
map-reduce-ruby (>= 3.0.0)
|
@@ -23,7 +23,7 @@ GEM
|
|
23
23
|
connection_pool
|
24
24
|
mime-types
|
25
25
|
aws-eventstream (1.2.0)
|
26
|
-
aws-partitions (1.
|
26
|
+
aws-partitions (1.657.0)
|
27
27
|
aws-sdk-core (3.166.0)
|
28
28
|
aws-eventstream (~> 1, >= 1.0.2)
|
29
29
|
aws-partitions (~> 1, >= 1.651.0)
|
@@ -62,7 +62,7 @@ GEM
|
|
62
62
|
rake (13.0.6)
|
63
63
|
redis (5.0.5)
|
64
64
|
redis-client (>= 0.9.0)
|
65
|
-
redis-client (0.11.
|
65
|
+
redis-client (0.11.1)
|
66
66
|
connection_pool
|
67
67
|
regexp_parser (2.6.0)
|
68
68
|
rexml (3.2.5)
|
data/README.md
CHANGED
@@ -95,28 +95,41 @@ class MyKrapsWorker
|
|
95
95
|
include Sidekiq::Worker
|
96
96
|
|
97
97
|
def perform(json)
|
98
|
-
Kraps::Worker.new(json, memory_limit:
|
98
|
+
Kraps::Worker.new(json, memory_limit: 16.megabytes, chunk_limit: 64, concurrency: 8).call(retries: 3)
|
99
99
|
end
|
100
100
|
end
|
101
101
|
```
|
102
102
|
|
103
103
|
The `json` argument is automatically enqueued by Kraps and contains everything
|
104
104
|
it needs to know about the job and step to execute. The `memory_limit` tells
|
105
|
-
Kraps how much memory it is allowed to allocate for temporary chunks
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
`
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
105
|
+
Kraps how much memory it is allowed to allocate for temporary chunks. More
|
106
|
+
concretely, it tells Kraps how big the file size of a temporary chunk can grow
|
107
|
+
in memory up until Kraps must write it to disk. However, ruby of course
|
108
|
+
allocates much more memory for a chunk than the raw file size of the chunk. As
|
109
|
+
a rule of thumb, it allocates 10 times more memory. Still, choosing a value for
|
110
|
+
`memory_size` depends on the memory size of your container/server, how much
|
111
|
+
worker threads your background queue spawns and how much memory your workers
|
112
|
+
need besides of Kraps. Let's say your container/server has 2 gigabytes of
|
113
|
+
memory and your background framework spawns 5 threads. Theoretically, you might
|
114
|
+
be able to give 300-400 megabytes to Kraps then, but now divide this by 10 and
|
115
|
+
specify a `memory_limit` of around `30.megabytes`, better less. The
|
116
|
+
`memory_limit` affects how much chunks will be written to disk depending on the
|
117
|
+
data size you are processing and how big these chunks are. The smaller the
|
118
|
+
value, the more chunks and the more chunks, the more runs Kraps need to merge
|
119
|
+
the chunks. It can affect the performance The `chunk_limit` ensures that only
|
120
|
+
the specified amount of chunks are processed in a single run. A run basically
|
121
|
+
means: it takes up to `chunk_limit` chunks, reduces them and pushes the result
|
122
|
+
as a new chunk to the list of chunks to process. Thus, if your number of file
|
123
|
+
descriptors is unlimited, you want to set it to a higher number to avoid the
|
124
|
+
overhead of multiple runs. `concurrency` tells Kraps how much threads to use to
|
125
|
+
concurrently upload/download files from the storage layer. Finally, `retries`
|
126
|
+
specifies how often Kraps should retry the job step in case of errors. Kraps
|
127
|
+
will sleep for 5 seconds between those retries. Please note that it's not yet
|
128
|
+
possible to use the retry mechanism of your background job framework with
|
129
|
+
Kraps. Please note, however, that `parallelize` is not covered by `retries`
|
130
|
+
yet, as the block passed to `parallelize` is executed by the runner, not the
|
131
|
+
workers.
|
132
|
+
|
120
133
|
|
121
134
|
Now, executing your job is super easy:
|
122
135
|
|
data/lib/kraps/version.rb
CHANGED
data/lib/kraps/worker.rb
CHANGED
@@ -60,6 +60,14 @@ module Kraps
|
|
60
60
|
current_step.block.call(key, value, block)
|
61
61
|
end
|
62
62
|
|
63
|
+
subsequent_step = next_step
|
64
|
+
|
65
|
+
if subsequent_step&.action == Actions::REDUCE
|
66
|
+
implementation.define_singleton_method(:reduce) do |key, value1, value2|
|
67
|
+
subsequent_step.block.call(key, value1, value2)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
63
71
|
mapper = MapReduce::Mapper.new(implementation, partitioner: partitioner, memory_limit: @memory_limit)
|
64
72
|
|
65
73
|
temp_paths.each do |temp_path|
|
@@ -143,15 +151,16 @@ module Kraps
|
|
143
151
|
yield
|
144
152
|
rescue Kraps::Error
|
145
153
|
distributed_job.stop
|
154
|
+
raise
|
146
155
|
rescue StandardError
|
147
|
-
sleep(5)
|
148
|
-
retries += 1
|
149
|
-
|
150
156
|
if retries >= num_retries
|
151
157
|
distributed_job.stop
|
152
158
|
raise
|
153
159
|
end
|
154
160
|
|
161
|
+
sleep(5)
|
162
|
+
retries += 1
|
163
|
+
|
155
164
|
retry
|
156
165
|
end
|
157
166
|
end
|
@@ -180,6 +189,10 @@ module Kraps
|
|
180
189
|
end
|
181
190
|
end
|
182
191
|
|
192
|
+
def next_step
|
193
|
+
@next_step ||= steps[@args["step_index"] + 1]
|
194
|
+
end
|
195
|
+
|
183
196
|
def partitioner
|
184
197
|
@partitioner ||= proc { |key| step.args[:partitioner].call(key, step.args[:partitions]) }
|
185
198
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kraps
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Vetter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-11-
|
11
|
+
date: 2022-11-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: attachie
|