kraps 0.7.0 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +11 -8
- data/lib/kraps/downloader.rb +19 -0
- data/lib/kraps/job.rb +16 -13
- data/lib/kraps/runner.rb +1 -1
- data/lib/kraps/version.rb +1 -1
- data/lib/kraps/worker.rb +2 -14
- data/lib/kraps.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d261c779e82209152e26decbc6c5a6c5c5ddb0fb40803884383617635727d3b2
|
4
|
+
data.tar.gz: 1b9c6fa8db7a7811cbac5a7a5db518e1f3ee75df583521b64417341e830425f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dcb05139042149be087b1a2c7f14a31cd5e28dedb1517aca83299f63b90046e4d05e0ab19dfaeede329e784880623abda19675252cdeaad04f8ccd87249afde5
|
7
|
+
data.tar.gz: 10fd07c322c659ae21a682832eba30416c830f9d2146af685d69168ad5137045ef4268c0a43cee4e879bb875edf900ca740bbe4cbfe8b91b34ad3df40763bce0
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,11 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## v0.8.0
|
4
|
+
|
5
|
+
* Use number of partitions of previous step for `jobs` option by default
|
6
|
+
* Changed `combine` to receive a `collector`
|
7
|
+
* Added mandatory `concurrency` argument to `load`
|
8
|
+
|
3
9
|
## v0.7.0
|
4
10
|
|
5
11
|
* Added a `jobs` option to the actions to limit the concurrency
|
data/README.md
CHANGED
@@ -232,10 +232,11 @@ return the new key-value pair, but the `collector` must be used instead.
|
|
232
232
|
The `jobs` argument can be useful when you need to access an external data
|
233
233
|
source, like a relational database and you want to limit the number of workers
|
234
234
|
accessing the store concurrently to avoid overloading it. If you don't specify
|
235
|
-
it, it will be identical to the number of partitions
|
236
|
-
use it for steps where you need to throttle the
|
237
|
-
course slow down the processing. The `jobs`
|
238
|
-
current step. The following steps don't inherit
|
235
|
+
it, it will be identical to the number of partitions of the previous step. It
|
236
|
+
is recommended to only use it for steps where you need to throttle the
|
237
|
+
concurrency, because it will of course slow down the processing. The `jobs`
|
238
|
+
argument only applies to the current step. The following steps don't inherit
|
239
|
+
the argument, but reset it.
|
239
240
|
|
240
241
|
* `map_partitions`: Maps the key value pairs to other key value pairs, but the
|
241
242
|
block receives all data of each partition as an enumerable and sorted by key.
|
@@ -273,8 +274,8 @@ most of the time, this is not neccessary and the key can simply be ignored.
|
|
273
274
|
passed job result are completely omitted.
|
274
275
|
|
275
276
|
```ruby
|
276
|
-
job.combine(other_job, worker: MyKrapsWorker, jobs: 8) do |key, value1, value2|
|
277
|
-
(value1 || {}).merge(value2 || {})
|
277
|
+
job.combine(other_job, worker: MyKrapsWorker, jobs: 8) do |key, value1, value2, collector|
|
278
|
+
collector.call(key, (value1 || {}).merge(value2 || {}))
|
278
279
|
end
|
279
280
|
```
|
280
281
|
|
@@ -316,10 +317,12 @@ It creates a folder for every partition and stores one or more chunks in there.
|
|
316
317
|
* `load`: Loads the previously dumped data
|
317
318
|
|
318
319
|
```ruby
|
319
|
-
job.load(prefix: "path/to/dump", partitions: 32, partitioner: Kraps::HashPartitioner.new, worker: MyKrapsWorker)
|
320
|
+
job.load(prefix: "path/to/dump", partitions: 32, concurrency: 8, partitioner: Kraps::HashPartitioner.new, worker: MyKrapsWorker)
|
320
321
|
```
|
321
322
|
|
322
|
-
The number of partitions
|
323
|
+
The number of partitions, the partitioner and concurrency must be specified.
|
324
|
+
The concurrency specifies the number of threads used for downloading chunks in
|
325
|
+
parallel.
|
323
326
|
|
324
327
|
Please note that every API method accepts a `before` callable:
|
325
328
|
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Kraps
|
2
|
+
class Downloader
|
3
|
+
def self.download_all(prefix:, concurrency:)
|
4
|
+
temp_paths = TempPaths.new
|
5
|
+
|
6
|
+
files = Kraps.driver.list(prefix: prefix).sort
|
7
|
+
|
8
|
+
temp_paths_index = files.each_with_object({}) do |file, hash|
|
9
|
+
hash[file] = temp_paths.add
|
10
|
+
end
|
11
|
+
|
12
|
+
Parallelizer.each(files, concurrency) do |file|
|
13
|
+
Kraps.driver.download(file, temp_paths_index[file].path)
|
14
|
+
end
|
15
|
+
|
16
|
+
temp_paths
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/kraps/job.rb
CHANGED
@@ -30,12 +30,14 @@ module Kraps
|
|
30
30
|
def map(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
|
31
31
|
fresh.tap do |job|
|
32
32
|
job.instance_eval do
|
33
|
+
jobs = [jobs, @partitions].compact.min
|
34
|
+
|
33
35
|
@partitions = partitions if partitions
|
34
36
|
@partitioner = partitioner if partitioner
|
35
37
|
|
36
38
|
@steps << Step.new(
|
37
39
|
action: Actions::MAP,
|
38
|
-
jobs:
|
40
|
+
jobs: jobs,
|
39
41
|
partitions: @partitions,
|
40
42
|
partitioner: @partitioner,
|
41
43
|
worker: worker,
|
@@ -49,12 +51,14 @@ module Kraps
|
|
49
51
|
def map_partitions(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
|
50
52
|
fresh.tap do |job|
|
51
53
|
job.instance_eval do
|
54
|
+
jobs = [jobs, @partitions].compact.min
|
55
|
+
|
52
56
|
@partitions = partitions if partitions
|
53
57
|
@partitioner = partitioner if partitioner
|
54
58
|
|
55
59
|
@steps << Step.new(
|
56
60
|
action: Actions::MAP_PARTITIONS,
|
57
|
-
jobs:
|
61
|
+
jobs: jobs,
|
58
62
|
partitions: @partitions,
|
59
63
|
partitioner: @partitioner,
|
60
64
|
worker: worker,
|
@@ -135,7 +139,7 @@ module Kraps
|
|
135
139
|
end
|
136
140
|
end
|
137
141
|
|
138
|
-
def load(prefix:, partitions:, partitioner:, worker: @worker)
|
142
|
+
def load(prefix:, partitions:, partitioner:, concurrency:, worker: @worker)
|
139
143
|
job = parallelize(partitions: partitions, partitioner: proc { |key, _| key }, worker: worker) do |collector|
|
140
144
|
(0...partitions).each do |partition|
|
141
145
|
collector.call(partition)
|
@@ -143,20 +147,19 @@ module Kraps
|
|
143
147
|
end
|
144
148
|
|
145
149
|
job.map_partitions(partitioner: partitioner, worker: worker) do |partition, _, collector|
|
146
|
-
|
147
|
-
|
148
|
-
path = File.join(prefix, partition.to_s, "chunk.json")
|
149
|
-
next unless Kraps.driver.exists?(path)
|
150
|
+
temp_paths = Downloader.download_all(prefix: File.join(prefix, partition.to_s, "/"), concurrency: concurrency)
|
150
151
|
|
151
|
-
|
152
|
+
temp_paths.each do |temp_path|
|
153
|
+
File.open(temp_path.path) do |stream|
|
154
|
+
stream.each_line do |line|
|
155
|
+
key, value = JSON.parse(line)
|
152
156
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
collector.call(key, value)
|
157
|
+
collector.call(key, value)
|
158
|
+
end
|
159
|
+
end
|
157
160
|
end
|
158
161
|
ensure
|
159
|
-
|
162
|
+
temp_paths&.delete
|
160
163
|
end
|
161
164
|
end
|
162
165
|
|
data/lib/kraps/runner.rb
CHANGED
@@ -100,7 +100,7 @@ module Kraps
|
|
100
100
|
|
101
101
|
def push_and_wait(enum:, job_count: nil)
|
102
102
|
redis_queue = RedisQueue.new(redis: Kraps.redis, token: SecureRandom.hex, namespace: Kraps.namespace, ttl: Kraps.job_ttl)
|
103
|
-
progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, token #{redis_queue.token}, %a, %c/%C (%p%) => #{@step.action}")
|
103
|
+
progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, #{@step.jobs || "?"} jobs, token #{redis_queue.token}, %a, %c/%C (%p%) => #{@step.action}")
|
104
104
|
|
105
105
|
total = 0
|
106
106
|
|
data/lib/kraps/version.rb
CHANGED
data/lib/kraps/worker.rb
CHANGED
@@ -157,7 +157,7 @@ module Kraps
|
|
157
157
|
implementation = Object.new
|
158
158
|
implementation.define_singleton_method(:map) do |&block|
|
159
159
|
combine_method.call(enum1, enum2) do |key, value1, value2|
|
160
|
-
|
160
|
+
current_step.block.call(key, value1, value2, block)
|
161
161
|
end
|
162
162
|
end
|
163
163
|
|
@@ -270,19 +270,7 @@ module Kraps
|
|
270
270
|
end
|
271
271
|
|
272
272
|
def download_all(token:, partition:)
|
273
|
-
|
274
|
-
|
275
|
-
files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{token}/#{partition}/")).sort
|
276
|
-
|
277
|
-
temp_paths_index = files.each_with_object({}) do |file, hash|
|
278
|
-
hash[file] = temp_paths.add
|
279
|
-
end
|
280
|
-
|
281
|
-
Parallelizer.each(files, @concurrency) do |file|
|
282
|
-
Kraps.driver.download(file, temp_paths_index[file].path)
|
283
|
-
end
|
284
|
-
|
285
|
-
temp_paths
|
273
|
+
Downloader.download_all(prefix: Kraps.driver.with_prefix("#{token}/#{partition}/"), concurrency: @concurrency)
|
286
274
|
end
|
287
275
|
|
288
276
|
def jobs
|
data/lib/kraps.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kraps
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Vetter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: attachie
|
@@ -128,6 +128,7 @@ files:
|
|
128
128
|
- docker-compose.yml
|
129
129
|
- lib/kraps.rb
|
130
130
|
- lib/kraps/actions.rb
|
131
|
+
- lib/kraps/downloader.rb
|
131
132
|
- lib/kraps/drivers.rb
|
132
133
|
- lib/kraps/frame.rb
|
133
134
|
- lib/kraps/hash_partitioner.rb
|