kraps 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +11 -8
- data/lib/kraps/downloader.rb +19 -0
- data/lib/kraps/job.rb +16 -13
- data/lib/kraps/runner.rb +1 -1
- data/lib/kraps/version.rb +1 -1
- data/lib/kraps/worker.rb +2 -14
- data/lib/kraps.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d261c779e82209152e26decbc6c5a6c5c5ddb0fb40803884383617635727d3b2
|
4
|
+
data.tar.gz: 1b9c6fa8db7a7811cbac5a7a5db518e1f3ee75df583521b64417341e830425f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dcb05139042149be087b1a2c7f14a31cd5e28dedb1517aca83299f63b90046e4d05e0ab19dfaeede329e784880623abda19675252cdeaad04f8ccd87249afde5
|
7
|
+
data.tar.gz: 10fd07c322c659ae21a682832eba30416c830f9d2146af685d69168ad5137045ef4268c0a43cee4e879bb875edf900ca740bbe4cbfe8b91b34ad3df40763bce0
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,11 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## v0.8.0
|
4
|
+
|
5
|
+
* Use number of partitions of previous step for `jobs` option by default
|
6
|
+
* Changed `combine` to receive a `collector`
|
7
|
+
* Added mandatory `concurrency` argument to `load`
|
8
|
+
|
3
9
|
## v0.7.0
|
4
10
|
|
5
11
|
* Added a `jobs` option to the actions to limit the concurrency
|
data/README.md
CHANGED
@@ -232,10 +232,11 @@ return the new key-value pair, but the `collector` must be used instead.
|
|
232
232
|
The `jobs` argument can be useful when you need to access an external data
|
233
233
|
source, like a relational database and you want to limit the number of workers
|
234
234
|
accessing the store concurrently to avoid overloading it. If you don't specify
|
235
|
-
it, it will be identical to the number of partitions
|
236
|
-
use it for steps where you need to throttle the
|
237
|
-
course slow down the processing. The `jobs`
|
238
|
-
current step. The following steps don't inherit
|
235
|
+
it, it will be identical to the number of partitions of the previous step. It
|
236
|
+
is recommended to only use it for steps where you need to throttle the
|
237
|
+
concurrency, because it will of course slow down the processing. The `jobs`
|
238
|
+
argument only applies to the current step. The following steps don't inherit
|
239
|
+
the argument, but reset it.
|
239
240
|
|
240
241
|
* `map_partitions`: Maps the key value pairs to other key value pairs, but the
|
241
242
|
block receives all data of each partition as an enumerable and sorted by key.
|
@@ -273,8 +274,8 @@ most of the time, this is not neccessary and the key can simply be ignored.
|
|
273
274
|
passed job result are completely omitted.
|
274
275
|
|
275
276
|
```ruby
|
276
|
-
job.combine(other_job, worker: MyKrapsWorker, jobs: 8) do |key, value1, value2|
|
277
|
-
(value1 || {}).merge(value2 || {})
|
277
|
+
job.combine(other_job, worker: MyKrapsWorker, jobs: 8) do |key, value1, value2, collector|
|
278
|
+
collector.call(key, (value1 || {}).merge(value2 || {}))
|
278
279
|
end
|
279
280
|
```
|
280
281
|
|
@@ -316,10 +317,12 @@ It creates a folder for every partition and stores one or more chunks in there.
|
|
316
317
|
* `load`: Loads the previously dumped data
|
317
318
|
|
318
319
|
```ruby
|
319
|
-
job.load(prefix: "path/to/dump", partitions: 32, partitioner: Kraps::HashPartitioner.new, worker: MyKrapsWorker)
|
320
|
+
job.load(prefix: "path/to/dump", partitions: 32, concurrency: 8, partitioner: Kraps::HashPartitioner.new, worker: MyKrapsWorker)
|
320
321
|
```
|
321
322
|
|
322
|
-
The number of partitions
|
323
|
+
The number of partitions, the partitioner and concurrency must be specified.
|
324
|
+
The concurrency specifies the number of threads used for downloading chunks in
|
325
|
+
parallel.
|
323
326
|
|
324
327
|
Please note that every API method accepts a `before` callable:
|
325
328
|
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Kraps
|
2
|
+
class Downloader
|
3
|
+
def self.download_all(prefix:, concurrency:)
|
4
|
+
temp_paths = TempPaths.new
|
5
|
+
|
6
|
+
files = Kraps.driver.list(prefix: prefix).sort
|
7
|
+
|
8
|
+
temp_paths_index = files.each_with_object({}) do |file, hash|
|
9
|
+
hash[file] = temp_paths.add
|
10
|
+
end
|
11
|
+
|
12
|
+
Parallelizer.each(files, concurrency) do |file|
|
13
|
+
Kraps.driver.download(file, temp_paths_index[file].path)
|
14
|
+
end
|
15
|
+
|
16
|
+
temp_paths
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/kraps/job.rb
CHANGED
@@ -30,12 +30,14 @@ module Kraps
|
|
30
30
|
def map(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
|
31
31
|
fresh.tap do |job|
|
32
32
|
job.instance_eval do
|
33
|
+
jobs = [jobs, @partitions].compact.min
|
34
|
+
|
33
35
|
@partitions = partitions if partitions
|
34
36
|
@partitioner = partitioner if partitioner
|
35
37
|
|
36
38
|
@steps << Step.new(
|
37
39
|
action: Actions::MAP,
|
38
|
-
jobs:
|
40
|
+
jobs: jobs,
|
39
41
|
partitions: @partitions,
|
40
42
|
partitioner: @partitioner,
|
41
43
|
worker: worker,
|
@@ -49,12 +51,14 @@ module Kraps
|
|
49
51
|
def map_partitions(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
|
50
52
|
fresh.tap do |job|
|
51
53
|
job.instance_eval do
|
54
|
+
jobs = [jobs, @partitions].compact.min
|
55
|
+
|
52
56
|
@partitions = partitions if partitions
|
53
57
|
@partitioner = partitioner if partitioner
|
54
58
|
|
55
59
|
@steps << Step.new(
|
56
60
|
action: Actions::MAP_PARTITIONS,
|
57
|
-
jobs:
|
61
|
+
jobs: jobs,
|
58
62
|
partitions: @partitions,
|
59
63
|
partitioner: @partitioner,
|
60
64
|
worker: worker,
|
@@ -135,7 +139,7 @@ module Kraps
|
|
135
139
|
end
|
136
140
|
end
|
137
141
|
|
138
|
-
def load(prefix:, partitions:, partitioner:, worker: @worker)
|
142
|
+
def load(prefix:, partitions:, partitioner:, concurrency:, worker: @worker)
|
139
143
|
job = parallelize(partitions: partitions, partitioner: proc { |key, _| key }, worker: worker) do |collector|
|
140
144
|
(0...partitions).each do |partition|
|
141
145
|
collector.call(partition)
|
@@ -143,20 +147,19 @@ module Kraps
|
|
143
147
|
end
|
144
148
|
|
145
149
|
job.map_partitions(partitioner: partitioner, worker: worker) do |partition, _, collector|
|
146
|
-
|
147
|
-
|
148
|
-
path = File.join(prefix, partition.to_s, "chunk.json")
|
149
|
-
next unless Kraps.driver.exists?(path)
|
150
|
+
temp_paths = Downloader.download_all(prefix: File.join(prefix, partition.to_s, "/"), concurrency: concurrency)
|
150
151
|
|
151
|
-
|
152
|
+
temp_paths.each do |temp_path|
|
153
|
+
File.open(temp_path.path) do |stream|
|
154
|
+
stream.each_line do |line|
|
155
|
+
key, value = JSON.parse(line)
|
152
156
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
collector.call(key, value)
|
157
|
+
collector.call(key, value)
|
158
|
+
end
|
159
|
+
end
|
157
160
|
end
|
158
161
|
ensure
|
159
|
-
|
162
|
+
temp_paths&.delete
|
160
163
|
end
|
161
164
|
end
|
162
165
|
|
data/lib/kraps/runner.rb
CHANGED
@@ -100,7 +100,7 @@ module Kraps
|
|
100
100
|
|
101
101
|
def push_and_wait(enum:, job_count: nil)
|
102
102
|
redis_queue = RedisQueue.new(redis: Kraps.redis, token: SecureRandom.hex, namespace: Kraps.namespace, ttl: Kraps.job_ttl)
|
103
|
-
progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, token #{redis_queue.token}, %a, %c/%C (%p%) => #{@step.action}")
|
103
|
+
progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, #{@step.jobs || "?"} jobs, token #{redis_queue.token}, %a, %c/%C (%p%) => #{@step.action}")
|
104
104
|
|
105
105
|
total = 0
|
106
106
|
|
data/lib/kraps/version.rb
CHANGED
data/lib/kraps/worker.rb
CHANGED
@@ -157,7 +157,7 @@ module Kraps
|
|
157
157
|
implementation = Object.new
|
158
158
|
implementation.define_singleton_method(:map) do |&block|
|
159
159
|
combine_method.call(enum1, enum2) do |key, value1, value2|
|
160
|
-
|
160
|
+
current_step.block.call(key, value1, value2, block)
|
161
161
|
end
|
162
162
|
end
|
163
163
|
|
@@ -270,19 +270,7 @@ module Kraps
|
|
270
270
|
end
|
271
271
|
|
272
272
|
def download_all(token:, partition:)
|
273
|
-
|
274
|
-
|
275
|
-
files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{token}/#{partition}/")).sort
|
276
|
-
|
277
|
-
temp_paths_index = files.each_with_object({}) do |file, hash|
|
278
|
-
hash[file] = temp_paths.add
|
279
|
-
end
|
280
|
-
|
281
|
-
Parallelizer.each(files, @concurrency) do |file|
|
282
|
-
Kraps.driver.download(file, temp_paths_index[file].path)
|
283
|
-
end
|
284
|
-
|
285
|
-
temp_paths
|
273
|
+
Downloader.download_all(prefix: Kraps.driver.with_prefix("#{token}/#{partition}/"), concurrency: @concurrency)
|
286
274
|
end
|
287
275
|
|
288
276
|
def jobs
|
data/lib/kraps.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kraps
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Vetter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: attachie
|
@@ -128,6 +128,7 @@ files:
|
|
128
128
|
- docker-compose.yml
|
129
129
|
- lib/kraps.rb
|
130
130
|
- lib/kraps/actions.rb
|
131
|
+
- lib/kraps/downloader.rb
|
131
132
|
- lib/kraps/drivers.rb
|
132
133
|
- lib/kraps/frame.rb
|
133
134
|
- lib/kraps/hash_partitioner.rb
|