kraps 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 19635ced3e745d44313ed3bc416ef73eb555134c591725f4dab7b38208e21393
4
- data.tar.gz: bb7f679c7e2cd053744d1c7d857629de1912a305880d0538bdab418de3861ba1
3
+ metadata.gz: d261c779e82209152e26decbc6c5a6c5c5ddb0fb40803884383617635727d3b2
4
+ data.tar.gz: 1b9c6fa8db7a7811cbac5a7a5db518e1f3ee75df583521b64417341e830425f4
5
5
  SHA512:
6
- metadata.gz: 91273ba54ea33c6d5cb1b4f335ad8039c35601953fdf1e6b9b2ac3117ceb25d81e9be569e5c8b70deda22b53a4c72f04d60ffb4b251badf5c5a64d13d399f36c
7
- data.tar.gz: 4e563257fcba0c9f457b363da4f43d000ee239a5179e2f965071bb3df27e362cdf7bc9950a1624520cb862acfb0fde89e96b0a4987d79a93524230c4b84619cd
6
+ metadata.gz: dcb05139042149be087b1a2c7f14a31cd5e28dedb1517aca83299f63b90046e4d05e0ab19dfaeede329e784880623abda19675252cdeaad04f8ccd87249afde5
7
+ data.tar.gz: 10fd07c322c659ae21a682832eba30416c830f9d2146af685d69168ad5137045ef4268c0a43cee4e879bb875edf900ca740bbe4cbfe8b91b34ad3df40763bce0
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## v0.8.0
4
+
5
+ * Use number of partitions of previous step for `jobs` option by default
6
+ * Changed `combine` to receive a `collector`
7
+ * Added mandatory `concurrency` argument to `load`
8
+
3
9
  ## v0.7.0
4
10
 
5
11
  * Added a `jobs` option to the actions to limit the concurrency
data/README.md CHANGED
@@ -232,10 +232,11 @@ return the new key-value pair, but the `collector` must be used instead.
232
232
  The `jobs` argument can be useful when you need to access an external data
233
233
  source, like a relational database and you want to limit the number of workers
234
234
  accessing the store concurrently to avoid overloading it. If you don't specify
235
- it, it will be identical to the number of partitions. It is recommended to only
236
- use it for steps where you need to throttle the concurrency, because it will of
237
- course slow down the processing. The `jobs` argument only applies to the
238
- current step. The following steps don't inherit the argument, but reset it.
235
+ it, it will be identical to the number of partitions of the previous step. It
236
+ is recommended to only use it for steps where you need to throttle the
237
+ concurrency, because it will of course slow down the processing. The `jobs`
238
+ argument only applies to the current step. The following steps don't inherit
239
+ the argument, but reset it.
239
240
 
240
241
  * `map_partitions`: Maps the key value pairs to other key value pairs, but the
241
242
  block receives all data of each partition as an enumerable and sorted by key.
@@ -273,8 +274,8 @@ most of the time, this is not neccessary and the key can simply be ignored.
273
274
  passed job result are completely omitted.
274
275
 
275
276
  ```ruby
276
- job.combine(other_job, worker: MyKrapsWorker, jobs: 8) do |key, value1, value2|
277
- (value1 || {}).merge(value2 || {})
277
+ job.combine(other_job, worker: MyKrapsWorker, jobs: 8) do |key, value1, value2, collector|
278
+ collector.call(key, (value1 || {}).merge(value2 || {}))
278
279
  end
279
280
  ```
280
281
 
@@ -316,10 +317,12 @@ It creates a folder for every partition and stores one or more chunks in there.
316
317
  * `load`: Loads the previously dumped data
317
318
 
318
319
  ```ruby
319
- job.load(prefix: "path/to/dump", partitions: 32, partitioner: Kraps::HashPartitioner.new, worker: MyKrapsWorker)
320
+ job.load(prefix: "path/to/dump", partitions: 32, concurrency: 8, partitioner: Kraps::HashPartitioner.new, worker: MyKrapsWorker)
320
321
  ```
321
322
 
322
- The number of partitions and the partitioner must be specified.
323
+ The number of partitions, the partitioner and concurrency must be specified.
324
+ The concurrency specifies the number of threads used for downloading chunks in
325
+ parallel.
323
326
 
324
327
  Please note that every API method accepts a `before` callable:
325
328
 
@@ -0,0 +1,19 @@
1
+ module Kraps
2
+ class Downloader
3
+ def self.download_all(prefix:, concurrency:)
4
+ temp_paths = TempPaths.new
5
+
6
+ files = Kraps.driver.list(prefix: prefix).sort
7
+
8
+ temp_paths_index = files.each_with_object({}) do |file, hash|
9
+ hash[file] = temp_paths.add
10
+ end
11
+
12
+ Parallelizer.each(files, concurrency) do |file|
13
+ Kraps.driver.download(file, temp_paths_index[file].path)
14
+ end
15
+
16
+ temp_paths
17
+ end
18
+ end
19
+ end
data/lib/kraps/job.rb CHANGED
@@ -30,12 +30,14 @@ module Kraps
30
30
  def map(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
31
31
  fresh.tap do |job|
32
32
  job.instance_eval do
33
+ jobs = [jobs, @partitions].compact.min
34
+
33
35
  @partitions = partitions if partitions
34
36
  @partitioner = partitioner if partitioner
35
37
 
36
38
  @steps << Step.new(
37
39
  action: Actions::MAP,
38
- jobs: [jobs, @partitions].compact.min,
40
+ jobs: jobs,
39
41
  partitions: @partitions,
40
42
  partitioner: @partitioner,
41
43
  worker: worker,
@@ -49,12 +51,14 @@ module Kraps
49
51
  def map_partitions(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
50
52
  fresh.tap do |job|
51
53
  job.instance_eval do
54
+ jobs = [jobs, @partitions].compact.min
55
+
52
56
  @partitions = partitions if partitions
53
57
  @partitioner = partitioner if partitioner
54
58
 
55
59
  @steps << Step.new(
56
60
  action: Actions::MAP_PARTITIONS,
57
- jobs: [jobs, @partitions].compact.min,
61
+ jobs: jobs,
58
62
  partitions: @partitions,
59
63
  partitioner: @partitioner,
60
64
  worker: worker,
@@ -135,7 +139,7 @@ module Kraps
135
139
  end
136
140
  end
137
141
 
138
- def load(prefix:, partitions:, partitioner:, worker: @worker)
142
+ def load(prefix:, partitions:, partitioner:, concurrency:, worker: @worker)
139
143
  job = parallelize(partitions: partitions, partitioner: proc { |key, _| key }, worker: worker) do |collector|
140
144
  (0...partitions).each do |partition|
141
145
  collector.call(partition)
@@ -143,20 +147,19 @@ module Kraps
143
147
  end
144
148
 
145
149
  job.map_partitions(partitioner: partitioner, worker: worker) do |partition, _, collector|
146
- tempfile = Tempfile.new
147
-
148
- path = File.join(prefix, partition.to_s, "chunk.json")
149
- next unless Kraps.driver.exists?(path)
150
+ temp_paths = Downloader.download_all(prefix: File.join(prefix, partition.to_s, "/"), concurrency: concurrency)
150
151
 
151
- Kraps.driver.download(path, tempfile.path)
152
+ temp_paths.each do |temp_path|
153
+ File.open(temp_path.path) do |stream|
154
+ stream.each_line do |line|
155
+ key, value = JSON.parse(line)
152
156
 
153
- tempfile.each_line do |line|
154
- key, value = JSON.parse(line)
155
-
156
- collector.call(key, value)
157
+ collector.call(key, value)
158
+ end
159
+ end
157
160
  end
158
161
  ensure
159
- tempfile&.close(true)
162
+ temp_paths&.delete
160
163
  end
161
164
  end
162
165
 
data/lib/kraps/runner.rb CHANGED
@@ -100,7 +100,7 @@ module Kraps
100
100
 
101
101
  def push_and_wait(enum:, job_count: nil)
102
102
  redis_queue = RedisQueue.new(redis: Kraps.redis, token: SecureRandom.hex, namespace: Kraps.namespace, ttl: Kraps.job_ttl)
103
- progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, token #{redis_queue.token}, %a, %c/%C (%p%) => #{@step.action}")
103
+ progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, #{@step.jobs || "?"} jobs, token #{redis_queue.token}, %a, %c/%C (%p%) => #{@step.action}")
104
104
 
105
105
  total = 0
106
106
 
data/lib/kraps/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kraps
2
- VERSION = "0.7.0"
2
+ VERSION = "0.8.0"
3
3
  end
data/lib/kraps/worker.rb CHANGED
@@ -157,7 +157,7 @@ module Kraps
157
157
  implementation = Object.new
158
158
  implementation.define_singleton_method(:map) do |&block|
159
159
  combine_method.call(enum1, enum2) do |key, value1, value2|
160
- block.call(key, current_step.block.call(key, value1, value2))
160
+ current_step.block.call(key, value1, value2, block)
161
161
  end
162
162
  end
163
163
 
@@ -270,19 +270,7 @@ module Kraps
270
270
  end
271
271
 
272
272
  def download_all(token:, partition:)
273
- temp_paths = TempPaths.new
274
-
275
- files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{token}/#{partition}/")).sort
276
-
277
- temp_paths_index = files.each_with_object({}) do |file, hash|
278
- hash[file] = temp_paths.add
279
- end
280
-
281
- Parallelizer.each(files, @concurrency) do |file|
282
- Kraps.driver.download(file, temp_paths_index[file].path)
283
- end
284
-
285
- temp_paths
273
+ Downloader.download_all(prefix: Kraps.driver.with_prefix("#{token}/#{partition}/"), concurrency: @concurrency)
286
274
  end
287
275
 
288
276
  def jobs
data/lib/kraps.rb CHANGED
@@ -19,6 +19,7 @@ require_relative "kraps/runner"
19
19
  require_relative "kraps/step"
20
20
  require_relative "kraps/frame"
21
21
  require_relative "kraps/worker"
22
+ require_relative "kraps/downloader"
22
23
 
23
24
  module Kraps
24
25
  class Error < StandardError; end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kraps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Vetter
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-12-02 00:00:00.000000000 Z
11
+ date: 2023-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: attachie
@@ -128,6 +128,7 @@ files:
128
128
  - docker-compose.yml
129
129
  - lib/kraps.rb
130
130
  - lib/kraps/actions.rb
131
+ - lib/kraps/downloader.rb
131
132
  - lib/kraps/drivers.rb
132
133
  - lib/kraps/frame.rb
133
134
  - lib/kraps/hash_partitioner.rb