kraps 0.7.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 19635ced3e745d44313ed3bc416ef73eb555134c591725f4dab7b38208e21393
4
- data.tar.gz: bb7f679c7e2cd053744d1c7d857629de1912a305880d0538bdab418de3861ba1
3
+ metadata.gz: d261c779e82209152e26decbc6c5a6c5c5ddb0fb40803884383617635727d3b2
4
+ data.tar.gz: 1b9c6fa8db7a7811cbac5a7a5db518e1f3ee75df583521b64417341e830425f4
5
5
  SHA512:
6
- metadata.gz: 91273ba54ea33c6d5cb1b4f335ad8039c35601953fdf1e6b9b2ac3117ceb25d81e9be569e5c8b70deda22b53a4c72f04d60ffb4b251badf5c5a64d13d399f36c
7
- data.tar.gz: 4e563257fcba0c9f457b363da4f43d000ee239a5179e2f965071bb3df27e362cdf7bc9950a1624520cb862acfb0fde89e96b0a4987d79a93524230c4b84619cd
6
+ metadata.gz: dcb05139042149be087b1a2c7f14a31cd5e28dedb1517aca83299f63b90046e4d05e0ab19dfaeede329e784880623abda19675252cdeaad04f8ccd87249afde5
7
+ data.tar.gz: 10fd07c322c659ae21a682832eba30416c830f9d2146af685d69168ad5137045ef4268c0a43cee4e879bb875edf900ca740bbe4cbfe8b91b34ad3df40763bce0
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## v0.8.0
4
+
5
+ * Use number of partitions of previous step for `jobs` option by default
6
+ * Changed `combine` to receive a `collector`
7
+ * Added mandatory `concurrency` argument to `load`
8
+
3
9
  ## v0.7.0
4
10
 
5
11
  * Added a `jobs` option to the actions to limit the concurrency
data/README.md CHANGED
@@ -232,10 +232,11 @@ return the new key-value pair, but the `collector` must be used instead.
232
232
  The `jobs` argument can be useful when you need to access an external data
233
233
  source, like a relational database and you want to limit the number of workers
234
234
  accessing the store concurrently to avoid overloading it. If you don't specify
235
- it, it will be identical to the number of partitions. It is recommended to only
236
- use it for steps where you need to throttle the concurrency, because it will of
237
- course slow down the processing. The `jobs` argument only applies to the
238
- current step. The following steps don't inherit the argument, but reset it.
235
+ it, it will be identical to the number of partitions of the previous step. It
236
+ is recommended to only use it for steps where you need to throttle the
237
+ concurrency, because it will of course slow down the processing. The `jobs`
238
+ argument only applies to the current step. The following steps don't inherit
239
+ the argument, but reset it.
239
240
 
240
241
  * `map_partitions`: Maps the key value pairs to other key value pairs, but the
241
242
  block receives all data of each partition as an enumerable and sorted by key.
@@ -273,8 +274,8 @@ most of the time, this is not neccessary and the key can simply be ignored.
273
274
  passed job result are completely omitted.
274
275
 
275
276
  ```ruby
276
- job.combine(other_job, worker: MyKrapsWorker, jobs: 8) do |key, value1, value2|
277
- (value1 || {}).merge(value2 || {})
277
+ job.combine(other_job, worker: MyKrapsWorker, jobs: 8) do |key, value1, value2, collector|
278
+ collector.call(key, (value1 || {}).merge(value2 || {}))
278
279
  end
279
280
  ```
280
281
 
@@ -316,10 +317,12 @@ It creates a folder for every partition and stores one or more chunks in there.
316
317
  * `load`: Loads the previously dumped data
317
318
 
318
319
  ```ruby
319
- job.load(prefix: "path/to/dump", partitions: 32, partitioner: Kraps::HashPartitioner.new, worker: MyKrapsWorker)
320
+ job.load(prefix: "path/to/dump", partitions: 32, concurrency: 8, partitioner: Kraps::HashPartitioner.new, worker: MyKrapsWorker)
320
321
  ```
321
322
 
322
- The number of partitions and the partitioner must be specified.
323
+ The number of partitions, the partitioner and concurrency must be specified.
324
+ The concurrency specifies the number of threads used for downloading chunks in
325
+ parallel.
323
326
 
324
327
  Please note that every API method accepts a `before` callable:
325
328
 
@@ -0,0 +1,19 @@
1
+ module Kraps
2
+ class Downloader
3
+ def self.download_all(prefix:, concurrency:)
4
+ temp_paths = TempPaths.new
5
+
6
+ files = Kraps.driver.list(prefix: prefix).sort
7
+
8
+ temp_paths_index = files.each_with_object({}) do |file, hash|
9
+ hash[file] = temp_paths.add
10
+ end
11
+
12
+ Parallelizer.each(files, concurrency) do |file|
13
+ Kraps.driver.download(file, temp_paths_index[file].path)
14
+ end
15
+
16
+ temp_paths
17
+ end
18
+ end
19
+ end
data/lib/kraps/job.rb CHANGED
@@ -30,12 +30,14 @@ module Kraps
30
30
  def map(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
31
31
  fresh.tap do |job|
32
32
  job.instance_eval do
33
+ jobs = [jobs, @partitions].compact.min
34
+
33
35
  @partitions = partitions if partitions
34
36
  @partitioner = partitioner if partitioner
35
37
 
36
38
  @steps << Step.new(
37
39
  action: Actions::MAP,
38
- jobs: [jobs, @partitions].compact.min,
40
+ jobs: jobs,
39
41
  partitions: @partitions,
40
42
  partitioner: @partitioner,
41
43
  worker: worker,
@@ -49,12 +51,14 @@ module Kraps
49
51
  def map_partitions(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
50
52
  fresh.tap do |job|
51
53
  job.instance_eval do
54
+ jobs = [jobs, @partitions].compact.min
55
+
52
56
  @partitions = partitions if partitions
53
57
  @partitioner = partitioner if partitioner
54
58
 
55
59
  @steps << Step.new(
56
60
  action: Actions::MAP_PARTITIONS,
57
- jobs: [jobs, @partitions].compact.min,
61
+ jobs: jobs,
58
62
  partitions: @partitions,
59
63
  partitioner: @partitioner,
60
64
  worker: worker,
@@ -135,7 +139,7 @@ module Kraps
135
139
  end
136
140
  end
137
141
 
138
- def load(prefix:, partitions:, partitioner:, worker: @worker)
142
+ def load(prefix:, partitions:, partitioner:, concurrency:, worker: @worker)
139
143
  job = parallelize(partitions: partitions, partitioner: proc { |key, _| key }, worker: worker) do |collector|
140
144
  (0...partitions).each do |partition|
141
145
  collector.call(partition)
@@ -143,20 +147,19 @@ module Kraps
143
147
  end
144
148
 
145
149
  job.map_partitions(partitioner: partitioner, worker: worker) do |partition, _, collector|
146
- tempfile = Tempfile.new
147
-
148
- path = File.join(prefix, partition.to_s, "chunk.json")
149
- next unless Kraps.driver.exists?(path)
150
+ temp_paths = Downloader.download_all(prefix: File.join(prefix, partition.to_s, "/"), concurrency: concurrency)
150
151
 
151
- Kraps.driver.download(path, tempfile.path)
152
+ temp_paths.each do |temp_path|
153
+ File.open(temp_path.path) do |stream|
154
+ stream.each_line do |line|
155
+ key, value = JSON.parse(line)
152
156
 
153
- tempfile.each_line do |line|
154
- key, value = JSON.parse(line)
155
-
156
- collector.call(key, value)
157
+ collector.call(key, value)
158
+ end
159
+ end
157
160
  end
158
161
  ensure
159
- tempfile&.close(true)
162
+ temp_paths&.delete
160
163
  end
161
164
  end
162
165
 
data/lib/kraps/runner.rb CHANGED
@@ -100,7 +100,7 @@ module Kraps
100
100
 
101
101
  def push_and_wait(enum:, job_count: nil)
102
102
  redis_queue = RedisQueue.new(redis: Kraps.redis, token: SecureRandom.hex, namespace: Kraps.namespace, ttl: Kraps.job_ttl)
103
- progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, token #{redis_queue.token}, %a, %c/%C (%p%) => #{@step.action}")
103
+ progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, #{@step.jobs || "?"} jobs, token #{redis_queue.token}, %a, %c/%C (%p%) => #{@step.action}")
104
104
 
105
105
  total = 0
106
106
 
data/lib/kraps/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kraps
2
- VERSION = "0.7.0"
2
+ VERSION = "0.8.0"
3
3
  end
data/lib/kraps/worker.rb CHANGED
@@ -157,7 +157,7 @@ module Kraps
157
157
  implementation = Object.new
158
158
  implementation.define_singleton_method(:map) do |&block|
159
159
  combine_method.call(enum1, enum2) do |key, value1, value2|
160
- block.call(key, current_step.block.call(key, value1, value2))
160
+ current_step.block.call(key, value1, value2, block)
161
161
  end
162
162
  end
163
163
 
@@ -270,19 +270,7 @@ module Kraps
270
270
  end
271
271
 
272
272
  def download_all(token:, partition:)
273
- temp_paths = TempPaths.new
274
-
275
- files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{token}/#{partition}/")).sort
276
-
277
- temp_paths_index = files.each_with_object({}) do |file, hash|
278
- hash[file] = temp_paths.add
279
- end
280
-
281
- Parallelizer.each(files, @concurrency) do |file|
282
- Kraps.driver.download(file, temp_paths_index[file].path)
283
- end
284
-
285
- temp_paths
273
+ Downloader.download_all(prefix: Kraps.driver.with_prefix("#{token}/#{partition}/"), concurrency: @concurrency)
286
274
  end
287
275
 
288
276
  def jobs
data/lib/kraps.rb CHANGED
@@ -19,6 +19,7 @@ require_relative "kraps/runner"
19
19
  require_relative "kraps/step"
20
20
  require_relative "kraps/frame"
21
21
  require_relative "kraps/worker"
22
+ require_relative "kraps/downloader"
22
23
 
23
24
  module Kraps
24
25
  class Error < StandardError; end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kraps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Vetter
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-12-02 00:00:00.000000000 Z
11
+ date: 2023-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: attachie
@@ -128,6 +128,7 @@ files:
128
128
  - docker-compose.yml
129
129
  - lib/kraps.rb
130
130
  - lib/kraps/actions.rb
131
+ - lib/kraps/downloader.rb
131
132
  - lib/kraps/drivers.rb
132
133
  - lib/kraps/frame.rb
133
134
  - lib/kraps/hash_partitioner.rb