kraps 0.5.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 921ae08326c96216136418861b88af7f11bce519c924cd1813216165f7f02690
4
- data.tar.gz: '0913d31d3caeea0be664bc714e9d0da58227f515c047be31359e96040bc0c141'
3
+ metadata.gz: 19635ced3e745d44313ed3bc416ef73eb555134c591725f4dab7b38208e21393
4
+ data.tar.gz: bb7f679c7e2cd053744d1c7d857629de1912a305880d0538bdab418de3861ba1
5
5
  SHA512:
6
- metadata.gz: d8e43e5229fc310019801e62a2e278470a1eb37b50e4aca27b9c64edb6666115f0f25c7a7375790516e2726fcf10980cdac1523c54dde8d3527a39fd919a2a5a
7
- data.tar.gz: 30b1a9edcdd4f7ff476bfa4c070aef31debd727500e27a08b59f1df2663362c60e3cc3a3c860455d568abd994bb56a216f7eedf8baea6cc06ca73b1d0bdf9a07
6
+ metadata.gz: 91273ba54ea33c6d5cb1b4f335ad8039c35601953fdf1e6b9b2ac3117ceb25d81e9be569e5c8b70deda22b53a4c72f04d60ffb4b251badf5c5a64d13d399f36c
7
+ data.tar.gz: 4e563257fcba0c9f457b363da4f43d000ee239a5179e2f965071bb3df27e362cdf7bc9950a1624520cb862acfb0fde89e96b0a4987d79a93524230c4b84619cd
data/.rubocop.yml CHANGED
@@ -80,3 +80,6 @@ Style/WordArray:
80
80
 
81
81
  Style/RedundantEach:
82
82
  Enabled: false
83
+
84
+ Lint/NonLocalExitFromIterator:
85
+ Enabled: false
data/CHANGELOG.md CHANGED
@@ -1,5 +1,19 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## v0.7.0
4
+
5
+ * Added a `jobs` option to the actions to limit the concurrency
6
+ when e.g. accessing external data stores and to avoid overloading
7
+ them
8
+ * Added a queue using redis for the jobs to avoid starving workers
9
+ * Removed `distributed_job` dependency
10
+
11
+ ## v0.6.0
12
+
13
+ * Added `map_partitions`
14
+ * Added `combine`
15
+ * Added `dump` and `load`
16
+
3
17
  ## v0.5.0
4
18
 
5
19
  * Added a `before` option to specify a callable to run before
data/Gemfile.lock CHANGED
@@ -1,9 +1,8 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kraps (0.5.0)
4
+ kraps (0.7.0)
5
5
  attachie
6
- distributed_job
7
6
  map-reduce-ruby (>= 3.0.0)
8
7
  redis
9
8
  ruby-progressbar
@@ -41,8 +40,6 @@ GEM
41
40
  concurrent-ruby (1.1.10)
42
41
  connection_pool (2.3.0)
43
42
  diff-lcs (1.5.0)
44
- distributed_job (3.1.0)
45
- redis (>= 4.1.0)
46
43
  i18n (1.12.0)
47
44
  concurrent-ruby (~> 1.0)
48
45
  jmespath (1.6.1)
@@ -62,7 +59,7 @@ GEM
62
59
  rake (13.0.6)
63
60
  redis (5.0.5)
64
61
  redis-client (>= 0.9.0)
65
- redis-client (0.11.1)
62
+ redis-client (0.11.2)
66
63
  connection_pool
67
64
  regexp_parser (2.6.0)
68
65
  rexml (3.2.5)
data/README.md CHANGED
@@ -3,11 +3,12 @@
3
3
  **Easily process big data in ruby**
4
4
 
5
5
  Kraps allows to process and perform calculations on very large datasets in
6
- parallel using a map/reduce framework and runs on a background job framework
7
- you already have. You just need some space on your filesystem, S3 as a storage
8
- layer with temporary lifecycle policy enabled, the already mentioned background
9
- job framework (like sidekiq, shoryuken, etc) and redis to keep track of the
10
- progress. Most things you most likely already have in place anyways.
6
+ parallel using a map/reduce framework similar to [spark](https://spark.apache.org/),
7
+ but runs on a background job framework you already have. You just need some
8
+ space on your filesystem, S3 as a storage layer with temporary lifecycle policy
9
+ enabled, the already mentioned background job framework (like sidekiq,
10
+ shoryuken, etc) and redis to keep track of the progress. Most things you most
11
+ likely already have in place anyways.
11
12
 
12
13
  ## Installation
13
14
 
@@ -29,7 +30,7 @@ Kraps.configure(
29
30
  driver: Kraps::Drivers::S3Driver.new(s3_client: Aws::S3::Client.new("..."), bucket: "some-bucket", prefix: "temp/kraps/"),
30
31
  redis: Redis.new,
31
32
  namespace: "my-application", # An optional namespace to be used for redis keys, default: nil
32
- job_ttl: 24.hours, # Job information in redis will automatically be removed after this amount of time, default: 24 hours
33
+ job_ttl: 7.days, # Job information in redis will automatically be removed after this amount of time, default: 4 days
33
34
  show_progress: true # Whether or not to show the progress in the terminal when executing jobs, default: true
34
35
  enqueuer: ->(worker, json) { worker.perform_async(json) } # Allows to customize the enqueueing of worker jobs
35
36
  )
@@ -115,13 +116,13 @@ be able to give 300-400 megabytes to Kraps then, but now divide this by 10 and
115
116
  specify a `memory_limit` of around `30.megabytes`, better less. The
116
117
  `memory_limit` affects how much chunks will be written to disk depending on the
117
118
  data size you are processing and how big these chunks are. The smaller the
118
- value, the more chunks and the more chunks, the more runs Kraps need to merge
119
- the chunks. It can affect the performance The `chunk_limit` ensures that only
120
- the specified amount of chunks are processed in a single run. A run basically
121
- means: it takes up to `chunk_limit` chunks, reduces them and pushes the result
122
- as a new chunk to the list of chunks to process. Thus, if your number of file
123
- descriptors is unlimited, you want to set it to a higher number to avoid the
124
- overhead of multiple runs. `concurrency` tells Kraps how much threads to use to
119
+ value, the more chunks. The more chunks, the more runs Kraps need to merge
120
+ the chunks. The `chunk_limit` ensures that only the specified amount of chunks
121
+ are processed in a single run. A run basically means: it takes up to
122
+ `chunk_limit` chunks, reduces them and pushes the result as a new chunk to the
123
+ list of chunks to process. Thus, if your number of file descriptors is
124
+ unlimited, you want to set it to a higher number to avoid the overhead of
125
+ multiple runs. `concurrency` tells Kraps how much threads to use to
125
126
  concurrently upload/download files from the storage layer. Finally, `retries`
126
127
  specifies how often Kraps should retry the job step in case of errors. Kraps
127
128
  will sleep for 5 seconds between those retries. Please note that it's not yet
@@ -130,7 +131,6 @@ Kraps. Please note, however, that `parallelize` is not covered by `retries`
130
131
  yet, as the block passed to `parallelize` is executed by the runner, not the
131
132
  workers.
132
133
 
133
-
134
134
  Now, executing your job is super easy:
135
135
 
136
136
  ```ruby
@@ -182,11 +182,11 @@ https://github.com/mrkamel/map-reduce-ruby/#limitations-for-keys
182
182
  ## Storage
183
183
 
184
184
  Kraps stores temporary results of steps in a storage layer. Currently, only S3
185
- is supported besides a in memory driver used for testing purposes. Please be
185
+ is supported besides a in-memory driver used for testing purposes. Please be
186
186
  aware that Kraps does not clean up any files from the storage layer, as it
187
- would be a safe thing to do in case of errors anyways. Instead, Kraps relies on
188
- lifecycle features of modern object storage systems. Therefore, it is recommend
189
- to e.g. configure a lifecycle policy to delete any files after e.g. 7 days
187
+ would not be a safe thing to do in case of errors anyways. Instead, Kraps
188
+ relies on lifecycle features of modern object storage systems. Therefore, it is
189
+ required to configure a lifecycle policy to delete any files after e.g. 7 days
190
190
  either for a whole bucket or for a certain prefix like e.g. `temp/` and tell
191
191
  Kraps about the prefix to use (e.g. `temp/kraps/`).
192
192
 
@@ -220,7 +220,7 @@ items are used as keys and the values are set to `nil`.
220
220
  * `map`: Maps the key value pairs to other key value pairs
221
221
 
222
222
  ```ruby
223
- job.map(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker) do |key, value, collector|
223
+ job.map(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8) do |key, value, collector|
224
224
  collector.call("changed #{key}", "changed #{value}")
225
225
  end
226
226
  ```
@@ -229,10 +229,31 @@ The block gets each key-value pair passed and the `collector` block can be
229
229
  called as often as neccessary. This is also the reason why `map` can not simply
230
230
  return the new key-value pair, but the `collector` must be used instead.
231
231
 
232
+ The `jobs` argument can be useful when you need to access an external data
233
+ source, like a relational database and you want to limit the number of workers
234
+ accessing the store concurrently to avoid overloading it. If you don't specify
235
+ it, it will be identical to the number of partitions. It is recommended to only
236
+ use it for steps where you need to throttle the concurrency, because it will of
237
+ course slow down the processing. The `jobs` argument only applies to the
238
+ current step. The following steps don't inherit the argument, but reset it.
239
+
240
+ * `map_partitions`: Maps the key value pairs to other key value pairs, but the
241
+ block receives all data of each partition as an enumerable and sorted by key.
242
+ Please be aware that you should not call `to_a` or similar on the enumerable.
243
+ Prefer `map` over `map_partitions` when possible.
244
+
245
+ ```ruby
246
+ job.map_partitions(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8) do |pairs, collector|
247
+ pairs.each do |key, value|
248
+ collector.call("changed #{key}", "changed #{value}")
249
+ end
250
+ end
251
+ ```
252
+
232
253
  * `reduce`: Reduces the values of pairs having the same key
233
254
 
234
255
  ```ruby
235
- job.reduce(worker: MyKrapsWorker) do |key, value1, value2|
256
+ job.reduce(worker: MyKrapsWorker, jobs: 8) do |key, value1, value2|
236
257
  value1 + value2
237
258
  end
238
259
  ```
@@ -245,26 +266,61 @@ The `key` itself is also passed to the block for the case that you need to
245
266
  customize the reduce calculation according to the value of the key. However,
246
267
  most of the time, this is not neccessary and the key can simply be ignored.
247
268
 
269
+ * `combine`: Combines the results of 2 jobs by combining every key available
270
+ in the current job result with the corresponding key from the passed job
271
+ result. When the passed job result does not have the corresponding key,
272
+ `nil` will be passed to the block. Keys which are only available in the
273
+ passed job result are completely omitted.
274
+
275
+ ```ruby
276
+ job.combine(other_job, worker: MyKrapsWorker, jobs: 8) do |key, value1, value2|
277
+ (value1 || {}).merge(value2 || {})
278
+ end
279
+ ```
280
+
281
+ Please note that the keys, partitioners and the number of partitions must match
282
+ for the jobs to be combined. Further note that the results of `other_job` must
283
+ be reduced, meaning that every key must be unique. Finally, `other_job` must
284
+ not neccessarily be listed in the array of jobs returned by the `call` method,
285
+ since Kraps detects the dependency on its own.
286
+
248
287
  * `repartition`: Used to change the partitioning
249
288
 
250
289
  ```ruby
251
- job.repartition(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker)
290
+ job.repartition(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker, jobs: 8)
252
291
  ```
253
292
 
254
293
  Repartitions all data into the specified number of partitions and using the
255
294
  specified partitioner.
256
295
 
257
296
  * `each_partition`: Passes the partition number and all data of each partition
258
- as a lazy enumerable
297
+ as an enumerable and sorted by key. Please be aware that you should not call
298
+ `to_a` or similar on the enumerable.
259
299
 
260
300
  ```ruby
261
- job.each_partition do |partition, pairs|
301
+ job.each_partition(jobs: 8) do |partition, pairs|
262
302
  pairs.each do |key, value|
263
303
  # ...
264
304
  end
265
305
  end
266
306
  ```
267
307
 
308
+ * `dump`: Store all current data per partition under the specified prefix
309
+
310
+ ```ruby
311
+ job.dump(prefix: "path/to/dump", worker: MyKrapsWorker)
312
+ ```
313
+
314
+ It creates a folder for every partition and stores one or more chunks in there.
315
+
316
+ * `load`: Loads the previously dumped data
317
+
318
+ ```ruby
319
+ job.load(prefix: "path/to/dump", partitions: 32, partitioner: Kraps::HashPartitioner.new, worker: MyKrapsWorker)
320
+ ```
321
+
322
+ The number of partitions and the partitioner must be specified.
323
+
268
324
  Please note that every API method accepts a `before` callable:
269
325
 
270
326
  ```ruby
@@ -326,13 +382,58 @@ When you execute the job, Kraps will execute the jobs one after another and as
326
382
  the jobs build up on each other, Kraps will execute the steps shared by both
327
383
  jobs only once.
328
384
 
385
+ ## Testing
386
+
387
+ Kraps ships with an in-memory fake driver for storage, which you can use for
388
+ testing purposes instead of the s3 driver:
389
+
390
+ ```ruby
391
+ Kraps.configure(
392
+ driver: Kraps::Drivers::FakeDriver.new(bucket: "kraps"),
393
+ # ...
394
+ ) ```
395
+
396
+ This is of course much faster than using s3 or some s3 compatible service.
397
+ Moreover, when testing large Kraps jobs you maybe want to test intermediate
398
+ steps. You can use `#dump` for this purpose and test that the data dumped is
399
+ correct.
400
+
401
+ ```ruby
402
+ job = job.dump("path/to/dump")
403
+ ```
404
+
405
+ and in your tests do
406
+
407
+ ```ruby
408
+ Kraps.driver.value("path/to/dump/0/chunk.json") # => data of partition 0
409
+ Kraps.driver.value("path/to/dump/1/chunk.json") # => data of partition 1
410
+ # ...
411
+ ```
412
+
413
+ The data is stored in lines, each line is a json encoded array of key and
414
+ value.
415
+
416
+ ```ruby
417
+ data = Kraps.driver.value("path/to/dump/0/chunk.json).lines.map do |line|
418
+ JSON.parse(line) # => [key, value]
419
+ end
420
+ ```
421
+
422
+ The API of the driver is:
423
+
424
+ * `store(name, data_or_ui, options = {})`: Stores `data_or_io` as `name`
425
+ * `list(prefix: nil)`: Lists all objects or all objects matching the `prefix`
426
+ * `value(name)`: Returns the object content of `name`
427
+ * `download(name, path)`: Downloads the object `name` to `path` in your
428
+ filesystem
429
+ * `exists?(name)`: Returns `true`/`false`
430
+ * `flush`: Removes all objects from the fake storage
431
+
329
432
  ## Dependencies
330
433
 
331
434
  Kraps is built on top of
332
435
  [map-reduce-ruby](https://github.com/mrkamel/map-reduce-ruby) for the
333
436
  map/reduce framework,
334
- [distributed_job](https://github.com/mrkamel/distributed_job)
335
- to keep track of the job/step status,
336
437
  [attachie](https://github.com/mrkamel/attachie) to interact with the storage
337
438
  layer (s3),
338
439
  [ruby-progressbar](https://github.com/jfelchner/ruby-progressbar) to
data/docker-compose.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  version: '2'
2
2
  services:
3
- elasticsearch:
3
+ redis:
4
4
  image: redis
5
5
  ports:
6
6
  - 6379:6379
data/lib/kraps/actions.rb CHANGED
@@ -3,7 +3,9 @@ module Kraps
3
3
  ALL = [
4
4
  PARALLELIZE = "parallelize",
5
5
  MAP = "map",
6
+ MAP_PARTITIONS = "map_partitions",
6
7
  REDUCE = "reduce",
8
+ COMBINE = "combine",
7
9
  EACH_PARTITION = "each_partition"
8
10
  ]
9
11
  end
data/lib/kraps/drivers.rb CHANGED
@@ -8,6 +8,26 @@ module Kraps
8
8
  def with_prefix(path)
9
9
  File.join(*[@prefix, path].compact)
10
10
  end
11
+
12
+ def list(prefix: nil)
13
+ driver.list(bucket, prefix: prefix)
14
+ end
15
+
16
+ def value(name)
17
+ driver.value(name, bucket)
18
+ end
19
+
20
+ def download(name, path)
21
+ driver.download(name, bucket, path)
22
+ end
23
+
24
+ def exists?(name)
25
+ driver.exists?(name, bucket)
26
+ end
27
+
28
+ def store(name, data_or_io, options = {})
29
+ driver.store(name, data_or_io, bucket, options)
30
+ end
11
31
  end
12
32
 
13
33
  class S3Driver
@@ -32,6 +52,10 @@ module Kraps
32
52
  @bucket = bucket
33
53
  @prefix = prefix
34
54
  end
55
+
56
+ def flush
57
+ driver.flush
58
+ end
35
59
  end
36
60
  end
37
61
  end
data/lib/kraps/job.rb CHANGED
@@ -27,7 +27,7 @@ module Kraps
27
27
  end
28
28
  end
29
29
 
30
- def map(partitions: nil, partitioner: nil, worker: @worker, before: nil, &block)
30
+ def map(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
31
31
  fresh.tap do |job|
32
32
  job.instance_eval do
33
33
  @partitions = partitions if partitions
@@ -35,6 +35,7 @@ module Kraps
35
35
 
36
36
  @steps << Step.new(
37
37
  action: Actions::MAP,
38
+ jobs: [jobs, @partitions].compact.min,
38
39
  partitions: @partitions,
39
40
  partitioner: @partitioner,
40
41
  worker: worker,
@@ -45,11 +46,31 @@ module Kraps
45
46
  end
46
47
  end
47
48
 
48
- def reduce(worker: @worker, before: nil, &block)
49
+ def map_partitions(partitions: nil, partitioner: nil, jobs: nil, worker: @worker, before: nil, &block)
50
+ fresh.tap do |job|
51
+ job.instance_eval do
52
+ @partitions = partitions if partitions
53
+ @partitioner = partitioner if partitioner
54
+
55
+ @steps << Step.new(
56
+ action: Actions::MAP_PARTITIONS,
57
+ jobs: [jobs, @partitions].compact.min,
58
+ partitions: @partitions,
59
+ partitioner: @partitioner,
60
+ worker: worker,
61
+ before: before,
62
+ block: block
63
+ )
64
+ end
65
+ end
66
+ end
67
+
68
+ def reduce(jobs: nil, worker: @worker, before: nil, &block)
49
69
  fresh.tap do |job|
50
70
  job.instance_eval do
51
71
  @steps << Step.new(
52
72
  action: Actions::REDUCE,
73
+ jobs: [jobs, @partitions].compact.min,
53
74
  partitions: @partitions,
54
75
  partitioner: @partitioner,
55
76
  worker: worker,
@@ -60,11 +81,30 @@ module Kraps
60
81
  end
61
82
  end
62
83
 
63
- def each_partition(worker: @worker, before: nil, &block)
84
+ def combine(other_job, jobs: nil, worker: @worker, before: nil, &block)
85
+ fresh.tap do |job|
86
+ job.instance_eval do
87
+ @steps << Step.new(
88
+ action: Actions::COMBINE,
89
+ jobs: [jobs, @partitions].compact.min,
90
+ partitions: @partitions,
91
+ partitioner: @partitioner,
92
+ worker: worker,
93
+ before: before,
94
+ block: block,
95
+ dependency: other_job,
96
+ options: { combine_step_index: other_job.steps.size - 1 }
97
+ )
98
+ end
99
+ end
100
+ end
101
+
102
+ def each_partition(jobs: nil, worker: @worker, before: nil, &block)
64
103
  fresh.tap do |job|
65
104
  job.instance_eval do
66
105
  @steps << Step.new(
67
106
  action: Actions::EACH_PARTITION,
107
+ jobs: [jobs, @partitions].compact.min,
68
108
  partitions: @partitions,
69
109
  partitioner: @partitioner,
70
110
  worker: worker,
@@ -75,12 +115,51 @@ module Kraps
75
115
  end
76
116
  end
77
117
 
78
- def repartition(partitions:, partitioner: nil, worker: @worker, before: nil)
79
- map(partitions: partitions, partitioner: partitioner, worker: worker, before: before) do |key, value, collector|
118
+ def repartition(partitions:, jobs: nil, partitioner: nil, worker: @worker, before: nil)
119
+ map(jobs: jobs, partitions: partitions, partitioner: partitioner, worker: worker, before: before) do |key, value, collector|
80
120
  collector.call(key, value)
81
121
  end
82
122
  end
83
123
 
124
+ def dump(prefix:, worker: @worker)
125
+ each_partition(worker: worker) do |partition, pairs|
126
+ tempfile = Tempfile.new
127
+
128
+ pairs.each do |pair|
129
+ tempfile.puts(JSON.generate(pair))
130
+ end
131
+
132
+ Kraps.driver.store(File.join(prefix, partition.to_s, "chunk.json"), tempfile.tap(&:rewind))
133
+ ensure
134
+ tempfile&.close(true)
135
+ end
136
+ end
137
+
138
+ def load(prefix:, partitions:, partitioner:, worker: @worker)
139
+ job = parallelize(partitions: partitions, partitioner: proc { |key, _| key }, worker: worker) do |collector|
140
+ (0...partitions).each do |partition|
141
+ collector.call(partition)
142
+ end
143
+ end
144
+
145
+ job.map_partitions(partitioner: partitioner, worker: worker) do |partition, _, collector|
146
+ tempfile = Tempfile.new
147
+
148
+ path = File.join(prefix, partition.to_s, "chunk.json")
149
+ next unless Kraps.driver.exists?(path)
150
+
151
+ Kraps.driver.download(path, tempfile.path)
152
+
153
+ tempfile.each_line do |line|
154
+ key, value = JSON.parse(line)
155
+
156
+ collector.call(key, value)
157
+ end
158
+ ensure
159
+ tempfile&.close(true)
160
+ end
161
+ end
162
+
84
163
  def fresh
85
164
  dup.tap do |job|
86
165
  job.instance_variable_set(:@steps, @steps.dup)
@@ -0,0 +1,13 @@
1
+ module Kraps
2
+ class JobResolver
3
+ def call(jobs)
4
+ resolve_dependencies(Array(jobs)).uniq
5
+ end
6
+
7
+ private
8
+
9
+ def resolve_dependencies(jobs)
10
+ jobs.map { |job| [resolve_dependencies(job.steps.map(&:dependency).compact), job] }.flatten
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,151 @@
1
+ module Kraps
2
+ class RedisQueue
3
+ VISIBILITY_TIMEOUT = 60
4
+
5
+ attr_reader :token
6
+
7
+ def initialize(redis:, token:, namespace:, ttl:)
8
+ @redis = redis
9
+ @token = token
10
+ @namespace = namespace
11
+ @ttl = ttl
12
+ end
13
+
14
+ def size
15
+ @size_script ||= <<~SCRIPT
16
+ local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
17
+
18
+ redis.call('expire', queue_key, ttl)
19
+ redis.call('expire', pending_key, ttl)
20
+ redis.call('expire', status_key, ttl)
21
+
22
+ return redis.call('llen', queue_key) + redis.call('zcard', pending_key)
23
+ SCRIPT
24
+
25
+ @redis.eval(@size_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl])
26
+ end
27
+
28
+ def enqueue(payload)
29
+ @enqueue_script ||= <<~SCRIPT
30
+ local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
31
+
32
+ redis.call('rpush', queue_key, job)
33
+
34
+ redis.call('expire', queue_key, ttl)
35
+ redis.call('expire', pending_key, ttl)
36
+ redis.call('expire', status_key, ttl)
37
+ SCRIPT
38
+
39
+ @redis.eval(@enqueue_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, JSON.generate(payload)])
40
+ end
41
+
42
+ def dequeue
43
+ @dequeue_script ||= <<~SCRIPT
44
+ local queue_key, pending_key, status_key, ttl, visibility_timeout = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), tonumber(ARGV[5])
45
+
46
+ local zitem = redis.call('zrange', pending_key, 0, 0, 'WITHSCORES')
47
+ local job = zitem[1]
48
+
49
+ if not zitem[2] or tonumber(zitem[2]) > tonumber(redis.call('time')[1]) then
50
+ job = redis.call('lpop', queue_key)
51
+ end
52
+
53
+ redis.call('expire', queue_key, ttl)
54
+ redis.call('expire', pending_key, ttl)
55
+ redis.call('expire', status_key, ttl)
56
+
57
+ if not job then return nil end
58
+
59
+ redis.call('zadd', pending_key, tonumber(redis.call('time')[1]) + visibility_timeout, job)
60
+ redis.call('expire', pending_key, ttl)
61
+
62
+ return job
63
+ SCRIPT
64
+
65
+ job = @redis.eval(@dequeue_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, VISIBILITY_TIMEOUT])
66
+
67
+ unless job
68
+ yield(nil)
69
+ return
70
+ end
71
+
72
+ keep_alive(job) do
73
+ yield(JSON.parse(job)) if job
74
+ end
75
+
76
+ @remove_script ||= <<~SCRIPT
77
+ local queue_key, pending_key, status_key, ttl, job = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5]
78
+
79
+ redis.call('zrem', pending_key, job)
80
+
81
+ redis.call('expire', queue_key, ttl)
82
+ redis.call('expire', pending_key, ttl)
83
+ redis.call('expire', status_key, ttl)
84
+ SCRIPT
85
+
86
+ @redis.eval(@remove_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, job])
87
+ end
88
+
89
+ def stop
90
+ @stop_script ||= <<~SCRIPT
91
+ local queue_key, pending_key, status_key, ttl = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4])
92
+
93
+ redis.call('hset', status_key, 'stopped', 1)
94
+
95
+ redis.call('expire', queue_key, ttl)
96
+ redis.call('expire', pending_key, ttl)
97
+ redis.call('expire', status_key, ttl)
98
+ SCRIPT
99
+
100
+ @redis.eval(@stop_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl])
101
+ end
102
+
103
+ def stopped?
104
+ @stopped_script ||= <<~SCRIPT
105
+ local queue_key, pending_key, status_key, ttl = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4])
106
+
107
+ redis.call('expire', queue_key, ttl)
108
+ redis.call('expire', pending_key, ttl)
109
+ redis.call('expire', status_key, ttl)
110
+
111
+ return redis.call('hget', status_key, 'stopped')
112
+ SCRIPT
113
+
114
+ @redis.eval(@stopped_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl]).to_i == 1
115
+ end
116
+
117
+ private
118
+
119
+ def keep_alive(job)
120
+ @keep_alive_script ||= <<~SCRIPT
121
+ local queue_key, pending_key, status_key, ttl, job, visibility_timeout = ARGV[1], ARGV[2], ARGV[3], tonumber(ARGV[4]), ARGV[5], tonumber(ARGV[6])
122
+
123
+ redis.call('zadd', pending_key, tonumber(redis.call('time')[1]) + visibility_timeout, job)
124
+
125
+ redis.call('expire', queue_key, ttl)
126
+ redis.call('expire', pending_key, ttl)
127
+ redis.call('expire', status_key, ttl)
128
+ SCRIPT
129
+
130
+ interval = Interval.new(5) do
131
+ @redis.eval(@keep_alive_script, argv: [redis_queue_key, redis_pending_key, redis_status_key, @ttl, job, VISIBILITY_TIMEOUT])
132
+ end
133
+
134
+ yield
135
+ ensure
136
+ interval&.stop
137
+ end
138
+
139
+ def redis_queue_key
140
+ [@namespace, "kraps", "queue", @token].compact.join(":")
141
+ end
142
+
143
+ def redis_pending_key
144
+ [@namespace, "kraps", "pending", @token].compact.join(":")
145
+ end
146
+
147
+ def redis_status_key
148
+ [@namespace, "kraps", "status", @token].compact.join(":")
149
+ end
150
+ end
151
+ end
data/lib/kraps/runner.rb CHANGED
@@ -5,7 +5,7 @@ module Kraps
5
5
  end
6
6
 
7
7
  def call(*args, **kwargs)
8
- Array(@klass.new.call(*args, **kwargs)).tap do |jobs|
8
+ JobResolver.new.call(@klass.new.call(*args, **kwargs)).tap do |jobs|
9
9
  jobs.each_with_index do |job, job_index|
10
10
  job.steps.each_with_index.inject(nil) do |frame, (_, step_index)|
11
11
  StepRunner.new(
@@ -45,107 +45,101 @@ module Kraps
45
45
 
46
46
  def perform_parallelize
47
47
  enum = Enumerator.new do |yielder|
48
- collector = proc { |item| yielder << item }
48
+ collector = proc { |item| yielder << { item: item } }
49
49
 
50
50
  @step.block.call(collector)
51
51
  end
52
52
 
53
- with_distributed_job do |distributed_job|
54
- push_and_wait(distributed_job, enum) do |item, part|
55
- enqueue(token: distributed_job.token, part: part, item: item)
56
- end
53
+ token = push_and_wait(enum: enum)
57
54
 
58
- Frame.new(token: distributed_job.token, partitions: @step.partitions)
59
- end
55
+ Frame.new(token: token, partitions: @step.partitions)
60
56
  end
61
57
 
62
58
  def perform_map
63
- with_distributed_job do |distributed_job|
64
- push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
65
- enqueue(token: distributed_job.token, part: part, partition: partition)
66
- end
59
+ enum = (0...@frame.partitions).map { |partition| { partition: partition } }
60
+ token = push_and_wait(job_count: @step.jobs, enum: enum)
67
61
 
68
- Frame.new(token: distributed_job.token, partitions: @step.partitions)
69
- end
62
+ Frame.new(token: token, partitions: @step.partitions)
63
+ end
64
+
65
+ def perform_map_partitions
66
+ enum = (0...@frame.partitions).map { |partition| { partition: partition } }
67
+ token = push_and_wait(job_count: @step.jobs, enum: enum)
68
+
69
+ Frame.new(token: token, partitions: @step.partitions)
70
70
  end
71
71
 
72
72
  def perform_reduce
73
- with_distributed_job do |distributed_job|
74
- push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
75
- enqueue(token: distributed_job.token, part: part, partition: partition)
76
- end
73
+ enum = (0...@frame.partitions).map { |partition| { partition: partition } }
74
+ token = push_and_wait(job_count: @step.jobs, enum: enum)
77
75
 
78
- Frame.new(token: distributed_job.token, partitions: @step.partitions)
79
- end
76
+ Frame.new(token: token, partitions: @step.partitions)
80
77
  end
81
78
 
82
- def perform_each_partition
83
- with_distributed_job do |distributed_job|
84
- push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
85
- enqueue(token: distributed_job.token, part: part, partition: partition)
86
- end
79
+ def perform_combine
80
+ combine_job = @step.dependency
81
+ combine_step = combine_job.steps[@step.options[:combine_step_index]]
82
+
83
+ raise(IncompatibleFrame, "Incompatible number of partitions") if combine_step.partitions != @step.partitions
87
84
 
88
- @frame
85
+ enum = (0...@frame.partitions).map do |partition|
86
+ { partition: partition, combine_frame: combine_step.frame.to_h }
89
87
  end
90
- end
91
88
 
92
- def enqueue(token:, part:, **rest)
93
- Kraps.enqueuer.call(
94
- @step.worker,
95
- JSON.generate(
96
- job_index: @job_index,
97
- step_index: @step_index,
98
- frame: @frame.to_h,
99
- token: token,
100
- part: part,
101
- klass: @klass,
102
- args: @args,
103
- kwargs: @kwargs,
104
- **rest
105
- )
106
- )
89
+ token = push_and_wait(job_count: @step.jobs, enum: enum)
90
+
91
+ Frame.new(token: token, partitions: @step.partitions)
107
92
  end
108
93
 
109
- def with_distributed_job
110
- distributed_job = Kraps.distributed_job_client.build(token: SecureRandom.hex)
94
+ def perform_each_partition
95
+ enum = (0...@frame.partitions).map { |partition| { partition: partition } }
96
+ push_and_wait(job_count: @step.jobs, enum: enum)
111
97
 
112
- yield(distributed_job)
113
- rescue Interrupt
114
- distributed_job&.stop
115
- raise
98
+ @frame
116
99
  end
117
100
 
118
- def push_and_wait(distributed_job, enum)
119
- progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, token #{distributed_job.token}, %a, %c/%C (%p%) => #{@step.action}")
101
+ def push_and_wait(enum:, job_count: nil)
102
+ redis_queue = RedisQueue.new(redis: Kraps.redis, token: SecureRandom.hex, namespace: Kraps.namespace, ttl: Kraps.job_ttl)
103
+ progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, token #{redis_queue.token}, %a, %c/%C (%p%) => #{@step.action}")
120
104
 
121
- begin
122
- total = 0
105
+ total = 0
123
106
 
124
- interval = Interval.new(1) do
125
- progress_bar.total = total
126
- end
107
+ interval = Interval.new(1) do
108
+ # The interval is used to continously update the progress bar even
109
+ # when push_all is used and to avoid sessions being terminated due
110
+ # to inactivity etc
127
111
 
128
- distributed_job.push_each(enum) do |item, part|
129
- total += 1
130
- interval.fire(timeout: 1)
112
+ progress_bar.total = total
113
+ progress_bar.progress = [progress_bar.total - redis_queue.size, 0].max
114
+ end
131
115
 
132
- yield(item, part)
133
- end
134
- ensure
135
- interval&.stop
116
+ enum.each_with_index do |item, part|
117
+ total += 1
118
+
119
+ redis_queue.enqueue(item.merge(part: part))
136
120
  end
137
121
 
138
- loop do
139
- progress_bar.total = distributed_job.total
140
- progress_bar.progress = progress_bar.total - distributed_job.count
122
+ (job_count || total).times do
123
+ break if redis_queue.stopped?
124
+
125
+ Kraps.enqueuer.call(@step.worker, JSON.generate(job_index: @job_index, step_index: @step_index, frame: @frame.to_h, token: redis_queue.token, klass: @klass, args: @args, kwargs: @kwargs))
126
+ end
141
127
 
142
- break if distributed_job.finished? || distributed_job.stopped?
128
+ loop do
129
+ break if redis_queue.size.zero?
130
+ break if redis_queue.stopped?
143
131
 
144
132
  sleep(1)
145
133
  end
146
134
 
147
- raise(JobStopped, "The job was stopped") if distributed_job.stopped?
135
+ raise(JobStopped, "The job was stopped") if redis_queue.stopped?
136
+
137
+ interval.fire(timeout: 1)
138
+
139
+ redis_queue.token
148
140
  ensure
141
+ redis_queue&.stop
142
+ interval&.stop
149
143
  progress_bar&.stop
150
144
  end
151
145
 
data/lib/kraps/step.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kraps
2
- Step = Struct.new(:action, :partitioner, :partitions, :block, :worker, :before, :frame, keyword_init: true)
2
+ Step = Struct.new(:action, :partitioner, :partitions, :jobs, :block, :worker, :before, :frame, :dependency, :options, keyword_init: true)
3
3
  end
@@ -1,29 +1,3 @@
1
1
  module Kraps
2
- class TempPath
3
- attr_reader :path
4
-
5
- def initialize(prefix: nil, suffix: nil)
6
- @path = File.join(Dir.tmpdir, [prefix, SecureRandom.hex[0, 16], Process.pid, suffix].compact.join("."))
7
-
8
- File.open(@path, File::CREAT | File::EXCL) {}
9
-
10
- ObjectSpace.define_finalizer(self, self.class.finalize(@path))
11
-
12
- return unless block_given?
13
-
14
- begin
15
- yield
16
- ensure
17
- unlink
18
- end
19
- end
20
-
21
- def unlink
22
- FileUtils.rm_f(@path)
23
- end
24
-
25
- def self.finalize(path)
26
- proc { FileUtils.rm_f(path) }
27
- end
28
- end
2
+ TempPath = MapReduce::TempPath
29
3
  end
@@ -17,9 +17,9 @@ module Kraps
17
17
  end
18
18
  end
19
19
 
20
- def unlink
20
+ def delete
21
21
  synchronize do
22
- @temp_paths.each(&:unlink)
22
+ @temp_paths.each(&:delete)
23
23
  end
24
24
  end
25
25
 
data/lib/kraps/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kraps
2
- VERSION = "0.5.0"
2
+ VERSION = "0.7.0"
3
3
  end
data/lib/kraps/worker.rb CHANGED
@@ -1,29 +1,32 @@
1
1
  module Kraps
2
2
  class Worker
3
- def initialize(json, memory_limit:, chunk_limit:, concurrency:)
3
+ include MapReduce::Mergeable
4
+
5
+ def initialize(json, memory_limit:, chunk_limit:, concurrency:, logger: Logger.new("/dev/null"))
4
6
  @args = JSON.parse(json)
5
7
  @memory_limit = memory_limit
6
8
  @chunk_limit = chunk_limit
7
9
  @concurrency = concurrency
10
+ @logger = logger
8
11
  end
9
12
 
10
13
  def call(retries: 3)
11
- return if distributed_job.stopped?
14
+ return if redis_queue.stopped?
12
15
 
13
16
  raise(InvalidAction, "Invalid action #{step.action}") unless Actions::ALL.include?(step.action)
14
17
 
15
- with_retries(retries) do # TODO: allow to use queue based retries
16
- step.before&.call
17
-
18
- send(:"perform_#{step.action}")
18
+ dequeue do |payload|
19
+ with_retries(retries) do # TODO: allow to use queue based retries
20
+ step.before&.call
19
21
 
20
- distributed_job.done(@args["part"])
22
+ send(:"perform_#{step.action}", payload)
23
+ end
21
24
  end
22
25
  end
23
26
 
24
27
  private
25
28
 
26
- def perform_parallelize
29
+ def perform_parallelize(payload)
27
30
  implementation = Class.new do
28
31
  def map(key)
29
32
  yield(key, nil)
@@ -31,29 +34,19 @@ module Kraps
31
34
  end
32
35
 
33
36
  mapper = MapReduce::Mapper.new(implementation.new, partitioner: partitioner, memory_limit: @memory_limit)
34
- mapper.map(@args["item"])
37
+ mapper.map(payload["item"])
35
38
 
36
39
  mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
37
40
  Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
38
41
  File.open(path) do |stream|
39
- Kraps.driver.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream, Kraps.driver.bucket)
42
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["part"]}.json"), stream)
40
43
  end
41
44
  end
42
45
  end
43
46
  end
44
47
 
45
- def perform_map
46
- temp_paths = TempPaths.new
47
-
48
- files = Kraps.driver.driver.list(Kraps.driver.bucket, prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")).sort
49
-
50
- temp_paths_index = files.each_with_object({}) do |file, hash|
51
- hash[file] = temp_paths.add
52
- end
53
-
54
- Parallelizer.each(files, @concurrency) do |file|
55
- Kraps.driver.driver.download(file, Kraps.driver.bucket, temp_paths_index[file].path)
56
- end
48
+ def perform_map(payload)
49
+ temp_paths = download_all(token: @args["frame"]["token"], partition: payload["partition"])
57
50
 
58
51
  current_step = step
59
52
 
@@ -85,17 +78,48 @@ module Kraps
85
78
  mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
86
79
  Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
87
80
  File.open(path) do |stream|
88
- Kraps.driver.driver.store(
89
- Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream, Kraps.driver.bucket
90
- )
81
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
91
82
  end
92
83
  end
93
84
  end
94
85
  ensure
95
- temp_paths&.unlink
86
+ temp_paths&.delete
96
87
  end
97
88
 
98
- def perform_reduce
89
+ def perform_map_partitions(payload)
90
+ temp_paths = download_all(token: @args["frame"]["token"], partition: payload["partition"])
91
+
92
+ current_step = step
93
+ current_partition = payload["partition"]
94
+
95
+ implementation = Object.new
96
+ implementation.define_singleton_method(:map) do |enum, &block|
97
+ current_step.block.call(current_partition, enum, block)
98
+ end
99
+
100
+ subsequent_step = next_step
101
+
102
+ if subsequent_step&.action == Actions::REDUCE
103
+ implementation.define_singleton_method(:reduce) do |key, value1, value2|
104
+ subsequent_step.block.call(key, value1, value2)
105
+ end
106
+ end
107
+
108
+ mapper = MapReduce::Mapper.new(implementation, partitioner: partitioner, memory_limit: @memory_limit)
109
+ mapper.map(k_way_merge(temp_paths.each.to_a, chunk_limit: @chunk_limit))
110
+
111
+ mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
112
+ Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
113
+ File.open(path) do |stream|
114
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
115
+ end
116
+ end
117
+ end
118
+ ensure
119
+ temp_paths&.delete
120
+ end
121
+
122
+ def perform_reduce(payload)
99
123
  current_step = step
100
124
 
101
125
  implementation = Object.new
@@ -105,8 +129,8 @@ module Kraps
105
129
 
106
130
  reducer = MapReduce::Reducer.new(implementation)
107
131
 
108
- Parallelizer.each(Kraps.driver.driver.list(Kraps.driver.bucket, prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")), @concurrency) do |file|
109
- Kraps.driver.driver.download(file, Kraps.driver.bucket, reducer.add_chunk)
132
+ Parallelizer.each(Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{payload["partition"]}/")), @concurrency) do |file|
133
+ Kraps.driver.download(file, reducer.add_chunk)
110
134
  end
111
135
 
112
136
  tempfile = Tempfile.new
@@ -115,35 +139,96 @@ module Kraps
115
139
  tempfile.puts(JSON.generate([key, value]))
116
140
  end
117
141
 
118
- Kraps.driver.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{@args["partition"]}/chunk.#{@args["part"]}.json"), tempfile.tap(&:rewind), Kraps.driver.bucket)
142
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{payload["partition"]}/chunk.#{payload["partition"]}.json"), tempfile.tap(&:rewind))
119
143
  ensure
120
144
  tempfile&.close(true)
121
145
  end
122
146
 
123
- def perform_each_partition
124
- temp_paths = TempPaths.new
147
+ def perform_combine(payload)
148
+ temp_paths1 = download_all(token: @args["frame"]["token"], partition: payload["partition"])
149
+ temp_paths2 = download_all(token: payload["combine_frame"]["token"], partition: payload["partition"])
125
150
 
126
- files = Kraps.driver.driver.list(Kraps.driver.bucket, prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")).sort
151
+ enum1 = k_way_merge(temp_paths1.each.to_a, chunk_limit: @chunk_limit)
152
+ enum2 = k_way_merge(temp_paths2.each.to_a, chunk_limit: @chunk_limit)
127
153
 
128
- temp_paths_index = files.each_with_object({}) do |file, hash|
129
- hash[file] = temp_paths.add
154
+ combine_method = method(:combine)
155
+ current_step = step
156
+
157
+ implementation = Object.new
158
+ implementation.define_singleton_method(:map) do |&block|
159
+ combine_method.call(enum1, enum2) do |key, value1, value2|
160
+ block.call(key, current_step.block.call(key, value1, value2))
161
+ end
130
162
  end
131
163
 
132
- Parallelizer.each(files, @concurrency) do |file|
133
- Kraps.driver.driver.download(file, Kraps.driver.bucket, temp_paths_index[file].path)
164
+ mapper = MapReduce::Mapper.new(implementation, partitioner: partitioner, memory_limit: @memory_limit)
165
+ mapper.map
166
+
167
+ mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
168
+ Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
169
+ File.open(path) do |stream|
170
+ Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
171
+ end
172
+ end
134
173
  end
174
+ ensure
175
+ temp_paths1&.delete
176
+ temp_paths2&.delete
177
+ end
135
178
 
136
- enum = Enumerator::Lazy.new(temp_paths) do |yielder, temp_path|
137
- File.open(temp_path.path) do |stream|
138
- stream.each_line do |line|
139
- yielder << JSON.parse(line)
179
+ def combine(enum1, enum2)
180
+ current1 = begin; enum1.next; rescue StopIteration; nil; end
181
+ current2 = begin; enum2.next; rescue StopIteration; nil; end
182
+
183
+ loop do
184
+ return if current1.nil? && current2.nil?
185
+ return if current1.nil?
186
+
187
+ if current2.nil?
188
+ yield(current1[0], current1[1], nil)
189
+
190
+ current1 = begin; enum1.next; rescue StopIteration; nil; end
191
+ elsif current1[0] == current2[0]
192
+ loop do
193
+ yield(current1[0], current1[1], current2[1])
194
+
195
+ current1 = begin; enum1.next; rescue StopIteration; nil; end
196
+
197
+ break if current1.nil?
198
+ break if current1[0] != current2[0]
199
+ end
200
+
201
+ current2 = begin; enum2.next; rescue StopIteration; nil; end
202
+ else
203
+ res = current1[0] <=> current2[0]
204
+
205
+ if res < 0
206
+ yield(current1[0], current1[1], nil)
207
+
208
+ current1 = begin; enum1.next; rescue StopIteration; nil; end
209
+ else
210
+ current2 = begin; enum2.next; rescue StopIteration; nil; end
140
211
  end
141
212
  end
142
213
  end
214
+ end
215
+
216
+ def perform_each_partition(payload)
217
+ temp_paths = TempPaths.new
218
+
219
+ files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{payload["partition"]}/")).sort
220
+
221
+ temp_paths_index = files.each_with_object({}) do |file, hash|
222
+ hash[file] = temp_paths.add
223
+ end
143
224
 
144
- step.block.call(@args["partition"], enum)
225
+ Parallelizer.each(files, @concurrency) do |file|
226
+ Kraps.driver.download(file, temp_paths_index[file].path)
227
+ end
228
+
229
+ step.block.call(payload["partition"], k_way_merge(temp_paths.each.to_a, chunk_limit: @chunk_limit))
145
230
  ensure
146
- temp_paths&.unlink
231
+ temp_paths&.delete
147
232
  end
148
233
 
149
234
  def with_retries(num_retries)
@@ -152,14 +237,16 @@ module Kraps
152
237
  begin
153
238
  yield
154
239
  rescue Kraps::Error
155
- distributed_job.stop
240
+ redis_queue.stop
156
241
  raise
157
- rescue StandardError
242
+ rescue StandardError => e
158
243
  if retries >= num_retries
159
- distributed_job.stop
244
+ redis_queue.stop
160
245
  raise
161
246
  end
162
247
 
248
+ @logger.error(e)
249
+
163
250
  sleep(5)
164
251
  retries += 1
165
252
 
@@ -167,8 +254,39 @@ module Kraps
167
254
  end
168
255
  end
169
256
 
257
+ def dequeue
258
+ loop do
259
+ break if redis_queue.stopped?
260
+ break if redis_queue.size.zero?
261
+
262
+ redis_queue.dequeue do |payload|
263
+ payload ? yield(payload) : sleep(1)
264
+ end
265
+ end
266
+ end
267
+
268
+ def redis_queue
269
+ @redis_queue ||= RedisQueue.new(redis: Kraps.redis, token: @args["token"], namespace: Kraps.namespace, ttl: Kraps.job_ttl)
270
+ end
271
+
272
+ def download_all(token:, partition:)
273
+ temp_paths = TempPaths.new
274
+
275
+ files = Kraps.driver.list(prefix: Kraps.driver.with_prefix("#{token}/#{partition}/")).sort
276
+
277
+ temp_paths_index = files.each_with_object({}) do |file, hash|
278
+ hash[file] = temp_paths.add
279
+ end
280
+
281
+ Parallelizer.each(files, @concurrency) do |file|
282
+ Kraps.driver.download(file, temp_paths_index[file].path)
283
+ end
284
+
285
+ temp_paths
286
+ end
287
+
170
288
  def jobs
171
- @jobs ||= Array(@args["klass"].constantize.new.call(*@args["args"], **@args["kwargs"].transform_keys(&:to_sym)))
289
+ @jobs ||= JobResolver.new.call(@args["klass"].constantize.new.call(*@args["args"], **@args["kwargs"].transform_keys(&:to_sym)))
172
290
  end
173
291
 
174
292
  def job
@@ -198,9 +316,5 @@ module Kraps
198
316
  def partitioner
199
317
  @partitioner ||= proc { |key| step.partitioner.call(key, step.partitions) }
200
318
  end
201
-
202
- def distributed_job
203
- @distributed_job ||= Kraps.distributed_job_client.build(token: @args["token"])
204
- end
205
319
  end
206
320
  end
data/lib/kraps.rb CHANGED
@@ -1,32 +1,37 @@
1
+ require "ruby-progressbar"
2
+ require "ruby-progressbar/outputs/null"
3
+ require "map_reduce"
4
+ require "redis"
5
+
1
6
  require_relative "kraps/version"
2
7
  require_relative "kraps/drivers"
3
8
  require_relative "kraps/actions"
4
9
  require_relative "kraps/parallelizer"
5
10
  require_relative "kraps/hash_partitioner"
11
+ require_relative "kraps/redis_queue"
6
12
  require_relative "kraps/temp_path"
7
13
  require_relative "kraps/temp_paths"
8
14
  require_relative "kraps/timeout_queue"
9
15
  require_relative "kraps/interval"
10
16
  require_relative "kraps/job"
17
+ require_relative "kraps/job_resolver"
11
18
  require_relative "kraps/runner"
12
19
  require_relative "kraps/step"
13
20
  require_relative "kraps/frame"
14
21
  require_relative "kraps/worker"
15
- require "distributed_job"
16
- require "ruby-progressbar"
17
- require "ruby-progressbar/outputs/null"
18
- require "map_reduce"
19
- require "redis"
20
22
 
21
23
  module Kraps
22
24
  class Error < StandardError; end
23
25
  class InvalidAction < Error; end
24
26
  class InvalidStep < Error; end
25
27
  class JobStopped < Error; end
28
+ class IncompatibleFrame < Error; end
26
29
 
27
- def self.configure(driver:, redis: Redis.new, namespace: nil, job_ttl: 24 * 60 * 60, show_progress: true, enqueuer: ->(worker, json) { worker.perform_async(json) })
30
+ def self.configure(driver:, redis: Redis.new, namespace: nil, job_ttl: 4 * 24 * 60 * 60, show_progress: true, enqueuer: ->(worker, json) { worker.perform_async(json) })
28
31
  @driver = driver
29
- @distributed_job_client = DistributedJob::Client.new(redis: redis, namespace: namespace, default_ttl: job_ttl)
32
+ @redis = redis
33
+ @namespace = namespace
34
+ @job_ttl = job_ttl.to_i
30
35
  @show_progress = show_progress
31
36
  @enqueuer = enqueuer
32
37
  end
@@ -35,8 +40,16 @@ module Kraps
35
40
  @driver
36
41
  end
37
42
 
38
- def self.distributed_job_client
39
- @distributed_job_client
43
+ def self.redis
44
+ @redis
45
+ end
46
+
47
+ def self.namespace
48
+ @namespace
49
+ end
50
+
51
+ def self.job_ttl
52
+ @job_ttl
40
53
  end
41
54
 
42
55
  def self.show_progress?
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kraps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Vetter
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-11-10 00:00:00.000000000 Z
11
+ date: 2022-12-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: attachie
@@ -24,20 +24,6 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: distributed_job
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: map-reduce-ruby
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -147,7 +133,9 @@ files:
147
133
  - lib/kraps/hash_partitioner.rb
148
134
  - lib/kraps/interval.rb
149
135
  - lib/kraps/job.rb
136
+ - lib/kraps/job_resolver.rb
150
137
  - lib/kraps/parallelizer.rb
138
+ - lib/kraps/redis_queue.rb
151
139
  - lib/kraps/runner.rb
152
140
  - lib/kraps/step.rb
153
141
  - lib/kraps/temp_path.rb