kraps 0.8.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +34 -0
- data/Gemfile +4 -3
- data/README.md +29 -6
- data/lib/kraps/actions.rb +2 -1
- data/lib/kraps/job.rb +18 -0
- data/lib/kraps/runner.rb +16 -1
- data/lib/kraps/version.rb +1 -1
- data/lib/kraps/worker.rb +52 -1
- metadata +2 -45
- data/Gemfile.lock +0 -108
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6ac0098a3c2f8acf49552f5b72621619345f9b59c83edd58368d7a999605f817
|
4
|
+
data.tar.gz: 3d522dd1149b57c9dce596ea4b38aa67ba5f2bc18a96bbc2840e80096e9eb192
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e859a2ecd0e635a067bde3cf3d51f76dc64702d91975342c590a43370d5a8a49da5c2272b6806550420537c74b3c7129ee0c76f58ff42ef0b154d7e8d0904417
|
7
|
+
data.tar.gz: 8e93aa69360b48b46f96a94cc2dd49d841b72fc858688c93071d46b99b86d50bf3b0758dd7f071cb50fd6ddc13cf532ed00d1ed1e4eee277d064568a11931f22
|
data/.rubocop.yml
CHANGED
@@ -16,6 +16,12 @@ Lint/UnreachableLoop:
|
|
16
16
|
Metrics/BlockLength:
|
17
17
|
Enabled: false
|
18
18
|
|
19
|
+
Style/HashEachMethods:
|
20
|
+
Enabled: false
|
21
|
+
|
22
|
+
Style/ZeroLengthPredicate:
|
23
|
+
Enabled: false
|
24
|
+
|
19
25
|
Gemspec/RequiredRubyVersion:
|
20
26
|
Enabled: false
|
21
27
|
|
@@ -48,6 +54,9 @@ Style/StringLiteralsInInterpolation:
|
|
48
54
|
Enabled: true
|
49
55
|
EnforcedStyle: double_quotes
|
50
56
|
|
57
|
+
Style/RedundantSelfAssignment:
|
58
|
+
Enabled: false
|
59
|
+
|
51
60
|
Layout/LineLength:
|
52
61
|
Max: 250
|
53
62
|
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,39 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## v0.10.0
|
4
|
+
|
5
|
+
* `append` operation added
|
6
|
+
|
7
|
+
## v0.9.0
|
8
|
+
|
9
|
+
* Argments are no longer passed to the `call` method, but to the
|
10
|
+
initializer instead
|
11
|
+
|
12
|
+
Before:
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
class MyJob
|
16
|
+
def call(arg1, arg2)
|
17
|
+
# ...
|
18
|
+
end
|
19
|
+
end
|
20
|
+
```
|
21
|
+
|
22
|
+
After:
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
class MyJob
|
26
|
+
def initializer(arg1, arg2)
|
27
|
+
@arg1 = arg1
|
28
|
+
@arg2 = arg2
|
29
|
+
end
|
30
|
+
|
31
|
+
def call
|
32
|
+
# ...
|
33
|
+
end
|
34
|
+
end
|
35
|
+
```
|
36
|
+
|
3
37
|
## v0.8.0
|
4
38
|
|
5
39
|
* Use number of partitions of previous step for `jobs` option by default
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -38,16 +38,21 @@ Kraps.configure(
|
|
38
38
|
|
39
39
|
Afterwards, create a job class, which tells Kraps what your job should do.
|
40
40
|
Therefore, you create some class with a `call` method, and optionally some
|
41
|
-
arguments. Let's create a simple job, which reads
|
42
|
-
how often search queries have been searched:
|
41
|
+
arguments passed to its initializer. Let's create a simple job, which reads
|
42
|
+
search log files to analyze how often search queries have been searched:
|
43
43
|
|
44
44
|
```ruby
|
45
45
|
class SearchLogCounter
|
46
|
-
def
|
46
|
+
def initialize(start_date:, end_date:)
|
47
|
+
@start_date = start_date
|
48
|
+
@end_date = end_date
|
49
|
+
end
|
50
|
+
|
51
|
+
def call
|
47
52
|
job = Kraps::Job.new(worker: MyKrapsWorker)
|
48
53
|
|
49
54
|
job = job.parallelize(partitions: 128) do |collector|
|
50
|
-
(Date.parse(start_date)..Date.parse(end_date)).each do |date|
|
55
|
+
(Date.parse(@start_date)..Date.parse(@end_date)).each do |date|
|
51
56
|
collector.call(date.to_s)
|
52
57
|
end
|
53
58
|
end
|
@@ -214,6 +219,10 @@ job.parallelize(partitions: 128, partitioner: partitioner, worker: MyKrapsWorker
|
|
214
219
|
end
|
215
220
|
```
|
216
221
|
|
222
|
+
Please note, that `parallelize` itself is not parallelized but rather
|
223
|
+
parallelizes the data you feed into Kraps within `parallelize` by splitting it
|
224
|
+
into the number of `partitions` specified.
|
225
|
+
|
217
226
|
The block must use the collector to feed Kraps with individual items. The
|
218
227
|
items are used as keys and the values are set to `nil`.
|
219
228
|
|
@@ -267,6 +276,15 @@ The `key` itself is also passed to the block for the case that you need to
|
|
267
276
|
customize the reduce calculation according to the value of the key. However,
|
268
277
|
most of the time, this is not neccessary and the key can simply be ignored.
|
269
278
|
|
279
|
+
* `append`: Appends the results of 2 jobs, such that all key-value pairs
|
280
|
+
of both jobs will be in the result. `append` does not accept any block.
|
281
|
+
|
282
|
+
```ruby
|
283
|
+
job.append(other_job, worker: MyKrapsWorker, jobs: 8)
|
284
|
+
```
|
285
|
+
Please note that the partitioners and the number of partitions must match for
|
286
|
+
the jobs to be appended.
|
287
|
+
|
270
288
|
* `combine`: Combines the results of 2 jobs by combining every key available
|
271
289
|
in the current job result with the corresponding key from the passed job
|
272
290
|
result. When the passed job result does not have the corresponding key,
|
@@ -345,11 +363,16 @@ of searches made:
|
|
345
363
|
|
346
364
|
```ruby
|
347
365
|
class SearchLogCounter
|
348
|
-
def
|
366
|
+
def initialize(start_date:, end_date:)
|
367
|
+
@start_date = start_date
|
368
|
+
@end_date = end_date
|
369
|
+
end
|
370
|
+
|
371
|
+
def call
|
349
372
|
count_job = Kraps::Job.new(worker: SomeBackgroundWorker)
|
350
373
|
|
351
374
|
count_job = count_job.parallelize(partitions: 128) do |collector|
|
352
|
-
(Date.parse(start_date)..Date.parse(end_date)).each do |date|
|
375
|
+
(Date.parse(@start_date)..Date.parse(@end_date)).each do |date|
|
353
376
|
collector.call(date.to_s)
|
354
377
|
end
|
355
378
|
end
|
data/lib/kraps/actions.rb
CHANGED
data/lib/kraps/job.rb
CHANGED
@@ -103,6 +103,24 @@ module Kraps
|
|
103
103
|
end
|
104
104
|
end
|
105
105
|
|
106
|
+
def append(other_job, jobs: nil, worker: @worker, before: nil, &block)
|
107
|
+
fresh.tap do |job|
|
108
|
+
job.instance_eval do
|
109
|
+
@steps << Step.new(
|
110
|
+
action: Actions::APPEND,
|
111
|
+
jobs: [jobs, @partitions].compact.min,
|
112
|
+
partitions: @partitions,
|
113
|
+
partitioner: @partitioner,
|
114
|
+
worker: worker,
|
115
|
+
before: before,
|
116
|
+
block: block,
|
117
|
+
dependency: other_job,
|
118
|
+
options: { append_step_index: other_job.steps.size - 1 }
|
119
|
+
)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
106
124
|
def each_partition(jobs: nil, worker: @worker, before: nil, &block)
|
107
125
|
fresh.tap do |job|
|
108
126
|
job.instance_eval do
|
data/lib/kraps/runner.rb
CHANGED
@@ -5,7 +5,7 @@ module Kraps
|
|
5
5
|
end
|
6
6
|
|
7
7
|
def call(*args, **kwargs)
|
8
|
-
JobResolver.new.call(@klass.new
|
8
|
+
JobResolver.new.call(@klass.new(*args, **kwargs).call).tap do |jobs|
|
9
9
|
jobs.each_with_index do |job, job_index|
|
10
10
|
job.steps.each_with_index.inject(nil) do |frame, (_, step_index)|
|
11
11
|
StepRunner.new(
|
@@ -91,6 +91,21 @@ module Kraps
|
|
91
91
|
Frame.new(token: token, partitions: @step.partitions)
|
92
92
|
end
|
93
93
|
|
94
|
+
def perform_append
|
95
|
+
append_job = @step.dependency
|
96
|
+
append_step = append_job.steps[@step.options[:append_step_index]]
|
97
|
+
|
98
|
+
raise(IncompatibleFrame, "Incompatible number of partitions") if append_step.partitions != @step.partitions
|
99
|
+
|
100
|
+
enum = (0...@frame.partitions).map do |partition|
|
101
|
+
{ partition: partition, append_frame: append_step.frame.to_h }
|
102
|
+
end
|
103
|
+
|
104
|
+
token = push_and_wait(job_count: @step.jobs, enum: enum)
|
105
|
+
|
106
|
+
Frame.new(token: token, partitions: @step.partitions)
|
107
|
+
end
|
108
|
+
|
94
109
|
def perform_each_partition
|
95
110
|
enum = (0...@frame.partitions).map { |partition| { partition: partition } }
|
96
111
|
push_and_wait(job_count: @step.jobs, enum: enum)
|
data/lib/kraps/version.rb
CHANGED
data/lib/kraps/worker.rb
CHANGED
@@ -86,6 +86,57 @@ module Kraps
|
|
86
86
|
temp_paths&.delete
|
87
87
|
end
|
88
88
|
|
89
|
+
def perform_append(payload)
|
90
|
+
temp_paths1 = download_all(token: @args["frame"]["token"], partition: payload["partition"])
|
91
|
+
temp_paths2 = download_all(token: payload["append_frame"]["token"], partition: payload["partition"])
|
92
|
+
|
93
|
+
implementation = Object.new
|
94
|
+
implementation.define_singleton_method(:map) do |key, value, &block|
|
95
|
+
block.call(key, value)
|
96
|
+
end
|
97
|
+
|
98
|
+
subsequent_step = next_step
|
99
|
+
|
100
|
+
if subsequent_step&.action == Actions::REDUCE
|
101
|
+
implementation.define_singleton_method(:reduce) do |key, value1, value2|
|
102
|
+
subsequent_step.block.call(key, value1, value2)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
mapper = MapReduce::Mapper.new(implementation, partitioner: partitioner, memory_limit: @memory_limit)
|
107
|
+
|
108
|
+
temp_paths1.each do |temp_path|
|
109
|
+
File.open(temp_path.path) do |stream|
|
110
|
+
stream.each_line do |line|
|
111
|
+
key, value = JSON.parse(line)
|
112
|
+
|
113
|
+
mapper.map(key, value)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
temp_paths2.each do |temp_path|
|
119
|
+
File.open(temp_path.path) do |stream|
|
120
|
+
stream.each_line do |line|
|
121
|
+
key, value = JSON.parse(line)
|
122
|
+
|
123
|
+
mapper.map(key, value)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
|
129
|
+
Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
|
130
|
+
File.open(path) do |stream|
|
131
|
+
Kraps.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{payload["partition"]}.json"), stream)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
ensure
|
136
|
+
temp_paths1&.delete
|
137
|
+
temp_paths2&.delete
|
138
|
+
end
|
139
|
+
|
89
140
|
def perform_map_partitions(payload)
|
90
141
|
temp_paths = download_all(token: @args["frame"]["token"], partition: payload["partition"])
|
91
142
|
|
@@ -274,7 +325,7 @@ module Kraps
|
|
274
325
|
end
|
275
326
|
|
276
327
|
def jobs
|
277
|
-
@jobs ||= JobResolver.new.call(@args["klass"].constantize.new
|
328
|
+
@jobs ||= JobResolver.new.call(@args["klass"].constantize.new(*@args["args"], **@args["kwargs"].transform_keys(&:to_sym)).call)
|
278
329
|
end
|
279
330
|
|
280
331
|
def job
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kraps
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Vetter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-04-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: attachie
|
@@ -66,48 +66,6 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: bundler
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ">="
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: rspec
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - ">="
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
90
|
-
type: :development
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
93
|
-
requirements:
|
94
|
-
- - ">="
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: rubocop
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - ">="
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
104
|
-
type: :development
|
105
|
-
prerelease: false
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
107
|
-
requirements:
|
108
|
-
- - ">="
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
111
69
|
description: Kraps allows to process and perform calculations on very large datasets
|
112
70
|
in parallel
|
113
71
|
email:
|
@@ -121,7 +79,6 @@ files:
|
|
121
79
|
- CHANGELOG.md
|
122
80
|
- CODE_OF_CONDUCT.md
|
123
81
|
- Gemfile
|
124
|
-
- Gemfile.lock
|
125
82
|
- LICENSE.txt
|
126
83
|
- README.md
|
127
84
|
- Rakefile
|
data/Gemfile.lock
DELETED
@@ -1,108 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
kraps (0.7.0)
|
5
|
-
attachie
|
6
|
-
map-reduce-ruby (>= 3.0.0)
|
7
|
-
redis
|
8
|
-
ruby-progressbar
|
9
|
-
|
10
|
-
GEM
|
11
|
-
remote: https://rubygems.org/
|
12
|
-
specs:
|
13
|
-
activesupport (7.0.4)
|
14
|
-
concurrent-ruby (~> 1.0, >= 1.0.2)
|
15
|
-
i18n (>= 1.6, < 2)
|
16
|
-
minitest (>= 5.1)
|
17
|
-
tzinfo (~> 2.0)
|
18
|
-
ast (2.4.2)
|
19
|
-
attachie (1.2.0)
|
20
|
-
activesupport
|
21
|
-
aws-sdk-s3
|
22
|
-
connection_pool
|
23
|
-
mime-types
|
24
|
-
aws-eventstream (1.2.0)
|
25
|
-
aws-partitions (1.657.0)
|
26
|
-
aws-sdk-core (3.166.0)
|
27
|
-
aws-eventstream (~> 1, >= 1.0.2)
|
28
|
-
aws-partitions (~> 1, >= 1.651.0)
|
29
|
-
aws-sigv4 (~> 1.5)
|
30
|
-
jmespath (~> 1, >= 1.6.1)
|
31
|
-
aws-sdk-kms (1.59.0)
|
32
|
-
aws-sdk-core (~> 3, >= 3.165.0)
|
33
|
-
aws-sigv4 (~> 1.1)
|
34
|
-
aws-sdk-s3 (1.117.1)
|
35
|
-
aws-sdk-core (~> 3, >= 3.165.0)
|
36
|
-
aws-sdk-kms (~> 1)
|
37
|
-
aws-sigv4 (~> 1.4)
|
38
|
-
aws-sigv4 (1.5.2)
|
39
|
-
aws-eventstream (~> 1, >= 1.0.2)
|
40
|
-
concurrent-ruby (1.1.10)
|
41
|
-
connection_pool (2.3.0)
|
42
|
-
diff-lcs (1.5.0)
|
43
|
-
i18n (1.12.0)
|
44
|
-
concurrent-ruby (~> 1.0)
|
45
|
-
jmespath (1.6.1)
|
46
|
-
json (2.6.2)
|
47
|
-
lazy_priority_queue (0.1.1)
|
48
|
-
map-reduce-ruby (3.0.0)
|
49
|
-
json
|
50
|
-
lazy_priority_queue
|
51
|
-
mime-types (3.4.1)
|
52
|
-
mime-types-data (~> 3.2015)
|
53
|
-
mime-types-data (3.2022.0105)
|
54
|
-
minitest (5.16.3)
|
55
|
-
parallel (1.22.1)
|
56
|
-
parser (3.1.2.1)
|
57
|
-
ast (~> 2.4.1)
|
58
|
-
rainbow (3.1.1)
|
59
|
-
rake (13.0.6)
|
60
|
-
redis (5.0.5)
|
61
|
-
redis-client (>= 0.9.0)
|
62
|
-
redis-client (0.11.2)
|
63
|
-
connection_pool
|
64
|
-
regexp_parser (2.6.0)
|
65
|
-
rexml (3.2.5)
|
66
|
-
rspec (3.12.0)
|
67
|
-
rspec-core (~> 3.12.0)
|
68
|
-
rspec-expectations (~> 3.12.0)
|
69
|
-
rspec-mocks (~> 3.12.0)
|
70
|
-
rspec-core (3.12.0)
|
71
|
-
rspec-support (~> 3.12.0)
|
72
|
-
rspec-expectations (3.12.0)
|
73
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
74
|
-
rspec-support (~> 3.12.0)
|
75
|
-
rspec-mocks (3.12.0)
|
76
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
77
|
-
rspec-support (~> 3.12.0)
|
78
|
-
rspec-support (3.12.0)
|
79
|
-
rubocop (1.38.0)
|
80
|
-
json (~> 2.3)
|
81
|
-
parallel (~> 1.10)
|
82
|
-
parser (>= 3.1.2.1)
|
83
|
-
rainbow (>= 2.2.2, < 4.0)
|
84
|
-
regexp_parser (>= 1.8, < 3.0)
|
85
|
-
rexml (>= 3.2.5, < 4.0)
|
86
|
-
rubocop-ast (>= 1.23.0, < 2.0)
|
87
|
-
ruby-progressbar (~> 1.7)
|
88
|
-
unicode-display_width (>= 1.4.0, < 3.0)
|
89
|
-
rubocop-ast (1.23.0)
|
90
|
-
parser (>= 3.1.1.0)
|
91
|
-
ruby-progressbar (1.11.0)
|
92
|
-
tzinfo (2.0.5)
|
93
|
-
concurrent-ruby (~> 1.0)
|
94
|
-
unicode-display_width (2.3.0)
|
95
|
-
|
96
|
-
PLATFORMS
|
97
|
-
ruby
|
98
|
-
x86_64-linux
|
99
|
-
|
100
|
-
DEPENDENCIES
|
101
|
-
bundler
|
102
|
-
kraps!
|
103
|
-
rake (~> 13.0)
|
104
|
-
rspec (~> 3.0)
|
105
|
-
rubocop
|
106
|
-
|
107
|
-
BUNDLED WITH
|
108
|
-
2.3.24
|