kraps 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/CHANGELOG.md +10 -0
- data/Gemfile.lock +28 -30
- data/README.md +6 -5
- data/lib/kraps/hash_partitioner.rb +7 -0
- data/lib/kraps/job.rb +3 -3
- data/lib/kraps/version.rb +1 -1
- data/lib/kraps/worker.rb +15 -7
- data/lib/kraps.rb +1 -0
- metadata +6 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f5bb601e7ee415b95b4b258a0241c25e6fe19eb3e772c06d4149afbfcfbe6c3d
|
4
|
+
data.tar.gz: cb948c05947e48d2d8e970eebbc6e2c4a5b0a88cb162ad87bf0743196f6bcaef
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1d1c5a16205c5584626fed5bca9b6c7dd6fae3b4f3c725b158e7740f6fa05a17abdcb483b43cbdad813576e2fc2c7621b89b94d61b32776d85ae774f5a4332d1
|
7
|
+
data.tar.gz: 2670dbc002633e801d8cf98fc8454c8881295f72b505bd4baf6cf0c8685a8c97a8a2dbf26e8a617c74b452ef627e820807e7af6e05b20a627fb99ce2eb216a1a
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
ADDED
data/Gemfile.lock
CHANGED
@@ -1,22 +1,21 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
kraps (0.
|
4
|
+
kraps (0.2.0)
|
5
5
|
attachie
|
6
6
|
distributed_job
|
7
|
-
map-reduce-ruby (>=
|
7
|
+
map-reduce-ruby (>= 3.0.0)
|
8
8
|
redis
|
9
9
|
ruby-progressbar
|
10
10
|
|
11
11
|
GEM
|
12
12
|
remote: https://rubygems.org/
|
13
13
|
specs:
|
14
|
-
activesupport (
|
14
|
+
activesupport (7.0.4)
|
15
15
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
16
16
|
i18n (>= 1.6, < 2)
|
17
17
|
minitest (>= 5.1)
|
18
18
|
tzinfo (~> 2.0)
|
19
|
-
zeitwerk (~> 2.3)
|
20
19
|
ast (2.4.2)
|
21
20
|
attachie (1.2.0)
|
22
21
|
activesupport
|
@@ -24,17 +23,17 @@ GEM
|
|
24
23
|
connection_pool
|
25
24
|
mime-types
|
26
25
|
aws-eventstream (1.2.0)
|
27
|
-
aws-partitions (1.
|
28
|
-
aws-sdk-core (3.
|
26
|
+
aws-partitions (1.654.0)
|
27
|
+
aws-sdk-core (3.166.0)
|
29
28
|
aws-eventstream (~> 1, >= 1.0.2)
|
30
|
-
aws-partitions (~> 1, >= 1.
|
31
|
-
aws-sigv4 (~> 1.
|
29
|
+
aws-partitions (~> 1, >= 1.651.0)
|
30
|
+
aws-sigv4 (~> 1.5)
|
32
31
|
jmespath (~> 1, >= 1.6.1)
|
33
|
-
aws-sdk-kms (1.
|
34
|
-
aws-sdk-core (~> 3, >= 3.
|
32
|
+
aws-sdk-kms (1.59.0)
|
33
|
+
aws-sdk-core (~> 3, >= 3.165.0)
|
35
34
|
aws-sigv4 (~> 1.1)
|
36
|
-
aws-sdk-s3 (1.
|
37
|
-
aws-sdk-core (~> 3, >= 3.
|
35
|
+
aws-sdk-s3 (1.117.1)
|
36
|
+
aws-sdk-core (~> 3, >= 3.165.0)
|
38
37
|
aws-sdk-kms (~> 1)
|
39
38
|
aws-sigv4 (~> 1.4)
|
40
39
|
aws-sigv4 (1.5.2)
|
@@ -49,7 +48,7 @@ GEM
|
|
49
48
|
jmespath (1.6.1)
|
50
49
|
json (2.6.2)
|
51
50
|
lazy_priority_queue (0.1.1)
|
52
|
-
map-reduce-ruby (
|
51
|
+
map-reduce-ruby (3.0.0)
|
53
52
|
json
|
54
53
|
lazy_priority_queue
|
55
54
|
mime-types (3.4.1)
|
@@ -63,40 +62,39 @@ GEM
|
|
63
62
|
rake (13.0.6)
|
64
63
|
redis (5.0.5)
|
65
64
|
redis-client (>= 0.9.0)
|
66
|
-
redis-client (0.
|
65
|
+
redis-client (0.11.0)
|
67
66
|
connection_pool
|
68
|
-
regexp_parser (2.
|
67
|
+
regexp_parser (2.6.0)
|
69
68
|
rexml (3.2.5)
|
70
|
-
rspec (3.
|
71
|
-
rspec-core (~> 3.
|
72
|
-
rspec-expectations (~> 3.
|
73
|
-
rspec-mocks (~> 3.
|
74
|
-
rspec-core (3.
|
75
|
-
rspec-support (~> 3.
|
76
|
-
rspec-expectations (3.
|
69
|
+
rspec (3.12.0)
|
70
|
+
rspec-core (~> 3.12.0)
|
71
|
+
rspec-expectations (~> 3.12.0)
|
72
|
+
rspec-mocks (~> 3.12.0)
|
73
|
+
rspec-core (3.12.0)
|
74
|
+
rspec-support (~> 3.12.0)
|
75
|
+
rspec-expectations (3.12.0)
|
77
76
|
diff-lcs (>= 1.2.0, < 2.0)
|
78
|
-
rspec-support (~> 3.
|
79
|
-
rspec-mocks (3.
|
77
|
+
rspec-support (~> 3.12.0)
|
78
|
+
rspec-mocks (3.12.0)
|
80
79
|
diff-lcs (>= 1.2.0, < 2.0)
|
81
|
-
rspec-support (~> 3.
|
82
|
-
rspec-support (3.
|
83
|
-
rubocop (1.
|
80
|
+
rspec-support (~> 3.12.0)
|
81
|
+
rspec-support (3.12.0)
|
82
|
+
rubocop (1.38.0)
|
84
83
|
json (~> 2.3)
|
85
84
|
parallel (~> 1.10)
|
86
85
|
parser (>= 3.1.2.1)
|
87
86
|
rainbow (>= 2.2.2, < 4.0)
|
88
87
|
regexp_parser (>= 1.8, < 3.0)
|
89
88
|
rexml (>= 3.2.5, < 4.0)
|
90
|
-
rubocop-ast (>= 1.
|
89
|
+
rubocop-ast (>= 1.23.0, < 2.0)
|
91
90
|
ruby-progressbar (~> 1.7)
|
92
91
|
unicode-display_width (>= 1.4.0, < 3.0)
|
93
|
-
rubocop-ast (1.
|
92
|
+
rubocop-ast (1.23.0)
|
94
93
|
parser (>= 3.1.1.0)
|
95
94
|
ruby-progressbar (1.11.0)
|
96
95
|
tzinfo (2.0.5)
|
97
96
|
concurrent-ruby (~> 1.0)
|
98
97
|
unicode-display_width (2.3.0)
|
99
|
-
zeitwerk (2.6.1)
|
100
98
|
|
101
99
|
PLATFORMS
|
102
100
|
ruby
|
data/README.md
CHANGED
@@ -143,17 +143,18 @@ split. Kraps assigns every `key` to a partition, either using a custom
|
|
143
143
|
`partitioner` or the default built in hash partitioner. The hash partitioner
|
144
144
|
simply calculates a hash of your key modulo the number of partitions and the
|
145
145
|
resulting partition number is the partition where the respective key is
|
146
|
-
assigned to. A partitioner is a callable which gets the key
|
147
|
-
returns a partition number. The built in hash
|
148
|
-
one:
|
146
|
+
assigned to. A partitioner is a callable which gets the key and the number of
|
147
|
+
partitions as argument and returns a partition number. The built in hash
|
148
|
+
partitioner looks similar to this one:
|
149
149
|
|
150
150
|
```ruby
|
151
|
-
partitioner = proc { |key| Digest::SHA1.hexdigest(key.inspect)[0..4].to_i(16) %
|
151
|
+
partitioner = proc { |key, num_partitions| Digest::SHA1.hexdigest(key.inspect)[0..4].to_i(16) % num_partitions }
|
152
152
|
```
|
153
153
|
|
154
154
|
Please note, it's important that the partitioner and the specified number of
|
155
155
|
partitions stays in sync. When you use a custom partitioner, please make sure
|
156
|
-
that the partitioner
|
156
|
+
that the partitioner correctly returns a partition number in the range of
|
157
|
+
`0...num_partitions`.
|
157
158
|
|
158
159
|
## Datatypes
|
159
160
|
|
data/lib/kraps/job.rb
CHANGED
@@ -6,10 +6,10 @@ module Kraps
|
|
6
6
|
@worker = worker
|
7
7
|
@steps = []
|
8
8
|
@partitions = 0
|
9
|
-
@partitioner =
|
9
|
+
@partitioner = HashPartitioner.new
|
10
10
|
end
|
11
11
|
|
12
|
-
def parallelize(partitions:, partitioner:
|
12
|
+
def parallelize(partitions:, partitioner: HashPartitioner.new, worker: @worker, &block)
|
13
13
|
fresh.tap do |job|
|
14
14
|
job.instance_eval do
|
15
15
|
@partitions = partitions
|
@@ -24,7 +24,7 @@ module Kraps
|
|
24
24
|
fresh.tap do |job|
|
25
25
|
job.instance_eval do
|
26
26
|
@partitions = partitions if partitions
|
27
|
-
@partitioner = partitioner
|
27
|
+
@partitioner = partitioner if partitioner
|
28
28
|
|
29
29
|
@steps << Step.new(action: Actions::MAP, args: { partitions: @partitions, partitioner: @partitioner, worker: worker }, block: block)
|
30
30
|
end
|
data/lib/kraps/version.rb
CHANGED
data/lib/kraps/worker.rb
CHANGED
@@ -31,8 +31,12 @@ module Kraps
|
|
31
31
|
mapper = MapReduce::Mapper.new(implementation.new, partitioner: partitioner, memory_limit: @memory_limit)
|
32
32
|
mapper.map(@args["item"])
|
33
33
|
|
34
|
-
mapper.shuffle do |
|
35
|
-
|
34
|
+
mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
|
35
|
+
Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
|
36
|
+
File.open(path) do |stream|
|
37
|
+
Kraps.driver.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream, Kraps.driver.bucket)
|
38
|
+
end
|
39
|
+
end
|
36
40
|
end
|
37
41
|
end
|
38
42
|
|
@@ -68,10 +72,14 @@ module Kraps
|
|
68
72
|
end
|
69
73
|
end
|
70
74
|
|
71
|
-
mapper.shuffle do |
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
+
mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
|
76
|
+
Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
|
77
|
+
File.open(path) do |stream|
|
78
|
+
Kraps.driver.driver.store(
|
79
|
+
Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream, Kraps.driver.bucket
|
80
|
+
)
|
81
|
+
end
|
82
|
+
end
|
75
83
|
end
|
76
84
|
ensure
|
77
85
|
temp_paths&.unlink
|
@@ -173,7 +181,7 @@ module Kraps
|
|
173
181
|
end
|
174
182
|
|
175
183
|
def partitioner
|
176
|
-
@partitioner ||= step.args[:partitioner]
|
184
|
+
@partitioner ||= proc { |key| step.args[:partitioner].call(key, step.args[:partitions]) }
|
177
185
|
end
|
178
186
|
|
179
187
|
def distributed_job
|
data/lib/kraps.rb
CHANGED
@@ -2,6 +2,7 @@ require_relative "kraps/version"
|
|
2
2
|
require_relative "kraps/drivers"
|
3
3
|
require_relative "kraps/actions"
|
4
4
|
require_relative "kraps/parallelizer"
|
5
|
+
require_relative "kraps/hash_partitioner"
|
5
6
|
require_relative "kraps/temp_path"
|
6
7
|
require_relative "kraps/temp_paths"
|
7
8
|
require_relative "kraps/timeout_queue"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kraps
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Vetter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-11-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: attachie
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 3.0.0
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 3.0.0
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: redis
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -132,6 +132,7 @@ extra_rdoc_files: []
|
|
132
132
|
files:
|
133
133
|
- ".rspec"
|
134
134
|
- ".rubocop.yml"
|
135
|
+
- CHANGELOG.md
|
135
136
|
- CODE_OF_CONDUCT.md
|
136
137
|
- Gemfile
|
137
138
|
- Gemfile.lock
|
@@ -143,6 +144,7 @@ files:
|
|
143
144
|
- lib/kraps/actions.rb
|
144
145
|
- lib/kraps/drivers.rb
|
145
146
|
- lib/kraps/frame.rb
|
147
|
+
- lib/kraps/hash_partitioner.rb
|
146
148
|
- lib/kraps/interval.rb
|
147
149
|
- lib/kraps/job.rb
|
148
150
|
- lib/kraps/parallelizer.rb
|