kraps 0.1.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d1f08b6fa0f725c63e3750f4b3bf04479622b40160a6364f708d91d37c0b1948
4
- data.tar.gz: '0681d837d852846cc6c115fe9dae3075a0e5b3bb8b8eae2d90d8a23ec26581e3'
3
+ metadata.gz: f5bb601e7ee415b95b4b258a0241c25e6fe19eb3e772c06d4149afbfcfbe6c3d
4
+ data.tar.gz: cb948c05947e48d2d8e970eebbc6e2c4a5b0a88cb162ad87bf0743196f6bcaef
5
5
  SHA512:
6
- metadata.gz: 354ab3129ef1713c8229af54945251069c98d681e2db5c716d93b5925576b601751c23c1d502cb72ddaea5df5fc91e6eceb4590a619de730ae65f0762662da21
7
- data.tar.gz: 0647fc85f445bc634f70e2e10feab325c3df6aec3a30d2af4b1a792e82b9adf1a31e8bb339465e10944bce927933956e8d88e72b37fb7d84027ec569441781d6
6
+ metadata.gz: 1d1c5a16205c5584626fed5bca9b6c7dd6fae3b4f3c725b158e7740f6fa05a17abdcb483b43cbdad813576e2fc2c7621b89b94d61b32776d85ae774f5a4332d1
7
+ data.tar.gz: 2670dbc002633e801d8cf98fc8454c8881295f72b505bd4baf6cf0c8685a8c97a8a2dbf26e8a617c74b452ef627e820807e7af6e05b20a627fb99ce2eb216a1a
data/.rubocop.yml CHANGED
@@ -77,3 +77,6 @@ Lint/EmptyClass:
77
77
 
78
78
  Style/WordArray:
79
79
  Enabled: false
80
+
81
+ Style/RedundantEach:
82
+ Enabled: false
data/CHANGELOG.md ADDED
@@ -0,0 +1,10 @@
1
+ # CHANGELOG
2
+
3
+ ## v0.3.0
4
+
5
+ * Changed partitioners to receive the number of partitions
6
+ as second parameter
7
+
8
+ ## v0.2.0
9
+
10
+ * Updated map-reduce-ruby to allow concurrent uploads
data/Gemfile.lock CHANGED
@@ -1,22 +1,21 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kraps (0.1.0)
4
+ kraps (0.2.0)
5
5
  attachie
6
6
  distributed_job
7
- map-reduce-ruby (>= 2.1.1)
7
+ map-reduce-ruby (>= 3.0.0)
8
8
  redis
9
9
  ruby-progressbar
10
10
 
11
11
  GEM
12
12
  remote: https://rubygems.org/
13
13
  specs:
14
- activesupport (6.1.7)
14
+ activesupport (7.0.4)
15
15
  concurrent-ruby (~> 1.0, >= 1.0.2)
16
16
  i18n (>= 1.6, < 2)
17
17
  minitest (>= 5.1)
18
18
  tzinfo (~> 2.0)
19
- zeitwerk (~> 2.3)
20
19
  ast (2.4.2)
21
20
  attachie (1.2.0)
22
21
  activesupport
@@ -24,17 +23,17 @@ GEM
24
23
  connection_pool
25
24
  mime-types
26
25
  aws-eventstream (1.2.0)
27
- aws-partitions (1.649.0)
28
- aws-sdk-core (3.164.0)
26
+ aws-partitions (1.654.0)
27
+ aws-sdk-core (3.166.0)
29
28
  aws-eventstream (~> 1, >= 1.0.2)
30
- aws-partitions (~> 1, >= 1.525.0)
31
- aws-sigv4 (~> 1.1)
29
+ aws-partitions (~> 1, >= 1.651.0)
30
+ aws-sigv4 (~> 1.5)
32
31
  jmespath (~> 1, >= 1.6.1)
33
- aws-sdk-kms (1.58.0)
34
- aws-sdk-core (~> 3, >= 3.127.0)
32
+ aws-sdk-kms (1.59.0)
33
+ aws-sdk-core (~> 3, >= 3.165.0)
35
34
  aws-sigv4 (~> 1.1)
36
- aws-sdk-s3 (1.116.0)
37
- aws-sdk-core (~> 3, >= 3.127.0)
35
+ aws-sdk-s3 (1.117.1)
36
+ aws-sdk-core (~> 3, >= 3.165.0)
38
37
  aws-sdk-kms (~> 1)
39
38
  aws-sigv4 (~> 1.4)
40
39
  aws-sigv4 (1.5.2)
@@ -49,7 +48,7 @@ GEM
49
48
  jmespath (1.6.1)
50
49
  json (2.6.2)
51
50
  lazy_priority_queue (0.1.1)
52
- map-reduce-ruby (2.1.1)
51
+ map-reduce-ruby (3.0.0)
53
52
  json
54
53
  lazy_priority_queue
55
54
  mime-types (3.4.1)
@@ -63,40 +62,39 @@ GEM
63
62
  rake (13.0.6)
64
63
  redis (5.0.5)
65
64
  redis-client (>= 0.9.0)
66
- redis-client (0.10.0)
65
+ redis-client (0.11.0)
67
66
  connection_pool
68
- regexp_parser (2.5.0)
67
+ regexp_parser (2.6.0)
69
68
  rexml (3.2.5)
70
- rspec (3.11.0)
71
- rspec-core (~> 3.11.0)
72
- rspec-expectations (~> 3.11.0)
73
- rspec-mocks (~> 3.11.0)
74
- rspec-core (3.11.0)
75
- rspec-support (~> 3.11.0)
76
- rspec-expectations (3.11.1)
69
+ rspec (3.12.0)
70
+ rspec-core (~> 3.12.0)
71
+ rspec-expectations (~> 3.12.0)
72
+ rspec-mocks (~> 3.12.0)
73
+ rspec-core (3.12.0)
74
+ rspec-support (~> 3.12.0)
75
+ rspec-expectations (3.12.0)
77
76
  diff-lcs (>= 1.2.0, < 2.0)
78
- rspec-support (~> 3.11.0)
79
- rspec-mocks (3.11.1)
77
+ rspec-support (~> 3.12.0)
78
+ rspec-mocks (3.12.0)
80
79
  diff-lcs (>= 1.2.0, < 2.0)
81
- rspec-support (~> 3.11.0)
82
- rspec-support (3.11.1)
83
- rubocop (1.36.0)
80
+ rspec-support (~> 3.12.0)
81
+ rspec-support (3.12.0)
82
+ rubocop (1.38.0)
84
83
  json (~> 2.3)
85
84
  parallel (~> 1.10)
86
85
  parser (>= 3.1.2.1)
87
86
  rainbow (>= 2.2.2, < 4.0)
88
87
  regexp_parser (>= 1.8, < 3.0)
89
88
  rexml (>= 3.2.5, < 4.0)
90
- rubocop-ast (>= 1.20.1, < 2.0)
89
+ rubocop-ast (>= 1.23.0, < 2.0)
91
90
  ruby-progressbar (~> 1.7)
92
91
  unicode-display_width (>= 1.4.0, < 3.0)
93
- rubocop-ast (1.21.0)
92
+ rubocop-ast (1.23.0)
94
93
  parser (>= 3.1.1.0)
95
94
  ruby-progressbar (1.11.0)
96
95
  tzinfo (2.0.5)
97
96
  concurrent-ruby (~> 1.0)
98
97
  unicode-display_width (2.3.0)
99
- zeitwerk (2.6.1)
100
98
 
101
99
  PLATFORMS
102
100
  ruby
data/README.md CHANGED
@@ -143,17 +143,18 @@ split. Kraps assigns every `key` to a partition, either using a custom
143
143
  `partitioner` or the default built in hash partitioner. The hash partitioner
144
144
  simply calculates a hash of your key modulo the number of partitions and the
145
145
  resulting partition number is the partition where the respective key is
146
- assigned to. A partitioner is a callable which gets the key as argument and
147
- returns a partition number. The built in hash partitioner looks similar to this
148
- one:
146
+ assigned to. A partitioner is a callable which gets the key and the number of
147
+ partitions as argument and returns a partition number. The built in hash
148
+ partitioner looks similar to this one:
149
149
 
150
150
  ```ruby
151
- partitioner = proc { |key| Digest::SHA1.hexdigest(key.inspect)[0..4].to_i(16) % 128 } # 128 partitions
151
+ partitioner = proc { |key, num_partitions| Digest::SHA1.hexdigest(key.inspect)[0..4].to_i(16) % num_partitions }
152
152
  ```
153
153
 
154
154
  Please note, it's important that the partitioner and the specified number of
155
155
  partitions stays in sync. When you use a custom partitioner, please make sure
156
- that the partitioner operates on the same number of partitions you specify.
156
+ that the partitioner correctly returns a partition number in the range of
157
+ `0...num_partitions`.
157
158
 
158
159
  ## Datatypes
159
160
 
@@ -0,0 +1,7 @@
1
+ module Kraps
2
+ class HashPartitioner
3
+ def call(key, num_partitions)
4
+ Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % num_partitions
5
+ end
6
+ end
7
+ end
data/lib/kraps/job.rb CHANGED
@@ -6,10 +6,10 @@ module Kraps
6
6
  @worker = worker
7
7
  @steps = []
8
8
  @partitions = 0
9
- @partitioner = MapReduce::HashPartitioner.new(@partitions)
9
+ @partitioner = HashPartitioner.new
10
10
  end
11
11
 
12
- def parallelize(partitions:, partitioner: MapReduce::HashPartitioner.new(partitions), worker: @worker, &block)
12
+ def parallelize(partitions:, partitioner: HashPartitioner.new, worker: @worker, &block)
13
13
  fresh.tap do |job|
14
14
  job.instance_eval do
15
15
  @partitions = partitions
@@ -24,7 +24,7 @@ module Kraps
24
24
  fresh.tap do |job|
25
25
  job.instance_eval do
26
26
  @partitions = partitions if partitions
27
- @partitioner = partitioner || MapReduce::HashPartitioner.new(partitions) if partitioner || partitions
27
+ @partitioner = partitioner if partitioner
28
28
 
29
29
  @steps << Step.new(action: Actions::MAP, args: { partitions: @partitions, partitioner: @partitioner, worker: worker }, block: block)
30
30
  end
data/lib/kraps/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kraps
2
- VERSION = "0.1.0"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/kraps/worker.rb CHANGED
@@ -31,8 +31,12 @@ module Kraps
31
31
  mapper = MapReduce::Mapper.new(implementation.new, partitioner: partitioner, memory_limit: @memory_limit)
32
32
  mapper.map(@args["item"])
33
33
 
34
- mapper.shuffle do |partition, tempfile| # TODO: upload in parallel
35
- Kraps.driver.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), tempfile, Kraps.driver.bucket)
34
+ mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
35
+ Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
36
+ File.open(path) do |stream|
37
+ Kraps.driver.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream, Kraps.driver.bucket)
38
+ end
39
+ end
36
40
  end
37
41
  end
38
42
 
@@ -68,10 +72,14 @@ module Kraps
68
72
  end
69
73
  end
70
74
 
71
- mapper.shuffle do |partition, tempfile| # TODO: upload in parallel
72
- Kraps.driver.driver.store(
73
- Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), tempfile, Kraps.driver.bucket
74
- )
75
+ mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
76
+ Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
77
+ File.open(path) do |stream|
78
+ Kraps.driver.driver.store(
79
+ Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream, Kraps.driver.bucket
80
+ )
81
+ end
82
+ end
75
83
  end
76
84
  ensure
77
85
  temp_paths&.unlink
@@ -173,7 +181,7 @@ module Kraps
173
181
  end
174
182
 
175
183
  def partitioner
176
- @partitioner ||= step.args[:partitioner]
184
+ @partitioner ||= proc { |key| step.args[:partitioner].call(key, step.args[:partitions]) }
177
185
  end
178
186
 
179
187
  def distributed_job
data/lib/kraps.rb CHANGED
@@ -2,6 +2,7 @@ require_relative "kraps/version"
2
2
  require_relative "kraps/drivers"
3
3
  require_relative "kraps/actions"
4
4
  require_relative "kraps/parallelizer"
5
+ require_relative "kraps/hash_partitioner"
5
6
  require_relative "kraps/temp_path"
6
7
  require_relative "kraps/temp_paths"
7
8
  require_relative "kraps/timeout_queue"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kraps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Vetter
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-28 00:00:00.000000000 Z
11
+ date: 2022-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: attachie
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: 2.1.1
47
+ version: 3.0.0
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: 2.1.1
54
+ version: 3.0.0
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: redis
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -132,6 +132,7 @@ extra_rdoc_files: []
132
132
  files:
133
133
  - ".rspec"
134
134
  - ".rubocop.yml"
135
+ - CHANGELOG.md
135
136
  - CODE_OF_CONDUCT.md
136
137
  - Gemfile
137
138
  - Gemfile.lock
@@ -143,6 +144,7 @@ files:
143
144
  - lib/kraps/actions.rb
144
145
  - lib/kraps/drivers.rb
145
146
  - lib/kraps/frame.rb
147
+ - lib/kraps/hash_partitioner.rb
146
148
  - lib/kraps/interval.rb
147
149
  - lib/kraps/job.rb
148
150
  - lib/kraps/parallelizer.rb