kraps 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d1f08b6fa0f725c63e3750f4b3bf04479622b40160a6364f708d91d37c0b1948
4
- data.tar.gz: '0681d837d852846cc6c115fe9dae3075a0e5b3bb8b8eae2d90d8a23ec26581e3'
3
+ metadata.gz: f5bb601e7ee415b95b4b258a0241c25e6fe19eb3e772c06d4149afbfcfbe6c3d
4
+ data.tar.gz: cb948c05947e48d2d8e970eebbc6e2c4a5b0a88cb162ad87bf0743196f6bcaef
5
5
  SHA512:
6
- metadata.gz: 354ab3129ef1713c8229af54945251069c98d681e2db5c716d93b5925576b601751c23c1d502cb72ddaea5df5fc91e6eceb4590a619de730ae65f0762662da21
7
- data.tar.gz: 0647fc85f445bc634f70e2e10feab325c3df6aec3a30d2af4b1a792e82b9adf1a31e8bb339465e10944bce927933956e8d88e72b37fb7d84027ec569441781d6
6
+ metadata.gz: 1d1c5a16205c5584626fed5bca9b6c7dd6fae3b4f3c725b158e7740f6fa05a17abdcb483b43cbdad813576e2fc2c7621b89b94d61b32776d85ae774f5a4332d1
7
+ data.tar.gz: 2670dbc002633e801d8cf98fc8454c8881295f72b505bd4baf6cf0c8685a8c97a8a2dbf26e8a617c74b452ef627e820807e7af6e05b20a627fb99ce2eb216a1a
data/.rubocop.yml CHANGED
@@ -77,3 +77,6 @@ Lint/EmptyClass:
77
77
 
78
78
  Style/WordArray:
79
79
  Enabled: false
80
+
81
+ Style/RedundantEach:
82
+ Enabled: false
data/CHANGELOG.md ADDED
@@ -0,0 +1,10 @@
1
+ # CHANGELOG
2
+
3
+ ## v0.3.0
4
+
5
+ * Changed partitioners to receive the number of partitions
6
+ as second parameter
7
+
8
+ ## v0.2.0
9
+
10
+ * Updated map-reduce-ruby to allow concurrent uploads
data/Gemfile.lock CHANGED
@@ -1,22 +1,21 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kraps (0.1.0)
4
+ kraps (0.2.0)
5
5
  attachie
6
6
  distributed_job
7
- map-reduce-ruby (>= 2.1.1)
7
+ map-reduce-ruby (>= 3.0.0)
8
8
  redis
9
9
  ruby-progressbar
10
10
 
11
11
  GEM
12
12
  remote: https://rubygems.org/
13
13
  specs:
14
- activesupport (6.1.7)
14
+ activesupport (7.0.4)
15
15
  concurrent-ruby (~> 1.0, >= 1.0.2)
16
16
  i18n (>= 1.6, < 2)
17
17
  minitest (>= 5.1)
18
18
  tzinfo (~> 2.0)
19
- zeitwerk (~> 2.3)
20
19
  ast (2.4.2)
21
20
  attachie (1.2.0)
22
21
  activesupport
@@ -24,17 +23,17 @@ GEM
24
23
  connection_pool
25
24
  mime-types
26
25
  aws-eventstream (1.2.0)
27
- aws-partitions (1.649.0)
28
- aws-sdk-core (3.164.0)
26
+ aws-partitions (1.654.0)
27
+ aws-sdk-core (3.166.0)
29
28
  aws-eventstream (~> 1, >= 1.0.2)
30
- aws-partitions (~> 1, >= 1.525.0)
31
- aws-sigv4 (~> 1.1)
29
+ aws-partitions (~> 1, >= 1.651.0)
30
+ aws-sigv4 (~> 1.5)
32
31
  jmespath (~> 1, >= 1.6.1)
33
- aws-sdk-kms (1.58.0)
34
- aws-sdk-core (~> 3, >= 3.127.0)
32
+ aws-sdk-kms (1.59.0)
33
+ aws-sdk-core (~> 3, >= 3.165.0)
35
34
  aws-sigv4 (~> 1.1)
36
- aws-sdk-s3 (1.116.0)
37
- aws-sdk-core (~> 3, >= 3.127.0)
35
+ aws-sdk-s3 (1.117.1)
36
+ aws-sdk-core (~> 3, >= 3.165.0)
38
37
  aws-sdk-kms (~> 1)
39
38
  aws-sigv4 (~> 1.4)
40
39
  aws-sigv4 (1.5.2)
@@ -49,7 +48,7 @@ GEM
49
48
  jmespath (1.6.1)
50
49
  json (2.6.2)
51
50
  lazy_priority_queue (0.1.1)
52
- map-reduce-ruby (2.1.1)
51
+ map-reduce-ruby (3.0.0)
53
52
  json
54
53
  lazy_priority_queue
55
54
  mime-types (3.4.1)
@@ -63,40 +62,39 @@ GEM
63
62
  rake (13.0.6)
64
63
  redis (5.0.5)
65
64
  redis-client (>= 0.9.0)
66
- redis-client (0.10.0)
65
+ redis-client (0.11.0)
67
66
  connection_pool
68
- regexp_parser (2.5.0)
67
+ regexp_parser (2.6.0)
69
68
  rexml (3.2.5)
70
- rspec (3.11.0)
71
- rspec-core (~> 3.11.0)
72
- rspec-expectations (~> 3.11.0)
73
- rspec-mocks (~> 3.11.0)
74
- rspec-core (3.11.0)
75
- rspec-support (~> 3.11.0)
76
- rspec-expectations (3.11.1)
69
+ rspec (3.12.0)
70
+ rspec-core (~> 3.12.0)
71
+ rspec-expectations (~> 3.12.0)
72
+ rspec-mocks (~> 3.12.0)
73
+ rspec-core (3.12.0)
74
+ rspec-support (~> 3.12.0)
75
+ rspec-expectations (3.12.0)
77
76
  diff-lcs (>= 1.2.0, < 2.0)
78
- rspec-support (~> 3.11.0)
79
- rspec-mocks (3.11.1)
77
+ rspec-support (~> 3.12.0)
78
+ rspec-mocks (3.12.0)
80
79
  diff-lcs (>= 1.2.0, < 2.0)
81
- rspec-support (~> 3.11.0)
82
- rspec-support (3.11.1)
83
- rubocop (1.36.0)
80
+ rspec-support (~> 3.12.0)
81
+ rspec-support (3.12.0)
82
+ rubocop (1.38.0)
84
83
  json (~> 2.3)
85
84
  parallel (~> 1.10)
86
85
  parser (>= 3.1.2.1)
87
86
  rainbow (>= 2.2.2, < 4.0)
88
87
  regexp_parser (>= 1.8, < 3.0)
89
88
  rexml (>= 3.2.5, < 4.0)
90
- rubocop-ast (>= 1.20.1, < 2.0)
89
+ rubocop-ast (>= 1.23.0, < 2.0)
91
90
  ruby-progressbar (~> 1.7)
92
91
  unicode-display_width (>= 1.4.0, < 3.0)
93
- rubocop-ast (1.21.0)
92
+ rubocop-ast (1.23.0)
94
93
  parser (>= 3.1.1.0)
95
94
  ruby-progressbar (1.11.0)
96
95
  tzinfo (2.0.5)
97
96
  concurrent-ruby (~> 1.0)
98
97
  unicode-display_width (2.3.0)
99
- zeitwerk (2.6.1)
100
98
 
101
99
  PLATFORMS
102
100
  ruby
data/README.md CHANGED
@@ -143,17 +143,18 @@ split. Kraps assigns every `key` to a partition, either using a custom
143
143
  `partitioner` or the default built in hash partitioner. The hash partitioner
144
144
  simply calculates a hash of your key modulo the number of partitions and the
145
145
  resulting partition number is the partition where the respective key is
146
- assigned to. A partitioner is a callable which gets the key as argument and
147
- returns a partition number. The built in hash partitioner looks similar to this
148
- one:
146
+ assigned to. A partitioner is a callable which gets the key and the number of
147
+ partitions as argument and returns a partition number. The built in hash
148
+ partitioner looks similar to this one:
149
149
 
150
150
  ```ruby
151
- partitioner = proc { |key| Digest::SHA1.hexdigest(key.inspect)[0..4].to_i(16) % 128 } # 128 partitions
151
+ partitioner = proc { |key, num_partitions| Digest::SHA1.hexdigest(key.inspect)[0..4].to_i(16) % num_partitions }
152
152
  ```
153
153
 
154
154
  Please note, it's important that the partitioner and the specified number of
155
155
  partitions stays in sync. When you use a custom partitioner, please make sure
156
- that the partitioner operates on the same number of partitions you specify.
156
+ that the partitioner correctly returns a partition number in the range of
157
+ `0...num_partitions`.
157
158
 
158
159
  ## Datatypes
159
160
 
@@ -0,0 +1,7 @@
1
+ module Kraps
2
+ class HashPartitioner
3
+ def call(key, num_partitions)
4
+ Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % num_partitions
5
+ end
6
+ end
7
+ end
data/lib/kraps/job.rb CHANGED
@@ -6,10 +6,10 @@ module Kraps
6
6
  @worker = worker
7
7
  @steps = []
8
8
  @partitions = 0
9
- @partitioner = MapReduce::HashPartitioner.new(@partitions)
9
+ @partitioner = HashPartitioner.new
10
10
  end
11
11
 
12
- def parallelize(partitions:, partitioner: MapReduce::HashPartitioner.new(partitions), worker: @worker, &block)
12
+ def parallelize(partitions:, partitioner: HashPartitioner.new, worker: @worker, &block)
13
13
  fresh.tap do |job|
14
14
  job.instance_eval do
15
15
  @partitions = partitions
@@ -24,7 +24,7 @@ module Kraps
24
24
  fresh.tap do |job|
25
25
  job.instance_eval do
26
26
  @partitions = partitions if partitions
27
- @partitioner = partitioner || MapReduce::HashPartitioner.new(partitions) if partitioner || partitions
27
+ @partitioner = partitioner if partitioner
28
28
 
29
29
  @steps << Step.new(action: Actions::MAP, args: { partitions: @partitions, partitioner: @partitioner, worker: worker }, block: block)
30
30
  end
data/lib/kraps/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kraps
2
- VERSION = "0.1.0"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/kraps/worker.rb CHANGED
@@ -31,8 +31,12 @@ module Kraps
31
31
  mapper = MapReduce::Mapper.new(implementation.new, partitioner: partitioner, memory_limit: @memory_limit)
32
32
  mapper.map(@args["item"])
33
33
 
34
- mapper.shuffle do |partition, tempfile| # TODO: upload in parallel
35
- Kraps.driver.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), tempfile, Kraps.driver.bucket)
34
+ mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
35
+ Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
36
+ File.open(path) do |stream|
37
+ Kraps.driver.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream, Kraps.driver.bucket)
38
+ end
39
+ end
36
40
  end
37
41
  end
38
42
 
@@ -68,10 +72,14 @@ module Kraps
68
72
  end
69
73
  end
70
74
 
71
- mapper.shuffle do |partition, tempfile| # TODO: upload in parallel
72
- Kraps.driver.driver.store(
73
- Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), tempfile, Kraps.driver.bucket
74
- )
75
+ mapper.shuffle(chunk_limit: @chunk_limit) do |partitions|
76
+ Parallelizer.each(partitions.to_a, @concurrency) do |partition, path|
77
+ File.open(path) do |stream|
78
+ Kraps.driver.driver.store(
79
+ Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), stream, Kraps.driver.bucket
80
+ )
81
+ end
82
+ end
75
83
  end
76
84
  ensure
77
85
  temp_paths&.unlink
@@ -173,7 +181,7 @@ module Kraps
173
181
  end
174
182
 
175
183
  def partitioner
176
- @partitioner ||= step.args[:partitioner]
184
+ @partitioner ||= proc { |key| step.args[:partitioner].call(key, step.args[:partitions]) }
177
185
  end
178
186
 
179
187
  def distributed_job
data/lib/kraps.rb CHANGED
@@ -2,6 +2,7 @@ require_relative "kraps/version"
2
2
  require_relative "kraps/drivers"
3
3
  require_relative "kraps/actions"
4
4
  require_relative "kraps/parallelizer"
5
+ require_relative "kraps/hash_partitioner"
5
6
  require_relative "kraps/temp_path"
6
7
  require_relative "kraps/temp_paths"
7
8
  require_relative "kraps/timeout_queue"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kraps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Vetter
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-28 00:00:00.000000000 Z
11
+ date: 2022-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: attachie
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: 2.1.1
47
+ version: 3.0.0
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: 2.1.1
54
+ version: 3.0.0
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: redis
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -132,6 +132,7 @@ extra_rdoc_files: []
132
132
  files:
133
133
  - ".rspec"
134
134
  - ".rubocop.yml"
135
+ - CHANGELOG.md
135
136
  - CODE_OF_CONDUCT.md
136
137
  - Gemfile
137
138
  - Gemfile.lock
@@ -143,6 +144,7 @@ files:
143
144
  - lib/kraps/actions.rb
144
145
  - lib/kraps/drivers.rb
145
146
  - lib/kraps/frame.rb
147
+ - lib/kraps/hash_partitioner.rb
146
148
  - lib/kraps/interval.rb
147
149
  - lib/kraps/job.rb
148
150
  - lib/kraps/parallelizer.rb