ruby-kafka-ec2 0.1.2 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d731fe0da3282726a8bb898f98f0ab3fe0133463662d9e7bc3c536c965bfc278
4
- data.tar.gz: e260eb39399056cc48cdf8b7b985b037cdb60eafc8f495771d7949e7eb581a52
3
+ metadata.gz: 66c71213189c16f43593597889adfb2ef3d0f4757cbf6ae7bb600310a2f88855
4
+ data.tar.gz: 71a0256485b92b88ed891e76bd3c3cbd85420b70332835cd33d4aeaf748e207d
5
5
  SHA512:
6
- metadata.gz: 3f31a1d280d2f49b864fbc4068a4f198952180101973d2a93c708ff621238f83476dac72e8c302427f3413a6a9761f81b5a703b8020aee3edba5bf2a42e0a967
7
- data.tar.gz: 9d4cda171d25754e42bda7a90c4d7b9c8a77fa8fc984a5546ccb1cbc10b4ccf10999f75c6dbaa7ecd4fdecdde285a3ccec0ead4071326564f6527382853edbe0
6
+ metadata.gz: c88ff1e2fe4ebd92fe6b9a13a87fd5e9582c09228ae17b29cdbd4c0186f83e22d6c96bb6f193fbd5ae2ed624979fe3c33a80c6db88e72a0800b2f878ffbfc7b1
7
+ data.tar.gz: 78cd8b945be174b64cdd261686683521160ed7c2c7334c6586726d1803abea4e0aad96d8470304ef324fe7b04281b7048d1591794771178c2bf71d0597f07ca7
data/README.md CHANGED
@@ -118,6 +118,17 @@ assignment_strategy_factory = Kafka::EC2::MixedInstanceAssignmentStrategyFactory
118
118
  )
119
119
  ```
120
120
 
121
+ The strategy also has the option `partition_weights`. This is useful when the topic has some skewed partitions. Suppose the partition with ID 0 of the topic "foo" receives twice as many records as other partitions. To reduce the number of partitions assigned to the consumer that consumes the partition with ID 0, specify `partition_weights` like below:
122
+
123
+ ```ruby
124
+ assignment_strategy_factory = Kafka::EC2::MixedInstanceAssignmentStrategyFactory.new(
125
+ partition_weights: {
126
+ "foo" => {
127
+ 0 => 2,
128
+ },
129
+ }
130
+ )
131
+ ```
121
132
 
122
133
  ## Development
123
134
 
@@ -23,11 +23,12 @@ module Kafka
23
23
  # instance_family_weights or availability_zone_weights. If the object is a proc,
24
24
  # it must returns such a hash and the proc is called every time the method "assign"
25
25
  # is called.
26
- def initialize(cluster:, instance_family_weights: {}, availability_zone_weights: {}, weights: {})
26
+ def initialize(cluster:, instance_family_weights: {}, availability_zone_weights: {}, weights: {}, partition_weights: {})
27
27
  @cluster = cluster
28
28
  @instance_family_weights = instance_family_weights
29
29
  @availability_zone_weights = availability_zone_weights
30
30
  @weights = weights
31
+ @partition_weights = partition_weights
31
32
  end
32
33
 
33
34
  # Assign the topic partitions to the group members.
@@ -41,6 +42,7 @@ module Kafka
41
42
  instance_id_to_capacity = Hash.new(0)
42
43
  instance_id_to_member_ids = Hash.new { |h, k| h[k] = [] }
43
44
  total_capacity = 0
45
+ member_id_to_instance_id = {}
44
46
 
45
47
  instance_family_to_capacity = @instance_family_weights.is_a?(Proc) ? @instance_family_weights.call() : @instance_family_weights
46
48
  az_to_capacity = @availability_zone_weights.is_a?(Proc) ? @availability_zone_weights.call() : @availability_zone_weights
@@ -50,6 +52,7 @@ module Kafka
50
52
 
51
53
  instance_id, instance_type, az = member_id_to_metadata[member_id].split(",")
52
54
  instance_id_to_member_ids[instance_id] << member_id
55
+ member_id_to_instance_id[member_id] = instance_id
53
56
  capacity = calculate_capacity(instance_type, az, instance_family_to_capacity, az_to_capacity, weights)
54
57
  instance_id_to_capacity[instance_id] += capacity
55
58
  total_capacity += capacity
@@ -64,25 +67,48 @@ module Kafka
64
67
  Array.new(partitions.count) { topic }.zip(partitions)
65
68
  end
66
69
 
67
- partition_count_per_capacity = topic_partitions.size / total_capacity
70
+ partition_weights = build_partition_weights(topics)
71
+ partition_weight_per_capacity = topic_partitions.sum { |topic, partition| partition_weights.dig(topic, partition) } / total_capacity
72
+
68
73
  last_index = 0
69
- instance_id_to_capacity.sort_by { |_, capacity| -capacity }.each do |instance_id, capacity|
70
- partition_count = (capacity * partition_count_per_capacity).round
74
+ member_id_to_acceptable_partition_weight = {}
75
+ instance_id_to_total_acceptable_partition_weight = Hash.new(0)
76
+ instance_id_to_capacity.each do |instance_id, capacity|
71
77
  member_ids = instance_id_to_member_ids[instance_id]
72
- topic_partitions[last_index, partition_count]&.each_with_index do |(topic, partition), index|
73
- member_id = member_ids[index % member_ids.size]
74
- group_assignment[member_id].assign(topic, [partition])
75
- end
78
+ member_ids.each do |member_id|
79
+ acceptable_partition_weight = capacity * partition_weight_per_capacity / member_ids.size
80
+ while last_index < topic_partitions.size
81
+ topic, partition = topic_partitions[last_index]
82
+ partition_weight = partition_weights.dig(topic, partition)
83
+ break if acceptable_partition_weight - partition_weight < 0
84
+
85
+ group_assignment[member_id].assign(topic, [partition])
86
+ acceptable_partition_weight -= partition_weight
76
87
 
77
- last_index += partition_count
88
+ last_index += 1
89
+ end
90
+
91
+ member_id_to_acceptable_partition_weight[member_id] = acceptable_partition_weight
92
+ instance_id_to_total_acceptable_partition_weight[instance_id] += acceptable_partition_weight
93
+ end
78
94
  end
79
95
 
80
- if last_index < topic_partitions.size
81
- member_ids = instance_id_to_member_ids.values.flatten
82
- topic_partitions[last_index, topic_partitions.size].each_with_index do |(topic, partition), index|
83
- member_id = member_ids[index % member_ids.size]
84
- group_assignment[member_id].assign(topic, [partition])
96
+ while last_index < topic_partitions.size
97
+ max_acceptable_partition_weight = member_id_to_acceptable_partition_weight.values.max
98
+ member_ids = member_id_to_acceptable_partition_weight.select { |_, w| w == max_acceptable_partition_weight }.keys
99
+ if member_ids.size == 1
100
+ member_id = member_ids.first
101
+ else
102
+ member_id = member_ids.max_by { |id| instance_id_to_total_acceptable_partition_weight[member_id_to_instance_id[id]] }
85
103
  end
104
+ topic, partition = topic_partitions[last_index]
105
+ group_assignment[member_id].assign(topic, [partition])
106
+
107
+ partition_weight = partition_weights.dig(topic, partition)
108
+ member_id_to_acceptable_partition_weight[member_id] -= partition_weight
109
+ instance_id_to_total_acceptable_partition_weight[member_id_to_instance_id[member_id]] -= partition_weight
110
+
111
+ last_index += 1
86
112
  end
87
113
 
88
114
  group_assignment
@@ -97,7 +123,18 @@ module Kafka
97
123
  instance_family, _ = instance_type.split(".")
98
124
 
99
125
  capacity = weights.dig(az, instance_family) || weights.dig(instance_family, az)
100
- capacity || instance_family_to_capacity.fetch(instance_family, 1) * az_to_capacity.fetch(az, 1)
126
+ (capacity || instance_family_to_capacity.fetch(instance_family, 1) * az_to_capacity.fetch(az, 1)).to_f
127
+ end
128
+
129
+ def build_partition_weights(topics)
130
+ # Duplicate the weights to not destruct @partition_weights or the return value of @partition_weights
131
+ weights = (@partition_weights.is_a?(Proc) ? @partition_weights.call() : @partition_weights).dup
132
+ topics.each do |t|
133
+ weights[t] = weights[t].dup || {}
134
+ weights[t].default = 1
135
+ end
136
+
137
+ weights
101
138
  end
102
139
  end
103
140
  end
@@ -9,9 +9,11 @@ module Kafka
9
9
  # @param availability_zone_weights [Hash, Proc]
10
10
  # @param weights [Hash, Proc]
11
11
  # @see Kafka::EC2::MixedInstanceAssignmentStrategy#initialize
12
- def initialize(instance_family_weights: {}, availability_zone_weights: {}, weights: {})
12
+ def initialize(instance_family_weights: {}, availability_zone_weights: {}, weights: {}, partition_weights: {})
13
13
  @instance_family_weights = instance_family_weights
14
14
  @availability_zone_weights = availability_zone_weights
15
+ @weights = weights
16
+ @partition_weights = partition_weights
15
17
  end
16
18
 
17
19
  def create(cluster:)
@@ -19,6 +21,8 @@ module Kafka
19
21
  cluster: cluster,
20
22
  instance_family_weights: @instance_family_weights,
21
23
  availability_zone_weights: @availability_zone_weights,
24
+ weights: @weights,
25
+ partition_weights: @partition_weights,
22
26
  )
23
27
  end
24
28
  end
@@ -1,5 +1,5 @@
1
1
  module Kafka
2
2
  class EC2
3
- VERSION = "0.1.2"
3
+ VERSION = "0.1.7"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-kafka-ec2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - abicky
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-05 00:00:00.000000000 Z
11
+ date: 2021-03-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-kafka
@@ -116,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
116
116
  - !ruby/object:Gem::Version
117
117
  version: '0'
118
118
  requirements: []
119
- rubygems_version: 3.1.2
119
+ rubygems_version: 3.1.4
120
120
  signing_key:
121
121
  specification_version: 4
122
122
  summary: An extension of ruby-kafka for EC2