ruby-kafka-ec2 0.1.2 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d731fe0da3282726a8bb898f98f0ab3fe0133463662d9e7bc3c536c965bfc278
4
- data.tar.gz: e260eb39399056cc48cdf8b7b985b037cdb60eafc8f495771d7949e7eb581a52
3
+ metadata.gz: 66c71213189c16f43593597889adfb2ef3d0f4757cbf6ae7bb600310a2f88855
4
+ data.tar.gz: 71a0256485b92b88ed891e76bd3c3cbd85420b70332835cd33d4aeaf748e207d
5
5
  SHA512:
6
- metadata.gz: 3f31a1d280d2f49b864fbc4068a4f198952180101973d2a93c708ff621238f83476dac72e8c302427f3413a6a9761f81b5a703b8020aee3edba5bf2a42e0a967
7
- data.tar.gz: 9d4cda171d25754e42bda7a90c4d7b9c8a77fa8fc984a5546ccb1cbc10b4ccf10999f75c6dbaa7ecd4fdecdde285a3ccec0ead4071326564f6527382853edbe0
6
+ metadata.gz: c88ff1e2fe4ebd92fe6b9a13a87fd5e9582c09228ae17b29cdbd4c0186f83e22d6c96bb6f193fbd5ae2ed624979fe3c33a80c6db88e72a0800b2f878ffbfc7b1
7
+ data.tar.gz: 78cd8b945be174b64cdd261686683521160ed7c2c7334c6586726d1803abea4e0aad96d8470304ef324fe7b04281b7048d1591794771178c2bf71d0597f07ca7
data/README.md CHANGED
@@ -118,6 +118,17 @@ assignment_strategy_factory = Kafka::EC2::MixedInstanceAssignmentStrategyFactory
118
118
  )
119
119
  ```
120
120
 
121
+ The strategy also has the option `partition_weights`. This is useful when the topic has some skewed partitions. Suppose the partition with ID 0 of the topic "foo" receives twice as many records as other partitions. To reduce the number of partitions assigned to the consumer that consumes the partition with ID 0, specify `partition_weights` like below:
122
+
123
+ ```ruby
124
+ assignment_strategy_factory = Kafka::EC2::MixedInstanceAssignmentStrategyFactory.new(
125
+ partition_weights: {
126
+ "foo" => {
127
+ 0 => 2,
128
+ },
129
+ }
130
+ )
131
+ ```
121
132
 
122
133
  ## Development
123
134
 
@@ -23,11 +23,12 @@ module Kafka
23
23
  # instance_family_weights or availability_zone_weights. If the object is a proc,
24
24
  # it must returns such a hash and the proc is called every time the method "assign"
25
25
  # is called.
26
- def initialize(cluster:, instance_family_weights: {}, availability_zone_weights: {}, weights: {})
26
+ def initialize(cluster:, instance_family_weights: {}, availability_zone_weights: {}, weights: {}, partition_weights: {})
27
27
  @cluster = cluster
28
28
  @instance_family_weights = instance_family_weights
29
29
  @availability_zone_weights = availability_zone_weights
30
30
  @weights = weights
31
+ @partition_weights = partition_weights
31
32
  end
32
33
 
33
34
  # Assign the topic partitions to the group members.
@@ -41,6 +42,7 @@ module Kafka
41
42
  instance_id_to_capacity = Hash.new(0)
42
43
  instance_id_to_member_ids = Hash.new { |h, k| h[k] = [] }
43
44
  total_capacity = 0
45
+ member_id_to_instance_id = {}
44
46
 
45
47
  instance_family_to_capacity = @instance_family_weights.is_a?(Proc) ? @instance_family_weights.call() : @instance_family_weights
46
48
  az_to_capacity = @availability_zone_weights.is_a?(Proc) ? @availability_zone_weights.call() : @availability_zone_weights
@@ -50,6 +52,7 @@ module Kafka
50
52
 
51
53
  instance_id, instance_type, az = member_id_to_metadata[member_id].split(",")
52
54
  instance_id_to_member_ids[instance_id] << member_id
55
+ member_id_to_instance_id[member_id] = instance_id
53
56
  capacity = calculate_capacity(instance_type, az, instance_family_to_capacity, az_to_capacity, weights)
54
57
  instance_id_to_capacity[instance_id] += capacity
55
58
  total_capacity += capacity
@@ -64,25 +67,48 @@ module Kafka
64
67
  Array.new(partitions.count) { topic }.zip(partitions)
65
68
  end
66
69
 
67
- partition_count_per_capacity = topic_partitions.size / total_capacity
70
+ partition_weights = build_partition_weights(topics)
71
+ partition_weight_per_capacity = topic_partitions.sum { |topic, partition| partition_weights.dig(topic, partition) } / total_capacity
72
+
68
73
  last_index = 0
69
- instance_id_to_capacity.sort_by { |_, capacity| -capacity }.each do |instance_id, capacity|
70
- partition_count = (capacity * partition_count_per_capacity).round
74
+ member_id_to_acceptable_partition_weight = {}
75
+ instance_id_to_total_acceptable_partition_weight = Hash.new(0)
76
+ instance_id_to_capacity.each do |instance_id, capacity|
71
77
  member_ids = instance_id_to_member_ids[instance_id]
72
- topic_partitions[last_index, partition_count]&.each_with_index do |(topic, partition), index|
73
- member_id = member_ids[index % member_ids.size]
74
- group_assignment[member_id].assign(topic, [partition])
75
- end
78
+ member_ids.each do |member_id|
79
+ acceptable_partition_weight = capacity * partition_weight_per_capacity / member_ids.size
80
+ while last_index < topic_partitions.size
81
+ topic, partition = topic_partitions[last_index]
82
+ partition_weight = partition_weights.dig(topic, partition)
83
+ break if acceptable_partition_weight - partition_weight < 0
84
+
85
+ group_assignment[member_id].assign(topic, [partition])
86
+ acceptable_partition_weight -= partition_weight
76
87
 
77
- last_index += partition_count
88
+ last_index += 1
89
+ end
90
+
91
+ member_id_to_acceptable_partition_weight[member_id] = acceptable_partition_weight
92
+ instance_id_to_total_acceptable_partition_weight[instance_id] += acceptable_partition_weight
93
+ end
78
94
  end
79
95
 
80
- if last_index < topic_partitions.size
81
- member_ids = instance_id_to_member_ids.values.flatten
82
- topic_partitions[last_index, topic_partitions.size].each_with_index do |(topic, partition), index|
83
- member_id = member_ids[index % member_ids.size]
84
- group_assignment[member_id].assign(topic, [partition])
96
+ while last_index < topic_partitions.size
97
+ max_acceptable_partition_weight = member_id_to_acceptable_partition_weight.values.max
98
+ member_ids = member_id_to_acceptable_partition_weight.select { |_, w| w == max_acceptable_partition_weight }.keys
99
+ if member_ids.size == 1
100
+ member_id = member_ids.first
101
+ else
102
+ member_id = member_ids.max_by { |id| instance_id_to_total_acceptable_partition_weight[member_id_to_instance_id[id]] }
85
103
  end
104
+ topic, partition = topic_partitions[last_index]
105
+ group_assignment[member_id].assign(topic, [partition])
106
+
107
+ partition_weight = partition_weights.dig(topic, partition)
108
+ member_id_to_acceptable_partition_weight[member_id] -= partition_weight
109
+ instance_id_to_total_acceptable_partition_weight[member_id_to_instance_id[member_id]] -= partition_weight
110
+
111
+ last_index += 1
86
112
  end
87
113
 
88
114
  group_assignment
@@ -97,7 +123,18 @@ module Kafka
97
123
  instance_family, _ = instance_type.split(".")
98
124
 
99
125
  capacity = weights.dig(az, instance_family) || weights.dig(instance_family, az)
100
- capacity || instance_family_to_capacity.fetch(instance_family, 1) * az_to_capacity.fetch(az, 1)
126
+ (capacity || instance_family_to_capacity.fetch(instance_family, 1) * az_to_capacity.fetch(az, 1)).to_f
127
+ end
128
+
129
+ def build_partition_weights(topics)
130
+ # Duplicate the weights to not destruct @partition_weights or the return value of @partition_weights
131
+ weights = (@partition_weights.is_a?(Proc) ? @partition_weights.call() : @partition_weights).dup
132
+ topics.each do |t|
133
+ weights[t] = weights[t].dup || {}
134
+ weights[t].default = 1
135
+ end
136
+
137
+ weights
101
138
  end
102
139
  end
103
140
  end
@@ -9,9 +9,11 @@ module Kafka
9
9
  # @param availability_zone_weights [Hash, Proc]
10
10
  # @param weights [Hash, Proc]
11
11
  # @see Kafka::EC2::MixedInstanceAssignmentStrategy#initialize
12
- def initialize(instance_family_weights: {}, availability_zone_weights: {}, weights: {})
12
+ def initialize(instance_family_weights: {}, availability_zone_weights: {}, weights: {}, partition_weights: {})
13
13
  @instance_family_weights = instance_family_weights
14
14
  @availability_zone_weights = availability_zone_weights
15
+ @weights = weights
16
+ @partition_weights = partition_weights
15
17
  end
16
18
 
17
19
  def create(cluster:)
@@ -19,6 +21,8 @@ module Kafka
19
21
  cluster: cluster,
20
22
  instance_family_weights: @instance_family_weights,
21
23
  availability_zone_weights: @availability_zone_weights,
24
+ weights: @weights,
25
+ partition_weights: @partition_weights,
22
26
  )
23
27
  end
24
28
  end
@@ -1,5 +1,5 @@
1
1
  module Kafka
2
2
  class EC2
3
- VERSION = "0.1.2"
3
+ VERSION = "0.1.7"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-kafka-ec2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - abicky
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-05 00:00:00.000000000 Z
11
+ date: 2021-03-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-kafka
@@ -116,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
116
116
  - !ruby/object:Gem::Version
117
117
  version: '0'
118
118
  requirements: []
119
- rubygems_version: 3.1.2
119
+ rubygems_version: 3.1.4
120
120
  signing_key:
121
121
  specification_version: 4
122
122
  summary: An extension of ruby-kafka for EC2