ruby-kafka 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,130 @@
1
+ require "set"
2
+ require "kafka/broker_pool"
3
+
4
+ module Kafka
5
+
6
+ # A cluster represents the state of a Kafka cluster. It needs to be initialized
7
+ # with a non-empty list of seed brokers. The first seed broker that the cluster can connect
8
+ # to will be asked for the cluster metadata, which allows the cluster to map topic
9
+ # partitions to the current leader for those partitions.
10
+ class Cluster
11
+
12
+ # Initializes a Cluster with a set of seed brokers.
13
+ #
14
+ # The cluster will try to fetch cluster metadata from one of the brokers.
15
+ #
16
+ # @param seed_brokers [Array<String>]
17
+ # @param broker_pool [Kafka::BrokerPool]
18
+ # @param logger [Logger]
19
+ def initialize(seed_brokers:, broker_pool:, logger:)
20
+ if seed_brokers.empty?
21
+ raise ArgumentError, "At least one seed broker must be configured"
22
+ end
23
+
24
+ @logger = logger
25
+ @seed_brokers = seed_brokers
26
+ @broker_pool = broker_pool
27
+ @cluster_info = nil
28
+ @stale = true
29
+
30
+ # This is the set of topics we need metadata for. If empty, metadata for
31
+ # all topics will be fetched.
32
+ @target_topics = Set.new
33
+ end
34
+
35
+ def add_target_topics(topics)
36
+ new_topics = Set.new(topics) - @target_topics
37
+
38
+ unless new_topics.empty?
39
+ @logger.info "New topics added to target list: #{new_topics.to_a.join(', ')}"
40
+
41
+ @target_topics.merge(new_topics)
42
+
43
+ refresh_metadata!
44
+ end
45
+ end
46
+
47
+ def mark_as_stale!
48
+ @stale = true
49
+ end
50
+
51
+ def refresh_metadata!
52
+ @cluster_info = nil
53
+ cluster_info
54
+ end
55
+
56
+ def refresh_metadata_if_necessary!
57
+ refresh_metadata! if @stale
58
+ end
59
+
60
+ # Finds the broker acting as the leader of the given topic and partition.
61
+ #
62
+ # @param topic [String]
63
+ # @param partition [Integer]
64
+ # @return [Broker] the broker that's currently leader.
65
+ def get_leader(topic, partition)
66
+ connect_to_broker(get_leader_id(topic, partition))
67
+ end
68
+
69
+ def partitions_for(topic)
70
+ cluster_info.partitions_for(topic)
71
+ end
72
+
73
+ def topics
74
+ cluster_info.topics.map(&:topic_name)
75
+ end
76
+
77
+ def disconnect
78
+ @broker_pool.close
79
+ end
80
+
81
+ private
82
+
83
+ def get_leader_id(topic, partition)
84
+ cluster_info.find_leader_id(topic, partition)
85
+ end
86
+
87
+ def cluster_info
88
+ @cluster_info ||= fetch_cluster_info
89
+ end
90
+
91
+ # Fetches the cluster metadata.
92
+ #
93
+ # This is used to update the partition leadership information, among other things.
94
+ # The methods will go through each node listed in `seed_brokers`, connecting to the
95
+ # first one that is available. This node will be queried for the cluster metadata.
96
+ #
97
+ # @raise [ConnectionError] if none of the nodes in `seed_brokers` are available.
98
+ # @return [Protocol::MetadataResponse] the cluster metadata.
99
+ def fetch_cluster_info
100
+ @seed_brokers.each do |node|
101
+ @logger.info "Fetching cluster metadata from #{node}"
102
+
103
+ begin
104
+ host, port = node.split(":", 2)
105
+
106
+ broker = @broker_pool.connect(host, port.to_i)
107
+ cluster_info = broker.fetch_metadata(topics: @target_topics)
108
+
109
+ @stale = false
110
+
111
+ @logger.info "Discovered cluster metadata; nodes: #{cluster_info.brokers.join(', ')}"
112
+
113
+ return cluster_info
114
+ rescue Error => e
115
+ @logger.error "Failed to fetch metadata from #{node}: #{e}"
116
+ ensure
117
+ broker.disconnect unless broker.nil?
118
+ end
119
+ end
120
+
121
+ raise ConnectionError, "Could not connect to any of the seed brokers: #{@seed_brokers.join(', ')}"
122
+ end
123
+
124
+ def connect_to_broker(broker_id)
125
+ info = cluster_info.find_broker(broker_id)
126
+
127
+ @broker_pool.connect(info.host, info.port, node_id: info.node_id)
128
+ end
129
+ end
130
+ end
@@ -88,8 +88,6 @@ module Kafka
88
88
  wait_for_response(response_class, notification) unless response_class.nil?
89
89
  end
90
90
  rescue Errno::EPIPE, Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError => e
91
- @logger.error "Connection error: #{e}"
92
-
93
91
  close
94
92
 
95
93
  raise ConnectionError, "Connection error: #{e}"
@@ -98,7 +96,7 @@ module Kafka
98
96
  private
99
97
 
100
98
  def open
101
- @logger.info "Opening connection to #{@host}:#{@port} with client id #{@client_id}..."
99
+ @logger.debug "Opening connection to #{@host}:#{@port} with client id #{@client_id}..."
102
100
 
103
101
  @socket = SocketWithTimeout.new(@host, @port, connect_timeout: @connect_timeout, timeout: @socket_timeout)
104
102
 
@@ -0,0 +1,127 @@
1
+ module Kafka
2
+
3
+ # Fetches messages from one or more partitions.
4
+ #
5
+ # operation = Kafka::FetchOperation.new(
6
+ # cluster: cluster,
7
+ # logger: logger,
8
+ # min_bytes: 1,
9
+ # max_wait_time: 10,
10
+ # )
11
+ #
12
+ # # These calls will schedule fetches from the specified topics/partitions.
13
+ # operation.fetch_from_partition("greetings", 42, offset: :latest, max_bytes: 100000)
14
+ # operation.fetch_from_partition("goodbyes", 13, offset: :latest, max_bytes: 100000)
15
+ #
16
+ # operation.execute
17
+ #
18
+ class FetchOperation
19
+ def initialize(cluster:, logger:, min_bytes:, max_wait_time:)
20
+ @cluster = cluster
21
+ @logger = logger
22
+ @min_bytes = min_bytes
23
+ @max_wait_time = max_wait_time
24
+ @topics = {}
25
+ end
26
+
27
+ def fetch_from_partition(topic, partition, offset:, max_bytes:)
28
+ if offset == :earliest
29
+ offset = -2
30
+ elsif offset == :latest
31
+ offset = -1
32
+ end
33
+
34
+ @topics[topic] ||= {}
35
+ @topics[topic][partition] = {
36
+ fetch_offset: offset,
37
+ max_bytes: max_bytes,
38
+ }
39
+ end
40
+
41
+ def execute
42
+ @cluster.add_target_topics(@topics.keys)
43
+ @cluster.refresh_metadata_if_necessary!
44
+
45
+ topics_by_broker = {}
46
+
47
+ @topics.each do |topic, partitions|
48
+ partitions.each do |partition, options|
49
+ broker = @cluster.get_leader(topic, partition)
50
+
51
+ topics_by_broker[broker] ||= {}
52
+ topics_by_broker[broker][topic] ||= {}
53
+ topics_by_broker[broker][topic][partition] = options
54
+ end
55
+ end
56
+
57
+ topics_by_broker.flat_map {|broker, topics|
58
+ resolve_offsets(broker, topics)
59
+
60
+ options = {
61
+ max_wait_time: @max_wait_time * 1000, # Kafka expects ms, not secs
62
+ min_bytes: @min_bytes,
63
+ topics: topics,
64
+ }
65
+
66
+ response = broker.fetch_messages(**options)
67
+
68
+ response.topics.flat_map {|fetched_topic|
69
+ fetched_topic.partitions.flat_map {|fetched_partition|
70
+ Protocol.handle_error(fetched_partition.error_code)
71
+
72
+ fetched_partition.messages.map {|offset, message|
73
+ FetchedMessage.new(
74
+ value: message.value,
75
+ key: message.key,
76
+ topic: fetched_topic.name,
77
+ partition: fetched_partition.partition,
78
+ offset: offset,
79
+ )
80
+ }
81
+ }
82
+ }
83
+ }
84
+ rescue Kafka::LeaderNotAvailable, Kafka::NotLeaderForPartition
85
+ @cluster.mark_as_stale!
86
+
87
+ raise
88
+ end
89
+
90
+ private
91
+
92
+ def resolve_offsets(broker, topics)
93
+ pending_topics = {}
94
+
95
+ topics.each do |topic, partitions|
96
+ partitions.each do |partition, options|
97
+ offset = options.fetch(:fetch_offset)
98
+ next if offset >= 0
99
+
100
+ @logger.debug "Resolving offset `#{offset}` for #{topic}/#{partition}..."
101
+
102
+ pending_topics[topic] ||= []
103
+ pending_topics[topic] << {
104
+ partition: partition,
105
+ time: offset,
106
+ max_offsets: 1,
107
+ }
108
+ end
109
+ end
110
+
111
+ return topics if pending_topics.empty?
112
+
113
+ response = broker.list_offsets(topics: pending_topics)
114
+
115
+ pending_topics.each do |topic, partitions|
116
+ partitions.each do |options|
117
+ partition = options.fetch(:partition)
118
+ resolved_offset = response.offset_for(topic, partition)
119
+
120
+ @logger.debug "Offset for #{topic}/#{partition} is #{resolved_offset.inspect}"
121
+
122
+ topics[topic][partition][:fetch_offset] = resolved_offset || 0
123
+ end
124
+ end
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,27 @@
1
+ module Kafka
2
+ class FetchedMessage
3
+
4
+ # @return [String] the value of the message.
5
+ attr_reader :value
6
+
7
+ # @return [String] the key of the message.
8
+ attr_reader :key
9
+
10
+ # @return [String] the name of the topic that the message was written to.
11
+ attr_reader :topic
12
+
13
+ # @return [Integer] the partition number that the message was written to.
14
+ attr_reader :partition
15
+
16
+ # @return [Integer] the offset of the message in the partition.
17
+ attr_reader :offset
18
+
19
+ def initialize(value:, key:, topic:, partition:, offset:)
20
+ @value = value
21
+ @key = key
22
+ @topic = topic
23
+ @partition = partition
24
+ @offset = offset
25
+ end
26
+ end
27
+ end
@@ -6,7 +6,7 @@ module Kafka
6
6
  end
7
7
 
8
8
  if defined?(ActiveSupport::Notifications)
9
- Instrumentation = ActiveSupport::Notifications
9
+ Instrumentation = ActiveSupport::Notifications
10
10
  else
11
11
  Instrumentation = NullInstrumentation
12
12
  end
@@ -1,3 +1,5 @@
1
+ require "kafka/protocol/message"
2
+
1
3
  module Kafka
2
4
 
3
5
  # Buffers messages for specific topics/partitions.
@@ -11,8 +13,9 @@ module Kafka
11
13
  @size = 0
12
14
  end
13
15
 
14
- def write(message, topic:, partition:)
16
+ def write(value:, key:, topic:, partition:)
15
17
  @size += 1
18
+ message = Protocol::Message.new(key: key, value: value)
16
19
  buffer_for(topic, partition) << message
17
20
  end
18
21
 
@@ -50,6 +53,10 @@ module Kafka
50
53
  @buffer.delete(topic) if @buffer[topic].empty?
51
54
  end
52
55
 
56
+ def message_count_for_partition(topic:, partition:)
57
+ buffer_for(topic, partition).count
58
+ end
59
+
53
60
  # Clears messages across all topics and partitions.
54
61
  #
55
62
  # @return [nil]
@@ -4,24 +4,29 @@ module Kafka
4
4
 
5
5
  # Assigns partitions to messages.
6
6
  class Partitioner
7
- def initialize(partitions)
8
- @partitions = partitions
9
- end
10
7
 
11
- # Assigns a partition number based on a key.
8
+ # Assigns a partition number based on a partition key. If no explicit
9
+ # partition key is provided, the message key will be used instead.
12
10
  #
13
11
  # If the key is nil, then a random partition is selected. Otherwise, a digest
14
12
  # of the key is used to deterministically find a partition. As long as the
15
13
  # number of partitions doesn't change, the same key will always be assigned
16
14
  # to the same partition.
17
15
  #
18
- # @param key [String, nil] the key to base the partition assignment on, or nil.
16
+ # @param partition_count [Integer] the number of partitions in the topic.
17
+ # @param message [Kafka::PendingMessage] the message that should be assigned
18
+ # a partition.
19
19
  # @return [Integer] the partition number.
20
- def partition_for_key(key)
20
+ def self.partition_for_key(partition_count, message)
21
+ raise ArgumentError if partition_count == 0
22
+
23
+ # If no explicit partition key is specified we use the message key instead.
24
+ key = message.partition_key || message.key
25
+
21
26
  if key.nil?
22
- rand(@partitions.count)
27
+ rand(partition_count)
23
28
  else
24
- Zlib.crc32(key) % @partitions.count
29
+ Zlib.crc32(key) % partition_count
25
30
  end
26
31
  end
27
32
  end
@@ -0,0 +1,13 @@
1
+ module Kafka
2
+ class PendingMessage
3
+ attr_reader :value, :key, :topic, :partition, :partition_key
4
+
5
+ def initialize(value:, key:, topic:, partition:, partition_key:)
6
+ @key = key
7
+ @value = value
8
+ @topic = topic
9
+ @partition = partition
10
+ @partition_key = partition_key
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,116 @@
1
+ module Kafka
2
+ # A produce operation attempts to send all messages in a buffer to the Kafka cluster.
3
+ # Since topics and partitions are spread among all brokers in a cluster, this usually
4
+ # involves sending requests to several or all of the brokers.
5
+ #
6
+ # ## Instrumentation
7
+ #
8
+ # When executing the operation, an `append_message_set.kafka` notification will be
9
+ # emitted for each message set that was successfully appended to a topic partition.
10
+ # The following keys will be found in the payload:
11
+ #
12
+ # * `:topic` — the topic that was written to.
13
+ # * `:partition` — the partition that the message set was appended to.
14
+ # * `:offset` — the offset of the first message in the message set.
15
+ # * `:message_count` — the number of messages that were appended.
16
+ #
17
+ # If there was an error appending the message set, the key `:exception` will be set
18
+ # in the payload. In that case, the message set will most likely not have been
19
+ # appended and will possibly be retried later. Check this key before reporting the
20
+ # operation as successful.
21
+ #
22
+ class ProduceOperation
23
+ def initialize(cluster:, buffer:, required_acks:, ack_timeout:, logger:)
24
+ @cluster = cluster
25
+ @buffer = buffer
26
+ @required_acks = required_acks
27
+ @ack_timeout = ack_timeout
28
+ @logger = logger
29
+ end
30
+
31
+ def execute
32
+ messages_for_broker = {}
33
+
34
+ @buffer.each do |topic, partition, messages|
35
+ begin
36
+ broker = @cluster.get_leader(topic, partition)
37
+
38
+ @logger.debug "Current leader for #{topic}/#{partition} is node #{broker}"
39
+
40
+ messages_for_broker[broker] ||= MessageBuffer.new
41
+ messages_for_broker[broker].concat(messages, topic: topic, partition: partition)
42
+ rescue Kafka::Error => e
43
+ @logger.error "Could not connect to leader for partition #{topic}/#{partition}: #{e}"
44
+
45
+ # We can't send the messages right now, so we'll just keep them in the buffer.
46
+ # We'll mark the cluster as stale in order to force a metadata refresh.
47
+ @cluster.mark_as_stale!
48
+ end
49
+ end
50
+
51
+ messages_for_broker.each do |broker, message_set|
52
+ begin
53
+ @logger.info "Sending #{message_set.size} messages to #{broker}"
54
+
55
+ response = broker.produce(
56
+ messages_for_topics: message_set.to_h,
57
+ required_acks: @required_acks,
58
+ timeout: @ack_timeout * 1000, # Kafka expects the timeout in milliseconds.
59
+ )
60
+
61
+ handle_response(response) if response
62
+ rescue ConnectionError => e
63
+ @logger.error "Could not connect to broker #{broker}: #{e}"
64
+
65
+ # Mark the cluster as stale in order to force a cluster metadata refresh.
66
+ @cluster.mark_as_stale!
67
+ end
68
+ end
69
+ end
70
+
71
+ private
72
+
73
+ def handle_response(response)
74
+ response.each_partition do |topic_info, partition_info|
75
+ topic = topic_info.topic
76
+ partition = partition_info.partition
77
+ offset = partition_info.offset
78
+ message_count = @buffer.message_count_for_partition(topic: topic, partition: partition)
79
+
80
+ begin
81
+ payload = {
82
+ topic: topic,
83
+ partition: partition,
84
+ offset: offset,
85
+ message_count: message_count,
86
+ }
87
+
88
+ Instrumentation.instrument("append_message_set.kafka", payload) do
89
+ Protocol.handle_error(partition_info.error_code)
90
+ end
91
+ rescue Kafka::CorruptMessage
92
+ @logger.error "Corrupt message when writing to #{topic}/#{partition}"
93
+ rescue Kafka::UnknownTopicOrPartition
94
+ @logger.error "Unknown topic or partition #{topic}/#{partition}"
95
+ rescue Kafka::LeaderNotAvailable
96
+ @logger.error "Leader currently not available for #{topic}/#{partition}"
97
+ @cluster.mark_as_stale!
98
+ rescue Kafka::NotLeaderForPartition
99
+ @logger.error "Broker not currently leader for #{topic}/#{partition}"
100
+ @cluster.mark_as_stale!
101
+ rescue Kafka::RequestTimedOut
102
+ @logger.error "Timed out while writing to #{topic}/#{partition}"
103
+ rescue Kafka::NotEnoughReplicas
104
+ @logger.error "Not enough in-sync replicas for #{topic}/#{partition}"
105
+ rescue Kafka::NotEnoughReplicasAfterAppend
106
+ @logger.error "Messages written, but to fewer in-sync replicas than required for #{topic}/#{partition}"
107
+ else
108
+ @logger.debug "Successfully appended #{message_count} messages to #{topic}/#{partition} at offset #{offset}"
109
+
110
+ # The messages were successfully written; clear them from the buffer.
111
+ @buffer.clear_messages(topic: topic, partition: partition)
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end