ruby-kafka 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 50a1c28cf71285d37c57e3dbe7a5d156f891576c
4
- data.tar.gz: f8a139dc4061ec8a771f86c0a5ca280692ddd5cc
3
+ metadata.gz: cc49e2526024ee3dd1cb52b5abfef38dbc2e37ea
4
+ data.tar.gz: a3c0111bccd3e8daf83eec647baec2fa9921fbf8
5
5
  SHA512:
6
- metadata.gz: 6d3245db50893aba63b50b600903dc0baf345017c940dcaa36b0e2fbe0f50fe231bc15f9af2dc083d4969fd63ba0493781570811a3ba4ba0e7df21b06e0fd993
7
- data.tar.gz: d5cddd687cc85f02b0826e95205114e2f64cffd9dad48ef692e717dbb3f7cac41ce2b0afd71ee271e8590e27b9eac42d8f67f247c71fe760dec2915191361273
6
+ metadata.gz: 94547594275850a3bacd22758fc0ccce8956c83f0e6e7c46dd1c4a5fb4aa5b52d6199383862cb5087a90b89ee2e0ca63b486082b0d44da2e0994f6cc76e794a7
7
+ data.tar.gz: 69d7522e0ecd82057fd4b5879273026e6669441e0803c8d28bee83213a66bdbdd13f08051d4b3bb0725c3f19ad9df4dedde7c4a6c0236fac56079b1a666c7ed7
@@ -18,6 +18,8 @@ module Kafka
18
18
  # By default, automatic delivery is disabled and you'll have to call
19
19
  # {#deliver_messages} manually.
20
20
  #
21
+ # ## Buffer Overflow and Backpressure
22
+ #
21
23
  # The calling thread communicates with the background thread doing the actual
22
24
  # work using a thread safe queue. While the background thread is busy delivering
23
25
  # messages, new messages will be buffered in the queue. In order to avoid
@@ -26,6 +28,17 @@ module Kafka
26
28
  # number of messages that is allowed to be buffered. You can configure this
27
29
  # value by setting `max_queue_size`.
28
30
  #
31
+ # If you produce messages faster than the background producer thread can
32
+ # deliver them to Kafka you will eventually fill the producer's buffer. Once
33
+ # this happens, the background thread will stop popping messages off the
34
+ # queue until it can successfully deliver the buffered messages. The queue
35
+ # will therefore grow in size, potentially hitting the `max_queue_size` limit.
36
+ # Once this happens, calls to {#produce} will raise a {BufferOverflow} error.
37
+ #
38
+ # Depending on your use case you may want to slow down the rate of messages
39
+ # being produced or perhaps halt your application completely until the
40
+ # producer can deliver the buffered messages and clear the message queue.
41
+ #
29
42
  # ## Example
30
43
  #
31
44
  # producer = kafka.async_producer(
@@ -91,6 +104,7 @@ module Kafka
91
104
  # @return [nil]
92
105
  def produce(*args)
93
106
  raise BufferOverflow if @queue.size >= @max_queue_size
107
+
94
108
  @queue << [:produce, args]
95
109
 
96
110
  nil
@@ -146,7 +160,7 @@ module Kafka
146
160
 
147
161
  case operation
148
162
  when :produce
149
- @producer.produce(*payload)
163
+ produce(*payload)
150
164
  deliver_messages if threshold_reached?
151
165
  when :deliver_messages
152
166
  deliver_messages
@@ -166,6 +180,13 @@ module Kafka
166
180
 
167
181
  private
168
182
 
183
+ def produce(*args)
184
+ @producer.produce(*args)
185
+ rescue BufferOverflow
186
+ deliver_messages
187
+ retry
188
+ end
189
+
169
190
  def deliver_messages
170
191
  @producer.deliver_messages
171
192
  rescue DeliveryFailed
@@ -6,22 +6,28 @@ module Kafka
6
6
  class MessageBuffer
7
7
  include Enumerable
8
8
 
9
- attr_reader :size
9
+ attr_reader :size, :bytesize
10
10
 
11
11
  def initialize
12
12
  @buffer = {}
13
13
  @size = 0
14
+ @bytesize = 0
14
15
  end
15
16
 
16
17
  def write(value:, key:, topic:, partition:)
17
- @size += 1
18
18
  message = Protocol::Message.new(key: key, value: value)
19
+
19
20
  buffer_for(topic, partition) << message
21
+
22
+ @size += 1
23
+ @bytesize += message.bytesize
20
24
  end
21
25
 
22
26
  def concat(messages, topic:, partition:)
23
- @size += messages.count
24
27
  buffer_for(topic, partition).concat(messages)
28
+
29
+ @size += messages.count
30
+ @bytesize += messages.map(&:bytesize).reduce(:+)
25
31
  end
26
32
 
27
33
  def to_h
@@ -48,6 +54,7 @@ module Kafka
48
54
  # @return [nil]
49
55
  def clear_messages(topic:, partition:)
50
56
  @size -= @buffer[topic][partition].count
57
+ @bytesize -= @buffer[topic][partition].map(&:bytesize).reduce(:+)
51
58
 
52
59
  @buffer[topic].delete(partition)
53
60
  @buffer.delete(topic) if @buffer[topic].empty?
@@ -2,12 +2,16 @@ module Kafka
2
2
  class PendingMessage
3
3
  attr_reader :value, :key, :topic, :partition, :partition_key
4
4
 
5
+ attr_reader :bytesize
6
+
5
7
  def initialize(value:, key:, topic:, partition:, partition_key:)
6
8
  @key = key
7
9
  @value = value
8
10
  @topic = topic
9
11
  @partition = partition
10
12
  @partition_key = partition_key
13
+
14
+ @bytesize = key.to_s.bytesize + value.to_s.bytesize
11
15
  end
12
16
  end
13
17
  end
@@ -0,0 +1,43 @@
1
+ module Kafka
2
+
3
+ # A pending message queue holds messages that have not yet been assigned to
4
+ # a partition. It's designed to only remove messages once they've been
5
+ # successfully handled.
6
+ class PendingMessageQueue
7
+ attr_reader :size, :bytesize
8
+
9
+ def initialize
10
+ @messages = []
11
+ @size = 0
12
+ @bytesize = 0
13
+ end
14
+
15
+ def write(message)
16
+ @messages << message
17
+ @size += 1
18
+ @bytesize += message.bytesize
19
+ end
20
+
21
+ def empty?
22
+ @messages.empty?
23
+ end
24
+
25
+ # Yields each message in the queue to the provided block, removing the
26
+ # message after the block has processed it. If the block raises an
27
+ # exception, the message will be retained in the queue.
28
+ #
29
+ # @yieldparam [PendingMessage] message
30
+ # @return [nil]
31
+ def dequeue_each(&block)
32
+ until @messages.empty?
33
+ message = @messages.first
34
+
35
+ yield message
36
+
37
+ @size -= 1
38
+ @bytesize -= message.bytesize
39
+ @messages.shift
40
+ end
41
+ end
42
+ end
43
+ end
@@ -25,12 +25,13 @@ module Kafka
25
25
  # * `sent_message_count` – the number of messages that were successfully sent.
26
26
  #
27
27
  class ProduceOperation
28
- def initialize(cluster:, buffer:, compression_codec:, required_acks:, ack_timeout:, logger:)
28
+ def initialize(cluster:, buffer:, compression_codec:, compression_threshold:, required_acks:, ack_timeout:, logger:)
29
29
  @cluster = cluster
30
30
  @buffer = buffer
31
31
  @required_acks = required_acks
32
32
  @ack_timeout = ack_timeout
33
33
  @compression_codec = compression_codec
34
+ @compression_threshold = compression_threshold
34
35
  @logger = logger
35
36
  end
36
37
 
@@ -77,7 +78,12 @@ module Kafka
77
78
  messages_for_topics = {}
78
79
 
79
80
  message_buffer.each do |topic, partition, messages|
80
- message_set = Protocol::MessageSet.new(messages: messages, compression_codec: @compression_codec)
81
+ message_set = Protocol::MessageSet.new(
82
+ messages: messages,
83
+ compression_codec: @compression_codec,
84
+ compression_threshold: @compression_threshold,
85
+ )
86
+
81
87
  messages_for_topics[topic] ||= {}
82
88
  messages_for_topics[topic][partition] = message_set
83
89
  end
@@ -1,6 +1,8 @@
1
+ require "set"
1
2
  require "kafka/partitioner"
2
3
  require "kafka/message_buffer"
3
4
  require "kafka/produce_operation"
5
+ require "kafka/pending_message_queue"
4
6
  require "kafka/pending_message"
5
7
  require "kafka/compression"
6
8
 
@@ -47,10 +49,42 @@ module Kafka
47
49
  # not, we do another round of requests, this time with just the remaining messages.
48
50
  # We do this for as long as `max_retries` permits.
49
51
  #
52
+ # ## Compression
53
+ #
54
+ # Depending on what kind of data you produce, enabling compression may yield improved
55
+ # bandwidth and space usage. Compression in Kafka is done on entire messages sets
56
+ # rather than on individual messages. This improves the compression rate and generally
57
+ # means that compressions works better the larger your buffers get, since the message
58
+ # sets will be larger by the time they're compressed.
59
+ #
60
+ # Since many workloads have variations in throughput and distribution across partitions,
61
+ # it's possible to configure a threshold for when to enable compression by setting
62
+ # `compression_threshold`. Only if the defined number of messages are buffered for a
63
+ # partition will the messages be compressed.
64
+ #
65
+ # Compression is enabled by passing the `compression_codec` parameter with the
66
+ # name of one of the algorithms allowed by Kafka:
67
+ #
68
+ # * `:snappy` for [Snappy](http://google.github.io/snappy/) compression.
69
+ # * `:gzip` for [gzip](https://en.wikipedia.org/wiki/Gzip) compression.
70
+ #
71
+ # By default, all message sets will be compressed if you specify a compression
72
+ # codec. To increase the compression threshold, set `compression_threshold` to
73
+ # an integer value higher than one.
74
+ #
50
75
  # ## Instrumentation
51
76
  #
77
+ # Whenever {#produce} is called, the notification `produce_message.producer.kafka`
78
+ # will be emitted with the following payload:
79
+ #
80
+ # * `value` – the message value.
81
+ # * `key` – the message key.
82
+ # * `topic` – the topic that was produced to.
83
+ # * `buffer_size` – the buffer size after adding the message.
84
+ # * `max_buffer_size` – the maximum allowed buffer size for the producer.
85
+ #
52
86
  # After {#deliver_messages} completes, the notification
53
- # `deliver_messages.producer.kafka` will be emitted.
87
+ # `deliver_messages.producer.kafka` will be emitted with the following payload:
54
88
  #
55
89
  # * `message_count` – the total number of messages that the producer tried to
56
90
  # deliver. Note that not all messages may get delivered.
@@ -116,9 +150,21 @@ module Kafka
116
150
  # @param retry_backoff [Integer] the number of seconds to wait between retries.
117
151
  #
118
152
  # @param max_buffer_size [Integer] the number of messages allowed in the buffer
119
- # before new writes will raise BufferOverflow exceptions.
153
+ # before new writes will raise {BufferOverflow} exceptions.
154
+ #
155
+ # @param max_buffer_bytesize [Integer] the maximum size of the buffer in bytes.
156
+ # attempting to produce messages when the buffer reaches this size will
157
+ # result in {BufferOverflow} being raised.
158
+ #
159
+ # @param compression_codec [Symbol, nil] the name of the compression codec to
160
+ # use, or nil if no compression should be performed. Valid codecs: `:snappy`
161
+ # and `:gzip`.
162
+ #
163
+ # @param compression_threshold [Integer] the number of messages that needs to
164
+ # be in a message set before it should be compressed. Note that message sets
165
+ # are per-partition rather than per-topic or per-producer.
120
166
  #
121
- def initialize(cluster:, logger:, compression_codec: nil, ack_timeout: 5, required_acks: 1, max_retries: 2, retry_backoff: 1, max_buffer_size: 1000)
167
+ def initialize(cluster:, logger:, compression_codec: nil, compression_threshold: 1, ack_timeout: 5, required_acks: 1, max_retries: 2, retry_backoff: 1, max_buffer_size: 1000, max_buffer_bytesize: 10_000_000)
122
168
  @cluster = cluster
123
169
  @logger = logger
124
170
  @required_acks = required_acks
@@ -126,13 +172,18 @@ module Kafka
126
172
  @max_retries = max_retries
127
173
  @retry_backoff = retry_backoff
128
174
  @max_buffer_size = max_buffer_size
175
+ @max_buffer_bytesize = max_buffer_bytesize
129
176
  @compression_codec = Compression.find_codec(compression_codec)
177
+ @compression_threshold = compression_threshold
178
+
179
+ # The set of topics that are produced to.
180
+ @target_topics = Set.new
130
181
 
131
182
  # A buffer organized by topic/partition.
132
183
  @buffer = MessageBuffer.new
133
184
 
134
185
  # Messages added by `#produce` but not yet assigned a partition.
135
- @pending_messages = []
186
+ @pending_message_queue = PendingMessageQueue.new
136
187
  end
137
188
 
138
189
  # Produces a message to the specified topic. Note that messages are buffered in
@@ -165,11 +216,7 @@ module Kafka
165
216
  # @raise [BufferOverflow] if the maximum buffer size has been reached.
166
217
  # @return [nil]
167
218
  def produce(value, key: nil, topic:, partition: nil, partition_key: nil)
168
- unless buffer_size < @max_buffer_size
169
- raise BufferOverflow, "Max buffer size #{@max_buffer_size} exceeded"
170
- end
171
-
172
- @pending_messages << PendingMessage.new(
219
+ message = PendingMessage.new(
173
220
  value: value,
174
221
  key: key,
175
222
  topic: topic,
@@ -177,6 +224,25 @@ module Kafka
177
224
  partition_key: partition_key,
178
225
  )
179
226
 
227
+ if buffer_size >= @max_buffer_size
228
+ raise BufferOverflow, "Max buffer size (#{@max_buffer_size} messages) exceeded"
229
+ end
230
+
231
+ if buffer_bytesize + message.bytesize >= @max_buffer_bytesize
232
+ raise BufferOverflow, "Max buffer bytesize (#{@max_buffer_bytesize} bytes) exceeded"
233
+ end
234
+
235
+ @target_topics.add(topic)
236
+ @pending_message_queue.write(message)
237
+
238
+ Instrumentation.instrument("produce_message.producer.kafka", {
239
+ value: value,
240
+ key: key,
241
+ topic: topic,
242
+ buffer_size: buffer_size,
243
+ max_buffer_size: @max_buffer_size,
244
+ })
245
+
180
246
  nil
181
247
  end
182
248
 
@@ -211,7 +277,11 @@ module Kafka
211
277
  #
212
278
  # @return [Integer] buffer size.
213
279
  def buffer_size
214
- @pending_messages.size + @buffer.size
280
+ @pending_message_queue.size + @buffer.size
281
+ end
282
+
283
+ def buffer_bytesize
284
+ @pending_message_queue.bytesize + @buffer.bytesize
215
285
  end
216
286
 
217
287
  # Closes all connections to the brokers.
@@ -226,9 +296,7 @@ module Kafka
226
296
  def deliver_messages_with_retries(notification)
227
297
  attempt = 0
228
298
 
229
- # Make sure we get metadata for this topic.
230
- target_topics = @pending_messages.map(&:topic).uniq
231
- @cluster.add_target_topics(target_topics)
299
+ @cluster.add_target_topics(@target_topics)
232
300
 
233
301
  operation = ProduceOperation.new(
234
302
  cluster: @cluster,
@@ -236,6 +304,7 @@ module Kafka
236
304
  required_acks: @required_acks,
237
305
  ack_timeout: @ack_timeout,
238
306
  compression_codec: @compression_codec,
307
+ compression_threshold: @compression_threshold,
239
308
  logger: @logger,
240
309
  )
241
310
 
@@ -249,7 +318,7 @@ module Kafka
249
318
  assign_partitions!
250
319
  operation.execute
251
320
 
252
- if @pending_messages.empty? && @buffer.empty?
321
+ if buffer_size.zero?
253
322
  break
254
323
  elsif attempt <= @max_retries
255
324
  @logger.warn "Failed to send all messages; attempting retry #{attempt} of #{@max_retries} after #{@retry_backoff}s"
@@ -276,10 +345,7 @@ module Kafka
276
345
  end
277
346
 
278
347
  def assign_partitions!
279
- until @pending_messages.empty?
280
- # We want to keep the message in the first-stage buffer in case there's an error.
281
- message = @pending_messages.first
282
-
348
+ @pending_message_queue.dequeue_each do |message|
283
349
  partition = message.partition
284
350
 
285
351
  if partition.nil?
@@ -293,9 +359,6 @@ module Kafka
293
359
  topic: message.topic,
294
360
  partition: partition,
295
361
  )
296
-
297
- # Now it's safe to remove the message from the first-stage buffer.
298
- @pending_messages.shift
299
362
  end
300
363
  rescue Kafka::Error => e
301
364
  @logger.error "Failed to assign pending message to a partition: #{e}"
@@ -18,11 +18,15 @@ module Kafka
18
18
 
19
19
  attr_reader :key, :value, :attributes, :offset
20
20
 
21
+ attr_reader :bytesize
22
+
21
23
  def initialize(value:, key: nil, attributes: 0, offset: -1)
22
24
  @key = key
23
25
  @value = value
24
26
  @attributes = attributes
25
27
  @offset = offset
28
+
29
+ @bytesize = @key.to_s.bytesize + @value.to_s.bytesize
26
30
  end
27
31
 
28
32
  def encode(encoder)
@@ -3,9 +3,14 @@ module Kafka
3
3
  class MessageSet
4
4
  attr_reader :messages
5
5
 
6
- def initialize(messages: [], compression_codec: nil)
6
+ def initialize(messages: [], compression_codec: nil, compression_threshold: 1)
7
7
  @messages = messages
8
8
  @compression_codec = compression_codec
9
+ @compression_threshold = compression_threshold
10
+ end
11
+
12
+ def size
13
+ @messages.size
9
14
  end
10
15
 
11
16
  def ==(other)
@@ -13,10 +18,10 @@ module Kafka
13
18
  end
14
19
 
15
20
  def encode(encoder)
16
- if @compression_codec.nil?
17
- encode_without_compression(encoder)
18
- else
21
+ if compress?
19
22
  encode_with_compression(encoder)
23
+ else
24
+ encode_without_compression(encoder)
20
25
  end
21
26
  end
22
27
 
@@ -39,6 +44,10 @@ module Kafka
39
44
 
40
45
  private
41
46
 
47
+ def compress?
48
+ !@compression_codec.nil? && size >= @compression_threshold
49
+ end
50
+
42
51
  def encode_with_compression(encoder)
43
52
  codec = @compression_codec
44
53
 
data/lib/kafka/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kafka
2
- VERSION = "0.1.6"
2
+ VERSION = "0.1.7"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-kafka
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daniel Schierbeck
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-02-22 00:00:00.000000000 Z
11
+ date: 2016-02-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -179,6 +179,7 @@ files:
179
179
  - lib/kafka/message_buffer.rb
180
180
  - lib/kafka/partitioner.rb
181
181
  - lib/kafka/pending_message.rb
182
+ - lib/kafka/pending_message_queue.rb
182
183
  - lib/kafka/produce_operation.rb
183
184
  - lib/kafka/producer.rb
184
185
  - lib/kafka/protocol.rb