karafka 2.2.7 → 2.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/CHANGELOG.md +12 -0
  4. data/Gemfile.lock +24 -14
  5. data/bin/karafka +2 -3
  6. data/docker-compose.yml +3 -1
  7. data/karafka.gemspec +1 -2
  8. data/lib/karafka/base_consumer.rb +1 -0
  9. data/lib/karafka/cli/base.rb +45 -34
  10. data/lib/karafka/cli/console.rb +5 -4
  11. data/lib/karafka/cli/help.rb +24 -0
  12. data/lib/karafka/cli/info.rb +2 -2
  13. data/lib/karafka/cli/install.rb +4 -4
  14. data/lib/karafka/cli/server.rb +68 -33
  15. data/lib/karafka/cli/topics.rb +1 -1
  16. data/lib/karafka/cli.rb +23 -19
  17. data/lib/karafka/connection/client.rb +9 -4
  18. data/lib/karafka/connection/rebalance_manager.rb +36 -21
  19. data/lib/karafka/errors.rb +3 -0
  20. data/lib/karafka/instrumentation/callbacks/rebalance.rb +64 -0
  21. data/lib/karafka/instrumentation/notifications.rb +5 -1
  22. data/lib/karafka/instrumentation/vendors/appsignal/base.rb +30 -0
  23. data/lib/karafka/instrumentation/vendors/appsignal/client.rb +122 -0
  24. data/lib/karafka/instrumentation/vendors/appsignal/dashboard.json +222 -0
  25. data/lib/karafka/instrumentation/vendors/appsignal/errors_listener.rb +30 -0
  26. data/lib/karafka/instrumentation/vendors/appsignal/metrics_listener.rb +331 -0
  27. data/lib/karafka/instrumentation/vendors/datadog/metrics_listener.rb +2 -2
  28. data/lib/karafka/patches/rdkafka/bindings.rb +22 -39
  29. data/lib/karafka/patches/rdkafka/opaque.rb +36 -0
  30. data/lib/karafka/pro/processing/coordinator.rb +6 -7
  31. data/lib/karafka/pro/processing/strategies/vp/default.rb +20 -0
  32. data/lib/karafka/version.rb +1 -1
  33. data/lib/karafka.rb +1 -1
  34. data.tar.gz.sig +0 -0
  35. metadata +12 -18
  36. metadata.gz.sig +0 -0
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Karafka
4
4
  module Connection
5
- # Manager for tracking changes in the partitions assignment.
5
+ # Manager for tracking changes in the partitions assignment after the assignment is done.
6
6
  #
7
7
  # We need tracking of those to clean up consumers that will no longer process given partitions
8
8
  # as they were taken away.
@@ -17,6 +17,10 @@ module Karafka
17
17
  #
18
18
  # @note For cooperative-sticky `#assigned_partitions` holds only the recently assigned
19
19
  # partitions, not all the partitions that are owned
20
+ #
21
+ # @note We have to have the `subscription_group` reference because we have a global pipeline
22
+ # for notifications and we need to make sure we track changes only for things that are of
23
+ # relevance to our subscription group
20
24
  class RebalanceManager
21
25
  # Empty array for internal usage not to create new objects
22
26
  EMPTY_ARRAY = [].freeze
@@ -25,12 +29,17 @@ module Karafka
25
29
 
26
30
  private_constant :EMPTY_ARRAY
27
31
 
32
+ # @param subscription_group_id [String] subscription group id
28
33
  # @return [RebalanceManager]
29
- def initialize
34
+ def initialize(subscription_group_id)
30
35
  @assigned_partitions = {}
31
36
  @revoked_partitions = {}
32
37
  @changed = false
33
38
  @active = false
39
+ @subscription_group_id = subscription_group_id
40
+
41
+ # Connects itself to the instrumentation pipeline so rebalances can be tracked
42
+ ::Karafka.monitor.subscribe(self)
34
43
  end
35
44
 
36
45
  # Resets the rebalance manager state
@@ -55,36 +64,42 @@ module Karafka
55
64
  @active
56
65
  end
57
66
 
58
- # Callback that kicks in inside of rdkafka, when new partitions are assigned.
67
+ # We consider as lost only partitions that were taken away and not re-assigned back to us
68
+ def lost_partitions
69
+ lost_partitions = {}
70
+
71
+ revoked_partitions.each do |topic, partitions|
72
+ lost_partitions[topic] = partitions - assigned_partitions.fetch(topic, EMPTY_ARRAY)
73
+ end
74
+
75
+ lost_partitions
76
+ end
77
+
78
+ # Callback that kicks in inside of rdkafka, when new partitions were assigned.
59
79
  #
60
80
  # @private
61
- # @param partitions [Rdkafka::Consumer::TopicPartitionList]
62
- def on_partitions_assigned(partitions)
81
+ # @param event [Karafka::Core::Monitoring::Event]
82
+ def on_rebalance_partitions_assigned(event)
83
+ # Apply changes only for our subscription group
84
+ return unless event[:subscription_group_id] == @subscription_group_id
85
+
63
86
  @active = true
64
- @assigned_partitions = partitions.to_h.transform_values { |part| part.map(&:partition) }
87
+ @assigned_partitions = event[:tpl].to_h.transform_values { |part| part.map(&:partition) }
65
88
  @changed = true
66
89
  end
67
90
 
68
- # Callback that kicks in inside of rdkafka, when partitions are revoked.
91
+ # Callback that kicks in inside of rdkafka, when partitions were revoked.
69
92
  #
70
93
  # @private
71
- # @param partitions [Rdkafka::Consumer::TopicPartitionList]
72
- def on_partitions_revoked(partitions)
94
+ # @param event [Karafka::Core::Monitoring::Event]
95
+ def on_rebalance_partitions_revoked(event)
96
+ # Apply changes only for our subscription group
97
+ return unless event[:subscription_group_id] == @subscription_group_id
98
+
73
99
  @active = true
74
- @revoked_partitions = partitions.to_h.transform_values { |part| part.map(&:partition) }
100
+ @revoked_partitions = event[:tpl].to_h.transform_values { |part| part.map(&:partition) }
75
101
  @changed = true
76
102
  end
77
-
78
- # We consider as lost only partitions that were taken away and not re-assigned back to us
79
- def lost_partitions
80
- lost_partitions = {}
81
-
82
- revoked_partitions.each do |topic, partitions|
83
- lost_partitions[topic] = partitions - assigned_partitions.fetch(topic, EMPTY_ARRAY)
84
- end
85
-
86
- lost_partitions
87
- end
88
103
  end
89
104
  end
90
105
  end
@@ -60,5 +60,8 @@ module Karafka
60
60
  # Raised when we run operations that require certain result but despite successfully finishing
61
61
  # it is not yet available due to some synchronization mechanisms and caches
62
62
  ResultNotVisibleError = Class.new(BaseError)
63
+
64
+ # Raised when there is an attempt to run an unrecognized CLI command
65
+ UnrecognizedCommandError = Class.new(BaseError)
63
66
  end
64
67
  end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Instrumentation
5
+ module Callbacks
6
+ # Callback that connects to the librdkafka rebalance callback and converts those events into
7
+ # our internal events
8
+ class Rebalance
9
+ # @param subscription_group_id [String] id of the current subscription group instance
10
+ # @param consumer_group_id [String] id of the current consumer group
11
+ def initialize(subscription_group_id, consumer_group_id)
12
+ @subscription_group_id = subscription_group_id
13
+ @consumer_group_id = consumer_group_id
14
+ end
15
+
16
+ # Publishes an event that partitions are going to be revoked.
17
+ # At this stage we can still commit offsets, etc.
18
+ #
19
+ # @param tpl [Rdkafka::Consumer::TopicPartitionList]
20
+ def on_partitions_revoke(tpl)
21
+ instrument('partitions_revoke', tpl)
22
+ end
23
+
24
+ # Publishes an event that partitions are going to be assigned
25
+ #
26
+ # @param tpl [Rdkafka::Consumer::TopicPartitionList]
27
+ def on_partitions_assign(tpl)
28
+ instrument('partitions_assign', tpl)
29
+ end
30
+
31
+ # Publishes an event that partitions were revoked. This is after we've lost them, so no
32
+ # option to commit offsets.
33
+ #
34
+ # @param tpl [Rdkafka::Consumer::TopicPartitionList]
35
+ def on_partitions_revoked(tpl)
36
+ instrument('partitions_revoked', tpl)
37
+ end
38
+
39
+ # Publishes an event that partitions were assigned.
40
+ #
41
+ # @param tpl [Rdkafka::Consumer::TopicPartitionList]
42
+ def on_partitions_assigned(tpl)
43
+ instrument('partitions_assigned', tpl)
44
+ end
45
+
46
+ private
47
+
48
+ # Publishes info that a rebalance event of a given type has happened
49
+ #
50
+ # @param name [String] name of the event
51
+ # @param tpl [Rdkafka::Consumer::TopicPartitionList]
52
+ def instrument(name, tpl)
53
+ ::Karafka.monitor.instrument(
54
+ "rebalance.#{name}",
55
+ caller: self,
56
+ subscription_group_id: @subscription_group_id,
57
+ consumer_group_id: @consumer_group_id,
58
+ tpl: tpl
59
+ )
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
@@ -35,10 +35,14 @@ module Karafka
35
35
  connection.listener.fetch_loop
36
36
  connection.listener.fetch_loop.received
37
37
 
38
- connection.client.rebalance_callback
39
38
  connection.client.poll.error
40
39
  connection.client.unsubscribe.error
41
40
 
41
+ rebalance.partitions_assign
42
+ rebalance.partitions_assigned
43
+ rebalance.partitions_revoke
44
+ rebalance.partitions_revoked
45
+
42
46
  consumer.consume
43
47
  consumer.consumed
44
48
  consumer.consuming.pause
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'client'
4
+
5
+ module Karafka
6
+ module Instrumentation
7
+ module Vendors
8
+ # Namespace for Appsignal instrumentation
9
+ module Appsignal
10
+ # Base for all the instrumentation listeners
11
+ class Base
12
+ include ::Karafka::Core::Configurable
13
+ extend Forwardable
14
+
15
+ # @param block [Proc] configuration block
16
+ def initialize(&block)
17
+ configure
18
+ setup(&block) if block
19
+ end
20
+
21
+ # @param block [Proc] configuration block
22
+ # @note We define this alias to be consistent with `Karafka#setup`
23
+ def setup(&block)
24
+ configure(&block)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Instrumentation
5
+ module Vendors
6
+ module Appsignal
7
+ # Appsignal client wrapper
8
+ # We wrap the native client so we can inject our own stub in specs when needed
9
+ #
10
+ # It also abstracts away the notion of transactions and their management
11
+ #
12
+ # @note This client is abstract, it has no notion of Karafka whatsoever
13
+ class Client
14
+ # Starts an appsignal transaction with a given action name
15
+ #
16
+ # @param action_name [String] action name. For processing this should be equal to
17
+ # consumer class + method name
18
+ def start_transaction(action_name)
19
+ transaction = ::Appsignal::Transaction.create(
20
+ SecureRandom.uuid,
21
+ ::Appsignal::Transaction::BACKGROUND_JOB,
22
+ ::Appsignal::Transaction::GenericRequest.new({})
23
+ )
24
+
25
+ transaction.set_action_if_nil(action_name)
26
+ end
27
+
28
+ # Stops the current transaction (if any)
29
+ def stop_transaction
30
+ return unless transaction?
31
+
32
+ ::Appsignal::Transaction.complete_current!
33
+ end
34
+
35
+ # Sets metadata on a current transaction (if any)
36
+ #
37
+ # @param metadata_hash [Hash] hash with metadata we want to set
38
+ def metadata=(metadata_hash)
39
+ return unless transaction?
40
+
41
+ transaction = ::Appsignal::Transaction.current
42
+
43
+ stringify_hash(metadata_hash).each do |key, value|
44
+ transaction.set_metadata(key, value)
45
+ end
46
+ end
47
+
48
+ # Increments counter with the given value and tags
49
+ #
50
+ # @param key [String] key we want to use
51
+ # @param value [Integer] increment value
52
+ # @param tags [Hash] additional extra tags
53
+ def count(key, value, tags)
54
+ ::Appsignal.increment_counter(
55
+ key,
56
+ value,
57
+ stringify_hash(tags)
58
+ )
59
+ end
60
+
61
+ # Sets gauge with the given value and tags
62
+ #
63
+ # @param key [String] key we want to use
64
+ # @param value [Integer] gauge value
65
+ # @param tags [Hash] additional extra tags
66
+ def gauge(key, value, tags)
67
+ ::Appsignal.set_gauge(
68
+ key,
69
+ value,
70
+ stringify_hash(tags)
71
+ )
72
+ end
73
+
74
+ # Sends the error that occurred to Appsignal
75
+ #
76
+ # @param error [Object] error we want to ship to Appsignal
77
+ def send_error(error)
78
+ # If we have an active transaction we should use it instead of creating a generic one
79
+ # That way proper namespace and other data may be transferred
80
+ #
81
+ # In case there is no transaction, a new generic background job one will be used
82
+ if transaction?
83
+ transaction.set_error(error)
84
+ else
85
+ ::Appsignal.send_error(error) do |transaction|
86
+ transaction.set_namespace(::Appsignal::Transaction::BACKGROUND_JOB)
87
+ end
88
+ end
89
+ end
90
+
91
+ # Registers the probe under a given name
92
+ # @param name [Symbol] probe name
93
+ # @param probe [Proc] code to run every minute
94
+ def register_probe(name, probe)
95
+ ::Appsignal::Minutely.probes.register(name, probe)
96
+ end
97
+
98
+ private
99
+
100
+ # @return [Boolean] do we have a transaction
101
+ def transaction?
102
+ ::Appsignal::Transaction.current?
103
+ end
104
+
105
+ # @return [::Appsignal::Transaction, nil] transaction or nil if not started
106
+ def transaction
107
+ ::Appsignal::Transaction.current
108
+ end
109
+
110
+ # Converts both keys and values of a hash into strings
111
+ # @param hash [Hash]
112
+ # @return [Hash]
113
+ def stringify_hash(hash)
114
+ hash
115
+ .transform_values(&:to_s)
116
+ .transform_keys!(&:to_s)
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,222 @@
1
+ {
2
+ "title": "Karafka",
3
+ "description": "This dashboard gives an overview of the overall Karafka status.\nCheckout topics lag, throughput/performance per consumer,\nprocess/worker counts, and many other things.\n",
4
+ "visuals": [
5
+ {
6
+ "title": "Consumed messages",
7
+ "description": "",
8
+ "line_label": "%topic%[%consumer_group%]",
9
+ "display": "LINE",
10
+ "format": "number",
11
+ "draw_null_as_zero": true,
12
+ "metrics": [
13
+ {
14
+ "name": "karafka_consumer_messages",
15
+ "fields": [
16
+ {
17
+ "field": "COUNTER"
18
+ }
19
+ ],
20
+ "tags": [
21
+ {
22
+ "key": "consumer_group",
23
+ "value": "*"
24
+ },
25
+ {
26
+ "key": "topic",
27
+ "value": "*"
28
+ }
29
+ ]
30
+ }
31
+ ],
32
+ "type": "timeseries"
33
+ },
34
+ {
35
+ "title": "Consumed batches",
36
+ "line_label": "%topic%[%consumer_group%]",
37
+ "display": "LINE",
38
+ "format": "number",
39
+ "draw_null_as_zero": true,
40
+ "metrics": [
41
+ {
42
+ "name": "karafka_consumer_batches",
43
+ "fields": [
44
+ {
45
+ "field": "COUNTER"
46
+ }
47
+ ],
48
+ "tags": [
49
+ {
50
+ "key": "consumer_group",
51
+ "value": "*"
52
+ },
53
+ {
54
+ "key": "topic",
55
+ "value": "*"
56
+ }
57
+ ]
58
+ }
59
+ ],
60
+ "type": "timeseries"
61
+ },
62
+ {
63
+ "title": "Lags",
64
+ "line_label": "%topic%[%consumer_group%]",
65
+ "display": "LINE",
66
+ "format": "number",
67
+ "draw_null_as_zero": true,
68
+ "metrics": [
69
+ {
70
+ "name": "karafka_consumer_aggregated_lag",
71
+ "fields": [
72
+ {
73
+ "field": "GAUGE"
74
+ }
75
+ ],
76
+ "tags": [
77
+ {
78
+ "key": "consumer_group",
79
+ "value": "*"
80
+ },
81
+ {
82
+ "key": "topic",
83
+ "value": "*"
84
+ }
85
+ ]
86
+ }
87
+ ],
88
+ "type": "timeseries"
89
+ },
90
+ {
91
+ "title": "Errors and DLQ",
92
+ "description": "",
93
+ "line_label": "%name% %topic%[%consumer_group%]",
94
+ "display": "LINE",
95
+ "format": "number",
96
+ "draw_null_as_zero": true,
97
+ "metrics": [
98
+ {
99
+ "name": "karafka_consumer_errors",
100
+ "fields": [
101
+ {
102
+ "field": "COUNTER"
103
+ }
104
+ ],
105
+ "tags": [
106
+ {
107
+ "key": "consumer_group",
108
+ "value": "*"
109
+ },
110
+ {
111
+ "key": "topic",
112
+ "value": "*"
113
+ }
114
+ ]
115
+ },
116
+ {
117
+ "name": "karafka_consumer_dead",
118
+ "fields": [
119
+ {
120
+ "field": "COUNTER"
121
+ }
122
+ ],
123
+ "tags": [
124
+ {
125
+ "key": "consumer_group",
126
+ "value": "*"
127
+ },
128
+ {
129
+ "key": "topic",
130
+ "value": "*"
131
+ }
132
+ ]
133
+ }
134
+ ],
135
+ "type": "timeseries"
136
+ },
137
+ {
138
+ "title": "Connection stability",
139
+ "description": "",
140
+ "line_label": "%name%",
141
+ "display": "LINE",
142
+ "format": "number",
143
+ "draw_null_as_zero": true,
144
+ "metrics": [
145
+ {
146
+ "name": "karafka_connection_disconnects",
147
+ "fields": [
148
+ {
149
+ "field": "COUNTER"
150
+ }
151
+ ],
152
+ "tags": []
153
+ },
154
+ {
155
+ "name": "karafka_connection_connects",
156
+ "fields": [
157
+ {
158
+ "field": "COUNTER"
159
+ }
160
+ ],
161
+ "tags": []
162
+ },
163
+ {
164
+ "name": "karafka_requests_retries",
165
+ "fields": [
166
+ {
167
+ "field": "COUNTER"
168
+ }
169
+ ],
170
+ "tags": []
171
+ },
172
+ {
173
+ "name": "karafka_transmission_errors",
174
+ "fields": [
175
+ {
176
+ "field": "COUNTER"
177
+ }
178
+ ],
179
+ "tags": []
180
+ },
181
+ {
182
+ "name": "karafka_receive_errors",
183
+ "fields": [
184
+ {
185
+ "field": "COUNTER"
186
+ }
187
+ ],
188
+ "tags": []
189
+ }
190
+ ],
191
+ "type": "timeseries"
192
+ },
193
+ {
194
+ "title": "Concurrency",
195
+ "line_label": "%name%",
196
+ "display": "LINE",
197
+ "format": "number",
198
+ "draw_null_as_zero": true,
199
+ "metrics": [
200
+ {
201
+ "name": "karafka_processes_count",
202
+ "fields": [
203
+ {
204
+ "field": "COUNTER"
205
+ }
206
+ ],
207
+ "tags": []
208
+ },
209
+ {
210
+ "name": "karafka_threads_count",
211
+ "fields": [
212
+ {
213
+ "field": "COUNTER"
214
+ }
215
+ ],
216
+ "tags": []
217
+ }
218
+ ],
219
+ "type": "timeseries"
220
+ }
221
+ ]
222
+ }
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'base'
4
+
5
+ module Karafka
6
+ module Instrumentation
7
+ module Vendors
8
+ # Namespace for Appsignal instrumentation
9
+ module Appsignal
10
+ # Listener for reporting errors from both consumers and producers
11
+ # Since we have the same API for WaterDrop and Karafka, we can use one listener with
12
+ # independent instances
13
+ class ErrorsListener < Base
14
+ def_delegators :config, :client
15
+
16
+ setting :client, default: Client.new
17
+
18
+ configure
19
+
20
+ # Sends error details to Appsignal
21
+ #
22
+ # @param event [Karafka::Core::Monitoring::Event]
23
+ def on_error_occurred(event)
24
+ client.send_error(event[:error])
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end