karafka 2.0.0.rc2 → 2.0.0.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/CHANGELOG.md +29 -2
  4. data/CONTRIBUTING.md +4 -8
  5. data/Gemfile.lock +9 -21
  6. data/LICENSE-COMM +1 -1
  7. data/README.md +46 -8
  8. data/bin/integrations +1 -1
  9. data/config/errors.yml +4 -0
  10. data/docker-compose.yml +3 -0
  11. data/karafka.gemspec +5 -5
  12. data/lib/karafka/active_job/consumer.rb +2 -0
  13. data/lib/karafka/connection/client.rb +9 -4
  14. data/lib/karafka/connection/listener.rb +21 -7
  15. data/lib/karafka/contracts/base.rb +1 -1
  16. data/lib/karafka/errors.rb +0 -3
  17. data/lib/karafka/instrumentation/callbacks/statistics.rb +1 -2
  18. data/lib/karafka/instrumentation/logger_listener.rb +27 -3
  19. data/lib/karafka/instrumentation/monitor.rb +14 -59
  20. data/lib/karafka/instrumentation/notifications.rb +52 -0
  21. data/lib/karafka/instrumentation/vendors/datadog/dashboard.json +1 -0
  22. data/lib/karafka/instrumentation/vendors/datadog/listener.rb +232 -0
  23. data/lib/karafka/patches/rdkafka/consumer.rb +1 -1
  24. data/lib/karafka/pro/active_job/dispatcher.rb +5 -2
  25. data/lib/karafka/pro/active_job/job_options_contract.rb +1 -0
  26. data/lib/karafka/pro/base_consumer.rb +2 -2
  27. data/lib/karafka/pro/contracts/base.rb +21 -0
  28. data/lib/karafka/pro/contracts/consumer_group.rb +34 -0
  29. data/lib/karafka/pro/contracts/consumer_group_topic.rb +33 -0
  30. data/lib/karafka/pro/loader.rb +21 -3
  31. data/lib/karafka/pro/processing/partitioner.rb +22 -3
  32. data/lib/karafka/pro/routing/builder_extensions.rb +30 -0
  33. data/lib/karafka/pro/routing/{extensions.rb → topic_extensions.rb} +2 -2
  34. data/lib/karafka/process.rb +1 -0
  35. data/lib/karafka/processing/jobs_queue.rb +11 -0
  36. data/lib/karafka/processing/worker.rb +4 -2
  37. data/lib/karafka/server.rb +3 -0
  38. data/lib/karafka/setup/config.rb +3 -3
  39. data/lib/karafka/version.rb +1 -1
  40. data/lib/karafka.rb +3 -2
  41. data.tar.gz.sig +0 -0
  42. metadata +27 -14
  43. metadata.gz.sig +0 -0
@@ -0,0 +1 @@
1
+ {"title":"Karafka monitoring dashboard","description":"","widgets":[{"id":7444969424381053,"definition":{"title":"Stability & errors","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":8304008422587936,"definition":{"title":"Client connects and disconnects","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Connects","formula":"query1"},{"alias":"Disconnects","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.connection.connects{*} by {host}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.connection.disconnects{*} by {host}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":3722865443336921,"definition":{"title":"Errors encountered (any)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"in-karafka errors","formula":"query1"},{"alias":"librdkafka consume errors","formula":"query2"},{"alias":"librdkafka receive errors","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{*} by {type}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consume.errors{*}.as_count()","data_source":"metrics","name":"query2"},{"query":"sum:karafka.receive.errors{*}.as_count()","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5477381252952760,"definition":{"title":"Processing errors","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{type:consumer.consume.error} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":2357301680769076,"definition":{"title":"Processing errors rate per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"% error rate per topic","formula":"(query1 / (query1 + query2)) * 100"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{type:consumer.consume.error} by {topic,partition}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*} by {topic,partition}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":3902930069982135,"definition":{"title":"Batches successful vs failures","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Successfully processed batch","formula":"query1"},{"alias":"Batch processing with error","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"},{"query":"avg:karafka.error_occurred{type:consumer.consume.error} by {partition,topic}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":718749162159145,"definition":{"title":"Consumer instances revocations and shutdowns","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Consumer instances revokations","formula":"query1"},{"alias":"Consumer instances shutdowns","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.revoked{*}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.shutdown{*}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":2,"width":4,"height":2}}]},"layout":{"x":0,"y":0,"width":12,"height":5}},{"id":5988438511387100,"definition":{"title":"Workers poll","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":8769294644934352,"definition":{"title":"Enqueued jobs","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Enqueued jobs","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.enqueued_jobs.avg{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":2714502141463873,"definition":{"title":"Workers usage","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Busy workers (p95)","formula":"query1"},{"alias":"Total workers","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.processing.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.worker.total_threads{*}","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5370086629441984,"definition":{"title":"Workers % utilization","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"% workers utilization","formula":"(query1 / query2) * 100"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.processing.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.worker.total_threads{*}","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}}]},"layout":{"x":0,"y":5,"width":12,"height":3}},{"id":8544040083223278,"definition":{"title":"Throughput ","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":3740207481939733,"definition":{"title":"Offset lag changes","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"derivative(query1)"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.offset{*} by {topic,partition}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":6319110548544878,"definition":{"title":"Batches processed per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6232784865331443,"definition":{"title":"Messages consumed per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Messages consumed","formula":"query1"},{"alias":"Average batch size","formula":"query1 / query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.messages{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":2321394598982770,"definition":{"title":"Consumption lag (in seconds)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Consumption lag in s (max)","formula":"query2 / 1000"},{"alias":"Consumption lag in s (avg)","formula":"query3 / 1000"},{"alias":"Consumption lag in s (p95)","formula":"query1 / 1000"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.consumption_lag.max{*}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.consumption_lag.avg{*}","data_source":"metrics","name":"query3"},{"query":"max:karafka.consumer.consumption_lag.95percentile{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":1062074781483741,"definition":{"title":"Processing lag (in ms)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Processing lag in ms (p95)","formula":"query1"},{"alias":"Processing lag in ms (max)","formula":"query2"},{"alias":"Processing lag in ms (avg)","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.processing_lag.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"max:karafka.consumer.processing_lag.max{*}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.processing_lag.avg{*}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":7497794728674267,"definition":{"title":"Batch processing time","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"},{"formula":"query2"},{"formula":"query3"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.consumed.time_taken.95percentile{*} by {topic,partition}","data_source":"metrics","name":"query1"},{"query":"max:karafka.consumer.consumed.time_taken.max{*} by {topic,partition}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.consumed.time_taken.avg{*} by {topic,partition}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":2,"width":4,"height":2}},{"id":4192833027984161,"definition":{"title":"Batch size per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Batch size p95","formula":"query1"},{"alias":"Batch size avg","formula":"query2"},{"alias":"Batch size max","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batch_size.95percentile{*} by {partition,topic}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batch_size.avg{*} by {partition,topic}","data_source":"metrics","name":"query2"},{"query":"sum:karafka.consumer.batch_size.max{*} by {partition,topic}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":4,"width":4,"height":2}},{"id":4741598444771147,"definition":{"title":"Messages consumed overall","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Messages consumed","formula":"query1"},{"alias":"Average batch size","formula":"query1 / query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.messages{*}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":4,"width":4,"height":2}},{"id":4502534794102513,"definition":{"title":"Polling times (ms)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"time":{},"type":"timeseries","requests":[{"formulas":[{"alias":"p95 ms polling time","formula":"query1"},{"alias":"max ms polling time","formula":"query2"},{"alias":"average ms polling time","formula":"query3"}],"queries":[{"name":"query1","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.95percentile{*}"},{"name":"query2","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.max{*}"},{"name":"query3","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.avg{*}"}],"response_format":"timeseries","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":4,"width":4,"height":2}}]},"layout":{"x":0,"y":0,"width":12,"height":7,"is_column_break":true}}],"template_variables":[],"layout_type":"ordered","is_read_only":false,"notify_list":[],"reflow_type":"fixed","id":"s3u-z47-i6u"}
@@ -0,0 +1,232 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Instrumentation
5
+ # Namespace for vendor specific instrumentation
6
+ module Vendors
7
+ # Datadog specific instrumentation
8
+ module Datadog
9
+ # Listener that can be used to subscribe to Karafka to receive stats via StatsD
10
+ # and/or Datadog
11
+ #
12
+ # @note You need to setup the `dogstatsd-ruby` client and assign it
13
+ class Listener
14
+ include ::Karafka::Core::Configurable
15
+ extend Forwardable
16
+
17
+ def_delegators :config, :client, :rd_kafka_metrics, :namespace, :default_tags
18
+
19
+ # Value object for storing a single rdkafka metric publishing details
20
+ RdKafkaMetric = Struct.new(:type, :scope, :name, :key_location)
21
+
22
+ # Namespace under which the DD metrics should be published
23
+ setting :namespace, default: 'karafka'
24
+
25
+ # Datadog client that we should use to publish the metrics
26
+ setting :client
27
+
28
+ # Default tags we want to publish (for example hostname)
29
+ # Format as followed (example for hostname): `["host:#{Socket.gethostname}"]`
30
+ setting :default_tags, default: []
31
+
32
+ # All the rdkafka metrics we want to publish
33
+ #
34
+ # By default we publish quite a lot so this can be tuned
35
+ # Note, that the once with `_d` come from Karafka, not rdkafka or Kafka
36
+ setting :rd_kafka_metrics, default: [
37
+ # Client metrics
38
+ RdKafkaMetric.new(:count, :root, 'messages.consumed', 'rxmsgs_d'),
39
+ RdKafkaMetric.new(:count, :root, 'messages.consumed.bytes', 'rxmsg_bytes'),
40
+
41
+ # Broker metrics
42
+ RdKafkaMetric.new(:count, :brokers, 'consume.attempts', 'txretries_d'),
43
+ RdKafkaMetric.new(:count, :brokers, 'consume.errors', 'txerrs_d'),
44
+ RdKafkaMetric.new(:count, :brokers, 'receive.errors', 'rxerrs_d'),
45
+ RdKafkaMetric.new(:count, :brokers, 'connection.connects', 'connects_d'),
46
+ RdKafkaMetric.new(:count, :brokers, 'connection.disconnects', 'disconnects_d'),
47
+ RdKafkaMetric.new(:gauge, :brokers, 'network.latency.avg', %w[rtt avg]),
48
+ RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p95', %w[rtt p95]),
49
+ RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p99', %w[rtt p99])
50
+ ].freeze
51
+
52
+ configure
53
+
54
+ # @param block [Proc] configuration block
55
+ def initialize(&block)
56
+ configure
57
+ setup(&block) if block
58
+ end
59
+
60
+ # @param block [Proc] configuration block
61
+ # @note We define this alias to be consistent with `WaterDrop#setup`
62
+ def setup(&block)
63
+ configure(&block)
64
+ end
65
+
66
+ # Hooks up to WaterDrop instrumentation for emitted statistics
67
+ #
68
+ # @param event [Dry::Events::Event]
69
+ def on_statistics_emitted(event)
70
+ statistics = event[:statistics]
71
+
72
+ rd_kafka_metrics.each do |metric|
73
+ report_metric(metric, statistics)
74
+ end
75
+ end
76
+
77
+ # Increases the errors count by 1
78
+ #
79
+ # @param event [Dry::Events::Event]
80
+ def on_error_occurred(event)
81
+ extra_tags = ["type:#{event[:type]}"]
82
+
83
+ if event.payload[:caller].respond_to?(:messages)
84
+ metadata = event.payload[:caller].messages.metadata
85
+
86
+ extra_tags += [
87
+ "topic:#{metadata.topic}",
88
+ "partition:#{metadata.partition}"
89
+ ]
90
+ end
91
+
92
+ count('error_occurred', 1, tags: default_tags + extra_tags)
93
+ end
94
+
95
+ # Reports how many messages we've polled and how much time did we spend on it
96
+ #
97
+ # @param event [Dry::Events::Event]
98
+ def on_connection_listener_fetch_loop_received(event)
99
+ time_taken = event[:time]
100
+ messages_count = event[:messages_buffer].size
101
+
102
+ histogram('listener.polling.time_taken', time_taken, tags: default_tags)
103
+ histogram('listener.polling.messages', messages_count, tags: default_tags)
104
+ end
105
+
106
+ # Here we report majority of things related to processing as we have access to the
107
+ # consumer
108
+ # @param event [Dry::Events::Event]
109
+ def on_consumer_consumed(event)
110
+ messages = event.payload[:caller].messages
111
+ metadata = messages.metadata
112
+
113
+ tags = default_tags + [
114
+ "topic:#{metadata.topic}",
115
+ "partition:#{metadata.partition}"
116
+ ]
117
+
118
+ count('consumer.messages', messages.count, tags: tags)
119
+ count('consumer.batches', 1, tags: tags)
120
+ gauge('consumer.offset', metadata.last_offset, tags: tags)
121
+ histogram('consumer.consumed.time_taken', event[:time], tags: tags)
122
+ histogram('consumer.batch_size', messages.count, tags: tags)
123
+ histogram('consumer.processing_lag', metadata.processing_lag, tags: tags)
124
+ histogram('consumer.consumption_lag', metadata.consumption_lag, tags: tags)
125
+ end
126
+
127
+ # @param event [Dry::Events::Event]
128
+ def on_consumer_revoked(event)
129
+ messages = event.payload[:caller].messages
130
+ metadata = messages.metadata
131
+
132
+ tags = default_tags + [
133
+ "topic:#{metadata.topic}",
134
+ "partition:#{metadata.partition}"
135
+ ]
136
+
137
+ count('consumer.revoked', 1, tags: tags)
138
+ end
139
+
140
+ # @param event [Dry::Events::Event]
141
+ def on_consumer_shutdown(event)
142
+ messages = event.payload[:caller].messages
143
+ metadata = messages.metadata
144
+
145
+ tags = default_tags + [
146
+ "topic:#{metadata.topic}",
147
+ "partition:#{metadata.partition}"
148
+ ]
149
+
150
+ count('consumer.shutdown', 1, tags: tags)
151
+ end
152
+
153
+ # Worker related metrics
154
+ # @param event [Dry::Events::Event]
155
+ def on_worker_process(event)
156
+ jq_stats = event[:jobs_queue].statistics
157
+
158
+ gauge('worker.total_threads', Karafka::App.config.concurrency, tags: default_tags)
159
+ histogram('worker.processing', jq_stats[:processing], tags: default_tags)
160
+ histogram('worker.enqueued_jobs', jq_stats[:enqueued], tags: default_tags)
161
+ end
162
+
163
+ # We report this metric before and after processing for higher accuracy
164
+ # Without this, the utilization would not be fully reflected
165
+ # @param event [Dry::Events::Event]
166
+ def on_worker_processed(event)
167
+ jq_stats = event[:jobs_queue].statistics
168
+
169
+ histogram('worker.processing', jq_stats[:processing], tags: default_tags)
170
+ end
171
+
172
+ private
173
+
174
+ %i[
175
+ count
176
+ gauge
177
+ histogram
178
+ increment
179
+ decrement
180
+ ].each do |metric_type|
181
+ class_eval <<~METHODS, __FILE__, __LINE__ + 1
182
+ def #{metric_type}(key, *args)
183
+ client.#{metric_type}(
184
+ namespaced_metric(key),
185
+ *args
186
+ )
187
+ end
188
+ METHODS
189
+ end
190
+
191
+ # Wraps metric name in listener's namespace
192
+ # @param metric_name [String] RdKafkaMetric name
193
+ # @return [String]
194
+ def namespaced_metric(metric_name)
195
+ "#{namespace}.#{metric_name}"
196
+ end
197
+
198
+ # Reports a given metric statistics to Datadog
199
+ # @param metric [RdKafkaMetric] metric value object
200
+ # @param statistics [Hash] hash with all the statistics emitted
201
+ def report_metric(metric, statistics)
202
+ case metric.scope
203
+ when :root
204
+ public_send(
205
+ metric.type,
206
+ metric.name,
207
+ statistics.fetch(*metric.key_location),
208
+ tags: default_tags
209
+ )
210
+ when :brokers
211
+ statistics.fetch('brokers').each_value do |broker_statistics|
212
+ # Skip bootstrap nodes
213
+ # Bootstrap nodes have nodeid -1, other nodes have positive
214
+ # node ids
215
+ next if broker_statistics['nodeid'] == -1
216
+
217
+ public_send(
218
+ metric.type,
219
+ metric.name,
220
+ broker_statistics.dig(*metric.key_location),
221
+ tags: default_tags + ["broker:#{broker_statistics['nodename']}"]
222
+ )
223
+ end
224
+ else
225
+ raise ArgumentError, metric.scope
226
+ end
227
+ end
228
+ end
229
+ end
230
+ end
231
+ end
232
+ end
@@ -12,7 +12,7 @@ module Karafka
12
12
  # @note We need this to make sure that we allocate proper dispatched events only to
13
13
  # callback listeners that should publish them
14
14
  def name
15
- ::Rdkafka::Bindings.rd_kafka_name(@native_kafka)
15
+ @name ||= ::Rdkafka::Bindings.rd_kafka_name(@native_kafka)
16
16
  end
17
17
  end
18
18
  end
@@ -23,7 +23,9 @@ module Karafka
23
23
  dispatch_method: :produce_async,
24
24
  # We don't create a dummy proc based partitioner as we would have to evaluate it with
25
25
  # each job.
26
- partitioner: nil
26
+ partitioner: nil,
27
+ # Allows for usage of `:key` or `:partition_key`
28
+ partition_key_type: :key
27
29
  }.freeze
28
30
 
29
31
  private_constant :DEFAULTS
@@ -45,11 +47,12 @@ module Karafka
45
47
  # @return [Hash] hash with dispatch details to which we merge topic and payload
46
48
  def dispatch_details(job)
47
49
  partitioner = fetch_option(job, :partitioner, DEFAULTS)
50
+ key_type = fetch_option(job, :partition_key_type, DEFAULTS)
48
51
 
49
52
  return {} unless partitioner
50
53
 
51
54
  {
52
- partition_key: partitioner.call(job)
55
+ key_type => partitioner.call(job)
53
56
  }
54
57
  end
55
58
  end
@@ -25,6 +25,7 @@ module Karafka
25
25
 
26
26
  optional(:dispatch_method) { |val| %i[produce_async produce_sync].include?(val) }
27
27
  optional(:partitioner) { |val| val.respond_to?(:call) }
28
+ optional(:partition_key_type) { |val| %i[key partition_key].include?(val) }
28
29
  end
29
30
  end
30
31
  end
@@ -36,7 +36,7 @@ module Karafka
36
36
  end
37
37
  end
38
38
 
39
- # Runs extra logic after consumption that is related to handling long running jobs
39
+ # Runs extra logic after consumption that is related to handling long-running jobs
40
40
  # @note This overwrites the '#on_after_consume' from the base consumer
41
41
  def on_after_consume
42
42
  coordinator.on_finished do |first_group_message, last_group_message|
@@ -59,7 +59,7 @@ module Karafka
59
59
  # Mark as consumed only if manual offset management is not on
60
60
  mark_as_consumed(last_message) unless topic.manual_offset_management? || revoked?
61
61
 
62
- # If this is not a long running job there is nothing for us to do here
62
+ # If this is not a long-running job there is nothing for us to do here
63
63
  return unless topic.long_running_job?
64
64
 
65
65
  # Once processing is done, we move to the new offset based on commits
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component.
4
+ # All of the commercial components are present in the lib/karafka/pro directory of this
5
+ # repository and their usage requires commercial license agreement.
6
+ #
7
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
8
+ #
9
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
10
+ # your code to Maciej Mensfeld.
11
+
12
+ module Karafka
13
+ module Pro
14
+ # Namespace for Karafka Pro related contracts
15
+ module Contracts
16
+ # Base contract for Pro components contracts
17
+ class Base < ::Karafka::Contracts::Base
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component.
4
+ # All of the commercial components are present in the lib/karafka/pro directory of this
5
+ # repository and their usage requires commercial license agreement.
6
+ #
7
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
8
+ #
9
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
10
+ # your code to Maciej Mensfeld.
11
+
12
+ module Karafka
13
+ module Pro
14
+ module Contracts
15
+ # Contract for validating correct Pro components setup on a consumer group and topic levels
16
+ class ConsumerGroup < Base
17
+ virtual do |data, errors|
18
+ next unless errors.empty?
19
+ next unless data.key?(:topics)
20
+
21
+ fetched_errors = []
22
+
23
+ data.fetch(:topics).each do |topic|
24
+ ConsumerGroupTopic.new.call(topic).errors.each do |key, value|
25
+ fetched_errors << [[topic, key].flatten, value]
26
+ end
27
+ end
28
+
29
+ fetched_errors
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component.
4
+ # All of the commercial components are present in the lib/karafka/pro directory of this
5
+ # repository and their usage requires commercial license agreement.
6
+ #
7
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
8
+ #
9
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
10
+ # your code to Maciej Mensfeld.
11
+
12
+ module Karafka
13
+ module Pro
14
+ module Contracts
15
+ # Contract for validating correct Pro components setup on a topic levels
16
+ class ConsumerGroupTopic < Base
17
+ configure do |config|
18
+ config.error_messages = YAML.safe_load(
19
+ File.read(
20
+ File.join(Karafka.gem_root, 'config', 'errors.yml')
21
+ )
22
+ ).fetch('en').fetch('validations').fetch('pro_consumer_group_topic')
23
+ end
24
+
25
+ virtual do |data|
26
+ next if data[:consumer] < Karafka::Pro::BaseConsumer
27
+
28
+ [[%i[consumer], :consumer_format]]
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -22,7 +22,11 @@ module Karafka
22
22
  processing/jobs_builder
23
23
  processing/coordinator
24
24
  processing/partitioner
25
- routing/extensions
25
+ contracts/base
26
+ contracts/consumer_group
27
+ contracts/consumer_group_topic
28
+ routing/topic_extensions
29
+ routing/builder_extensions
26
30
  active_job/consumer
27
31
  active_job/dispatcher
28
32
  active_job/job_options_contract
@@ -37,6 +41,16 @@ module Karafka
37
41
  def setup(config)
38
42
  COMPONENTS.each { |component| require_relative(component) }
39
43
 
44
+ reconfigure(config)
45
+
46
+ load_routing_extensions
47
+ end
48
+
49
+ private
50
+
51
+ # Sets proper config options to use pro components
52
+ # @param config [WaterDrop::Configurable::Node] root config node
53
+ def reconfigure(config)
40
54
  icfg = config.internal
41
55
 
42
56
  icfg.processing.coordinator_class = Processing::Coordinator
@@ -48,10 +62,14 @@ module Karafka
48
62
  icfg.active_job.dispatcher = ActiveJob::Dispatcher.new
49
63
  icfg.active_job.job_options_contract = ActiveJob::JobOptionsContract.new
50
64
 
51
- ::Karafka::Routing::Topic.include(Routing::Extensions)
52
-
53
65
  config.monitor.subscribe(PerformanceTracker.instance)
54
66
  end
67
+
68
+ # Loads routing extensions
69
+ def load_routing_extensions
70
+ ::Karafka::Routing::Topic.include(Routing::TopicExtensions)
71
+ ::Karafka::Routing::Builder.prepend(Routing::BuilderExtensions)
72
+ end
55
73
  end
56
74
  end
57
75
  end
@@ -27,9 +27,28 @@ module Karafka
27
27
  # process the data. With one thread it is not worth partitioning the work as the work
28
28
  # itself will be assigned to one thread (pointless work)
29
29
  if ktopic.virtual_partitioner? && @concurrency > 1
30
- messages
31
- .group_by { |msg| ktopic.virtual_partitioner.call(msg).hash.abs % @concurrency }
32
- .each { |group_id, messages_group| yield(group_id, messages_group) }
30
+ # We need to reduce it to number of threads, so the group_id is not a direct effect
31
+ # of the end user action. Otherwise the persistence layer for consumers would cache
32
+ # it forever and it would cause memory leaks
33
+ groupings = messages
34
+ .group_by { |msg| ktopic.virtual_partitioner.call(msg) }
35
+ .values
36
+
37
+ # Reduce the max concurrency to a size that matches the concurrency
38
+ # As mentioned above we cannot use the partitioning keys directly as it could cause
39
+ # memory leaks
40
+ #
41
+ # The algorithm here is simple, we assume that the most costly in terms of processing,
42
+ # will be processing of the biggest group and we reduce the smallest once to have
43
+ # max of groups equal to concurrency
44
+ while groupings.size > @concurrency
45
+ groupings.sort_by! { |grouping| -grouping.size }
46
+
47
+ # Offset order needs to be maintained for virtual partitions
48
+ groupings << (groupings.pop + groupings.pop).sort_by!(&:offset)
49
+ end
50
+
51
+ groupings.each_with_index { |messages_group, index| yield(index, messages_group) }
33
52
  else
34
53
  # When no virtual partitioner, works as regular one
35
54
  yield(0, messages)
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component.
4
+ # All of the commercial components are present in the lib/karafka/pro directory of this
5
+ # repository and their usage requires commercial license agreement.
6
+ #
7
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
8
+ #
9
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
10
+ # your code to Maciej Mensfeld.
11
+
12
+ module Karafka
13
+ module Pro
14
+ # Pro routing components
15
+ module Routing
16
+ # Routing extensions for builder to be able to validate Pro components correct usage
17
+ module BuilderExtensions
18
+ # Validate consumer groups with pro contracts
19
+ # @param block [Proc] routing defining block
20
+ def draw(&block)
21
+ super
22
+
23
+ each do |consumer_group|
24
+ ::Karafka::Pro::Contracts::ConsumerGroup.new.validate!(consumer_group.to_h)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -14,7 +14,7 @@ module Karafka
14
14
  # Pro routing components
15
15
  module Routing
16
16
  # Routing extensions that allow to configure some extra PRO routing options
17
- module Extensions
17
+ module TopicExtensions
18
18
  class << self
19
19
  # @param base [Class] class we extend
20
20
  def included(base)
@@ -28,7 +28,7 @@ module Karafka
28
28
  virtual_partitioner != nil
29
29
  end
30
30
 
31
- # @return [Boolean] is a given job on a topic a long running one
31
+ # @return [Boolean] is a given job on a topic a long-running one
32
32
  def long_running_job?
33
33
  @long_running_job || false
34
34
  end
@@ -9,6 +9,7 @@ module Karafka
9
9
  SIGINT
10
10
  SIGQUIT
11
11
  SIGTERM
12
+ SIGTTIN
12
13
  ].freeze
13
14
 
14
15
  HANDLED_SIGNALS.each do |signal|
@@ -119,6 +119,17 @@ module Karafka
119
119
  @semaphores[group_id].pop while wait?(group_id)
120
120
  end
121
121
 
122
+ # - `processing` - number of jobs that are currently being processed (active work)
123
+ # - `enqueued` - number of jobs in the queue that are waiting to be picked up by a worker
124
+ #
125
+ # @return [Hash] hash with basic usage statistics of this queue.
126
+ def statistics
127
+ {
128
+ processing: size - @queue.size,
129
+ enqueued: @queue.size
130
+ }.freeze
131
+ end
132
+
122
133
  private
123
134
 
124
135
  # @param group_id [String] id of the group in which jobs we're interested.
@@ -47,9 +47,11 @@ module Karafka
47
47
  job = @jobs_queue.pop
48
48
 
49
49
  if job
50
- Karafka.monitor.instrument('worker.process', caller: self, job: job)
50
+ instrument_details = { caller: self, job: job, jobs_queue: @jobs_queue }
51
51
 
52
- Karafka.monitor.instrument('worker.processed', caller: self, job: job) do
52
+ Karafka.monitor.instrument('worker.process', instrument_details)
53
+
54
+ Karafka.monitor.instrument('worker.processed', instrument_details) do
53
55
  job.before_call
54
56
 
55
57
  # If a job is marked as non blocking, we can run a tick in the job queue and if there
@@ -104,6 +104,9 @@ module Karafka
104
104
  # We're done waiting, lets kill them!
105
105
  workers.each(&:terminate)
106
106
  listeners.each(&:terminate)
107
+ # We always need to shutdown clients to make sure we do not force the GC to close consumer.
108
+ # This can cause memory leaks and crashes.
109
+ listeners.each(&:shutdown)
107
110
 
108
111
  Karafka::App.producer.close
109
112
 
@@ -12,7 +12,7 @@ module Karafka
12
12
  # enough and will still keep the code simple
13
13
  # @see Karafka::Setup::Configurators::Base for more details about configurators api
14
14
  class Config
15
- extend ::WaterDrop::Configurable
15
+ extend ::Karafka::Core::Configurable
16
16
 
17
17
  # Defaults for kafka settings, that will be overwritten only if not present already
18
18
  KAFKA_DEFAULTS = {
@@ -60,9 +60,9 @@ module Karafka
60
60
  # option [Boolean] should we leave offset management to the user
61
61
  setting :manual_offset_management, default: false
62
62
  # options max_messages [Integer] how many messages do we want to fetch from Kafka in one go
63
- setting :max_messages, default: 1_000
63
+ setting :max_messages, default: 100
64
64
  # option [Integer] number of milliseconds we can wait while fetching data
65
- setting :max_wait_time, default: 5_000
65
+ setting :max_wait_time, default: 1_000
66
66
  # option shutdown_timeout [Integer] the number of milliseconds after which Karafka no
67
67
  # longer waits for the consumers to stop gracefully but instead we force terminate
68
68
  # everything.
@@ -3,5 +3,5 @@
3
3
  # Main module namespace
4
4
  module Karafka
5
5
  # Current Karafka version
6
- VERSION = '2.0.0.rc2'
6
+ VERSION = '2.0.0.rc5'
7
7
  end