karafka 2.0.0.rc2 → 2.0.0.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/CHANGELOG.md +29 -2
  4. data/CONTRIBUTING.md +4 -8
  5. data/Gemfile.lock +9 -21
  6. data/LICENSE-COMM +1 -1
  7. data/README.md +46 -8
  8. data/bin/integrations +1 -1
  9. data/config/errors.yml +4 -0
  10. data/docker-compose.yml +3 -0
  11. data/karafka.gemspec +5 -5
  12. data/lib/karafka/active_job/consumer.rb +2 -0
  13. data/lib/karafka/connection/client.rb +9 -4
  14. data/lib/karafka/connection/listener.rb +21 -7
  15. data/lib/karafka/contracts/base.rb +1 -1
  16. data/lib/karafka/errors.rb +0 -3
  17. data/lib/karafka/instrumentation/callbacks/statistics.rb +1 -2
  18. data/lib/karafka/instrumentation/logger_listener.rb +27 -3
  19. data/lib/karafka/instrumentation/monitor.rb +14 -59
  20. data/lib/karafka/instrumentation/notifications.rb +52 -0
  21. data/lib/karafka/instrumentation/vendors/datadog/dashboard.json +1 -0
  22. data/lib/karafka/instrumentation/vendors/datadog/listener.rb +232 -0
  23. data/lib/karafka/patches/rdkafka/consumer.rb +1 -1
  24. data/lib/karafka/pro/active_job/dispatcher.rb +5 -2
  25. data/lib/karafka/pro/active_job/job_options_contract.rb +1 -0
  26. data/lib/karafka/pro/base_consumer.rb +2 -2
  27. data/lib/karafka/pro/contracts/base.rb +21 -0
  28. data/lib/karafka/pro/contracts/consumer_group.rb +34 -0
  29. data/lib/karafka/pro/contracts/consumer_group_topic.rb +33 -0
  30. data/lib/karafka/pro/loader.rb +21 -3
  31. data/lib/karafka/pro/processing/partitioner.rb +22 -3
  32. data/lib/karafka/pro/routing/builder_extensions.rb +30 -0
  33. data/lib/karafka/pro/routing/{extensions.rb → topic_extensions.rb} +2 -2
  34. data/lib/karafka/process.rb +1 -0
  35. data/lib/karafka/processing/jobs_queue.rb +11 -0
  36. data/lib/karafka/processing/worker.rb +4 -2
  37. data/lib/karafka/server.rb +3 -0
  38. data/lib/karafka/setup/config.rb +3 -3
  39. data/lib/karafka/version.rb +1 -1
  40. data/lib/karafka.rb +3 -2
  41. data.tar.gz.sig +0 -0
  42. metadata +27 -14
  43. metadata.gz.sig +0 -0
@@ -0,0 +1 @@
1
+ {"title":"Karafka monitoring dashboard","description":"","widgets":[{"id":7444969424381053,"definition":{"title":"Stability & errors","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":8304008422587936,"definition":{"title":"Client connects and disconnects","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Connects","formula":"query1"},{"alias":"Disconnects","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.connection.connects{*} by {host}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.connection.disconnects{*} by {host}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":3722865443336921,"definition":{"title":"Errors encountered (any)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"in-karafka errors","formula":"query1"},{"alias":"librdkafka consume errors","formula":"query2"},{"alias":"librdkafka receive errors","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{*} by {type}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consume.errors{*}.as_count()","data_source":"metrics","name":"query2"},{"query":"sum:karafka.receive.errors{*}.as_count()","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5477381252952760,"definition":{"title":"Processing errors","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{type:consumer.consume.error} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":2357301680769076,"definition":{"title":"Processing errors rate per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"% error rate per topic","formula":"(query1 / (query1 + query2)) * 100"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{type:consumer.consume.error} by {topic,partition}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*} by {topic,partition}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":3902930069982135,"definition":{"title":"Batches successful vs failures","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Successfully processed batch","formula":"query1"},{"alias":"Batch processing with error","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"},{"query":"avg:karafka.error_occurred{type:consumer.consume.error} by {partition,topic}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":718749162159145,"definition":{"title":"Consumer instances revocations and shutdowns","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Consumer instances revokations","formula":"query1"},{"alias":"Consumer instances shutdowns","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.revoked{*}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.shutdown{*}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":2,"width":4,"height":2}}]},"layout":{"x":0,"y":0,"width":12,"height":5}},{"id":5988438511387100,"definition":{"title":"Workers poll","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":8769294644934352,"definition":{"title":"Enqueued jobs","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Enqueued jobs","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.enqueued_jobs.avg{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":2714502141463873,"definition":{"title":"Workers usage","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Busy workers (p95)","formula":"query1"},{"alias":"Total workers","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.processing.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.worker.total_threads{*}","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5370086629441984,"definition":{"title":"Workers % utilization","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"% workers utilization","formula":"(query1 / query2) * 100"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.processing.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.worker.total_threads{*}","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}}]},"layout":{"x":0,"y":5,"width":12,"height":3}},{"id":8544040083223278,"definition":{"title":"Throughput ","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":3740207481939733,"definition":{"title":"Offset lag changes","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"derivative(query1)"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.offset{*} by {topic,partition}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":6319110548544878,"definition":{"title":"Batches processed per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6232784865331443,"definition":{"title":"Messages consumed per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Messages consumed","formula":"query1"},{"alias":"Average batch size","formula":"query1 / query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.messages{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":2321394598982770,"definition":{"title":"Consumption lag (in seconds)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Consumption lag in s (max)","formula":"query2 / 1000"},{"alias":"Consumption lag in s (avg)","formula":"query3 / 1000"},{"alias":"Consumption lag in s (p95)","formula":"query1 / 1000"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.consumption_lag.max{*}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.consumption_lag.avg{*}","data_source":"metrics","name":"query3"},{"query":"max:karafka.consumer.consumption_lag.95percentile{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":1062074781483741,"definition":{"title":"Processing lag (in ms)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Processing lag in ms (p95)","formula":"query1"},{"alias":"Processing lag in ms (max)","formula":"query2"},{"alias":"Processing lag in ms (avg)","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.processing_lag.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"max:karafka.consumer.processing_lag.max{*}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.processing_lag.avg{*}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":7497794728674267,"definition":{"title":"Batch processing time","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"},{"formula":"query2"},{"formula":"query3"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.consumed.time_taken.95percentile{*} by {topic,partition}","data_source":"metrics","name":"query1"},{"query":"max:karafka.consumer.consumed.time_taken.max{*} by {topic,partition}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.consumed.time_taken.avg{*} by {topic,partition}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":2,"width":4,"height":2}},{"id":4192833027984161,"definition":{"title":"Batch size per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Batch size p95","formula":"query1"},{"alias":"Batch size avg","formula":"query2"},{"alias":"Batch size max","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batch_size.95percentile{*} by {partition,topic}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batch_size.avg{*} by {partition,topic}","data_source":"metrics","name":"query2"},{"query":"sum:karafka.consumer.batch_size.max{*} by {partition,topic}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":4,"width":4,"height":2}},{"id":4741598444771147,"definition":{"title":"Messages consumed overall","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Messages consumed","formula":"query1"},{"alias":"Average batch size","formula":"query1 / query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.messages{*}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":4,"width":4,"height":2}},{"id":4502534794102513,"definition":{"title":"Polling times (ms)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"time":{},"type":"timeseries","requests":[{"formulas":[{"alias":"p95 ms polling time","formula":"query1"},{"alias":"max ms polling time","formula":"query2"},{"alias":"average ms polling time","formula":"query3"}],"queries":[{"name":"query1","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.95percentile{*}"},{"name":"query2","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.max{*}"},{"name":"query3","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.avg{*}"}],"response_format":"timeseries","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":4,"width":4,"height":2}}]},"layout":{"x":0,"y":0,"width":12,"height":7,"is_column_break":true}}],"template_variables":[],"layout_type":"ordered","is_read_only":false,"notify_list":[],"reflow_type":"fixed","id":"s3u-z47-i6u"}
@@ -0,0 +1,232 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Instrumentation
5
+ # Namespace for vendor specific instrumentation
6
+ module Vendors
7
+ # Datadog specific instrumentation
8
+ module Datadog
9
+ # Listener that can be used to subscribe to Karafka to receive stats via StatsD
10
+ # and/or Datadog
11
+ #
12
+ # @note You need to setup the `dogstatsd-ruby` client and assign it
13
+ class Listener
14
+ include ::Karafka::Core::Configurable
15
+ extend Forwardable
16
+
17
+ def_delegators :config, :client, :rd_kafka_metrics, :namespace, :default_tags
18
+
19
+ # Value object for storing a single rdkafka metric publishing details
20
+ RdKafkaMetric = Struct.new(:type, :scope, :name, :key_location)
21
+
22
+ # Namespace under which the DD metrics should be published
23
+ setting :namespace, default: 'karafka'
24
+
25
+ # Datadog client that we should use to publish the metrics
26
+ setting :client
27
+
28
+ # Default tags we want to publish (for example hostname)
29
+ # Format as followed (example for hostname): `["host:#{Socket.gethostname}"]`
30
+ setting :default_tags, default: []
31
+
32
+ # All the rdkafka metrics we want to publish
33
+ #
34
+ # By default we publish quite a lot so this can be tuned
35
+ # Note, that the once with `_d` come from Karafka, not rdkafka or Kafka
36
+ setting :rd_kafka_metrics, default: [
37
+ # Client metrics
38
+ RdKafkaMetric.new(:count, :root, 'messages.consumed', 'rxmsgs_d'),
39
+ RdKafkaMetric.new(:count, :root, 'messages.consumed.bytes', 'rxmsg_bytes'),
40
+
41
+ # Broker metrics
42
+ RdKafkaMetric.new(:count, :brokers, 'consume.attempts', 'txretries_d'),
43
+ RdKafkaMetric.new(:count, :brokers, 'consume.errors', 'txerrs_d'),
44
+ RdKafkaMetric.new(:count, :brokers, 'receive.errors', 'rxerrs_d'),
45
+ RdKafkaMetric.new(:count, :brokers, 'connection.connects', 'connects_d'),
46
+ RdKafkaMetric.new(:count, :brokers, 'connection.disconnects', 'disconnects_d'),
47
+ RdKafkaMetric.new(:gauge, :brokers, 'network.latency.avg', %w[rtt avg]),
48
+ RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p95', %w[rtt p95]),
49
+ RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p99', %w[rtt p99])
50
+ ].freeze
51
+
52
+ configure
53
+
54
+ # @param block [Proc] configuration block
55
+ def initialize(&block)
56
+ configure
57
+ setup(&block) if block
58
+ end
59
+
60
+ # @param block [Proc] configuration block
61
+ # @note We define this alias to be consistent with `WaterDrop#setup`
62
+ def setup(&block)
63
+ configure(&block)
64
+ end
65
+
66
+ # Hooks up to WaterDrop instrumentation for emitted statistics
67
+ #
68
+ # @param event [Dry::Events::Event]
69
+ def on_statistics_emitted(event)
70
+ statistics = event[:statistics]
71
+
72
+ rd_kafka_metrics.each do |metric|
73
+ report_metric(metric, statistics)
74
+ end
75
+ end
76
+
77
+ # Increases the errors count by 1
78
+ #
79
+ # @param event [Dry::Events::Event]
80
+ def on_error_occurred(event)
81
+ extra_tags = ["type:#{event[:type]}"]
82
+
83
+ if event.payload[:caller].respond_to?(:messages)
84
+ metadata = event.payload[:caller].messages.metadata
85
+
86
+ extra_tags += [
87
+ "topic:#{metadata.topic}",
88
+ "partition:#{metadata.partition}"
89
+ ]
90
+ end
91
+
92
+ count('error_occurred', 1, tags: default_tags + extra_tags)
93
+ end
94
+
95
+ # Reports how many messages we've polled and how much time did we spend on it
96
+ #
97
+ # @param event [Dry::Events::Event]
98
+ def on_connection_listener_fetch_loop_received(event)
99
+ time_taken = event[:time]
100
+ messages_count = event[:messages_buffer].size
101
+
102
+ histogram('listener.polling.time_taken', time_taken, tags: default_tags)
103
+ histogram('listener.polling.messages', messages_count, tags: default_tags)
104
+ end
105
+
106
+ # Here we report majority of things related to processing as we have access to the
107
+ # consumer
108
+ # @param event [Dry::Events::Event]
109
+ def on_consumer_consumed(event)
110
+ messages = event.payload[:caller].messages
111
+ metadata = messages.metadata
112
+
113
+ tags = default_tags + [
114
+ "topic:#{metadata.topic}",
115
+ "partition:#{metadata.partition}"
116
+ ]
117
+
118
+ count('consumer.messages', messages.count, tags: tags)
119
+ count('consumer.batches', 1, tags: tags)
120
+ gauge('consumer.offset', metadata.last_offset, tags: tags)
121
+ histogram('consumer.consumed.time_taken', event[:time], tags: tags)
122
+ histogram('consumer.batch_size', messages.count, tags: tags)
123
+ histogram('consumer.processing_lag', metadata.processing_lag, tags: tags)
124
+ histogram('consumer.consumption_lag', metadata.consumption_lag, tags: tags)
125
+ end
126
+
127
+ # @param event [Dry::Events::Event]
128
+ def on_consumer_revoked(event)
129
+ messages = event.payload[:caller].messages
130
+ metadata = messages.metadata
131
+
132
+ tags = default_tags + [
133
+ "topic:#{metadata.topic}",
134
+ "partition:#{metadata.partition}"
135
+ ]
136
+
137
+ count('consumer.revoked', 1, tags: tags)
138
+ end
139
+
140
+ # @param event [Dry::Events::Event]
141
+ def on_consumer_shutdown(event)
142
+ messages = event.payload[:caller].messages
143
+ metadata = messages.metadata
144
+
145
+ tags = default_tags + [
146
+ "topic:#{metadata.topic}",
147
+ "partition:#{metadata.partition}"
148
+ ]
149
+
150
+ count('consumer.shutdown', 1, tags: tags)
151
+ end
152
+
153
+ # Worker related metrics
154
+ # @param event [Dry::Events::Event]
155
+ def on_worker_process(event)
156
+ jq_stats = event[:jobs_queue].statistics
157
+
158
+ gauge('worker.total_threads', Karafka::App.config.concurrency, tags: default_tags)
159
+ histogram('worker.processing', jq_stats[:processing], tags: default_tags)
160
+ histogram('worker.enqueued_jobs', jq_stats[:enqueued], tags: default_tags)
161
+ end
162
+
163
+ # We report this metric before and after processing for higher accuracy
164
+ # Without this, the utilization would not be fully reflected
165
+ # @param event [Dry::Events::Event]
166
+ def on_worker_processed(event)
167
+ jq_stats = event[:jobs_queue].statistics
168
+
169
+ histogram('worker.processing', jq_stats[:processing], tags: default_tags)
170
+ end
171
+
172
+ private
173
+
174
+ %i[
175
+ count
176
+ gauge
177
+ histogram
178
+ increment
179
+ decrement
180
+ ].each do |metric_type|
181
+ class_eval <<~METHODS, __FILE__, __LINE__ + 1
182
+ def #{metric_type}(key, *args)
183
+ client.#{metric_type}(
184
+ namespaced_metric(key),
185
+ *args
186
+ )
187
+ end
188
+ METHODS
189
+ end
190
+
191
+ # Wraps metric name in listener's namespace
192
+ # @param metric_name [String] RdKafkaMetric name
193
+ # @return [String]
194
+ def namespaced_metric(metric_name)
195
+ "#{namespace}.#{metric_name}"
196
+ end
197
+
198
+ # Reports a given metric statistics to Datadog
199
+ # @param metric [RdKafkaMetric] metric value object
200
+ # @param statistics [Hash] hash with all the statistics emitted
201
+ def report_metric(metric, statistics)
202
+ case metric.scope
203
+ when :root
204
+ public_send(
205
+ metric.type,
206
+ metric.name,
207
+ statistics.fetch(*metric.key_location),
208
+ tags: default_tags
209
+ )
210
+ when :brokers
211
+ statistics.fetch('brokers').each_value do |broker_statistics|
212
+ # Skip bootstrap nodes
213
+ # Bootstrap nodes have nodeid -1, other nodes have positive
214
+ # node ids
215
+ next if broker_statistics['nodeid'] == -1
216
+
217
+ public_send(
218
+ metric.type,
219
+ metric.name,
220
+ broker_statistics.dig(*metric.key_location),
221
+ tags: default_tags + ["broker:#{broker_statistics['nodename']}"]
222
+ )
223
+ end
224
+ else
225
+ raise ArgumentError, metric.scope
226
+ end
227
+ end
228
+ end
229
+ end
230
+ end
231
+ end
232
+ end
@@ -12,7 +12,7 @@ module Karafka
12
12
  # @note We need this to make sure that we allocate proper dispatched events only to
13
13
  # callback listeners that should publish them
14
14
  def name
15
- ::Rdkafka::Bindings.rd_kafka_name(@native_kafka)
15
+ @name ||= ::Rdkafka::Bindings.rd_kafka_name(@native_kafka)
16
16
  end
17
17
  end
18
18
  end
@@ -23,7 +23,9 @@ module Karafka
23
23
  dispatch_method: :produce_async,
24
24
  # We don't create a dummy proc based partitioner as we would have to evaluate it with
25
25
  # each job.
26
- partitioner: nil
26
+ partitioner: nil,
27
+ # Allows for usage of `:key` or `:partition_key`
28
+ partition_key_type: :key
27
29
  }.freeze
28
30
 
29
31
  private_constant :DEFAULTS
@@ -45,11 +47,12 @@ module Karafka
45
47
  # @return [Hash] hash with dispatch details to which we merge topic and payload
46
48
  def dispatch_details(job)
47
49
  partitioner = fetch_option(job, :partitioner, DEFAULTS)
50
+ key_type = fetch_option(job, :partition_key_type, DEFAULTS)
48
51
 
49
52
  return {} unless partitioner
50
53
 
51
54
  {
52
- partition_key: partitioner.call(job)
55
+ key_type => partitioner.call(job)
53
56
  }
54
57
  end
55
58
  end
@@ -25,6 +25,7 @@ module Karafka
25
25
 
26
26
  optional(:dispatch_method) { |val| %i[produce_async produce_sync].include?(val) }
27
27
  optional(:partitioner) { |val| val.respond_to?(:call) }
28
+ optional(:partition_key_type) { |val| %i[key partition_key].include?(val) }
28
29
  end
29
30
  end
30
31
  end
@@ -36,7 +36,7 @@ module Karafka
36
36
  end
37
37
  end
38
38
 
39
- # Runs extra logic after consumption that is related to handling long running jobs
39
+ # Runs extra logic after consumption that is related to handling long-running jobs
40
40
  # @note This overwrites the '#on_after_consume' from the base consumer
41
41
  def on_after_consume
42
42
  coordinator.on_finished do |first_group_message, last_group_message|
@@ -59,7 +59,7 @@ module Karafka
59
59
  # Mark as consumed only if manual offset management is not on
60
60
  mark_as_consumed(last_message) unless topic.manual_offset_management? || revoked?
61
61
 
62
- # If this is not a long running job there is nothing for us to do here
62
+ # If this is not a long-running job there is nothing for us to do here
63
63
  return unless topic.long_running_job?
64
64
 
65
65
  # Once processing is done, we move to the new offset based on commits
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component.
4
+ # All of the commercial components are present in the lib/karafka/pro directory of this
5
+ # repository and their usage requires commercial license agreement.
6
+ #
7
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
8
+ #
9
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
10
+ # your code to Maciej Mensfeld.
11
+
12
+ module Karafka
13
+ module Pro
14
+ # Namespace for Karafka Pro related contracts
15
+ module Contracts
16
+ # Base contract for Pro components contracts
17
+ class Base < ::Karafka::Contracts::Base
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component.
4
+ # All of the commercial components are present in the lib/karafka/pro directory of this
5
+ # repository and their usage requires commercial license agreement.
6
+ #
7
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
8
+ #
9
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
10
+ # your code to Maciej Mensfeld.
11
+
12
+ module Karafka
13
+ module Pro
14
+ module Contracts
15
+ # Contract for validating correct Pro components setup on a consumer group and topic levels
16
+ class ConsumerGroup < Base
17
+ virtual do |data, errors|
18
+ next unless errors.empty?
19
+ next unless data.key?(:topics)
20
+
21
+ fetched_errors = []
22
+
23
+ data.fetch(:topics).each do |topic|
24
+ ConsumerGroupTopic.new.call(topic).errors.each do |key, value|
25
+ fetched_errors << [[topic, key].flatten, value]
26
+ end
27
+ end
28
+
29
+ fetched_errors
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component.
4
+ # All of the commercial components are present in the lib/karafka/pro directory of this
5
+ # repository and their usage requires commercial license agreement.
6
+ #
7
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
8
+ #
9
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
10
+ # your code to Maciej Mensfeld.
11
+
12
+ module Karafka
13
+ module Pro
14
+ module Contracts
15
+ # Contract for validating correct Pro components setup on a topic levels
16
+ class ConsumerGroupTopic < Base
17
+ configure do |config|
18
+ config.error_messages = YAML.safe_load(
19
+ File.read(
20
+ File.join(Karafka.gem_root, 'config', 'errors.yml')
21
+ )
22
+ ).fetch('en').fetch('validations').fetch('pro_consumer_group_topic')
23
+ end
24
+
25
+ virtual do |data|
26
+ next if data[:consumer] < Karafka::Pro::BaseConsumer
27
+
28
+ [[%i[consumer], :consumer_format]]
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -22,7 +22,11 @@ module Karafka
22
22
  processing/jobs_builder
23
23
  processing/coordinator
24
24
  processing/partitioner
25
- routing/extensions
25
+ contracts/base
26
+ contracts/consumer_group
27
+ contracts/consumer_group_topic
28
+ routing/topic_extensions
29
+ routing/builder_extensions
26
30
  active_job/consumer
27
31
  active_job/dispatcher
28
32
  active_job/job_options_contract
@@ -37,6 +41,16 @@ module Karafka
37
41
  def setup(config)
38
42
  COMPONENTS.each { |component| require_relative(component) }
39
43
 
44
+ reconfigure(config)
45
+
46
+ load_routing_extensions
47
+ end
48
+
49
+ private
50
+
51
+ # Sets proper config options to use pro components
52
+ # @param config [WaterDrop::Configurable::Node] root config node
53
+ def reconfigure(config)
40
54
  icfg = config.internal
41
55
 
42
56
  icfg.processing.coordinator_class = Processing::Coordinator
@@ -48,10 +62,14 @@ module Karafka
48
62
  icfg.active_job.dispatcher = ActiveJob::Dispatcher.new
49
63
  icfg.active_job.job_options_contract = ActiveJob::JobOptionsContract.new
50
64
 
51
- ::Karafka::Routing::Topic.include(Routing::Extensions)
52
-
53
65
  config.monitor.subscribe(PerformanceTracker.instance)
54
66
  end
67
+
68
+ # Loads routing extensions
69
+ def load_routing_extensions
70
+ ::Karafka::Routing::Topic.include(Routing::TopicExtensions)
71
+ ::Karafka::Routing::Builder.prepend(Routing::BuilderExtensions)
72
+ end
55
73
  end
56
74
  end
57
75
  end
@@ -27,9 +27,28 @@ module Karafka
27
27
  # process the data. With one thread it is not worth partitioning the work as the work
28
28
  # itself will be assigned to one thread (pointless work)
29
29
  if ktopic.virtual_partitioner? && @concurrency > 1
30
- messages
31
- .group_by { |msg| ktopic.virtual_partitioner.call(msg).hash.abs % @concurrency }
32
- .each { |group_id, messages_group| yield(group_id, messages_group) }
30
+ # We need to reduce it to number of threads, so the group_id is not a direct effect
31
+ # of the end user action. Otherwise the persistence layer for consumers would cache
32
+ # it forever and it would cause memory leaks
33
+ groupings = messages
34
+ .group_by { |msg| ktopic.virtual_partitioner.call(msg) }
35
+ .values
36
+
37
+ # Reduce the max concurrency to a size that matches the concurrency
38
+ # As mentioned above we cannot use the partitioning keys directly as it could cause
39
+ # memory leaks
40
+ #
41
+ # The algorithm here is simple, we assume that the most costly in terms of processing,
42
+ # will be processing of the biggest group and we reduce the smallest once to have
43
+ # max of groups equal to concurrency
44
+ while groupings.size > @concurrency
45
+ groupings.sort_by! { |grouping| -grouping.size }
46
+
47
+ # Offset order needs to be maintained for virtual partitions
48
+ groupings << (groupings.pop + groupings.pop).sort_by!(&:offset)
49
+ end
50
+
51
+ groupings.each_with_index { |messages_group, index| yield(index, messages_group) }
33
52
  else
34
53
  # When no virtual partitioner, works as regular one
35
54
  yield(0, messages)
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component.
4
+ # All of the commercial components are present in the lib/karafka/pro directory of this
5
+ # repository and their usage requires commercial license agreement.
6
+ #
7
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
8
+ #
9
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
10
+ # your code to Maciej Mensfeld.
11
+
12
+ module Karafka
13
+ module Pro
14
+ # Pro routing components
15
+ module Routing
16
+ # Routing extensions for builder to be able to validate Pro components correct usage
17
+ module BuilderExtensions
18
+ # Validate consumer groups with pro contracts
19
+ # @param block [Proc] routing defining block
20
+ def draw(&block)
21
+ super
22
+
23
+ each do |consumer_group|
24
+ ::Karafka::Pro::Contracts::ConsumerGroup.new.validate!(consumer_group.to_h)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -14,7 +14,7 @@ module Karafka
14
14
  # Pro routing components
15
15
  module Routing
16
16
  # Routing extensions that allow to configure some extra PRO routing options
17
- module Extensions
17
+ module TopicExtensions
18
18
  class << self
19
19
  # @param base [Class] class we extend
20
20
  def included(base)
@@ -28,7 +28,7 @@ module Karafka
28
28
  virtual_partitioner != nil
29
29
  end
30
30
 
31
- # @return [Boolean] is a given job on a topic a long running one
31
+ # @return [Boolean] is a given job on a topic a long-running one
32
32
  def long_running_job?
33
33
  @long_running_job || false
34
34
  end
@@ -9,6 +9,7 @@ module Karafka
9
9
  SIGINT
10
10
  SIGQUIT
11
11
  SIGTERM
12
+ SIGTTIN
12
13
  ].freeze
13
14
 
14
15
  HANDLED_SIGNALS.each do |signal|
@@ -119,6 +119,17 @@ module Karafka
119
119
  @semaphores[group_id].pop while wait?(group_id)
120
120
  end
121
121
 
122
+ # - `processing` - number of jobs that are currently being processed (active work)
123
+ # - `enqueued` - number of jobs in the queue that are waiting to be picked up by a worker
124
+ #
125
+ # @return [Hash] hash with basic usage statistics of this queue.
126
+ def statistics
127
+ {
128
+ processing: size - @queue.size,
129
+ enqueued: @queue.size
130
+ }.freeze
131
+ end
132
+
122
133
  private
123
134
 
124
135
  # @param group_id [String] id of the group in which jobs we're interested.
@@ -47,9 +47,11 @@ module Karafka
47
47
  job = @jobs_queue.pop
48
48
 
49
49
  if job
50
- Karafka.monitor.instrument('worker.process', caller: self, job: job)
50
+ instrument_details = { caller: self, job: job, jobs_queue: @jobs_queue }
51
51
 
52
- Karafka.monitor.instrument('worker.processed', caller: self, job: job) do
52
+ Karafka.monitor.instrument('worker.process', instrument_details)
53
+
54
+ Karafka.monitor.instrument('worker.processed', instrument_details) do
53
55
  job.before_call
54
56
 
55
57
  # If a job is marked as non blocking, we can run a tick in the job queue and if there
@@ -104,6 +104,9 @@ module Karafka
104
104
  # We're done waiting, lets kill them!
105
105
  workers.each(&:terminate)
106
106
  listeners.each(&:terminate)
107
+ # We always need to shutdown clients to make sure we do not force the GC to close consumer.
108
+ # This can cause memory leaks and crashes.
109
+ listeners.each(&:shutdown)
107
110
 
108
111
  Karafka::App.producer.close
109
112
 
@@ -12,7 +12,7 @@ module Karafka
12
12
  # enough and will still keep the code simple
13
13
  # @see Karafka::Setup::Configurators::Base for more details about configurators api
14
14
  class Config
15
- extend ::WaterDrop::Configurable
15
+ extend ::Karafka::Core::Configurable
16
16
 
17
17
  # Defaults for kafka settings, that will be overwritten only if not present already
18
18
  KAFKA_DEFAULTS = {
@@ -60,9 +60,9 @@ module Karafka
60
60
  # option [Boolean] should we leave offset management to the user
61
61
  setting :manual_offset_management, default: false
62
62
  # options max_messages [Integer] how many messages do we want to fetch from Kafka in one go
63
- setting :max_messages, default: 1_000
63
+ setting :max_messages, default: 100
64
64
  # option [Integer] number of milliseconds we can wait while fetching data
65
- setting :max_wait_time, default: 5_000
65
+ setting :max_wait_time, default: 1_000
66
66
  # option shutdown_timeout [Integer] the number of milliseconds after which Karafka no
67
67
  # longer waits for the consumers to stop gracefully but instead we force terminate
68
68
  # everything.
@@ -3,5 +3,5 @@
3
3
  # Main module namespace
4
4
  module Karafka
5
5
  # Current Karafka version
6
- VERSION = '2.0.0.rc2'
6
+ VERSION = '2.0.0.rc5'
7
7
  end