karafka 2.0.0.rc2 → 2.0.0.rc5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/CHANGELOG.md +29 -2
- data/CONTRIBUTING.md +4 -8
- data/Gemfile.lock +9 -21
- data/LICENSE-COMM +1 -1
- data/README.md +46 -8
- data/bin/integrations +1 -1
- data/config/errors.yml +4 -0
- data/docker-compose.yml +3 -0
- data/karafka.gemspec +5 -5
- data/lib/karafka/active_job/consumer.rb +2 -0
- data/lib/karafka/connection/client.rb +9 -4
- data/lib/karafka/connection/listener.rb +21 -7
- data/lib/karafka/contracts/base.rb +1 -1
- data/lib/karafka/errors.rb +0 -3
- data/lib/karafka/instrumentation/callbacks/statistics.rb +1 -2
- data/lib/karafka/instrumentation/logger_listener.rb +27 -3
- data/lib/karafka/instrumentation/monitor.rb +14 -59
- data/lib/karafka/instrumentation/notifications.rb +52 -0
- data/lib/karafka/instrumentation/vendors/datadog/dashboard.json +1 -0
- data/lib/karafka/instrumentation/vendors/datadog/listener.rb +232 -0
- data/lib/karafka/patches/rdkafka/consumer.rb +1 -1
- data/lib/karafka/pro/active_job/dispatcher.rb +5 -2
- data/lib/karafka/pro/active_job/job_options_contract.rb +1 -0
- data/lib/karafka/pro/base_consumer.rb +2 -2
- data/lib/karafka/pro/contracts/base.rb +21 -0
- data/lib/karafka/pro/contracts/consumer_group.rb +34 -0
- data/lib/karafka/pro/contracts/consumer_group_topic.rb +33 -0
- data/lib/karafka/pro/loader.rb +21 -3
- data/lib/karafka/pro/processing/partitioner.rb +22 -3
- data/lib/karafka/pro/routing/builder_extensions.rb +30 -0
- data/lib/karafka/pro/routing/{extensions.rb → topic_extensions.rb} +2 -2
- data/lib/karafka/process.rb +1 -0
- data/lib/karafka/processing/jobs_queue.rb +11 -0
- data/lib/karafka/processing/worker.rb +4 -2
- data/lib/karafka/server.rb +3 -0
- data/lib/karafka/setup/config.rb +3 -3
- data/lib/karafka/version.rb +1 -1
- data/lib/karafka.rb +3 -2
- data.tar.gz.sig +0 -0
- metadata +27 -14
- metadata.gz.sig +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
{"title":"Karafka monitoring dashboard","description":"","widgets":[{"id":7444969424381053,"definition":{"title":"Stability & errors","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":8304008422587936,"definition":{"title":"Client connects and disconnects","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Connects","formula":"query1"},{"alias":"Disconnects","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.connection.connects{*} by {host}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.connection.disconnects{*} by {host}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":3722865443336921,"definition":{"title":"Errors encountered (any)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"in-karafka errors","formula":"query1"},{"alias":"librdkafka consume errors","formula":"query2"},{"alias":"librdkafka receive errors","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{*} by {type}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consume.errors{*}.as_count()","data_source":"metrics","name":"query2"},{"query":"sum:karafka.receive.errors{*}.as_count()","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5477381252952760,"definition":{"title":"Processing errors","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{type:consumer.consume.error} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":2357301680769076,"definition":{"title":"Processing errors rate per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"% error rate per topic","formula":"(query1 / (query1 + query2)) * 100"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.error_occurred{type:consumer.consume.error} by {topic,partition}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*} by {topic,partition}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":3902930069982135,"definition":{"title":"Batches successful vs failures","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Successfully processed batch","formula":"query1"},{"alias":"Batch processing with error","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"},{"query":"avg:karafka.error_occurred{type:consumer.consume.error} by {partition,topic}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":718749162159145,"definition":{"title":"Consumer instances revocations and shutdowns","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Consumer instances revokations","formula":"query1"},{"alias":"Consumer instances shutdowns","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.revoked{*}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.shutdown{*}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":2,"width":4,"height":2}}]},"layout":{"x":0,"y":0,"width":12,"height":5}},{"id":5988438511387100,"definition":{"title":"Workers poll","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":8769294644934352,"definition":{"title":"Enqueued jobs","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Enqueued jobs","formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.enqueued_jobs.avg{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":2714502141463873,"definition":{"title":"Workers usage","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Busy workers (p95)","formula":"query1"},{"alias":"Total workers","formula":"query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.processing.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.worker.total_threads{*}","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5370086629441984,"definition":{"title":"Workers % utilization","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"% workers utilization","formula":"(query1 / query2) * 100"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.worker.processing.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.worker.total_threads{*}","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}}]},"layout":{"x":0,"y":5,"width":12,"height":3}},{"id":8544040083223278,"definition":{"title":"Throughput ","type":"group","show_title":true,"layout_type":"ordered","widgets":[{"id":3740207481939733,"definition":{"title":"Offset lag changes","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"derivative(query1)"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.offset{*} by {topic,partition}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":6319110548544878,"definition":{"title":"Batches processed per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6232784865331443,"definition":{"title":"Messages consumed per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Messages consumed","formula":"query1"},{"alias":"Average batch size","formula":"query1 / query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.messages{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*} by {partition,topic}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":2321394598982770,"definition":{"title":"Consumption lag (in seconds)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Consumption lag in s (max)","formula":"query2 / 1000"},{"alias":"Consumption lag in s (avg)","formula":"query3 / 1000"},{"alias":"Consumption lag in s (p95)","formula":"query1 / 1000"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.consumption_lag.max{*}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.consumption_lag.avg{*}","data_source":"metrics","name":"query3"},{"query":"max:karafka.consumer.consumption_lag.95percentile{*}","data_source":"metrics","name":"query1"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":1062074781483741,"definition":{"title":"Processing lag (in ms)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Processing lag in ms (p95)","formula":"query1"},{"alias":"Processing lag in ms (max)","formula":"query2"},{"alias":"Processing lag in ms (avg)","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.processing_lag.95percentile{*}","data_source":"metrics","name":"query1"},{"query":"max:karafka.consumer.processing_lag.max{*}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.processing_lag.avg{*}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":7497794728674267,"definition":{"title":"Batch processing time","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"formula":"query1"},{"formula":"query2"},{"formula":"query3"}],"response_format":"timeseries","queries":[{"query":"max:karafka.consumer.consumed.time_taken.95percentile{*} by {topic,partition}","data_source":"metrics","name":"query1"},{"query":"max:karafka.consumer.consumed.time_taken.max{*} by {topic,partition}","data_source":"metrics","name":"query2"},{"query":"max:karafka.consumer.consumed.time_taken.avg{*} by {topic,partition}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":2,"width":4,"height":2}},{"id":4192833027984161,"definition":{"title":"Batch size per topic","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Batch size p95","formula":"query1"},{"alias":"Batch size avg","formula":"query2"},{"alias":"Batch size max","formula":"query3"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.batch_size.95percentile{*} by {partition,topic}","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batch_size.avg{*} by {partition,topic}","data_source":"metrics","name":"query2"},{"query":"sum:karafka.consumer.batch_size.max{*} by {partition,topic}","data_source":"metrics","name":"query3"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":0,"y":4,"width":4,"height":2}},{"id":4741598444771147,"definition":{"title":"Messages consumed overall","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"formulas":[{"alias":"Messages consumed","formula":"query1"},{"alias":"Average batch size","formula":"query1 / query2"}],"response_format":"timeseries","queries":[{"query":"sum:karafka.consumer.messages{*}.as_count()","data_source":"metrics","name":"query1"},{"query":"sum:karafka.consumer.batches{*}.as_count()","data_source":"metrics","name":"query2"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":4,"y":4,"width":4,"height":2}},{"id":4502534794102513,"definition":{"title":"Polling times (ms)","title_size":"16","title_align":"left","show_legend":true,"legend_layout":"auto","legend_columns":["avg","min","max","value","sum"],"time":{},"type":"timeseries","requests":[{"formulas":[{"alias":"p95 ms polling time","formula":"query1"},{"alias":"max ms polling time","formula":"query2"},{"alias":"average ms polling time","formula":"query3"}],"queries":[{"name":"query1","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.95percentile{*}"},{"name":"query2","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.max{*}"},{"name":"query3","data_source":"metrics","query":"avg:karafka.listener.polling.time_taken.avg{*}"}],"response_format":"timeseries","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}]},"layout":{"x":8,"y":4,"width":4,"height":2}}]},"layout":{"x":0,"y":0,"width":12,"height":7,"is_column_break":true}}],"template_variables":[],"layout_type":"ordered","is_read_only":false,"notify_list":[],"reflow_type":"fixed","id":"s3u-z47-i6u"}
|
@@ -0,0 +1,232 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Karafka
|
4
|
+
module Instrumentation
|
5
|
+
# Namespace for vendor specific instrumentation
|
6
|
+
module Vendors
|
7
|
+
# Datadog specific instrumentation
|
8
|
+
module Datadog
|
9
|
+
# Listener that can be used to subscribe to Karafka to receive stats via StatsD
|
10
|
+
# and/or Datadog
|
11
|
+
#
|
12
|
+
# @note You need to setup the `dogstatsd-ruby` client and assign it
|
13
|
+
class Listener
|
14
|
+
include ::Karafka::Core::Configurable
|
15
|
+
extend Forwardable
|
16
|
+
|
17
|
+
def_delegators :config, :client, :rd_kafka_metrics, :namespace, :default_tags
|
18
|
+
|
19
|
+
# Value object for storing a single rdkafka metric publishing details
|
20
|
+
RdKafkaMetric = Struct.new(:type, :scope, :name, :key_location)
|
21
|
+
|
22
|
+
# Namespace under which the DD metrics should be published
|
23
|
+
setting :namespace, default: 'karafka'
|
24
|
+
|
25
|
+
# Datadog client that we should use to publish the metrics
|
26
|
+
setting :client
|
27
|
+
|
28
|
+
# Default tags we want to publish (for example hostname)
|
29
|
+
# Format as followed (example for hostname): `["host:#{Socket.gethostname}"]`
|
30
|
+
setting :default_tags, default: []
|
31
|
+
|
32
|
+
# All the rdkafka metrics we want to publish
|
33
|
+
#
|
34
|
+
# By default we publish quite a lot so this can be tuned
|
35
|
+
# Note, that the once with `_d` come from Karafka, not rdkafka or Kafka
|
36
|
+
setting :rd_kafka_metrics, default: [
|
37
|
+
# Client metrics
|
38
|
+
RdKafkaMetric.new(:count, :root, 'messages.consumed', 'rxmsgs_d'),
|
39
|
+
RdKafkaMetric.new(:count, :root, 'messages.consumed.bytes', 'rxmsg_bytes'),
|
40
|
+
|
41
|
+
# Broker metrics
|
42
|
+
RdKafkaMetric.new(:count, :brokers, 'consume.attempts', 'txretries_d'),
|
43
|
+
RdKafkaMetric.new(:count, :brokers, 'consume.errors', 'txerrs_d'),
|
44
|
+
RdKafkaMetric.new(:count, :brokers, 'receive.errors', 'rxerrs_d'),
|
45
|
+
RdKafkaMetric.new(:count, :brokers, 'connection.connects', 'connects_d'),
|
46
|
+
RdKafkaMetric.new(:count, :brokers, 'connection.disconnects', 'disconnects_d'),
|
47
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.avg', %w[rtt avg]),
|
48
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p95', %w[rtt p95]),
|
49
|
+
RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p99', %w[rtt p99])
|
50
|
+
].freeze
|
51
|
+
|
52
|
+
configure
|
53
|
+
|
54
|
+
# @param block [Proc] configuration block
|
55
|
+
def initialize(&block)
|
56
|
+
configure
|
57
|
+
setup(&block) if block
|
58
|
+
end
|
59
|
+
|
60
|
+
# @param block [Proc] configuration block
|
61
|
+
# @note We define this alias to be consistent with `WaterDrop#setup`
|
62
|
+
def setup(&block)
|
63
|
+
configure(&block)
|
64
|
+
end
|
65
|
+
|
66
|
+
# Hooks up to WaterDrop instrumentation for emitted statistics
|
67
|
+
#
|
68
|
+
# @param event [Dry::Events::Event]
|
69
|
+
def on_statistics_emitted(event)
|
70
|
+
statistics = event[:statistics]
|
71
|
+
|
72
|
+
rd_kafka_metrics.each do |metric|
|
73
|
+
report_metric(metric, statistics)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Increases the errors count by 1
|
78
|
+
#
|
79
|
+
# @param event [Dry::Events::Event]
|
80
|
+
def on_error_occurred(event)
|
81
|
+
extra_tags = ["type:#{event[:type]}"]
|
82
|
+
|
83
|
+
if event.payload[:caller].respond_to?(:messages)
|
84
|
+
metadata = event.payload[:caller].messages.metadata
|
85
|
+
|
86
|
+
extra_tags += [
|
87
|
+
"topic:#{metadata.topic}",
|
88
|
+
"partition:#{metadata.partition}"
|
89
|
+
]
|
90
|
+
end
|
91
|
+
|
92
|
+
count('error_occurred', 1, tags: default_tags + extra_tags)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Reports how many messages we've polled and how much time did we spend on it
|
96
|
+
#
|
97
|
+
# @param event [Dry::Events::Event]
|
98
|
+
def on_connection_listener_fetch_loop_received(event)
|
99
|
+
time_taken = event[:time]
|
100
|
+
messages_count = event[:messages_buffer].size
|
101
|
+
|
102
|
+
histogram('listener.polling.time_taken', time_taken, tags: default_tags)
|
103
|
+
histogram('listener.polling.messages', messages_count, tags: default_tags)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Here we report majority of things related to processing as we have access to the
|
107
|
+
# consumer
|
108
|
+
# @param event [Dry::Events::Event]
|
109
|
+
def on_consumer_consumed(event)
|
110
|
+
messages = event.payload[:caller].messages
|
111
|
+
metadata = messages.metadata
|
112
|
+
|
113
|
+
tags = default_tags + [
|
114
|
+
"topic:#{metadata.topic}",
|
115
|
+
"partition:#{metadata.partition}"
|
116
|
+
]
|
117
|
+
|
118
|
+
count('consumer.messages', messages.count, tags: tags)
|
119
|
+
count('consumer.batches', 1, tags: tags)
|
120
|
+
gauge('consumer.offset', metadata.last_offset, tags: tags)
|
121
|
+
histogram('consumer.consumed.time_taken', event[:time], tags: tags)
|
122
|
+
histogram('consumer.batch_size', messages.count, tags: tags)
|
123
|
+
histogram('consumer.processing_lag', metadata.processing_lag, tags: tags)
|
124
|
+
histogram('consumer.consumption_lag', metadata.consumption_lag, tags: tags)
|
125
|
+
end
|
126
|
+
|
127
|
+
# @param event [Dry::Events::Event]
|
128
|
+
def on_consumer_revoked(event)
|
129
|
+
messages = event.payload[:caller].messages
|
130
|
+
metadata = messages.metadata
|
131
|
+
|
132
|
+
tags = default_tags + [
|
133
|
+
"topic:#{metadata.topic}",
|
134
|
+
"partition:#{metadata.partition}"
|
135
|
+
]
|
136
|
+
|
137
|
+
count('consumer.revoked', 1, tags: tags)
|
138
|
+
end
|
139
|
+
|
140
|
+
# @param event [Dry::Events::Event]
|
141
|
+
def on_consumer_shutdown(event)
|
142
|
+
messages = event.payload[:caller].messages
|
143
|
+
metadata = messages.metadata
|
144
|
+
|
145
|
+
tags = default_tags + [
|
146
|
+
"topic:#{metadata.topic}",
|
147
|
+
"partition:#{metadata.partition}"
|
148
|
+
]
|
149
|
+
|
150
|
+
count('consumer.shutdown', 1, tags: tags)
|
151
|
+
end
|
152
|
+
|
153
|
+
# Worker related metrics
|
154
|
+
# @param event [Dry::Events::Event]
|
155
|
+
def on_worker_process(event)
|
156
|
+
jq_stats = event[:jobs_queue].statistics
|
157
|
+
|
158
|
+
gauge('worker.total_threads', Karafka::App.config.concurrency, tags: default_tags)
|
159
|
+
histogram('worker.processing', jq_stats[:processing], tags: default_tags)
|
160
|
+
histogram('worker.enqueued_jobs', jq_stats[:enqueued], tags: default_tags)
|
161
|
+
end
|
162
|
+
|
163
|
+
# We report this metric before and after processing for higher accuracy
|
164
|
+
# Without this, the utilization would not be fully reflected
|
165
|
+
# @param event [Dry::Events::Event]
|
166
|
+
def on_worker_processed(event)
|
167
|
+
jq_stats = event[:jobs_queue].statistics
|
168
|
+
|
169
|
+
histogram('worker.processing', jq_stats[:processing], tags: default_tags)
|
170
|
+
end
|
171
|
+
|
172
|
+
private
|
173
|
+
|
174
|
+
%i[
|
175
|
+
count
|
176
|
+
gauge
|
177
|
+
histogram
|
178
|
+
increment
|
179
|
+
decrement
|
180
|
+
].each do |metric_type|
|
181
|
+
class_eval <<~METHODS, __FILE__, __LINE__ + 1
|
182
|
+
def #{metric_type}(key, *args)
|
183
|
+
client.#{metric_type}(
|
184
|
+
namespaced_metric(key),
|
185
|
+
*args
|
186
|
+
)
|
187
|
+
end
|
188
|
+
METHODS
|
189
|
+
end
|
190
|
+
|
191
|
+
# Wraps metric name in listener's namespace
|
192
|
+
# @param metric_name [String] RdKafkaMetric name
|
193
|
+
# @return [String]
|
194
|
+
def namespaced_metric(metric_name)
|
195
|
+
"#{namespace}.#{metric_name}"
|
196
|
+
end
|
197
|
+
|
198
|
+
# Reports a given metric statistics to Datadog
|
199
|
+
# @param metric [RdKafkaMetric] metric value object
|
200
|
+
# @param statistics [Hash] hash with all the statistics emitted
|
201
|
+
def report_metric(metric, statistics)
|
202
|
+
case metric.scope
|
203
|
+
when :root
|
204
|
+
public_send(
|
205
|
+
metric.type,
|
206
|
+
metric.name,
|
207
|
+
statistics.fetch(*metric.key_location),
|
208
|
+
tags: default_tags
|
209
|
+
)
|
210
|
+
when :brokers
|
211
|
+
statistics.fetch('brokers').each_value do |broker_statistics|
|
212
|
+
# Skip bootstrap nodes
|
213
|
+
# Bootstrap nodes have nodeid -1, other nodes have positive
|
214
|
+
# node ids
|
215
|
+
next if broker_statistics['nodeid'] == -1
|
216
|
+
|
217
|
+
public_send(
|
218
|
+
metric.type,
|
219
|
+
metric.name,
|
220
|
+
broker_statistics.dig(*metric.key_location),
|
221
|
+
tags: default_tags + ["broker:#{broker_statistics['nodename']}"]
|
222
|
+
)
|
223
|
+
end
|
224
|
+
else
|
225
|
+
raise ArgumentError, metric.scope
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
@@ -12,7 +12,7 @@ module Karafka
|
|
12
12
|
# @note We need this to make sure that we allocate proper dispatched events only to
|
13
13
|
# callback listeners that should publish them
|
14
14
|
def name
|
15
|
-
::Rdkafka::Bindings.rd_kafka_name(@native_kafka)
|
15
|
+
@name ||= ::Rdkafka::Bindings.rd_kafka_name(@native_kafka)
|
16
16
|
end
|
17
17
|
end
|
18
18
|
end
|
@@ -23,7 +23,9 @@ module Karafka
|
|
23
23
|
dispatch_method: :produce_async,
|
24
24
|
# We don't create a dummy proc based partitioner as we would have to evaluate it with
|
25
25
|
# each job.
|
26
|
-
partitioner: nil
|
26
|
+
partitioner: nil,
|
27
|
+
# Allows for usage of `:key` or `:partition_key`
|
28
|
+
partition_key_type: :key
|
27
29
|
}.freeze
|
28
30
|
|
29
31
|
private_constant :DEFAULTS
|
@@ -45,11 +47,12 @@ module Karafka
|
|
45
47
|
# @return [Hash] hash with dispatch details to which we merge topic and payload
|
46
48
|
def dispatch_details(job)
|
47
49
|
partitioner = fetch_option(job, :partitioner, DEFAULTS)
|
50
|
+
key_type = fetch_option(job, :partition_key_type, DEFAULTS)
|
48
51
|
|
49
52
|
return {} unless partitioner
|
50
53
|
|
51
54
|
{
|
52
|
-
|
55
|
+
key_type => partitioner.call(job)
|
53
56
|
}
|
54
57
|
end
|
55
58
|
end
|
@@ -36,7 +36,7 @@ module Karafka
|
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
39
|
-
# Runs extra logic after consumption that is related to handling long
|
39
|
+
# Runs extra logic after consumption that is related to handling long-running jobs
|
40
40
|
# @note This overwrites the '#on_after_consume' from the base consumer
|
41
41
|
def on_after_consume
|
42
42
|
coordinator.on_finished do |first_group_message, last_group_message|
|
@@ -59,7 +59,7 @@ module Karafka
|
|
59
59
|
# Mark as consumed only if manual offset management is not on
|
60
60
|
mark_as_consumed(last_message) unless topic.manual_offset_management? || revoked?
|
61
61
|
|
62
|
-
# If this is not a long
|
62
|
+
# If this is not a long-running job there is nothing for us to do here
|
63
63
|
return unless topic.long_running_job?
|
64
64
|
|
65
65
|
# Once processing is done, we move to the new offset based on commits
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# This Karafka component is a Pro component.
|
4
|
+
# All of the commercial components are present in the lib/karafka/pro directory of this
|
5
|
+
# repository and their usage requires commercial license agreement.
|
6
|
+
#
|
7
|
+
# Karafka has also commercial-friendly license, commercial support and commercial components.
|
8
|
+
#
|
9
|
+
# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
|
10
|
+
# your code to Maciej Mensfeld.
|
11
|
+
|
12
|
+
module Karafka
|
13
|
+
module Pro
|
14
|
+
# Namespace for Karafka Pro related contracts
|
15
|
+
module Contracts
|
16
|
+
# Base contract for Pro components contracts
|
17
|
+
class Base < ::Karafka::Contracts::Base
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# This Karafka component is a Pro component.
|
4
|
+
# All of the commercial components are present in the lib/karafka/pro directory of this
|
5
|
+
# repository and their usage requires commercial license agreement.
|
6
|
+
#
|
7
|
+
# Karafka has also commercial-friendly license, commercial support and commercial components.
|
8
|
+
#
|
9
|
+
# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
|
10
|
+
# your code to Maciej Mensfeld.
|
11
|
+
|
12
|
+
module Karafka
|
13
|
+
module Pro
|
14
|
+
module Contracts
|
15
|
+
# Contract for validating correct Pro components setup on a consumer group and topic levels
|
16
|
+
class ConsumerGroup < Base
|
17
|
+
virtual do |data, errors|
|
18
|
+
next unless errors.empty?
|
19
|
+
next unless data.key?(:topics)
|
20
|
+
|
21
|
+
fetched_errors = []
|
22
|
+
|
23
|
+
data.fetch(:topics).each do |topic|
|
24
|
+
ConsumerGroupTopic.new.call(topic).errors.each do |key, value|
|
25
|
+
fetched_errors << [[topic, key].flatten, value]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
fetched_errors
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# This Karafka component is a Pro component.
|
4
|
+
# All of the commercial components are present in the lib/karafka/pro directory of this
|
5
|
+
# repository and their usage requires commercial license agreement.
|
6
|
+
#
|
7
|
+
# Karafka has also commercial-friendly license, commercial support and commercial components.
|
8
|
+
#
|
9
|
+
# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
|
10
|
+
# your code to Maciej Mensfeld.
|
11
|
+
|
12
|
+
module Karafka
|
13
|
+
module Pro
|
14
|
+
module Contracts
|
15
|
+
# Contract for validating correct Pro components setup on a topic levels
|
16
|
+
class ConsumerGroupTopic < Base
|
17
|
+
configure do |config|
|
18
|
+
config.error_messages = YAML.safe_load(
|
19
|
+
File.read(
|
20
|
+
File.join(Karafka.gem_root, 'config', 'errors.yml')
|
21
|
+
)
|
22
|
+
).fetch('en').fetch('validations').fetch('pro_consumer_group_topic')
|
23
|
+
end
|
24
|
+
|
25
|
+
virtual do |data|
|
26
|
+
next if data[:consumer] < Karafka::Pro::BaseConsumer
|
27
|
+
|
28
|
+
[[%i[consumer], :consumer_format]]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/karafka/pro/loader.rb
CHANGED
@@ -22,7 +22,11 @@ module Karafka
|
|
22
22
|
processing/jobs_builder
|
23
23
|
processing/coordinator
|
24
24
|
processing/partitioner
|
25
|
-
|
25
|
+
contracts/base
|
26
|
+
contracts/consumer_group
|
27
|
+
contracts/consumer_group_topic
|
28
|
+
routing/topic_extensions
|
29
|
+
routing/builder_extensions
|
26
30
|
active_job/consumer
|
27
31
|
active_job/dispatcher
|
28
32
|
active_job/job_options_contract
|
@@ -37,6 +41,16 @@ module Karafka
|
|
37
41
|
def setup(config)
|
38
42
|
COMPONENTS.each { |component| require_relative(component) }
|
39
43
|
|
44
|
+
reconfigure(config)
|
45
|
+
|
46
|
+
load_routing_extensions
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
# Sets proper config options to use pro components
|
52
|
+
# @param config [WaterDrop::Configurable::Node] root config node
|
53
|
+
def reconfigure(config)
|
40
54
|
icfg = config.internal
|
41
55
|
|
42
56
|
icfg.processing.coordinator_class = Processing::Coordinator
|
@@ -48,10 +62,14 @@ module Karafka
|
|
48
62
|
icfg.active_job.dispatcher = ActiveJob::Dispatcher.new
|
49
63
|
icfg.active_job.job_options_contract = ActiveJob::JobOptionsContract.new
|
50
64
|
|
51
|
-
::Karafka::Routing::Topic.include(Routing::Extensions)
|
52
|
-
|
53
65
|
config.monitor.subscribe(PerformanceTracker.instance)
|
54
66
|
end
|
67
|
+
|
68
|
+
# Loads routing extensions
|
69
|
+
def load_routing_extensions
|
70
|
+
::Karafka::Routing::Topic.include(Routing::TopicExtensions)
|
71
|
+
::Karafka::Routing::Builder.prepend(Routing::BuilderExtensions)
|
72
|
+
end
|
55
73
|
end
|
56
74
|
end
|
57
75
|
end
|
@@ -27,9 +27,28 @@ module Karafka
|
|
27
27
|
# process the data. With one thread it is not worth partitioning the work as the work
|
28
28
|
# itself will be assigned to one thread (pointless work)
|
29
29
|
if ktopic.virtual_partitioner? && @concurrency > 1
|
30
|
-
|
31
|
-
|
32
|
-
|
30
|
+
# We need to reduce it to number of threads, so the group_id is not a direct effect
|
31
|
+
# of the end user action. Otherwise the persistence layer for consumers would cache
|
32
|
+
# it forever and it would cause memory leaks
|
33
|
+
groupings = messages
|
34
|
+
.group_by { |msg| ktopic.virtual_partitioner.call(msg) }
|
35
|
+
.values
|
36
|
+
|
37
|
+
# Reduce the max concurrency to a size that matches the concurrency
|
38
|
+
# As mentioned above we cannot use the partitioning keys directly as it could cause
|
39
|
+
# memory leaks
|
40
|
+
#
|
41
|
+
# The algorithm here is simple, we assume that the most costly in terms of processing,
|
42
|
+
# will be processing of the biggest group and we reduce the smallest once to have
|
43
|
+
# max of groups equal to concurrency
|
44
|
+
while groupings.size > @concurrency
|
45
|
+
groupings.sort_by! { |grouping| -grouping.size }
|
46
|
+
|
47
|
+
# Offset order needs to be maintained for virtual partitions
|
48
|
+
groupings << (groupings.pop + groupings.pop).sort_by!(&:offset)
|
49
|
+
end
|
50
|
+
|
51
|
+
groupings.each_with_index { |messages_group, index| yield(index, messages_group) }
|
33
52
|
else
|
34
53
|
# When no virtual partitioner, works as regular one
|
35
54
|
yield(0, messages)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# This Karafka component is a Pro component.
|
4
|
+
# All of the commercial components are present in the lib/karafka/pro directory of this
|
5
|
+
# repository and their usage requires commercial license agreement.
|
6
|
+
#
|
7
|
+
# Karafka has also commercial-friendly license, commercial support and commercial components.
|
8
|
+
#
|
9
|
+
# By sending a pull request to the pro components, you are agreeing to transfer the copyright of
|
10
|
+
# your code to Maciej Mensfeld.
|
11
|
+
|
12
|
+
module Karafka
|
13
|
+
module Pro
|
14
|
+
# Pro routing components
|
15
|
+
module Routing
|
16
|
+
# Routing extensions for builder to be able to validate Pro components correct usage
|
17
|
+
module BuilderExtensions
|
18
|
+
# Validate consumer groups with pro contracts
|
19
|
+
# @param block [Proc] routing defining block
|
20
|
+
def draw(&block)
|
21
|
+
super
|
22
|
+
|
23
|
+
each do |consumer_group|
|
24
|
+
::Karafka::Pro::Contracts::ConsumerGroup.new.validate!(consumer_group.to_h)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -14,7 +14,7 @@ module Karafka
|
|
14
14
|
# Pro routing components
|
15
15
|
module Routing
|
16
16
|
# Routing extensions that allow to configure some extra PRO routing options
|
17
|
-
module
|
17
|
+
module TopicExtensions
|
18
18
|
class << self
|
19
19
|
# @param base [Class] class we extend
|
20
20
|
def included(base)
|
@@ -28,7 +28,7 @@ module Karafka
|
|
28
28
|
virtual_partitioner != nil
|
29
29
|
end
|
30
30
|
|
31
|
-
# @return [Boolean] is a given job on a topic a long
|
31
|
+
# @return [Boolean] is a given job on a topic a long-running one
|
32
32
|
def long_running_job?
|
33
33
|
@long_running_job || false
|
34
34
|
end
|
data/lib/karafka/process.rb
CHANGED
@@ -119,6 +119,17 @@ module Karafka
|
|
119
119
|
@semaphores[group_id].pop while wait?(group_id)
|
120
120
|
end
|
121
121
|
|
122
|
+
# - `processing` - number of jobs that are currently being processed (active work)
|
123
|
+
# - `enqueued` - number of jobs in the queue that are waiting to be picked up by a worker
|
124
|
+
#
|
125
|
+
# @return [Hash] hash with basic usage statistics of this queue.
|
126
|
+
def statistics
|
127
|
+
{
|
128
|
+
processing: size - @queue.size,
|
129
|
+
enqueued: @queue.size
|
130
|
+
}.freeze
|
131
|
+
end
|
132
|
+
|
122
133
|
private
|
123
134
|
|
124
135
|
# @param group_id [String] id of the group in which jobs we're interested.
|
@@ -47,9 +47,11 @@ module Karafka
|
|
47
47
|
job = @jobs_queue.pop
|
48
48
|
|
49
49
|
if job
|
50
|
-
|
50
|
+
instrument_details = { caller: self, job: job, jobs_queue: @jobs_queue }
|
51
51
|
|
52
|
-
Karafka.monitor.instrument('worker.
|
52
|
+
Karafka.monitor.instrument('worker.process', instrument_details)
|
53
|
+
|
54
|
+
Karafka.monitor.instrument('worker.processed', instrument_details) do
|
53
55
|
job.before_call
|
54
56
|
|
55
57
|
# If a job is marked as non blocking, we can run a tick in the job queue and if there
|
data/lib/karafka/server.rb
CHANGED
@@ -104,6 +104,9 @@ module Karafka
|
|
104
104
|
# We're done waiting, lets kill them!
|
105
105
|
workers.each(&:terminate)
|
106
106
|
listeners.each(&:terminate)
|
107
|
+
# We always need to shutdown clients to make sure we do not force the GC to close consumer.
|
108
|
+
# This can cause memory leaks and crashes.
|
109
|
+
listeners.each(&:shutdown)
|
107
110
|
|
108
111
|
Karafka::App.producer.close
|
109
112
|
|
data/lib/karafka/setup/config.rb
CHANGED
@@ -12,7 +12,7 @@ module Karafka
|
|
12
12
|
# enough and will still keep the code simple
|
13
13
|
# @see Karafka::Setup::Configurators::Base for more details about configurators api
|
14
14
|
class Config
|
15
|
-
extend ::
|
15
|
+
extend ::Karafka::Core::Configurable
|
16
16
|
|
17
17
|
# Defaults for kafka settings, that will be overwritten only if not present already
|
18
18
|
KAFKA_DEFAULTS = {
|
@@ -60,9 +60,9 @@ module Karafka
|
|
60
60
|
# option [Boolean] should we leave offset management to the user
|
61
61
|
setting :manual_offset_management, default: false
|
62
62
|
# options max_messages [Integer] how many messages do we want to fetch from Kafka in one go
|
63
|
-
setting :max_messages, default:
|
63
|
+
setting :max_messages, default: 100
|
64
64
|
# option [Integer] number of milliseconds we can wait while fetching data
|
65
|
-
setting :max_wait_time, default:
|
65
|
+
setting :max_wait_time, default: 1_000
|
66
66
|
# option shutdown_timeout [Integer] the number of milliseconds after which Karafka no
|
67
67
|
# longer waits for the consumers to stop gracefully but instead we force terminate
|
68
68
|
# everything.
|
data/lib/karafka/version.rb
CHANGED