racecar 2.0.0 → 2.10.0.beta2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.github/dependabot.yml +17 -0
  3. data/.github/workflows/ci.yml +46 -0
  4. data/.github/workflows/publish.yml +12 -0
  5. data/.gitignore +1 -2
  6. data/CHANGELOG.md +83 -1
  7. data/Dockerfile +9 -0
  8. data/Gemfile +6 -0
  9. data/Gemfile.lock +72 -0
  10. data/README.md +303 -82
  11. data/Rakefile +5 -0
  12. data/docker-compose.yml +65 -0
  13. data/examples/batch_consumer.rb +4 -2
  14. data/examples/cat_consumer.rb +2 -0
  15. data/examples/producing_consumer.rb +2 -0
  16. data/exe/racecar +37 -14
  17. data/extra/datadog-dashboard.json +1 -0
  18. data/lib/ensure_hash_compact.rb +2 -0
  19. data/lib/generators/racecar/consumer_generator.rb +2 -0
  20. data/lib/generators/racecar/install_generator.rb +2 -0
  21. data/lib/racecar/cli.rb +26 -21
  22. data/lib/racecar/config.rb +80 -4
  23. data/lib/racecar/consumer.rb +51 -6
  24. data/lib/racecar/consumer_set.rb +113 -44
  25. data/lib/racecar/ctl.rb +31 -3
  26. data/lib/racecar/daemon.rb +4 -2
  27. data/lib/racecar/datadog.rb +83 -3
  28. data/lib/racecar/delivery_callback.rb +27 -0
  29. data/lib/racecar/erroneous_state_error.rb +34 -0
  30. data/lib/racecar/heroku.rb +49 -0
  31. data/lib/racecar/instrumenter.rb +4 -7
  32. data/lib/racecar/liveness_probe.rb +78 -0
  33. data/lib/racecar/message.rb +6 -1
  34. data/lib/racecar/message_delivery_error.rb +112 -0
  35. data/lib/racecar/null_instrumenter.rb +2 -0
  36. data/lib/racecar/parallel_runner.rb +110 -0
  37. data/lib/racecar/pause.rb +8 -4
  38. data/lib/racecar/producer.rb +139 -0
  39. data/lib/racecar/rails_config_file_loader.rb +7 -1
  40. data/lib/racecar/rebalance_listener.rb +58 -0
  41. data/lib/racecar/runner.rb +79 -37
  42. data/lib/racecar/version.rb +3 -1
  43. data/lib/racecar.rb +36 -8
  44. data/racecar.gemspec +7 -4
  45. metadata +47 -25
  46. data/.github/workflows/rspec.yml +0 -24
@@ -1,8 +1,10 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class BatchConsumer < Racecar::Consumer
2
4
  subscribes_to "messages", start_from_beginning: false
3
5
 
4
- def process_batch(batch)
5
- batch.messages.each do |message|
6
+ def process_batch(messages)
7
+ messages.each do |message|
6
8
  puts message.value
7
9
  end
8
10
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class CatConsumer < Racecar::Consumer
2
4
  subscribes_to "messages", start_from_beginning: false
3
5
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class ProducingConsumer < Racecar::Consumer
2
4
  subscribes_to "messages", start_from_beginning: false
3
5
 
data/exe/racecar CHANGED
@@ -3,19 +3,42 @@
3
3
  require "racecar"
4
4
  require "racecar/cli"
5
5
 
6
- begin
7
- Racecar::Cli.main(ARGV)
8
- rescue SignalException => e
9
- # We might receive SIGTERM before our signal handler is installed.
10
- if Signal.signame(e.signo) == "TERM"
11
- exit(0)
12
- else
13
- raise
6
+ module Racecar
7
+ class << self
8
+ def start(argv)
9
+ Cli.main(argv)
10
+ rescue SignalException => e
11
+ # We might receive SIGTERM before our signal handler is installed.
12
+ if Signal.signame(e.signo) == "TERM"
13
+ exit(0)
14
+ else
15
+ raise
16
+ end
17
+ rescue SystemExit
18
+ raise
19
+ rescue Exception => e
20
+ $stderr.puts "=> Crashed: #{exception_with_causes(e)}\n#{e.backtrace.join("\n")}"
21
+
22
+ Racecar.config.error_handler.call(e)
23
+
24
+ exit(1)
25
+ else
26
+ exit(0)
27
+ end
28
+
29
+ private
30
+
31
+ def exception_with_causes(e)
32
+ result = +"#{e.class}: #{e}"
33
+ if e.cause
34
+ result << "\n"
35
+ result << "--- Caused by: ---\n"
36
+ result << exception_with_causes(e.cause)
37
+ end
38
+ result
39
+ end
14
40
  end
15
- rescue
16
- # Exceptions are printed to STDERR and sent to the error handler
17
- # in `Racecar::Cli#run`, so we don't need to do anything here.
18
- exit(1)
19
- else
20
- exit(0)
21
41
  end
42
+
43
+ # Start your engines!
44
+ Racecar.start(ARGV)
@@ -0,0 +1 @@
1
+ {"title":"Racecar consumer groups","description":"Dashboard for monitoring [Racecar](https://github.com/zendesk/racecar) Kafka consumer groups.","widgets":[{"id":4916208698459109,"definition":{"title":"Single-message processing","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":82605028,"definition":{"title":"95th percentile message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.95percentile{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":2857871641649870,"definition":{"title":"Max message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.max{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":88579656,"definition":{"title":"Median message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.median{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}}]}},{"id":4068194420543030,"definition":{"title":"Batch processing","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":341686567,"definition":{"title":"95th percentile batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_batch.latency.95percentile{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":341687897,"definition":{"title":"Median batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_batch.latency.median{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5352911818003929,"definition":{"title":"Max batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"max:racecar.consumer.process_batch.latency.max{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":1654098217056312,"definition":{"title":"Max message batch size","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"max:racecar.consumer.batch_size.max{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"purple","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":7718619791149134,"definition":{"title":"Average per-message latency in batch processing mode","show_legend":false,"legend_size":"0","legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"max:racecar.consumer.process_batch.latency.avg{$group_id,$client,$topic,$partition,$env}/max:racecar.consumer.batch_size.avg{$group_id,$client,$topic,$partition,$env}","metadata":[{"expression":"max:racecar.consumer.process_batch.latency.avg{$env,$pod,$group_id,$client,$topic,$partition}/max:racecar.consumer.batch_size.avg{$env,$pod,$group_id,$client,$topic,$partition}","alias_name":"ms"}],"style":{"palette":"purple","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":4,"y":2,"width":4,"height":2}}]}},{"id":7110612496425151,"definition":{"title":"Throughput & Lag","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":301212748,"definition":{"title":"Message lag changes","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"derivative(max:racecar.consumer.offset{$group_id,$client,$topic,$partition,$env} by {topic,partition,pod})","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":82604183,"definition":{"title":"Processing throughput by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {topic,group_id}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5547724125706857,"definition":{"title":"Processing throughput by group","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {group_id}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":375397853,"definition":{"title":"Processing throughput by host","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {group_id,host}.as_rate()","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":7820607170949322,"definition":{"title":"Messages consumed in timeframe","type":"query_value","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env}.as_count()","aggregator":"sum"}],"autoscale":true,"precision":0},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":1428183857213882,"definition":{"title":"Time lag (end-to-end latency)","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.time_lag{$group_id,$client,$topic,$partition,$env} by {group_id,pod}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":2,"width":4,"height":2}}]}},{"id":1487807434456879,"definition":{"title":"Processing Errors & Group Stability","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":82605029,"definition":{"title":"Processing errors","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.process_batch.errors{$group_id,$client,$topic,$partition,$env} by {topic,pod,group_id,partition}.as_count()+sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic,pod,group_id,partition}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":83104736,"definition":{"title":"Processing error rate by topic (%)","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"(sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()/(sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()+sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()))*100","style":{"palette":"orange","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6572534533091871,"definition":{"title":"Processing errors in timeframe","type":"query_value","requests":[{"q":"sum:racecar.consumer.process_batch.errors{$topic,$client,$group_id,$env}.as_count()+sum:racecar.consumer.process_message.errors{$topic,$client,$group_id,$env}.as_count()","aggregator":"sum","conditional_formats":[{"comparator":">","palette":"white_on_red","value":0},{"comparator":"<=","palette":"white_on_green","value":0}]}],"autoscale":true,"precision":0},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":302705923,"definition":{"title":"Pause duration","show_legend":false,"legend_size":"0","legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.consumer.pause.duration{$client,$group_id,$topic,$env} by {pod,group_id,topic,partition}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":235544854,"definition":{"title":"Group joins","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.join_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":235544862,"definition":{"title":"Group leaves","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.leave_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":2,"width":4,"height":2}},{"id":235545167,"definition":{"title":"Group syncs","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.sync_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":4,"width":4,"height":2}}]}},{"id":8013176155436939,"definition":{"title":"Producer & message delivery","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":5948628389625057,"definition":{"title":"Message delivery latency (median)","title_size":"16","title_align":"left","show_legend":false,"type":"timeseries","requests":[{"q":"avg:racecar.producer.deliver.latency.median{$client,$env}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":3158040379950811,"definition":{"title":"Producer buffer size (max)","title_size":"16","title_align":"left","show_legend":false,"type":"timeseries","requests":[{"q":"max:racecar.producer.buffer.size.max{$client,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6916375790222772,"definition":{"title":"Producer buffer size (avg) kp","title_size":"16","title_align":"left","show_legend":false,"legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.producer.buffer.size.avg{$client,$env} by {topic,host}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":3160735194874896,"definition":{"title":"Message size (95p)","title_size":"16","title_align":"left","show_legend":false,"legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.producer.produce.message_size.95percentile{$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":2,"width":4,"height":2}}]}}],"template_variables":[{"name":"env","default":"production","prefix":"env"},{"name":"group_id","default":"*","prefix":"group_id"},{"name":"client","default":"*","prefix":"client"},{"name":"topic","default":"*","prefix":"topic"},{"name":"partition","default":"*","prefix":"partition"}],"layout_type":"ordered","is_read_only":false,"notify_list":[],"reflow_type":"fixed","id":"ywc-z36-g29"}
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # only needed when ruby < 2.4 and not using active support
2
4
 
3
5
  unless {}.respond_to? :compact
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Racecar
2
4
  module Generators
3
5
  class ConsumerGenerator < Rails::Generators::NamedBase
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Racecar
2
4
  module Generators
3
5
  class InstallGenerator < Rails::Generators::Base
data/lib/racecar/cli.rb CHANGED
@@ -1,23 +1,25 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require "optparse"
2
4
  require "logger"
3
5
  require "fileutils"
4
6
  require "racecar/rails_config_file_loader"
5
7
  require "racecar/daemon"
8
+ require "racecar/liveness_probe"
6
9
 
7
10
  module Racecar
8
11
  class Cli
9
- def self.main(args)
10
- new(args).run
12
+ class << self
13
+ def main(args)
14
+ new(args).run
15
+ end
11
16
  end
12
17
 
13
18
  def initialize(args)
14
19
  @parser = build_parser
15
20
  @parser.parse!(args)
16
21
  @consumer_name = args.first or raise Racecar::Error, "no consumer specified"
17
- end
18
-
19
- def config
20
- Racecar.config
22
+ @runner = nil
21
23
  end
22
24
 
23
25
  def run
@@ -58,21 +60,29 @@ module Racecar
58
60
  $stderr.puts "=> Ctrl-C to shutdown consumer"
59
61
  end
60
62
 
61
- processor = consumer_class.new
62
-
63
- Racecar.run(processor)
64
- rescue => e
65
- $stderr.puts "=> Crashed: #{e.class}: #{e}\n#{e.backtrace.join("\n")}"
63
+ if config.liveness_probe_enabled
64
+ $stderr.puts "=> Liveness probe enabled"
65
+ config.install_liveness_probe
66
+ end
66
67
 
67
- config.error_handler.call(e)
68
+ processor = consumer_class.new
69
+ @runner = Racecar.runner(processor)
70
+ @runner.run
71
+ nil
72
+ end
68
73
 
69
- raise
74
+ def stop
75
+ @runner.stop
70
76
  end
71
77
 
72
78
  private
73
79
 
74
80
  attr_reader :consumer_name
75
81
 
82
+ def config
83
+ Racecar.config
84
+ end
85
+
76
86
  def daemonize!
77
87
  daemon = Daemon.new(File.expand_path(config.pidfile))
78
88
 
@@ -102,12 +112,7 @@ module Racecar
102
112
  opts.on("-r", "--require STRING", "Require a library before starting the consumer") do |lib|
103
113
  $LOAD_PATH.unshift(Dir.pwd) unless load_path_modified
104
114
  load_path_modified = true
105
- begin
106
- require lib
107
- rescue => e
108
- $stderr.puts "=> #{lib} failed to load: #{e.message}"
109
- exit
110
- end
115
+ require lib
111
116
  end
112
117
 
113
118
  opts.on("-l", "--log STRING", "Log to the specified file") do |logfile|
@@ -115,13 +120,13 @@ module Racecar
115
120
  end
116
121
 
117
122
  Racecar::Config.variables.each do |variable|
118
- opt_name = "--" << variable.name.to_s.gsub("_", "-")
123
+ opt_name = +"--#{variable.name.to_s.gsub('_', '-')}"
119
124
  opt_name << " #{variable.type.upcase}" unless variable.boolean?
120
125
 
121
126
  desc = variable.description || "N/A"
122
127
 
123
128
  if variable.default
124
- desc << " (default: #{variable.default.inspect})"
129
+ desc += " (default: #{variable.default.inspect})"
125
130
  end
126
131
 
127
132
  opts.on(opt_name, desc) do |value|
@@ -1,9 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "tmpdir"
4
+
1
5
  require "king_konf"
2
6
 
7
+ require "racecar/liveness_probe"
8
+ require "racecar/instrumenter"
9
+ require "racecar/rebalance_listener"
10
+
3
11
  module Racecar
4
12
  class Config < KingKonf::Config
5
13
  env_prefix :racecar
6
14
 
15
+ STATISTICS_DISABLED_VALUE = 0
16
+
7
17
  desc "A list of Kafka brokers in the cluster that you're consuming from"
8
18
  list :brokers, default: ["localhost:9092"]
9
19
 
@@ -19,6 +29,9 @@ module Racecar
19
29
  desc "The minimum number of messages in the local consumer queue"
20
30
  integer :min_message_queue_size, default: 2000
21
31
 
32
+ desc "Which partition assignment strategy to use, range, roundrobin or cooperative-sticky. -- https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md"
33
+ string :partition_assignment_strategy, default: "range,roundrobin"
34
+
22
35
  desc "Kafka consumer configuration options, separated with '=' -- https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md"
23
36
  list :consumer, default: []
24
37
 
@@ -52,9 +65,12 @@ module Racecar
52
65
  desc "How long to wait when trying to communicate with a Kafka broker"
53
66
  float :socket_timeout, default: 30
54
67
 
55
- desc "How long to allow the Kafka brokers to wait before returning messages"
68
+ desc "How long to allow the Kafka brokers to wait before returning messages (in seconds)"
56
69
  float :max_wait_time, default: 1
57
70
 
71
+ desc "How long to try to deliver a produced message before finally giving up (in seconds)"
72
+ float :message_timeout, default: 5*60
73
+
58
74
  desc "Maximum amount of data the broker shall return for a Fetch request"
59
75
  integer :max_bytes, default: 10485760
60
76
 
@@ -70,6 +86,9 @@ module Racecar
70
86
  desc "The log level for the Racecar logs"
71
87
  string :log_level, default: "info"
72
88
 
89
+ desc "The strategy used to determine which topic partition a message is written to when Racecar produces a value to Kafka; defaults to `consistent_random`"
90
+ symbol :partitioner, allowed_values: %i{consistent consistent_random murmur2 murmur2_random fnv1a fnv1a_random}, default: :consistent_random
91
+
73
92
  desc "Protocol used to communicate with brokers"
74
93
  symbol :security_protocol, allowed_values: %i{plaintext ssl sasl_plaintext sasl_ssl}
75
94
 
@@ -151,10 +170,33 @@ module Racecar
151
170
  desc "Whether to boot Rails when starting the consumer"
152
171
  boolean :without_rails, default: false
153
172
 
173
+ desc "How frequently librdkafka should report statistics to your application (in seconds). A statistics callback
174
+ must also be provided. This should be defined with a `statistics_callback` method on your processor. Stats
175
+ are disabled if this value is set to 0, or there is no callback defined. This is set by default to 1 second
176
+ for backward compatibility, however this can be quite memory intensive"
177
+ integer :statistics_interval, default: 1
178
+
179
+ desc "Whether to enable liveness probe behavior (touch the file)"
180
+ boolean :liveness_probe_enabled, default: false
181
+
182
+ desc "Path to a file Racecar will touch to show liveness"
183
+ string :liveness_probe_file_path, default: "#{Dir.tmpdir}/racecar-liveness"
184
+
185
+ desc "Used only by the liveness probe: Max time (in seconds) between liveness events before the process is considered not healthy"
186
+ integer :liveness_probe_max_interval, default: 5
187
+
154
188
  # The error handler must be set directly on the object.
155
189
  attr_reader :error_handler
156
190
 
157
- attr_accessor :subscriptions, :logger
191
+ attr_accessor :subscriptions, :logger, :parallel_workers
192
+
193
+ def statistics_interval_ms
194
+ if Rdkafka::Config.statistics_callback
195
+ statistics_interval * 1000
196
+ else
197
+ STATISTICS_DISABLED_VALUE
198
+ end
199
+ end
158
200
 
159
201
  def max_wait_time_ms
160
202
  max_wait_time * 1000
@@ -189,6 +231,7 @@ module Racecar
189
231
  end
190
232
 
191
233
  def load_consumer_class(consumer_class)
234
+ self.consumer_class = consumer_class
192
235
  self.group_id = consumer_class.group_id || self.group_id
193
236
 
194
237
  self.group_id ||= [
@@ -196,13 +239,16 @@ module Racecar
196
239
  group_id_prefix,
197
240
 
198
241
  # MyFunnyConsumer => my-funny-consumer
199
- consumer_class.name.gsub(/[a-z][A-Z]/) {|str| str[0] << "-" << str[1] }.downcase,
200
- ].compact.join("")
242
+ consumer_class.name.gsub(/[a-z][A-Z]/) { |str| "#{str[0]}-#{str[1]}" }.downcase,
243
+ ].compact.join
201
244
 
245
+ self.parallel_workers = consumer_class.parallel_workers
202
246
  self.subscriptions = consumer_class.subscriptions
203
247
  self.max_wait_time = consumer_class.max_wait_time || self.max_wait_time
248
+ self.fetch_messages = consumer_class.fetch_messages || self.fetch_messages
204
249
  self.pidfile ||= "#{group_id}.pid"
205
250
  end
251
+ attr_accessor :consumer_class
206
252
 
207
253
  def on_error(&handler)
208
254
  @error_handler = handler
@@ -224,11 +270,41 @@ module Racecar
224
270
  producer_config
225
271
  end
226
272
 
273
+ def instrumenter
274
+ @instrumenter ||= begin
275
+ default_payload = { client_id: client_id, group_id: group_id }
276
+
277
+ if defined?(ActiveSupport::Notifications)
278
+ # ActiveSupport needs `concurrent-ruby` but doesn't `require` it.
279
+ require 'concurrent/utility/monotonic_time'
280
+ Instrumenter.new(backend: ActiveSupport::Notifications, default_payload: default_payload)
281
+ else
282
+ logger.warn "ActiveSupport::Notifications not available, instrumentation is disabled"
283
+ NullInstrumenter
284
+ end
285
+ end
286
+ end
287
+ attr_writer :instrumenter
288
+
289
+ def install_liveness_probe
290
+ liveness_probe.tap(&:install)
291
+ end
292
+
293
+ def liveness_probe
294
+ require "active_support/notifications"
295
+ @liveness_probe ||= LivenessProbe.new(
296
+ ActiveSupport::Notifications,
297
+ liveness_probe_file_path,
298
+ liveness_probe_max_interval
299
+ )
300
+ end
301
+
227
302
  private
228
303
 
229
304
  def rdkafka_security_config
230
305
  {
231
306
  "security.protocol" => security_protocol,
307
+ "enable.ssl.certificate.verification" => ssl_verify_hostname,
232
308
  "ssl.ca.location" => ssl_ca_location,
233
309
  "ssl.crl.location" => ssl_crl_location,
234
310
  "ssl.keystore.location" => ssl_keystore_location,
@@ -1,3 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "racecar/message_delivery_error"
4
+
1
5
  module Racecar
2
6
  class Consumer
3
7
  Subscription = Struct.new(:topic, :start_from_beginning, :max_bytes_per_partition, :additional_config)
@@ -5,7 +9,7 @@ module Racecar
5
9
  class << self
6
10
  attr_accessor :max_wait_time
7
11
  attr_accessor :group_id
8
- attr_accessor :producer, :consumer
12
+ attr_accessor :producer, :consumer, :parallel_workers, :fetch_messages
9
13
 
10
14
  def subscriptions
11
15
  @subscriptions ||= []
@@ -23,29 +27,68 @@ module Racecar
23
27
  # @param additional_config [Hash] Configuration properties for consumer.
24
28
  # See https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
25
29
  # @return [nil]
26
- def subscribes_to(*topics, start_from_beginning: true, max_bytes_per_partition: 1048576, additional_config: {})
30
+ def subscribes_to(
31
+ *topics,
32
+ start_from_beginning: true,
33
+ max_bytes_per_partition: 1048576,
34
+ additional_config: {}
35
+ )
27
36
  topics.each do |topic|
28
37
  subscriptions << Subscription.new(topic, start_from_beginning, max_bytes_per_partition, additional_config)
29
38
  end
30
39
  end
40
+
41
+ # Rebalance hooks for subclasses to override
42
+ def on_partitions_assigned(rebalance_event); end
43
+ def on_partitions_revoked(rebalance_event); end
31
44
  end
32
45
 
33
- def configure(producer:, consumer:, instrumenter: NullInstrumenter)
46
+ def configure(producer:, consumer:, instrumenter: NullInstrumenter, config: Racecar.config)
34
47
  @producer = producer
48
+ @delivery_handles = []
49
+
35
50
  @consumer = consumer
51
+
36
52
  @instrumenter = instrumenter
53
+ @config = config
37
54
  end
38
55
 
39
56
  def teardown; end
40
57
 
41
- # Delivers messages that got produced.
58
+ # Blocks until all messages produced so far have been successfully published. If
59
+ # message delivery finally fails, a Racecar::MessageDeliveryError is raised. The
60
+ # delivery failed for the reason in the exception. The error can be broker side
61
+ # (e.g. downtime, configuration issue) or specific to the message being sent. The
62
+ # caller must handle the latter cases or run into head of line blocking.
42
63
  def deliver!
43
64
  @delivery_handles ||= []
44
65
  if @delivery_handles.any?
45
66
  instrumentation_payload = { delivered_message_count: @delivery_handles.size }
46
67
 
47
68
  @instrumenter.instrument('deliver_messages', instrumentation_payload) do
48
- @delivery_handles.each(&:wait)
69
+ @delivery_handles.each do |handle|
70
+ begin
71
+ # rdkafka-ruby checks every wait_timeout seconds if the message was
72
+ # successfully delivered, up to max_wait_timeout seconds before raising
73
+ # Rdkafka::AbstractHandle::WaitTimeoutError. librdkafka will (re)try to
74
+ # deliver all messages in the background, until "config.message_timeout"
75
+ # (message.timeout.ms) is exceeded. Phrased differently, rdkafka-ruby's
76
+ # WaitTimeoutError is just informative.
77
+ # The raising can be avoided if max_wait_timeout below is greater than
78
+ # config.message_timeout, but config is not available here (without
79
+ # changing the interface).
80
+ handle.wait(max_wait_timeout: 60, wait_timeout: 0.1)
81
+ rescue Rdkafka::AbstractHandle::WaitTimeoutError => e
82
+ partition = MessageDeliveryError.partition_from_delivery_handle(handle)
83
+ # ideally we could use the logger passed to the Runner, but it is not
84
+ # available here. The runner sets it for Rdkafka, though, so we can use
85
+ # that instead.
86
+ @config.logger.debug "Still trying to deliver message to (partition #{partition})... (will try up to Racecar.config.message_timeout)"
87
+ retry
88
+ rescue Rdkafka::RdkafkaError => e
89
+ raise MessageDeliveryError.new(e, handle)
90
+ end
91
+ end
49
92
  end
50
93
  end
51
94
  @delivery_handles.clear
@@ -54,13 +97,14 @@ module Racecar
54
97
  protected
55
98
 
56
99
  # https://github.com/appsignal/rdkafka-ruby#producing-messages
57
- def produce(payload, topic:, key:, partition_key: nil, headers: nil, create_time: nil)
100
+ def produce(payload, topic:, key: nil, partition: nil, partition_key: nil, headers: nil, create_time: nil)
58
101
  @delivery_handles ||= []
59
102
  message_size = payload.respond_to?(:bytesize) ? payload.bytesize : 0
60
103
  instrumentation_payload = {
61
104
  value: payload,
62
105
  headers: headers,
63
106
  key: key,
107
+ partition: partition,
64
108
  partition_key: partition_key,
65
109
  topic: topic,
66
110
  message_size: message_size,
@@ -73,6 +117,7 @@ module Racecar
73
117
  topic: topic,
74
118
  payload: payload,
75
119
  key: key,
120
+ partition: partition,
76
121
  partition_key: partition_key,
77
122
  timestamp: create_time,
78
123
  headers: headers,