racecar 2.0.0 → 2.10.0.beta2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.github/dependabot.yml +17 -0
  3. data/.github/workflows/ci.yml +46 -0
  4. data/.github/workflows/publish.yml +12 -0
  5. data/.gitignore +1 -2
  6. data/CHANGELOG.md +83 -1
  7. data/Dockerfile +9 -0
  8. data/Gemfile +6 -0
  9. data/Gemfile.lock +72 -0
  10. data/README.md +303 -82
  11. data/Rakefile +5 -0
  12. data/docker-compose.yml +65 -0
  13. data/examples/batch_consumer.rb +4 -2
  14. data/examples/cat_consumer.rb +2 -0
  15. data/examples/producing_consumer.rb +2 -0
  16. data/exe/racecar +37 -14
  17. data/extra/datadog-dashboard.json +1 -0
  18. data/lib/ensure_hash_compact.rb +2 -0
  19. data/lib/generators/racecar/consumer_generator.rb +2 -0
  20. data/lib/generators/racecar/install_generator.rb +2 -0
  21. data/lib/racecar/cli.rb +26 -21
  22. data/lib/racecar/config.rb +80 -4
  23. data/lib/racecar/consumer.rb +51 -6
  24. data/lib/racecar/consumer_set.rb +113 -44
  25. data/lib/racecar/ctl.rb +31 -3
  26. data/lib/racecar/daemon.rb +4 -2
  27. data/lib/racecar/datadog.rb +83 -3
  28. data/lib/racecar/delivery_callback.rb +27 -0
  29. data/lib/racecar/erroneous_state_error.rb +34 -0
  30. data/lib/racecar/heroku.rb +49 -0
  31. data/lib/racecar/instrumenter.rb +4 -7
  32. data/lib/racecar/liveness_probe.rb +78 -0
  33. data/lib/racecar/message.rb +6 -1
  34. data/lib/racecar/message_delivery_error.rb +112 -0
  35. data/lib/racecar/null_instrumenter.rb +2 -0
  36. data/lib/racecar/parallel_runner.rb +110 -0
  37. data/lib/racecar/pause.rb +8 -4
  38. data/lib/racecar/producer.rb +139 -0
  39. data/lib/racecar/rails_config_file_loader.rb +7 -1
  40. data/lib/racecar/rebalance_listener.rb +58 -0
  41. data/lib/racecar/runner.rb +79 -37
  42. data/lib/racecar/version.rb +3 -1
  43. data/lib/racecar.rb +36 -8
  44. data/racecar.gemspec +7 -4
  45. metadata +47 -25
  46. data/.github/workflows/rspec.yml +0 -24
@@ -1,8 +1,10 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class BatchConsumer < Racecar::Consumer
2
4
  subscribes_to "messages", start_from_beginning: false
3
5
 
4
- def process_batch(batch)
5
- batch.messages.each do |message|
6
+ def process_batch(messages)
7
+ messages.each do |message|
6
8
  puts message.value
7
9
  end
8
10
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class CatConsumer < Racecar::Consumer
2
4
  subscribes_to "messages", start_from_beginning: false
3
5
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class ProducingConsumer < Racecar::Consumer
2
4
  subscribes_to "messages", start_from_beginning: false
3
5
 
data/exe/racecar CHANGED
@@ -3,19 +3,42 @@
3
3
  require "racecar"
4
4
  require "racecar/cli"
5
5
 
6
- begin
7
- Racecar::Cli.main(ARGV)
8
- rescue SignalException => e
9
- # We might receive SIGTERM before our signal handler is installed.
10
- if Signal.signame(e.signo) == "TERM"
11
- exit(0)
12
- else
13
- raise
6
+ module Racecar
7
+ class << self
8
+ def start(argv)
9
+ Cli.main(argv)
10
+ rescue SignalException => e
11
+ # We might receive SIGTERM before our signal handler is installed.
12
+ if Signal.signame(e.signo) == "TERM"
13
+ exit(0)
14
+ else
15
+ raise
16
+ end
17
+ rescue SystemExit
18
+ raise
19
+ rescue Exception => e
20
+ $stderr.puts "=> Crashed: #{exception_with_causes(e)}\n#{e.backtrace.join("\n")}"
21
+
22
+ Racecar.config.error_handler.call(e)
23
+
24
+ exit(1)
25
+ else
26
+ exit(0)
27
+ end
28
+
29
+ private
30
+
31
+ def exception_with_causes(e)
32
+ result = +"#{e.class}: #{e}"
33
+ if e.cause
34
+ result << "\n"
35
+ result << "--- Caused by: ---\n"
36
+ result << exception_with_causes(e.cause)
37
+ end
38
+ result
39
+ end
14
40
  end
15
- rescue
16
- # Exceptions are printed to STDERR and sent to the error handler
17
- # in `Racecar::Cli#run`, so we don't need to do anything here.
18
- exit(1)
19
- else
20
- exit(0)
21
41
  end
42
+
43
+ # Start your engines!
44
+ Racecar.start(ARGV)
@@ -0,0 +1 @@
1
+ {"title":"Racecar consumer groups","description":"Dashboard for monitoring [Racecar](https://github.com/zendesk/racecar) Kafka consumer groups.","widgets":[{"id":4916208698459109,"definition":{"title":"Single-message processing","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":82605028,"definition":{"title":"95th percentile message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.95percentile{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":2857871641649870,"definition":{"title":"Max message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.max{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":88579656,"definition":{"title":"Median message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.median{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}}]}},{"id":4068194420543030,"definition":{"title":"Batch processing","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":341686567,"definition":{"title":"95th percentile batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_batch.latency.95percentile{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":341687897,"definition":{"title":"Median batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_batch.latency.median{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5352911818003929,"definition":{"title":"Max batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"max:racecar.consumer.process_batch.latency.max{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":1654098217056312,"definition":{"title":"Max message batch size","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"max:racecar.consumer.batch_size.max{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"purple","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":7718619791149134,"definition":{"title":"Average per-message latency in batch processing mode","show_legend":false,"legend_size":"0","legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"max:racecar.consumer.process_batch.latency.avg{$group_id,$client,$topic,$partition,$env}/max:racecar.consumer.batch_size.avg{$group_id,$client,$topic,$partition,$env}","metadata":[{"expression":"max:racecar.consumer.process_batch.latency.avg{$env,$pod,$group_id,$client,$topic,$partition}/max:racecar.consumer.batch_size.avg{$env,$pod,$group_id,$client,$topic,$partition}","alias_name":"ms"}],"style":{"palette":"purple","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":4,"y":2,"width":4,"height":2}}]}},{"id":7110612496425151,"definition":{"title":"Throughput & Lag","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":301212748,"definition":{"title":"Message lag changes","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"derivative(max:racecar.consumer.offset{$group_id,$client,$topic,$partition,$env} by {topic,partition,pod})","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":82604183,"definition":{"title":"Processing throughput by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {topic,group_id}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5547724125706857,"definition":{"title":"Processing throughput by group","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {group_id}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":375397853,"definition":{"title":"Processing throughput by host","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {group_id,host}.as_rate()","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":7820607170949322,"definition":{"title":"Messages consumed in timeframe","type":"query_value","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env}.as_count()","aggregator":"sum"}],"autoscale":true,"precision":0},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":1428183857213882,"definition":{"title":"Time lag (end-to-end latency)","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.time_lag{$group_id,$client,$topic,$partition,$env} by {group_id,pod}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":2,"width":4,"height":2}}]}},{"id":1487807434456879,"definition":{"title":"Processing Errors & Group Stability","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":82605029,"definition":{"title":"Processing errors","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.process_batch.errors{$group_id,$client,$topic,$partition,$env} by {topic,pod,group_id,partition}.as_count()+sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic,pod,group_id,partition}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":83104736,"definition":{"title":"Processing error rate by topic (%)","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"(sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()/(sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()+sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()))*100","style":{"palette":"orange","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6572534533091871,"definition":{"title":"Processing errors in timeframe","type":"query_value","requests":[{"q":"sum:racecar.consumer.process_batch.errors{$topic,$client,$group_id,$env}.as_count()+sum:racecar.consumer.process_message.errors{$topic,$client,$group_id,$env}.as_count()","aggregator":"sum","conditional_formats":[{"comparator":">","palette":"white_on_red","value":0},{"comparator":"<=","palette":"white_on_green","value":0}]}],"autoscale":true,"precision":0},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":302705923,"definition":{"title":"Pause duration","show_legend":false,"legend_size":"0","legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.consumer.pause.duration{$client,$group_id,$topic,$env} by {pod,group_id,topic,partition}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":235544854,"definition":{"title":"Group joins","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.join_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":235544862,"definition":{"title":"Group leaves","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.leave_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":2,"width":4,"height":2}},{"id":235545167,"definition":{"title":"Group syncs","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.sync_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":4,"width":4,"height":2}}]}},{"id":8013176155436939,"definition":{"title":"Producer & message delivery","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":5948628389625057,"definition":{"title":"Message delivery latency (median)","title_size":"16","title_align":"left","show_legend":false,"type":"timeseries","requests":[{"q":"avg:racecar.producer.deliver.latency.median{$client,$env}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":3158040379950811,"definition":{"title":"Producer buffer size (max)","title_size":"16","title_align":"left","show_legend":false,"type":"timeseries","requests":[{"q":"max:racecar.producer.buffer.size.max{$client,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6916375790222772,"definition":{"title":"Producer buffer size (avg) kp","title_size":"16","title_align":"left","show_legend":false,"legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.producer.buffer.size.avg{$client,$env} by {topic,host}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":3160735194874896,"definition":{"title":"Message size (95p)","title_size":"16","title_align":"left","show_legend":false,"legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.producer.produce.message_size.95percentile{$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":2,"width":4,"height":2}}]}}],"template_variables":[{"name":"env","default":"production","prefix":"env"},{"name":"group_id","default":"*","prefix":"group_id"},{"name":"client","default":"*","prefix":"client"},{"name":"topic","default":"*","prefix":"topic"},{"name":"partition","default":"*","prefix":"partition"}],"layout_type":"ordered","is_read_only":false,"notify_list":[],"reflow_type":"fixed","id":"ywc-z36-g29"}
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # only needed when ruby < 2.4 and not using active support
2
4
 
3
5
  unless {}.respond_to? :compact
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Racecar
2
4
  module Generators
3
5
  class ConsumerGenerator < Rails::Generators::NamedBase
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Racecar
2
4
  module Generators
3
5
  class InstallGenerator < Rails::Generators::Base
data/lib/racecar/cli.rb CHANGED
@@ -1,23 +1,25 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require "optparse"
2
4
  require "logger"
3
5
  require "fileutils"
4
6
  require "racecar/rails_config_file_loader"
5
7
  require "racecar/daemon"
8
+ require "racecar/liveness_probe"
6
9
 
7
10
  module Racecar
8
11
  class Cli
9
- def self.main(args)
10
- new(args).run
12
+ class << self
13
+ def main(args)
14
+ new(args).run
15
+ end
11
16
  end
12
17
 
13
18
  def initialize(args)
14
19
  @parser = build_parser
15
20
  @parser.parse!(args)
16
21
  @consumer_name = args.first or raise Racecar::Error, "no consumer specified"
17
- end
18
-
19
- def config
20
- Racecar.config
22
+ @runner = nil
21
23
  end
22
24
 
23
25
  def run
@@ -58,21 +60,29 @@ module Racecar
58
60
  $stderr.puts "=> Ctrl-C to shutdown consumer"
59
61
  end
60
62
 
61
- processor = consumer_class.new
62
-
63
- Racecar.run(processor)
64
- rescue => e
65
- $stderr.puts "=> Crashed: #{e.class}: #{e}\n#{e.backtrace.join("\n")}"
63
+ if config.liveness_probe_enabled
64
+ $stderr.puts "=> Liveness probe enabled"
65
+ config.install_liveness_probe
66
+ end
66
67
 
67
- config.error_handler.call(e)
68
+ processor = consumer_class.new
69
+ @runner = Racecar.runner(processor)
70
+ @runner.run
71
+ nil
72
+ end
68
73
 
69
- raise
74
+ def stop
75
+ @runner.stop
70
76
  end
71
77
 
72
78
  private
73
79
 
74
80
  attr_reader :consumer_name
75
81
 
82
+ def config
83
+ Racecar.config
84
+ end
85
+
76
86
  def daemonize!
77
87
  daemon = Daemon.new(File.expand_path(config.pidfile))
78
88
 
@@ -102,12 +112,7 @@ module Racecar
102
112
  opts.on("-r", "--require STRING", "Require a library before starting the consumer") do |lib|
103
113
  $LOAD_PATH.unshift(Dir.pwd) unless load_path_modified
104
114
  load_path_modified = true
105
- begin
106
- require lib
107
- rescue => e
108
- $stderr.puts "=> #{lib} failed to load: #{e.message}"
109
- exit
110
- end
115
+ require lib
111
116
  end
112
117
 
113
118
  opts.on("-l", "--log STRING", "Log to the specified file") do |logfile|
@@ -115,13 +120,13 @@ module Racecar
115
120
  end
116
121
 
117
122
  Racecar::Config.variables.each do |variable|
118
- opt_name = "--" << variable.name.to_s.gsub("_", "-")
123
+ opt_name = +"--#{variable.name.to_s.gsub('_', '-')}"
119
124
  opt_name << " #{variable.type.upcase}" unless variable.boolean?
120
125
 
121
126
  desc = variable.description || "N/A"
122
127
 
123
128
  if variable.default
124
- desc << " (default: #{variable.default.inspect})"
129
+ desc += " (default: #{variable.default.inspect})"
125
130
  end
126
131
 
127
132
  opts.on(opt_name, desc) do |value|
@@ -1,9 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "tmpdir"
4
+
1
5
  require "king_konf"
2
6
 
7
+ require "racecar/liveness_probe"
8
+ require "racecar/instrumenter"
9
+ require "racecar/rebalance_listener"
10
+
3
11
  module Racecar
4
12
  class Config < KingKonf::Config
5
13
  env_prefix :racecar
6
14
 
15
+ STATISTICS_DISABLED_VALUE = 0
16
+
7
17
  desc "A list of Kafka brokers in the cluster that you're consuming from"
8
18
  list :brokers, default: ["localhost:9092"]
9
19
 
@@ -19,6 +29,9 @@ module Racecar
19
29
  desc "The minimum number of messages in the local consumer queue"
20
30
  integer :min_message_queue_size, default: 2000
21
31
 
32
+ desc "Which partition assignment strategy to use, range, roundrobin or cooperative-sticky. -- https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md"
33
+ string :partition_assignment_strategy, default: "range,roundrobin"
34
+
22
35
  desc "Kafka consumer configuration options, separated with '=' -- https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md"
23
36
  list :consumer, default: []
24
37
 
@@ -52,9 +65,12 @@ module Racecar
52
65
  desc "How long to wait when trying to communicate with a Kafka broker"
53
66
  float :socket_timeout, default: 30
54
67
 
55
- desc "How long to allow the Kafka brokers to wait before returning messages"
68
+ desc "How long to allow the Kafka brokers to wait before returning messages (in seconds)"
56
69
  float :max_wait_time, default: 1
57
70
 
71
+ desc "How long to try to deliver a produced message before finally giving up (in seconds)"
72
+ float :message_timeout, default: 5*60
73
+
58
74
  desc "Maximum amount of data the broker shall return for a Fetch request"
59
75
  integer :max_bytes, default: 10485760
60
76
 
@@ -70,6 +86,9 @@ module Racecar
70
86
  desc "The log level for the Racecar logs"
71
87
  string :log_level, default: "info"
72
88
 
89
+ desc "The strategy used to determine which topic partition a message is written to when Racecar produces a value to Kafka; defaults to `consistent_random`"
90
+ symbol :partitioner, allowed_values: %i{consistent consistent_random murmur2 murmur2_random fnv1a fnv1a_random}, default: :consistent_random
91
+
73
92
  desc "Protocol used to communicate with brokers"
74
93
  symbol :security_protocol, allowed_values: %i{plaintext ssl sasl_plaintext sasl_ssl}
75
94
 
@@ -151,10 +170,33 @@ module Racecar
151
170
  desc "Whether to boot Rails when starting the consumer"
152
171
  boolean :without_rails, default: false
153
172
 
173
+ desc "How frequently librdkafka should report statistics to your application (in seconds). A statistics callback
174
+ must also be provided. This should be defined with a `statistics_callback` method on your processor. Stats
175
+ are disabled if this value is set to 0, or there is no callback defined. This is set by default to 1 second
176
+ for backward compatibility, however this can be quite memory intensive"
177
+ integer :statistics_interval, default: 1
178
+
179
+ desc "Whether to enable liveness probe behavior (touch the file)"
180
+ boolean :liveness_probe_enabled, default: false
181
+
182
+ desc "Path to a file Racecar will touch to show liveness"
183
+ string :liveness_probe_file_path, default: "#{Dir.tmpdir}/racecar-liveness"
184
+
185
+ desc "Used only by the liveness probe: Max time (in seconds) between liveness events before the process is considered not healthy"
186
+ integer :liveness_probe_max_interval, default: 5
187
+
154
188
  # The error handler must be set directly on the object.
155
189
  attr_reader :error_handler
156
190
 
157
- attr_accessor :subscriptions, :logger
191
+ attr_accessor :subscriptions, :logger, :parallel_workers
192
+
193
+ def statistics_interval_ms
194
+ if Rdkafka::Config.statistics_callback
195
+ statistics_interval * 1000
196
+ else
197
+ STATISTICS_DISABLED_VALUE
198
+ end
199
+ end
158
200
 
159
201
  def max_wait_time_ms
160
202
  max_wait_time * 1000
@@ -189,6 +231,7 @@ module Racecar
189
231
  end
190
232
 
191
233
  def load_consumer_class(consumer_class)
234
+ self.consumer_class = consumer_class
192
235
  self.group_id = consumer_class.group_id || self.group_id
193
236
 
194
237
  self.group_id ||= [
@@ -196,13 +239,16 @@ module Racecar
196
239
  group_id_prefix,
197
240
 
198
241
  # MyFunnyConsumer => my-funny-consumer
199
- consumer_class.name.gsub(/[a-z][A-Z]/) {|str| str[0] << "-" << str[1] }.downcase,
200
- ].compact.join("")
242
+ consumer_class.name.gsub(/[a-z][A-Z]/) { |str| "#{str[0]}-#{str[1]}" }.downcase,
243
+ ].compact.join
201
244
 
245
+ self.parallel_workers = consumer_class.parallel_workers
202
246
  self.subscriptions = consumer_class.subscriptions
203
247
  self.max_wait_time = consumer_class.max_wait_time || self.max_wait_time
248
+ self.fetch_messages = consumer_class.fetch_messages || self.fetch_messages
204
249
  self.pidfile ||= "#{group_id}.pid"
205
250
  end
251
+ attr_accessor :consumer_class
206
252
 
207
253
  def on_error(&handler)
208
254
  @error_handler = handler
@@ -224,11 +270,41 @@ module Racecar
224
270
  producer_config
225
271
  end
226
272
 
273
+ def instrumenter
274
+ @instrumenter ||= begin
275
+ default_payload = { client_id: client_id, group_id: group_id }
276
+
277
+ if defined?(ActiveSupport::Notifications)
278
+ # ActiveSupport needs `concurrent-ruby` but doesn't `require` it.
279
+ require 'concurrent/utility/monotonic_time'
280
+ Instrumenter.new(backend: ActiveSupport::Notifications, default_payload: default_payload)
281
+ else
282
+ logger.warn "ActiveSupport::Notifications not available, instrumentation is disabled"
283
+ NullInstrumenter
284
+ end
285
+ end
286
+ end
287
+ attr_writer :instrumenter
288
+
289
+ def install_liveness_probe
290
+ liveness_probe.tap(&:install)
291
+ end
292
+
293
+ def liveness_probe
294
+ require "active_support/notifications"
295
+ @liveness_probe ||= LivenessProbe.new(
296
+ ActiveSupport::Notifications,
297
+ liveness_probe_file_path,
298
+ liveness_probe_max_interval
299
+ )
300
+ end
301
+
227
302
  private
228
303
 
229
304
  def rdkafka_security_config
230
305
  {
231
306
  "security.protocol" => security_protocol,
307
+ "enable.ssl.certificate.verification" => ssl_verify_hostname,
232
308
  "ssl.ca.location" => ssl_ca_location,
233
309
  "ssl.crl.location" => ssl_crl_location,
234
310
  "ssl.keystore.location" => ssl_keystore_location,
@@ -1,3 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "racecar/message_delivery_error"
4
+
1
5
  module Racecar
2
6
  class Consumer
3
7
  Subscription = Struct.new(:topic, :start_from_beginning, :max_bytes_per_partition, :additional_config)
@@ -5,7 +9,7 @@ module Racecar
5
9
  class << self
6
10
  attr_accessor :max_wait_time
7
11
  attr_accessor :group_id
8
- attr_accessor :producer, :consumer
12
+ attr_accessor :producer, :consumer, :parallel_workers, :fetch_messages
9
13
 
10
14
  def subscriptions
11
15
  @subscriptions ||= []
@@ -23,29 +27,68 @@ module Racecar
23
27
  # @param additional_config [Hash] Configuration properties for consumer.
24
28
  # See https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
25
29
  # @return [nil]
26
- def subscribes_to(*topics, start_from_beginning: true, max_bytes_per_partition: 1048576, additional_config: {})
30
+ def subscribes_to(
31
+ *topics,
32
+ start_from_beginning: true,
33
+ max_bytes_per_partition: 1048576,
34
+ additional_config: {}
35
+ )
27
36
  topics.each do |topic|
28
37
  subscriptions << Subscription.new(topic, start_from_beginning, max_bytes_per_partition, additional_config)
29
38
  end
30
39
  end
40
+
41
+ # Rebalance hooks for subclasses to override
42
+ def on_partitions_assigned(rebalance_event); end
43
+ def on_partitions_revoked(rebalance_event); end
31
44
  end
32
45
 
33
- def configure(producer:, consumer:, instrumenter: NullInstrumenter)
46
+ def configure(producer:, consumer:, instrumenter: NullInstrumenter, config: Racecar.config)
34
47
  @producer = producer
48
+ @delivery_handles = []
49
+
35
50
  @consumer = consumer
51
+
36
52
  @instrumenter = instrumenter
53
+ @config = config
37
54
  end
38
55
 
39
56
  def teardown; end
40
57
 
41
- # Delivers messages that got produced.
58
+ # Blocks until all messages produced so far have been successfully published. If
59
+ # message delivery finally fails, a Racecar::MessageDeliveryError is raised. The
60
+ # delivery failed for the reason in the exception. The error can be broker side
61
+ # (e.g. downtime, configuration issue) or specific to the message being sent. The
62
+ # caller must handle the latter cases or run into head of line blocking.
42
63
  def deliver!
43
64
  @delivery_handles ||= []
44
65
  if @delivery_handles.any?
45
66
  instrumentation_payload = { delivered_message_count: @delivery_handles.size }
46
67
 
47
68
  @instrumenter.instrument('deliver_messages', instrumentation_payload) do
48
- @delivery_handles.each(&:wait)
69
+ @delivery_handles.each do |handle|
70
+ begin
71
+ # rdkafka-ruby checks every wait_timeout seconds if the message was
72
+ # successfully delivered, up to max_wait_timeout seconds before raising
73
+ # Rdkafka::AbstractHandle::WaitTimeoutError. librdkafka will (re)try to
74
+ # deliver all messages in the background, until "config.message_timeout"
75
+ # (message.timeout.ms) is exceeded. Phrased differently, rdkafka-ruby's
76
+ # WaitTimeoutError is just informative.
77
+ # The raising can be avoided if max_wait_timeout below is greater than
78
+ # config.message_timeout, but config is not available here (without
79
+ # changing the interface).
80
+ handle.wait(max_wait_timeout: 60, wait_timeout: 0.1)
81
+ rescue Rdkafka::AbstractHandle::WaitTimeoutError => e
82
+ partition = MessageDeliveryError.partition_from_delivery_handle(handle)
83
+ # ideally we could use the logger passed to the Runner, but it is not
84
+ # available here. The runner sets it for Rdkafka, though, so we can use
85
+ # that instead.
86
+ @config.logger.debug "Still trying to deliver message to (partition #{partition})... (will try up to Racecar.config.message_timeout)"
87
+ retry
88
+ rescue Rdkafka::RdkafkaError => e
89
+ raise MessageDeliveryError.new(e, handle)
90
+ end
91
+ end
49
92
  end
50
93
  end
51
94
  @delivery_handles.clear
@@ -54,13 +97,14 @@ module Racecar
54
97
  protected
55
98
 
56
99
  # https://github.com/appsignal/rdkafka-ruby#producing-messages
57
- def produce(payload, topic:, key:, partition_key: nil, headers: nil, create_time: nil)
100
+ def produce(payload, topic:, key: nil, partition: nil, partition_key: nil, headers: nil, create_time: nil)
58
101
  @delivery_handles ||= []
59
102
  message_size = payload.respond_to?(:bytesize) ? payload.bytesize : 0
60
103
  instrumentation_payload = {
61
104
  value: payload,
62
105
  headers: headers,
63
106
  key: key,
107
+ partition: partition,
64
108
  partition_key: partition_key,
65
109
  topic: topic,
66
110
  message_size: message_size,
@@ -73,6 +117,7 @@ module Racecar
73
117
  topic: topic,
74
118
  payload: payload,
75
119
  key: key,
120
+ partition: partition,
76
121
  partition_key: partition_key,
77
122
  timestamp: create_time,
78
123
  headers: headers,