racecar 2.2.0 → 2.3.0.alpha1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e2c32338556d27bfcbae35df0758163a457d2c6e3f520141206c5a9c0124cc68
4
- data.tar.gz: 43d9c42d0d483c269b15d417ef59e2985da3ca3945d34d6772f8dd262ccbfaf6
3
+ metadata.gz: 539a8002f306c561358adb3f257d721dbe3764766ff7b14a52b4d7a7d2de4a22
4
+ data.tar.gz: ca482343833ac570fbadcc652cf3cd4e3691cba116093379c333c274137323c0
5
5
  SHA512:
6
- metadata.gz: 0467ac1cdefb6cad9870dd73b92f4a5a943b9f685ff3fc876b3f183d109ae3d29d7c2c7dffea8f31bca7c7b18565e5aba04d4865c94f2448a7228be175855a5b
7
- data.tar.gz: e4ab43eb180995af916d447b006438b4a48cb808b29aabec52b455e246541a083192d1b560a957fa6f3ab7d5412dd12ab74aac49acc0b606c3df87cec90b93b6
6
+ metadata.gz: 4ff0d1b0115b9aee1268e57613f348ebf654698d0aba021ecf0ac6b9d418ea9a91c23f14aa550b76e0cffe62673cd95444ca871c4dc2d9175d1bbf2970d2be74
7
+ data.tar.gz: eafa3506a43d5d79a12adcea2f56ecae66068a7ff077d7beba63630cfe6d94a84888b05e2de2da40fc53daa63fa8d0ee6593b97db3eb26ec119705dbb39f6153
@@ -1,9 +1,10 @@
1
1
  name: CI
2
2
 
3
3
  on:
4
+ pull_request:
5
+ branches: ["master"]
4
6
  push:
5
- branches:
6
- - '**'
7
+ branches: ["master"]
7
8
 
8
9
  jobs:
9
10
  unit-specs:
@@ -11,12 +12,12 @@ jobs:
11
12
 
12
13
  strategy:
13
14
  matrix:
14
- ruby-version: ["2.5", "2.6"]
15
+ ruby-version: ["2.5", "2.6", "3.0"]
15
16
 
16
17
  steps:
17
18
  - uses: zendesk/checkout@v2
18
19
  - name: Set up Ruby
19
- uses: zendesk/setup-ruby@v1.58.0
20
+ uses: zendesk/setup-ruby@v1.64.1
20
21
  with:
21
22
  ruby-version: ${{ matrix.ruby-version }}
22
23
  bundler-cache: true
@@ -25,37 +26,16 @@ jobs:
25
26
 
26
27
  integration-specs:
27
28
  runs-on: ubuntu-latest
28
-
29
- services:
30
- zookeeper:
31
- image: confluentinc/cp-zookeeper
32
- ports:
33
- - 2181:2181
34
- env:
35
- ZOOKEEPER_CLIENT_PORT: 2181
36
-
37
- kafka:
38
- image: confluentinc/cp-kafka
39
- ports:
40
- - 9092:9092
41
- - 29092:29092
42
- options: --health-cmd "kafka-topics --list --bootstrap-server=localhost:9092" --health-interval 10s --health-timeout 5s --health-retries 5
43
- env:
44
- KAFKA_BROKER_ID: 1
45
- KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
46
- KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
47
- KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
48
- KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
49
- KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
50
-
51
29
  steps:
52
30
  - uses: zendesk/checkout@v2
53
31
  - name: Set up Ruby
54
- uses: zendesk/setup-ruby@v1.58.0
32
+ uses: zendesk/setup-ruby@v1.64.1
55
33
  with:
56
34
  ruby-version: 2.7
57
35
  bundler-cache: true
36
+ - name: Bring up docker-compose stack
37
+ run: docker-compose up -d
58
38
  - name: Build and test with RSpec
59
39
  env:
60
40
  RACECAR_BROKERS: localhost:9092
61
- run: bundle exec rspec --format documentation --require spec_helper --color spec/integration/*_spec.rb
41
+ run: timeout --kill-after 180 150 bundle exec rspec --format documentation --require spec_helper --color spec/integration/*_spec.rb
data/CHANGELOG.md CHANGED
@@ -2,6 +2,14 @@
2
2
 
3
3
  ## Unreleased
4
4
 
5
+ * [Racecar::Consumer] When messages fail to deliver, an extended error with hints is now raised. Instead of `Rdkafka::RdkafkaError` you'll get a `Racecar::MessageDeliveryError` instead. ([#219](https://github.com/zendesk/racecar/pull/219)). If you have set a `Racecar.config.error_handler`, it might need to be updated.
6
+ * [Racecar::Consumer] When message delivery times out, Racecar will reset the producer in an attempt to fix some of the potential causes for this error. ([#219](https://github.com/zendesk/racecar/pull/219))
7
+ * Validate the `process` and `process_batch` method signature on consumer classes when initializing (#236)
8
+ * Add Ruby 3.0 compatibility (#237)
9
+ * Introduce parallel runner, which forks a number of independent consumers, allowing partitions to be processed in parallel. ([#222](https://github.com/zendesk/racecar/pull/222))
10
+ * [Racecar::Runner] Ensure producer is closed, whether it closes or errors. ([#222](https://github.com/zendesk/racecar/pull/222))
11
+ * Configure `statistics_interval` directly in the config. Disable statistics when no callback is defined ([#232](https://github.com/zendesk/racecar/pull/232))
12
+
5
13
  ## racecar v2.2.0
6
14
 
7
15
  * [Racecar::ConsumerSet] **breaking change** `Racecar::ConsumerSet`'s functions `poll` and `batch_pall` expect the max wait values to be given in milliseconds. The defaults were using `config.max_wait_time`, which is in seconds. If you do not directly use `Racecar::ConsumerSet`, or always call its `poll` and `batch_poll` functions by specfiying the max wait time (the first argument), then this breaking change does not affect you. ([#214](https://github.com/zendesk/racecar/pull/214))
data/Dockerfile ADDED
@@ -0,0 +1,9 @@
1
+ FROM circleci/ruby:2.7.2
2
+
3
+ RUN sudo apt-get update
4
+ RUN sudo apt-get install docker
5
+
6
+ WORKDIR /app
7
+ COPY . .
8
+
9
+ RUN bundle install
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- racecar (2.1.1)
4
+ racecar (2.2.0)
5
5
  king_konf (~> 1.0.0)
6
6
  rdkafka (~> 0.8.0)
7
7
 
@@ -18,7 +18,7 @@ GEM
18
18
  concurrent-ruby (1.1.7)
19
19
  diff-lcs (1.4.4)
20
20
  dogstatsd-ruby (4.8.2)
21
- ffi (1.13.1)
21
+ ffi (1.14.2)
22
22
  i18n (1.8.5)
23
23
  concurrent-ruby (~> 1.0)
24
24
  king_konf (1.0.0)
@@ -37,15 +37,15 @@ GEM
37
37
  rspec-core (~> 3.10.0)
38
38
  rspec-expectations (~> 3.10.0)
39
39
  rspec-mocks (~> 3.10.0)
40
- rspec-core (3.10.0)
40
+ rspec-core (3.10.1)
41
41
  rspec-support (~> 3.10.0)
42
- rspec-expectations (3.10.0)
42
+ rspec-expectations (3.10.1)
43
43
  diff-lcs (>= 1.2.0, < 2.0)
44
44
  rspec-support (~> 3.10.0)
45
- rspec-mocks (3.10.0)
45
+ rspec-mocks (3.10.2)
46
46
  diff-lcs (>= 1.2.0, < 2.0)
47
47
  rspec-support (~> 3.10.0)
48
- rspec-support (3.10.0)
48
+ rspec-support (3.10.2)
49
49
  thread_safe (0.3.6)
50
50
  timecop (0.9.2)
51
51
  tzinfo (1.2.8)
data/README.md CHANGED
@@ -83,6 +83,30 @@ Now run your consumer with `bundle exec racecar TapDanceConsumer`.
83
83
 
84
84
  Note: if you're not using Rails, you'll have to add the file yourself. No-one will judge you for copy-pasting it.
85
85
 
86
+ #### Running consumers in parallel (experimental)
87
+
88
+ Warning - limited battle testing in production environments; use at your own risk!
89
+
90
+ If you want to process different partitions in parallel, and don't want to deploy a number of instances matching the total partitions of the topic, you can specify the number of workers to spin up - that number of processes will be forked, and each will register its own consumer in the group. Some things to note:
91
+ - This would make no difference on a single partitioned topic - only one consumer would ever be assigned a partition. A couple of example configurations to process all partitions in parallel (we'll assume a 15 partition topic):
92
+ - Parallel workers set to 3, 5 separate instances / replicas running in your container orchestrator
93
+ - Parallel workers set to 5, 3 separate instances / replicas running in your container orchestrator
94
+ - Since we're forking new processes, the memory demands are a little higher
95
+ - From some initial testing, running 5 parallel workers requires no more than double the memory of running a Racecar consumer without parallelism.
96
+
97
+ The number of parallel workers is configured per consumer class; you may only want to take advantage of this for busier consumers:
98
+ ```ruby
99
+ class ParallelProcessingConsumer < Racecar::Consumer
100
+ subscribes_to "some-topic"
101
+
102
+ self.parallel_workers = 5
103
+
104
+ def process(message)
105
+ ...
106
+ end
107
+ end
108
+ ```
109
+
86
110
  #### Initializing consumers
87
111
 
88
112
  You can optionally add an `initialize` method if you need to do any set-up work before processing messages, e.g.
@@ -266,6 +290,8 @@ All timeouts are defined in number of seconds.
266
290
  * `pause_with_exponential_backoff` – Set to `true` if you want to double the `pause_timeout` on each consecutive failure of a particular partition.
267
291
  * `socket_timeout` – How long to wait when trying to communicate with a Kafka broker. Default is 30 seconds.
268
292
  * `max_wait_time` – How long to allow the Kafka brokers to wait before returning messages. A higher number means larger batches, at the cost of higher latency. Default is 1 second.
293
+ * `message_timeout` – How long to try to deliver a produced message before finally giving up. Default is 5 minutes. Transient errors are automatically retried. If a message delivery fails, the current read message batch is retried.
294
+ * `statistics_interval` – How frequently librdkafka should publish statistics about its consumers and producers; you must also add a `statistics_callback` method to your processor, otherwise the stats are disabled. The default is 1 second, however this can be quite memory hungry, so you may want to tune this and monitor.
269
295
 
270
296
  #### Memory & network usage
271
297
 
@@ -319,6 +345,8 @@ Racecar supports [Datadog](https://www.datadoghq.com/) monitoring integration. I
319
345
  * `datadog_namespace` – The namespace to use for Datadog metrics.
320
346
  * `datadog_tags` – Tags that should always be set on Datadog metrics.
321
347
 
348
+ Furthermore, there's a [standard Datadog dashboard configution file](https://raw.githubusercontent.com/zendesk/racecar/master/extra/datadog-dashboard.json) that you can import to get started with a Racecar dashboard for all of your consumers.
349
+
322
350
  #### Consumers Without Rails ####
323
351
 
324
352
  By default, if Rails is detected, it will be automatically started when the consumer is started. There are cases where you might not want or need Rails. You can pass the `--without-rails` option when starting the consumer and Rails won't be started.
@@ -492,6 +520,16 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
492
520
 
493
521
  The integration tests run against a Kafka instance that is not automatically started from within `rspec`. You can set one up using the provided `docker-compose.yml` by running `docker-compose up`.
494
522
 
523
+ ### Running RSpec within Docker
524
+
525
+ There can be behavioural inconsistencies between running the specs on your machine, and in the CI pipeline. Due to this, there is now a Dockerfile included in the project, which is based on the CircleCI ruby 2.7.2 image. This could easily be extended with more Dockerfiles to cover different Ruby versions if desired. In order to run the specs via Docker:
526
+
527
+ - Uncomment the `tests` service from the docker-compose.yml
528
+ - Bring up the stack with `docker-compose up -d`
529
+ - Execute the entire suite with `docker-compose run --rm tests rspec`
530
+ - Execute a single spec or directory with `docker-compose run --rm tests rspec spec/integration/consumer_spec.rb`
531
+
532
+ Please note - your code directory is mounted as a volume, so you can make code changes without needing to rebuild
495
533
 
496
534
  ## Contributing
497
535
 
data/docker-compose.yml CHANGED
@@ -1,19 +1,19 @@
1
- version: '2'
1
+ version: '2.1'
2
+
2
3
  services:
3
4
  zookeeper:
4
5
  image: confluentinc/cp-zookeeper:5.5.1
5
- hostname: zookeeper
6
- container_name: zookeeper
7
6
  ports:
8
7
  - "2181:2181"
9
8
  environment:
10
9
  ZOOKEEPER_CLIENT_PORT: 2181
11
10
  ZOOKEEPER_TICK_TIME: 2000
11
+ KAFKA_OPTS: "-Dzookeeper.4lw.commands.whitelist=*"
12
+ healthcheck:
13
+ test: echo ruok | nc 127.0.0.1 2181 | grep imok
12
14
 
13
15
  broker:
14
16
  image: confluentinc/cp-kafka:5.5.1
15
- hostname: broker
16
- container_name: broker
17
17
  depends_on:
18
18
  - zookeeper
19
19
  ports:
@@ -30,3 +30,36 @@ services:
30
30
  KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
31
31
  KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
32
32
  KAFKA_JMX_PORT: 9101
33
+ KAFKA_DELETE_TOPIC_ENABLE: 'true'
34
+ healthcheck:
35
+ test: nc -z 127.0.0.1 9092
36
+
37
+ wait-for-healthy-services:
38
+ image: alpine
39
+ depends_on:
40
+ broker:
41
+ condition: service_healthy
42
+ zookeeper:
43
+ condition: service_healthy
44
+
45
+
46
+ # If you want to run the tests locally with Docker, comment in the tests service.
47
+ # The behaviour, especially of the integration tests, can differ somewhat compared
48
+ # to running it on your machine.
49
+
50
+ # tests:
51
+ # build:
52
+ # context: .
53
+ # depends_on:
54
+ # wait-for-healthy-services:
55
+ # condition: service_started
56
+ # environment:
57
+ # RACECAR_BROKERS: broker:29092
58
+ # DOCKER_SUDO: 'true'
59
+ # # When bringing up the stack, we just let the container exit. For running the
60
+ # # specs, we'll use commands like `docker-compose run tests rspec`
61
+ # command: ["echo", "ready"]
62
+ # volumes:
63
+ # # The line below allows us to run docker commands from the container itself
64
+ # - "/var/run/docker.sock:/var/run/docker.sock"
65
+ # - .:/app
@@ -0,0 +1 @@
1
+ {"title":"Racecar consumer groups","description":"Dashboard for monitoring [Racecar](https://github.com/zendesk/racecar) Kafka consumer groups.","widgets":[{"id":4916208698459109,"definition":{"title":"Single-message processing","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":82605028,"definition":{"title":"95th percentile message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.95percentile{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":2857871641649870,"definition":{"title":"Max message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.max{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":88579656,"definition":{"title":"Median message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.median{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}}]}},{"id":4068194420543030,"definition":{"title":"Batch processing","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":341686567,"definition":{"title":"95th percentile batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_batch.latency.95percentile{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":341687897,"definition":{"title":"Median batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_batch.latency.median{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5352911818003929,"definition":{"title":"Max batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"max:racecar.consumer.process_batch.latency.max{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":1654098217056312,"definition":{"title":"Max message batch size","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"max:racecar.consumer.batch_size.max{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"purple","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":7718619791149134,"definition":{"title":"Average per-message latency in batch processing mode","show_legend":false,"legend_size":"0","legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"max:racecar.consumer.process_batch.latency.avg{$group_id,$client,$topic,$partition,$env}/max:racecar.consumer.batch_size.avg{$group_id,$client,$topic,$partition,$env}","metadata":[{"expression":"max:racecar.consumer.process_batch.latency.avg{$env,$pod,$group_id,$client,$topic,$partition}/max:racecar.consumer.batch_size.avg{$env,$pod,$group_id,$client,$topic,$partition}","alias_name":"ms"}],"style":{"palette":"purple","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":4,"y":2,"width":4,"height":2}}]}},{"id":7110612496425151,"definition":{"title":"Throughput & Lag","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":301212748,"definition":{"title":"Message lag changes","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"derivative(max:racecar.consumer.offset{$group_id,$client,$topic,$partition,$env} by {topic,partition,pod})","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":82604183,"definition":{"title":"Processing throughput by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {topic,group_id}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5547724125706857,"definition":{"title":"Processing throughput by group","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {group_id}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":375397853,"definition":{"title":"Processing throughput by host","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {group_id,host}.as_rate()","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":7820607170949322,"definition":{"title":"Messages consumed in timeframe","type":"query_value","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env}.as_count()","aggregator":"sum"}],"autoscale":true,"precision":0},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":1428183857213882,"definition":{"title":"Time lag (end-to-end latency)","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.time_lag{$group_id,$client,$topic,$partition,$env} by {group_id,pod}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":2,"width":4,"height":2}}]}},{"id":1487807434456879,"definition":{"title":"Processing Errors & Group Stability","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":82605029,"definition":{"title":"Processing errors","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.process_batch.errors{$group_id,$client,$topic,$partition,$env} by {topic,pod,group_id,partition}.as_count()+sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic,pod,group_id,partition}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":83104736,"definition":{"title":"Processing error rate by topic (%)","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"(sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()/(sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()+sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()))*100","style":{"palette":"orange","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6572534533091871,"definition":{"title":"Processing errors in timeframe","type":"query_value","requests":[{"q":"sum:racecar.consumer.process_batch.errors{$topic,$client,$group_id,$env}.as_count()+sum:racecar.consumer.process_message.errors{$topic,$client,$group_id,$env}.as_count()","aggregator":"sum","conditional_formats":[{"comparator":">","palette":"white_on_red","value":0},{"comparator":"<=","palette":"white_on_green","value":0}]}],"autoscale":true,"precision":0},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":302705923,"definition":{"title":"Pause duration","show_legend":false,"legend_size":"0","legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.consumer.pause.duration{$client,$group_id,$topic,$env} by {pod,group_id,topic,partition}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":235544854,"definition":{"title":"Group joins","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.join_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":235544862,"definition":{"title":"Group leaves","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.leave_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":2,"width":4,"height":2}},{"id":235545167,"definition":{"title":"Group syncs","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.sync_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":4,"width":4,"height":2}}]}},{"id":8013176155436939,"definition":{"title":"Producer & message delivery","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":5948628389625057,"definition":{"title":"Message delivery latency (median)","title_size":"16","title_align":"left","show_legend":false,"type":"timeseries","requests":[{"q":"avg:racecar.producer.deliver.latency.median{$client,$env}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":3158040379950811,"definition":{"title":"Producer buffer size (max)","title_size":"16","title_align":"left","show_legend":false,"type":"timeseries","requests":[{"q":"max:racecar.producer.buffer.size.max{$client,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6916375790222772,"definition":{"title":"Producer buffer size (avg) kp","title_size":"16","title_align":"left","show_legend":false,"legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.producer.buffer.size.avg{$client,$env} by {topic,host}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":3160735194874896,"definition":{"title":"Message size (95p)","title_size":"16","title_align":"left","show_legend":false,"legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.producer.produce.message_size.95percentile{$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":2,"width":4,"height":2}}]}}],"template_variables":[{"name":"env","default":"production","prefix":"env"},{"name":"group_id","default":"*","prefix":"group_id"},{"name":"client","default":"*","prefix":"client"},{"name":"topic","default":"*","prefix":"topic"},{"name":"partition","default":"*","prefix":"partition"}],"layout_type":"ordered","is_read_only":false,"notify_list":[],"reflow_type":"fixed","id":"ywc-z36-g29"}
data/lib/racecar.rb CHANGED
@@ -7,6 +7,7 @@ require "racecar/null_instrumenter"
7
7
  require "racecar/consumer"
8
8
  require "racecar/consumer_set"
9
9
  require "racecar/runner"
10
+ require "racecar/parallel_runner"
10
11
  require "racecar/config"
11
12
  require "racecar/version"
12
13
  require "ensure_hash_compact"
@@ -51,6 +52,12 @@ module Racecar
51
52
  end
52
53
 
53
54
  def self.run(processor)
54
- Runner.new(processor, config: config, logger: logger, instrumenter: instrumenter).run
55
+ runner = Runner.new(processor, config: config, logger: logger, instrumenter: instrumenter)
56
+
57
+ if config.parallel_workers && config.parallel_workers > 1
58
+ ParallelRunner.new(runner: runner, config: config, logger: logger).run
59
+ else
60
+ runner.run
61
+ end
55
62
  end
56
63
  end
data/lib/racecar/cli.rb CHANGED
@@ -59,8 +59,8 @@ module Racecar
59
59
  end
60
60
 
61
61
  processor = consumer_class.new
62
-
63
62
  Racecar.run(processor)
63
+ nil
64
64
  end
65
65
 
66
66
  private
@@ -6,6 +6,8 @@ module Racecar
6
6
  class Config < KingKonf::Config
7
7
  env_prefix :racecar
8
8
 
9
+ STATISTICS_DISABLED_VALUE = 0
10
+
9
11
  desc "A list of Kafka brokers in the cluster that you're consuming from"
10
12
  list :brokers, default: ["localhost:9092"]
11
13
 
@@ -57,6 +59,9 @@ module Racecar
57
59
  desc "How long to allow the Kafka brokers to wait before returning messages (in seconds)"
58
60
  float :max_wait_time, default: 1
59
61
 
62
+ desc "How long to try to deliver a produced message before finally giving up (in seconds)"
63
+ float :message_timeout, default: 5*60
64
+
60
65
  desc "Maximum amount of data the broker shall return for a Fetch request"
61
66
  integer :max_bytes, default: 10485760
62
67
 
@@ -153,10 +158,24 @@ module Racecar
153
158
  desc "Whether to boot Rails when starting the consumer"
154
159
  boolean :without_rails, default: false
155
160
 
161
+ desc "How frequently librdkafka should report statistics to your application (in seconds). A statistics callback
162
+ must also be provided. This should be defined with a `statistics_callback` method on your processor. Stats
163
+ are disabled if this value is set to 0, or there is no callback defined. This is set by default to 1 second
164
+ for backward compatibility, however this can be quite memory intensive"
165
+ integer :statistics_interval, default: 1
166
+
156
167
  # The error handler must be set directly on the object.
157
168
  attr_reader :error_handler
158
169
 
159
- attr_accessor :subscriptions, :logger
170
+ attr_accessor :subscriptions, :logger, :parallel_workers
171
+
172
+ def statistics_interval_ms
173
+ if Rdkafka::Config.statistics_callback
174
+ statistics_interval * 1000
175
+ else
176
+ STATISTICS_DISABLED_VALUE
177
+ end
178
+ end
160
179
 
161
180
  def max_wait_time_ms
162
181
  max_wait_time * 1000
@@ -201,6 +220,7 @@ module Racecar
201
220
  consumer_class.name.gsub(/[a-z][A-Z]/) { |str| "#{str[0]}-#{str[1]}" }.downcase,
202
221
  ].compact.join
203
222
 
223
+ self.parallel_workers = consumer_class.parallel_workers
204
224
  self.subscriptions = consumer_class.subscriptions
205
225
  self.max_wait_time = consumer_class.max_wait_time || self.max_wait_time
206
226
  self.pidfile ||= "#{group_id}.pid"
@@ -231,6 +251,7 @@ module Racecar
231
251
  def rdkafka_security_config
232
252
  {
233
253
  "security.protocol" => security_protocol,
254
+ "enable.ssl.certificate.verification" => ssl_verify_hostname,
234
255
  "ssl.ca.location" => ssl_ca_location,
235
256
  "ssl.crl.location" => ssl_crl_location,
236
257
  "ssl.keystore.location" => ssl_keystore_location,
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "racecar/message_delivery_error"
4
+
3
5
  module Racecar
4
6
  class Consumer
5
7
  Subscription = Struct.new(:topic, :start_from_beginning, :max_bytes_per_partition, :additional_config)
@@ -7,7 +9,7 @@ module Racecar
7
9
  class << self
8
10
  attr_accessor :max_wait_time
9
11
  attr_accessor :group_id
10
- attr_accessor :producer, :consumer
12
+ attr_accessor :producer, :consumer, :parallel_workers
11
13
 
12
14
  def subscriptions
13
15
  @subscriptions ||= []
@@ -25,29 +27,62 @@ module Racecar
25
27
  # @param additional_config [Hash] Configuration properties for consumer.
26
28
  # See https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
27
29
  # @return [nil]
28
- def subscribes_to(*topics, start_from_beginning: true, max_bytes_per_partition: 1048576, additional_config: {})
30
+ def subscribes_to(
31
+ *topics,
32
+ start_from_beginning: true,
33
+ max_bytes_per_partition: 1048576,
34
+ additional_config: {}
35
+ )
29
36
  topics.each do |topic|
30
37
  subscriptions << Subscription.new(topic, start_from_beginning, max_bytes_per_partition, additional_config)
31
38
  end
32
39
  end
33
40
  end
34
41
 
35
- def configure(producer:, consumer:, instrumenter: NullInstrumenter)
42
+ def configure(producer:, consumer:, instrumenter: NullInstrumenter, config: Racecar.config)
36
43
  @producer = producer
44
+ @delivery_handles = []
45
+
37
46
  @consumer = consumer
47
+
38
48
  @instrumenter = instrumenter
49
+ @config = config
39
50
  end
40
51
 
41
52
  def teardown; end
42
53
 
43
- # Delivers messages that got produced.
54
+ # Blocks until all messages produced so far have been successfully published. If
55
+ # message delivery finally fails, a Racecar::MessageDeliveryError is raised. The
56
+ # delivery failed for the reason in the exception. The error can be broker side
57
+ # (e.g. downtime, configuration issue) or specific to the message being sent. The
58
+ # caller must handle the latter cases or run into head of line blocking.
44
59
  def deliver!
45
60
  @delivery_handles ||= []
46
61
  if @delivery_handles.any?
47
62
  instrumentation_payload = { delivered_message_count: @delivery_handles.size }
48
63
 
49
64
  @instrumenter.instrument('deliver_messages', instrumentation_payload) do
50
- @delivery_handles.each(&:wait)
65
+ @delivery_handles.each do |handle|
66
+ # rdkafka-ruby checks every wait_timeout seconds if the message was
67
+ # successfully delivered, up to max_wait_timeout seconds before raising
68
+ # Rdkafka::AbstractHandle::WaitTimeoutError. librdkafka will (re)try to
69
+ # deliver all messages in the background, until "config.message_timeout"
70
+ # (message.timeout.ms) is exceeded. Phrased differently, rdkafka-ruby's
71
+ # WaitTimeoutError is just informative.
72
+ # The raising can be avoided if max_wait_timeout below is greater than
73
+ # config.message_timeout, but config is not available here (without
74
+ # changing the interface).
75
+ handle.wait(max_wait_timeout: 60, wait_timeout: 0.1)
76
+ rescue Rdkafka::AbstractHandle::WaitTimeoutError => e
77
+ partition = MessageDeliveryError.partition_from_delivery_handle(handle)
78
+ # ideally we could use the logger passed to the Runner, but it is not
79
+ # available here. The runner sets it for Rdkafka, though, so we can use
80
+ # that instead.
81
+ @config.logger.debug "Still trying to deliver message to (partition #{partition})... (will try up to Racecar.config.message_timeout)"
82
+ retry
83
+ rescue Rdkafka::RdkafkaError => e
84
+ raise MessageDeliveryError.new(e, handle)
85
+ end
51
86
  end
52
87
  end
53
88
  @delivery_handles.clear
@@ -224,7 +224,7 @@ module Racecar
224
224
  "queued.min.messages" => @config.min_message_queue_size,
225
225
  "session.timeout.ms" => @config.session_timeout * 1000,
226
226
  "socket.timeout.ms" => @config.socket_timeout * 1000,
227
- "statistics.interval.ms" => 1000, # 1s is the highest granularity offered
227
+ "statistics.interval.ms" => @config.statistics_interval_ms
228
228
  }
229
229
  config.merge! @config.rdkafka_consumer
230
230
  config.merge! subscription.additional_config
data/lib/racecar/ctl.rb CHANGED
@@ -3,6 +3,7 @@
3
3
  require "optparse"
4
4
  require "racecar/rails_config_file_loader"
5
5
  require "racecar/daemon"
6
+ require "racecar/message_delivery_error"
6
7
 
7
8
  module Racecar
8
9
  class Ctl
@@ -96,12 +97,17 @@ module Racecar
96
97
  Racecar.config.validate!
97
98
 
98
99
  producer = Rdkafka::Config.new({
99
- "bootstrap.servers": Racecar.config.brokers.join(","),
100
- "client.id": Racecar.config.client_id,
100
+ "bootstrap.servers": Racecar.config.brokers.join(","),
101
+ "client.id": Racecar.config.client_id,
102
+ "message.timeout.ms": Racecar.config.message_timeout * 1000,
101
103
  }.merge(Racecar.config.rdkafka_producer)).producer
102
104
 
103
105
  handle = producer.produce(payload: message.value, key: message.key, topic: message.topic)
104
- handle.wait(max_wait_timeout: 5)
106
+ begin
107
+ handle.wait(max_wait_timeout: Racecar.config.message_timeout)
108
+ rescue Rdkafka::RdkafkaError => e
109
+ raise MessageDeliveryError.new(e, handle)
110
+ end
105
111
 
106
112
  $stderr.puts "=> Delivered message to Kafka cluster"
107
113
  end
@@ -75,8 +75,8 @@ module Racecar
75
75
  private
76
76
 
77
77
  %w[increment histogram count timing gauge].each do |type|
78
- define_method(type) do |*args|
79
- emit(type, *args)
78
+ define_method(type) do |*args, **kwargs|
79
+ emit(type, *args, **kwargs)
80
80
  end
81
81
  end
82
82
 
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Racecar
4
+ # MessageDeliveryError wraps an Rdkafka error and tries to give
5
+ # specific hints on how to debug or resolve the error within the
6
+ # Racecar context.
7
+ class MessageDeliveryError < StandardError
8
+ # partition_from_delivery_handle takes an rdkafka delivery handle
9
+ # and returns a human readable version of the partition. It handles
10
+ # the case where the partition is unknown.
11
+ def self.partition_from_delivery_handle(delivery_handle)
12
+ partition = delivery_handle&.create_result&.partition
13
+ # -1 is rdkafka-ruby's default value, which gets eventually set by librdkafka
14
+ return "no yet known" if partition.nil? || partition == -1
15
+ partition.to_s
16
+ end
17
+
18
+ def initialize(rdkafka_error, delivery_handle)
19
+ raise rdkafka_error unless rdkafka_error.is_a?(Rdkafka::RdkafkaError)
20
+
21
+ @rdkafka_error = rdkafka_error
22
+ @delivery_handle = delivery_handle
23
+ end
24
+
25
+ attr_reader :rdkafka_error
26
+
27
+ def code
28
+ @rdkafka_error.code
29
+ end
30
+
31
+ def to_s
32
+ msg = <<~EOM
33
+ Message delivery finally failed:
34
+ #{@rdkafka_error.to_s}
35
+
36
+ #{explain}
37
+ EOM
38
+ end
39
+
40
+ private
41
+
42
+ def explain
43
+ case @rdkafka_error.code
44
+ when :msg_timed_out # -192
45
+ <<~EOM
46
+ Could not deliver message within Racecar.config.message_timeout.
47
+
48
+ This can happen for various reasons, but most commonly because the connection to the broker is interrupted or there is no leader available. Check the broker's logs or the network for more insight.
49
+
50
+ Upstream documentation:
51
+ https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md#error-local-time-out
52
+ EOM
53
+
54
+ when :msg_size_too_large # 10
55
+ <<~EOM
56
+ Could not deliver message, since it is bigger than either the broker's or Racecar's maximum message size.
57
+
58
+ The broker's config option on the topic is called "max.message.bytes" and the broker wide default is "message.max.bytes". The client's is "message.max.bytes". Take extra care to distinguish this from similarly named properties for receiving/consuming messages (i.e. Racecar.config.max_bytes is NOT related).
59
+
60
+ Racecar's limit is currently not configurable and uses librdkafka's default of 1 MB (10³ bytes). As of writing, librdkafka will send at least one message regardless of this limit. It is therefore very likely you're hitting the broker's limit and not Racecar's/librdkafka's.
61
+
62
+ Upstream documentation:
63
+ broker per topic: https://docs.confluent.io/platform/current/installation/configuration/topic-configs.html#topicconfigs_max.message.bytes
64
+ broker default: https://docs.confluent.io/platform/current/installation/configuration/broker-configs.html#brokerconfigs_message.max.bytes
65
+ client: https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
66
+ EOM
67
+
68
+ when :unknown_topic_or_part # 3
69
+ partition = self.class.partition_from_delivery_handle(@delivery_handle)
70
+
71
+ <<~EOM
72
+ Could not deliver message, since the targeted topic or partition (#{partition}) does not exist.
73
+
74
+ Check that there are no typos, or that the broker's "auto.create.topics.enable" is enabled. For freshly created topics with auto create enabled, this may appear in the beginning (race condition on creation and publishing).
75
+
76
+ Upstream documentation:
77
+ broker setting: https://docs.confluent.io/platform/current/installation/configuration/broker-configs.html#brokerconfigs_auto.create.topics.enable
78
+ client: https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md#topic-metadata-propagation-for-newly-created-topics
79
+ https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md#topic-auto-creation
80
+ EOM
81
+
82
+ when :record_list_too_large # 18
83
+ <<~EOM
84
+ Tried to deliver more messages in a batch than the broker's segment size.
85
+
86
+ Either increase the broker's "log.segment.bytes", or decrease any of the client's related settings "batch.num.messages", "batch.size" or "message.max.bytes". None of these are configurable through Racecar yet, as the defaults should be sufficient and sane.
87
+
88
+ Upstream documentation:
89
+ broker: https://docs.confluent.io/platform/current/installation/configuration/broker-configs.html#brokerconfigs_log.segment.bytes
90
+ client: https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
91
+ EOM
92
+
93
+ when :topic_authorization_failed # 29
94
+ <<~EOM
95
+ Failed to deliver message because of insufficient authorization to write into the topic.
96
+
97
+ Double check that it is not a race condition on topic creation. If it isn't, verify the ACLs are correct.
98
+
99
+ Upstream documentation:
100
+ https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md#unknown-or-unauthorized-topics
101
+ EOM
102
+
103
+ else
104
+ <<~EOM
105
+ No specific information is available for this error. Consider adding it to Racecar. You can find generally helpful information in the upstream documentation:
106
+ https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md
107
+ https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
108
+ EOM
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Racecar
4
+ class ParallelRunner
5
+ Worker = Struct.new(:pid, :parent_reader)
6
+
7
+ SHUTDOWN_SIGNALS = ["INT", "QUIT", "TERM"]
8
+
9
+ def initialize(runner:, config:, logger:)
10
+ @runner = runner
11
+ @config = config
12
+ @logger = logger
13
+ end
14
+
15
+ def worker_pids
16
+ workers.map(&:pid)
17
+ end
18
+
19
+ def running?
20
+ @running
21
+ end
22
+
23
+ def run
24
+ logger.info "=> Running with #{config.parallel_workers} parallel workers"
25
+
26
+ self.workers = config.parallel_workers.times.map do
27
+ run_worker.tap { |w| logger.info "=> Forked new Racecar consumer with process id #{w.pid}" }
28
+ end
29
+
30
+ # Print the consumer config to STDERR on USR1.
31
+ trap("USR1") { $stderr.puts config.inspect }
32
+
33
+ SHUTDOWN_SIGNALS.each { |signal| trap(signal) { terminate_workers } }
34
+
35
+ @running = true
36
+
37
+ wait_for_exit
38
+ end
39
+
40
+ private
41
+
42
+ attr_accessor :workers
43
+ attr_reader :runner, :config, :logger
44
+
45
+ def run_worker
46
+ parent_reader, child_writer = IO.pipe
47
+
48
+ pid = fork do
49
+ begin
50
+ parent_reader.close
51
+
52
+ runner.run
53
+ rescue Exception => e
54
+ # Allow the parent process to re-raise the exception after shutdown
55
+ child_writer.binmode
56
+ child_writer.write(Marshal.dump(e))
57
+ ensure
58
+ child_writer.close
59
+ end
60
+ end
61
+
62
+ child_writer.close
63
+
64
+ Worker.new(pid, parent_reader)
65
+ end
66
+
67
+ def terminate_workers
68
+ return if @terminating
69
+
70
+ @terminating = true
71
+ $stderr.puts "=> Terminating workers"
72
+
73
+ Process.kill("TERM", *workers.map(&:pid))
74
+ end
75
+
76
+ def wait_for_exit
77
+ # The call to IO.select blocks until one or more of our readers are ready for reading,
78
+ # which could be for one of two reasons:
79
+ #
80
+ # - An exception is raised in the child process, in which case we should initiate
81
+ # a shutdown;
82
+ #
83
+ # - A graceful shutdown was already initiated, and the pipe writer has been closed, in
84
+ # which case there is nothing more to do.
85
+ #
86
+ # - One of the child processes was killed somehow. If this turns out to be too strict
87
+ # (i.e. closing down all the workers, we could revisit and look at restarting dead
88
+ # workers.
89
+ #
90
+ ready_readers = IO.select(workers.map(&:parent_reader)).first
91
+
92
+ first_read = ready_readers.first.read
93
+
94
+ terminate_workers
95
+
96
+ workers.map(&:pid).each do |pid|
97
+ logger.debug "=> Waiting for worker with pid #{pid} to exit"
98
+ Process.waitpid(pid)
99
+ logger.debug "=> Worker with pid #{pid} shutdown"
100
+ end
101
+
102
+ exception_found = !first_read.empty?
103
+ raise Marshal.load(first_read) if exception_found
104
+ end
105
+ end
106
+ end
@@ -3,6 +3,7 @@
3
3
  require "rdkafka"
4
4
  require "racecar/pause"
5
5
  require "racecar/message"
6
+ require "racecar/message_delivery_error"
6
7
 
7
8
  module Racecar
8
9
  class Runner
@@ -53,6 +54,7 @@ module Racecar
53
54
  producer: producer,
54
55
  consumer: consumer,
55
56
  instrumenter: @instrumenter,
57
+ config: @config,
56
58
  )
57
59
 
58
60
  instrumentation_payload = {
@@ -79,12 +81,17 @@ module Racecar
79
81
  end
80
82
 
81
83
  logger.info "Gracefully shutting down"
82
- processor.deliver!
83
- processor.teardown
84
- consumer.commit
85
- @instrumenter.instrument('leave_group') do
86
- consumer.close
84
+ begin
85
+ processor.deliver!
86
+ processor.teardown
87
+ consumer.commit
88
+ ensure
89
+ @instrumenter.instrument('leave_group') do
90
+ consumer.close
91
+ end
87
92
  end
93
+ ensure
94
+ producer.close
88
95
  end
89
96
 
90
97
  def stop
@@ -98,10 +105,20 @@ module Racecar
98
105
  def process_method
99
106
  @process_method ||= begin
100
107
  case
101
- when processor.respond_to?(:process_batch) then :batch
102
- when processor.respond_to?(:process) then :single
108
+ when processor.respond_to?(:process_batch)
109
+ if processor.method(:process_batch).arity != 1
110
+ raise Racecar::Error, "Invalid method signature for `process_batch`. The method must take exactly 1 argument."
111
+ end
112
+
113
+ :batch
114
+ when processor.respond_to?(:process)
115
+ if processor.method(:process).arity != 1
116
+ raise Racecar::Error, "Invalid method signature for `process`. The method must take exactly 1 argument."
117
+ end
118
+
119
+ :single
103
120
  else
104
- raise NotImplementedError, "Consumer class must implement process or process_batch method"
121
+ raise NotImplementedError, "Consumer class `#{processor.class}` must implement a `process` or `process_batch` method"
105
122
  end
106
123
  end
107
124
  end
@@ -128,7 +145,8 @@ module Racecar
128
145
  producer_config = {
129
146
  "bootstrap.servers" => config.brokers.join(","),
130
147
  "client.id" => config.client_id,
131
- "statistics.interval.ms" => 1000,
148
+ "statistics.interval.ms" => config.statistics_interval_ms,
149
+ "message.timeout.ms" => config.message_timeout * 1000,
132
150
  }
133
151
  producer_config["compression.codec"] = config.producer_compression_codec.to_s unless config.producer_compression_codec.nil?
134
152
  producer_config.merge!(config.rdkafka_producer)
@@ -176,6 +194,7 @@ module Racecar
176
194
  consumer.store_offset(message)
177
195
  end
178
196
  rescue => e
197
+ instrumentation_payload[:unrecoverable_delivery_error] = reset_producer_on_unrecoverable_delivery_errors(e)
179
198
  instrumentation_payload[:retries_count] = pause.pauses_count
180
199
  config.error_handler.call(e, instrumentation_payload)
181
200
  raise e
@@ -206,6 +225,7 @@ module Racecar
206
225
  processor.deliver!
207
226
  consumer.store_offset(messages.last)
208
227
  rescue => e
228
+ instrumentation_payload[:unrecoverable_delivery_error] = reset_producer_on_unrecoverable_delivery_errors(e)
209
229
  instrumentation_payload[:retries_count] = pause.pauses_count
210
230
  config.error_handler.call(e, instrumentation_payload)
211
231
  raise e
@@ -214,6 +234,28 @@ module Racecar
214
234
  end
215
235
  end
216
236
 
237
+ # librdkafka will continue to try to deliver already queued messages, even if ruby-rdkafka
238
+ # raised before that. This method detects any unrecoverable errors and resets the producer
239
+ # as a last ditch effort.
240
+ # The function returns true if there were unrecoverable errors, or false otherwise.
241
+ def reset_producer_on_unrecoverable_delivery_errors(error)
242
+ return false unless error.is_a?(Racecar::MessageDeliveryError)
243
+ return false unless error.code == :msg_timed_out # -192
244
+
245
+ logger.error error.to_s
246
+ logger.error "Racecar will reset the producer to force a new broker connection."
247
+ @producer.close
248
+ @producer = nil
249
+ processor.configure(
250
+ producer: producer,
251
+ consumer: consumer,
252
+ instrumenter: @instrumenter,
253
+ config: @config,
254
+ )
255
+
256
+ true
257
+ end
258
+
217
259
  def with_pause(topic, partition, offsets)
218
260
  pause = pauses[topic][partition]
219
261
  return yield pause if config.pause_timeout == 0
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Racecar
4
- VERSION = "2.2.0"
4
+ VERSION = "2.3.0.alpha1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: racecar
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.0
4
+ version: 2.3.0.alpha1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daniel Schierbeck
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2021-01-29 00:00:00.000000000 Z
12
+ date: 2021-03-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: king_konf
@@ -170,6 +170,7 @@ files:
170
170
  - ".gitignore"
171
171
  - ".rspec"
172
172
  - CHANGELOG.md
173
+ - Dockerfile
173
174
  - Gemfile
174
175
  - Gemfile.lock
175
176
  - LICENSE.txt
@@ -184,6 +185,7 @@ files:
184
185
  - examples/producing_consumer.rb
185
186
  - exe/racecar
186
187
  - exe/racecarctl
188
+ - extra/datadog-dashboard.json
187
189
  - lib/ensure_hash_compact.rb
188
190
  - lib/generators/racecar/consumer_generator.rb
189
191
  - lib/generators/racecar/install_generator.rb
@@ -199,7 +201,9 @@ files:
199
201
  - lib/racecar/datadog.rb
200
202
  - lib/racecar/instrumenter.rb
201
203
  - lib/racecar/message.rb
204
+ - lib/racecar/message_delivery_error.rb
202
205
  - lib/racecar/null_instrumenter.rb
206
+ - lib/racecar/parallel_runner.rb
203
207
  - lib/racecar/pause.rb
204
208
  - lib/racecar/rails_config_file_loader.rb
205
209
  - lib/racecar/runner.rb
@@ -220,9 +224,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
220
224
  version: '0'
221
225
  required_rubygems_version: !ruby/object:Gem::Requirement
222
226
  requirements:
223
- - - ">="
227
+ - - ">"
224
228
  - !ruby/object:Gem::Version
225
- version: '0'
229
+ version: 1.3.1
226
230
  requirements: []
227
231
  rubygems_version: 3.1.2
228
232
  signing_key: