RubyGems - racecar - Versions diffs - 2.2.0 → 2.3.0.alpha1 - Mend

racecar 2.2.0 → 2.3.0.alpha1

Files changed (20) hide show

checksums.yaml +4 -4
data/.github/workflows/ci.yml +9 -29
data/CHANGELOG.md +8 -0
data/Dockerfile +9 -0
data/Gemfile.lock +6 -6
data/README.md +38 -0
data/docker-compose.yml +38 -5
data/extra/datadog-dashboard.json +1 -0
data/lib/racecar.rb +8 -1
data/lib/racecar/cli.rb +1 -1
data/lib/racecar/config.rb +22 -1
data/lib/racecar/consumer.rb +40 -5
data/lib/racecar/consumer_set.rb +1 -1
data/lib/racecar/ctl.rb +9 -3
data/lib/racecar/datadog.rb +2 -2
data/lib/racecar/message_delivery_error.rb +112 -0
data/lib/racecar/parallel_runner.rb +106 -0
data/lib/racecar/runner.rb +51 -9
data/lib/racecar/version.rb +1 -1
metadata +8 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e2c32338556d27bfcbae35df0758163a457d2c6e3f520141206c5a9c0124cc68
-  data.tar.gz: 43d9c42d0d483c269b15d417ef59e2985da3ca3945d34d6772f8dd262ccbfaf6
+  metadata.gz: 539a8002f306c561358adb3f257d721dbe3764766ff7b14a52b4d7a7d2de4a22
+  data.tar.gz: ca482343833ac570fbadcc652cf3cd4e3691cba116093379c333c274137323c0
 SHA512:
-  metadata.gz: 0467ac1cdefb6cad9870dd73b92f4a5a943b9f685ff3fc876b3f183d109ae3d29d7c2c7dffea8f31bca7c7b18565e5aba04d4865c94f2448a7228be175855a5b
-  data.tar.gz: e4ab43eb180995af916d447b006438b4a48cb808b29aabec52b455e246541a083192d1b560a957fa6f3ab7d5412dd12ab74aac49acc0b606c3df87cec90b93b6
+  metadata.gz: 4ff0d1b0115b9aee1268e57613f348ebf654698d0aba021ecf0ac6b9d418ea9a91c23f14aa550b76e0cffe62673cd95444ca871c4dc2d9175d1bbf2970d2be74
+  data.tar.gz: eafa3506a43d5d79a12adcea2f56ecae66068a7ff077d7beba63630cfe6d94a84888b05e2de2da40fc53daa63fa8d0ee6593b97db3eb26ec119705dbb39f6153

data/.github/workflows/ci.yml CHANGED Viewed

@@ -1,9 +1,10 @@
 name: CI
 on:
+  pull_request:
+    branches: ["master"]
   push:
-    branches:
-      - '**'
+    branches: ["master"]
 jobs:
   unit-specs:
@@ -11,12 +12,12 @@ jobs:
     strategy:
       matrix:
-        ruby-version: ["2.5", "2.6"]
+        ruby-version: ["2.5", "2.6", "3.0"]
     steps:
     - uses: zendesk/checkout@v2
     - name: Set up Ruby
-      uses: zendesk/setup-ruby@v1.58.0
+      uses: zendesk/setup-ruby@v1.64.1
       with:
         ruby-version: ${{ matrix.ruby-version }}
         bundler-cache: true
@@ -25,37 +26,16 @@ jobs:
   integration-specs:
     runs-on: ubuntu-latest
-    services:
-      zookeeper:
-        image: confluentinc/cp-zookeeper
-        ports:
-          - 2181:2181
-        env:
-          ZOOKEEPER_CLIENT_PORT: 2181
-      kafka:
-        image: confluentinc/cp-kafka
-        ports:
-          - 9092:9092
-          - 29092:29092
-        options: --health-cmd "kafka-topics --list --bootstrap-server=localhost:9092" --health-interval 10s --health-timeout 5s --health-retries 5
-        env:
-          KAFKA_BROKER_ID: 1
-          KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
-          KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
-          KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
-          KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
-          KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
     steps:
     - uses: zendesk/checkout@v2
     - name: Set up Ruby
-      uses: zendesk/setup-ruby@v1.58.0
+      uses: zendesk/setup-ruby@v1.64.1
       with:
         ruby-version: 2.7
         bundler-cache: true
+    - name: Bring up docker-compose stack
+      run: docker-compose up -d
     - name: Build and test with RSpec
       env:
         RACECAR_BROKERS: localhost:9092
-      run: bundle exec rspec --format documentation --require spec_helper --color spec/integration/*_spec.rb
+      run: timeout --kill-after 180 150 bundle exec rspec --format documentation --require spec_helper --color spec/integration/*_spec.rb

data/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,14 @@
 ## Unreleased
+* [Racecar::Consumer] When messages fail to deliver, an extended error with hints is now raised. Instead of `Rdkafka::RdkafkaError` you'll get a `Racecar::MessageDeliveryError` instead. ([#219](https://github.com/zendesk/racecar/pull/219)). If you have set a `Racecar.config.error_handler`, it might need to be updated.
+* [Racecar::Consumer] When message delivery times out, Racecar will reset the producer in an attempt to fix some of the potential causes for this error. ([#219](https://github.com/zendesk/racecar/pull/219))
+* Validate the `process` and `process_batch` method signature on consumer classes when initializing (#236)
+* Add Ruby 3.0 compatibility (#237)
+* Introduce parallel runner, which forks a number of independent consumers, allowing partitions to be processed in parallel. ([#222](https://github.com/zendesk/racecar/pull/222))
+* [Racecar::Runner] Ensure producer is closed, whether it closes or errors. ([#222](https://github.com/zendesk/racecar/pull/222))
+* Configure `statistics_interval` directly in the config. Disable statistics when no callback is defined ([#232](https://github.com/zendesk/racecar/pull/232))
 ## racecar v2.2.0
 * [Racecar::ConsumerSet] **breaking change** `Racecar::ConsumerSet`'s functions `poll` and `batch_pall` expect the max wait values to be given in milliseconds. The defaults were using `config.max_wait_time`, which is in seconds. If you do not directly use `Racecar::ConsumerSet`, or always call its `poll` and `batch_poll` functions by specfiying the max wait time (the first argument), then this breaking change does not affect you. ([#214](https://github.com/zendesk/racecar/pull/214))

data/Dockerfile ADDED Viewed

@@ -0,0 +1,9 @@
+FROM circleci/ruby:2.7.2
+RUN sudo apt-get update
+RUN sudo apt-get install docker
+WORKDIR /app
+COPY . .
+RUN bundle install

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    racecar (2.1.1)
+    racecar (2.2.0)
       king_konf (~> 1.0.0)
       rdkafka (~> 0.8.0)
@@ -18,7 +18,7 @@ GEM
     concurrent-ruby (1.1.7)
     diff-lcs (1.4.4)
     dogstatsd-ruby (4.8.2)
-    ffi (1.13.1)
+    ffi (1.14.2)
     i18n (1.8.5)
       concurrent-ruby (~> 1.0)
     king_konf (1.0.0)
@@ -37,15 +37,15 @@ GEM
       rspec-core (~> 3.10.0)
       rspec-expectations (~> 3.10.0)
       rspec-mocks (~> 3.10.0)
-    rspec-core (3.10.0)
+    rspec-core (3.10.1)
       rspec-support (~> 3.10.0)
-    rspec-expectations (3.10.0)
+    rspec-expectations (3.10.1)
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.10.0)
-    rspec-mocks (3.10.0)
+    rspec-mocks (3.10.2)
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.10.0)
-    rspec-support (3.10.0)
+    rspec-support (3.10.2)
     thread_safe (0.3.6)
     timecop (0.9.2)
     tzinfo (1.2.8)

data/README.md CHANGED Viewed

@@ -83,6 +83,30 @@ Now run your consumer with `bundle exec racecar TapDanceConsumer`.
 Note: if you're not using Rails, you'll have to add the file yourself. No-one will judge you for copy-pasting it.
+#### Running consumers in parallel (experimental)
+Warning - limited battle testing in production environments; use at your own risk!
+If you want to process different partitions in parallel, and don't want to deploy a number of instances matching the total partitions of the topic, you can specify the number of workers to spin up - that number of processes will be forked, and each will register its own consumer in the group. Some things to note:
+- This would make no difference on a single partitioned topic - only one consumer would ever be assigned a partition. A couple of example configurations to process all partitions in parallel (we'll assume a 15 partition topic):
+  - Parallel workers set to 3, 5 separate instances / replicas running in your container orchestrator
+  - Parallel workers set to 5, 3 separate instances / replicas running in your container orchestrator
+- Since we're forking new processes, the memory demands are a little higher
+  - From some initial testing, running 5 parallel workers requires no more than double the memory of running a Racecar consumer without parallelism.
+The number of parallel workers is configured per consumer class; you may only want to take advantage of this for busier consumers:
+```ruby
+class ParallelProcessingConsumer < Racecar::Consumer
+  subscribes_to "some-topic"
+  self.parallel_workers = 5
+  def process(message)
+    ...
+  end
+end
+```
 #### Initializing consumers
 You can optionally add an `initialize` method if you need to do any set-up work before processing messages, e.g.
@@ -266,6 +290,8 @@ All timeouts are defined in number of seconds.
 * `pause_with_exponential_backoff` – Set to `true` if you want to double the `pause_timeout` on each consecutive failure of a particular partition.
 * `socket_timeout` – How long to wait when trying to communicate with a Kafka broker. Default is 30 seconds.
 * `max_wait_time` – How long to allow the Kafka brokers to wait before returning messages. A higher number means larger batches, at the cost of higher latency. Default is 1 second.
+* `message_timeout` – How long to try to deliver a produced message before finally giving up. Default is 5 minutes. Transient errors are automatically retried. If a message delivery fails, the current read message batch is retried.
+* `statistics_interval` – How frequently librdkafka should publish statistics about its consumers and producers; you must also add a `statistics_callback` method to your processor, otherwise the stats are disabled. The default is 1 second, however this can be quite memory hungry, so you may want to tune this and monitor.
 #### Memory & network usage
@@ -319,6 +345,8 @@ Racecar supports [Datadog](https://www.datadoghq.com/) monitoring integration. I
 * `datadog_namespace` – The namespace to use for Datadog metrics.
 * `datadog_tags` – Tags that should always be set on Datadog metrics.
+Furthermore, there's a [standard Datadog dashboard configution file](https://raw.githubusercontent.com/zendesk/racecar/master/extra/datadog-dashboard.json) that you can import to get started with a Racecar dashboard for all of your consumers.
 #### Consumers Without Rails ####
 By default, if Rails is detected, it will be automatically started when the consumer is started. There are cases where you might not want or need Rails. You can pass the `--without-rails` option when starting the consumer and Rails won't be started.
@@ -492,6 +520,16 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
 The integration tests run against a Kafka instance that is not automatically started from within `rspec`. You can set one up using the provided `docker-compose.yml` by running `docker-compose up`.
+### Running RSpec within Docker
+There can be behavioural inconsistencies between running the specs on your machine, and in the CI pipeline. Due to this, there is now a Dockerfile included in the project, which is based on the CircleCI ruby 2.7.2 image. This could easily be extended with more Dockerfiles to cover different Ruby versions if desired. In order to run the specs via Docker:
+- Uncomment the `tests` service from the docker-compose.yml
+- Bring up the stack with `docker-compose up -d`
+- Execute the entire suite with `docker-compose run --rm tests rspec`
+- Execute a single spec or directory with `docker-compose run --rm tests rspec spec/integration/consumer_spec.rb`
+Please note - your code directory is mounted as a volume, so you can make code changes without needing to rebuild
 ## Contributing

data/docker-compose.yml CHANGED Viewed

@@ -1,19 +1,19 @@
-version: '2'
+version: '2.1'
 services:
   zookeeper:
     image: confluentinc/cp-zookeeper:5.5.1
-    hostname: zookeeper
-    container_name: zookeeper
     ports:
       - "2181:2181"
     environment:
       ZOOKEEPER_CLIENT_PORT: 2181
       ZOOKEEPER_TICK_TIME: 2000
+      KAFKA_OPTS: "-Dzookeeper.4lw.commands.whitelist=*"
+    healthcheck:
+      test: echo ruok | nc 127.0.0.1 2181 | grep imok
   broker:
     image: confluentinc/cp-kafka:5.5.1
-    hostname: broker
-    container_name: broker
     depends_on:
       - zookeeper
     ports:
@@ -30,3 +30,36 @@ services:
       KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
       KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
       KAFKA_JMX_PORT: 9101
+      KAFKA_DELETE_TOPIC_ENABLE: 'true'
+    healthcheck:
+      test: nc -z 127.0.0.1 9092
+  wait-for-healthy-services:
+    image: alpine
+    depends_on:
+      broker:
+        condition: service_healthy
+      zookeeper:
+        condition: service_healthy
+  # If you want to run the tests locally with Docker, comment in the tests service.
+  # The behaviour, especially of the integration tests, can differ somewhat compared
+  # to running it on your machine.
+  # tests:
+  #   build:
+  #     context: .
+  #   depends_on:
+  #     wait-for-healthy-services:
+  #       condition: service_started
+  #   environment:
+  #     RACECAR_BROKERS: broker:29092
+  #     DOCKER_SUDO: 'true'
+  #   # When bringing up the stack, we just let the container exit. For running the
+  #   # specs, we'll use commands like `docker-compose run tests rspec`
+  #   command: ["echo", "ready"]
+  #   volumes:
+  #     # The line below allows us to run docker commands from the container itself
+  #     - "/var/run/docker.sock:/var/run/docker.sock"
+  #     - .:/app

data/extra/datadog-dashboard.json ADDED Viewed

@@ -0,0 +1 @@

+ {"title":"Racecar consumer groups","description":"Dashboard for monitoring [Racecar](https://github.com/zendesk/racecar) Kafka consumer groups.","widgets":[{"id":4916208698459109,"definition":{"title":"Single-message processing","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":82605028,"definition":{"title":"95th percentile message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.95percentile{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":2857871641649870,"definition":{"title":"Max message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.max{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":88579656,"definition":{"title":"Median message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.median{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}}]}},{"id":4068194420543030,"definition":{"title":"Batch processing","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":341686567,"definition":{"title":"95th percentile batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_batch.latency.95percentile{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":341687897,"definition":{"title":"Median batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_batch.latency.median{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5352911818003929,"definition":{"title":"Max batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"max:racecar.consumer.process_batch.latency.max{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":1654098217056312,"definition":{"title":"Max message batch size","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"max:racecar.consumer.batch_size.max{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"purple","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":7718619791149134,"definition":{"title":"Average per-message latency in batch processing mode","show_legend":false,"legend_size":"0","legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"max:racecar.consumer.process_batch.latency.avg{$group_id,$client,$topic,$partition,$env}/max:racecar.consumer.batch_size.avg{$group_id,$client,$topic,$partition,$env}","metadata":[{"expression":"max:racecar.consumer.process_batch.latency.avg{$env,$pod,$group_id,$client,$topic,$partition}/max:racecar.consumer.batch_size.avg{$env,$pod,$group_id,$client,$topic,$partition}","alias_name":"ms"}],"style":{"palette":"purple","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":4,"y":2,"width":4,"height":2}}]}},{"id":7110612496425151,"definition":{"title":"Throughput & Lag","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":301212748,"definition":{"title":"Message lag changes","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"derivative(max:racecar.consumer.offset{$group_id,$client,$topic,$partition,$env} by {topic,partition,pod})","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":82604183,"definition":{"title":"Processing throughput by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {topic,group_id}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5547724125706857,"definition":{"title":"Processing throughput by group","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {group_id}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":375397853,"definition":{"title":"Processing throughput by host","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {group_id,host}.as_rate()","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":7820607170949322,"definition":{"title":"Messages consumed in timeframe","type":"query_value","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env}.as_count()","aggregator":"sum"}],"autoscale":true,"precision":0},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":1428183857213882,"definition":{"title":"Time lag (end-to-end latency)","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.time_lag{$group_id,$client,$topic,$partition,$env} by {group_id,pod}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":2,"width":4,"height":2}}]}},{"id":1487807434456879,"definition":{"title":"Processing Errors & Group Stability","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":82605029,"definition":{"title":"Processing errors","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.process_batch.errors{$group_id,$client,$topic,$partition,$env} by {topic,pod,group_id,partition}.as_count()+sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic,pod,group_id,partition}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":83104736,"definition":{"title":"Processing error rate by topic (%)","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"(sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()/(sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()+sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()))*100","style":{"palette":"orange","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6572534533091871,"definition":{"title":"Processing errors in timeframe","type":"query_value","requests":[{"q":"sum:racecar.consumer.process_batch.errors{$topic,$client,$group_id,$env}.as_count()+sum:racecar.consumer.process_message.errors{$topic,$client,$group_id,$env}.as_count()","aggregator":"sum","conditional_formats":[{"comparator":">","palette":"white_on_red","value":0},{"comparator":"<=","palette":"white_on_green","value":0}]}],"autoscale":true,"precision":0},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":302705923,"definition":{"title":"Pause duration","show_legend":false,"legend_size":"0","legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.consumer.pause.duration{$client,$group_id,$topic,$env} by {pod,group_id,topic,partition}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":235544854,"definition":{"title":"Group joins","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.join_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":235544862,"definition":{"title":"Group leaves","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.leave_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":2,"width":4,"height":2}},{"id":235545167,"definition":{"title":"Group syncs","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.sync_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":4,"width":4,"height":2}}]}},{"id":8013176155436939,"definition":{"title":"Producer & message delivery","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":5948628389625057,"definition":{"title":"Message delivery latency (median)","title_size":"16","title_align":"left","show_legend":false,"type":"timeseries","requests":[{"q":"avg:racecar.producer.deliver.latency.median{$client,$env}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":3158040379950811,"definition":{"title":"Producer buffer size (max)","title_size":"16","title_align":"left","show_legend":false,"type":"timeseries","requests":[{"q":"max:racecar.producer.buffer.size.max{$client,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6916375790222772,"definition":{"title":"Producer buffer size (avg) kp","title_size":"16","title_align":"left","show_legend":false,"legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.producer.buffer.size.avg{$client,$env} by {topic,host}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":3160735194874896,"definition":{"title":"Message size (95p)","title_size":"16","title_align":"left","show_legend":false,"legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.producer.produce.message_size.95percentile{$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":2,"width":4,"height":2}}]}}],"template_variables":[{"name":"env","default":"production","prefix":"env"},{"name":"group_id","default":"*","prefix":"group_id"},{"name":"client","default":"*","prefix":"client"},{"name":"topic","default":"*","prefix":"topic"},{"name":"partition","default":"*","prefix":"partition"}],"layout_type":"ordered","is_read_only":false,"notify_list":[],"reflow_type":"fixed","id":"ywc-z36-g29"}

data/lib/racecar.rb CHANGED Viewed

@@ -7,6 +7,7 @@ require "racecar/null_instrumenter"
 require "racecar/consumer"
 require "racecar/consumer_set"
 require "racecar/runner"
+require "racecar/parallel_runner"
 require "racecar/config"
 require "racecar/version"
 require "ensure_hash_compact"
@@ -51,6 +52,12 @@ module Racecar
   end
   def self.run(processor)
-    Runner.new(processor, config: config, logger: logger, instrumenter: instrumenter).run
+    runner = Runner.new(processor, config: config, logger: logger, instrumenter: instrumenter)
+    if config.parallel_workers && config.parallel_workers > 1
+      ParallelRunner.new(runner: runner, config: config, logger: logger).run
+    else
+      runner.run
+    end
   end
 end

data/lib/racecar/cli.rb CHANGED Viewed

@@ -59,8 +59,8 @@ module Racecar
       end
       processor = consumer_class.new
       Racecar.run(processor)
+      nil
     end
     private

data/lib/racecar/config.rb CHANGED Viewed

@@ -6,6 +6,8 @@ module Racecar
   class Config < KingKonf::Config
     env_prefix :racecar
+    STATISTICS_DISABLED_VALUE = 0
     desc "A list of Kafka brokers in the cluster that you're consuming from"
     list :brokers, default: ["localhost:9092"]
@@ -57,6 +59,9 @@ module Racecar
     desc "How long to allow the Kafka brokers to wait before returning messages (in seconds)"
     float :max_wait_time, default: 1
+    desc "How long to try to deliver a produced message before finally giving up (in seconds)"
+    float :message_timeout, default: 5*60
     desc "Maximum amount of data the broker shall return for a Fetch request"
     integer :max_bytes, default: 10485760
@@ -153,10 +158,24 @@ module Racecar
     desc "Whether to boot Rails when starting the consumer"
     boolean :without_rails, default: false
+    desc "How frequently librdkafka should report statistics to your application (in seconds). A statistics callback
+          must also be provided. This should be defined with a `statistics_callback` method on your processor. Stats
+          are disabled if this value is set to 0, or there is no callback defined. This is set by default to 1 second
+          for backward compatibility, however this can be quite memory intensive"
+    integer :statistics_interval, default: 1
     # The error handler must be set directly on the object.
     attr_reader :error_handler
-    attr_accessor :subscriptions, :logger
+    attr_accessor :subscriptions, :logger, :parallel_workers
+    def statistics_interval_ms
+      if Rdkafka::Config.statistics_callback
+        statistics_interval * 1000
+      else
+        STATISTICS_DISABLED_VALUE
+      end
+    end
     def max_wait_time_ms
       max_wait_time * 1000
@@ -201,6 +220,7 @@ module Racecar
         consumer_class.name.gsub(/[a-z][A-Z]/) { |str| "#{str[0]}-#{str[1]}" }.downcase,
       ].compact.join
+      self.parallel_workers = consumer_class.parallel_workers
       self.subscriptions = consumer_class.subscriptions
       self.max_wait_time = consumer_class.max_wait_time || self.max_wait_time
       self.pidfile ||= "#{group_id}.pid"
@@ -231,6 +251,7 @@ module Racecar
     def rdkafka_security_config
       {
         "security.protocol" => security_protocol,
+        "enable.ssl.certificate.verification" => ssl_verify_hostname,
         "ssl.ca.location" => ssl_ca_location,
         "ssl.crl.location" => ssl_crl_location,
         "ssl.keystore.location" => ssl_keystore_location,

data/lib/racecar/consumer.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # frozen_string_literal: true
+require "racecar/message_delivery_error"
 module Racecar
   class Consumer
     Subscription = Struct.new(:topic, :start_from_beginning, :max_bytes_per_partition, :additional_config)
@@ -7,7 +9,7 @@ module Racecar
     class << self
       attr_accessor :max_wait_time
       attr_accessor :group_id
-      attr_accessor :producer, :consumer
+      attr_accessor :producer, :consumer, :parallel_workers
       def subscriptions
         @subscriptions ||= []
@@ -25,29 +27,62 @@ module Racecar
       # @param additional_config [Hash] Configuration properties for consumer.
       #   See https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
       # @return [nil]
-      def subscribes_to(*topics, start_from_beginning: true, max_bytes_per_partition: 1048576, additional_config: {})
+      def subscribes_to(
+        *topics,
+        start_from_beginning: true,
+        max_bytes_per_partition: 1048576,
+        additional_config: {}
+      )
         topics.each do |topic|
           subscriptions << Subscription.new(topic, start_from_beginning, max_bytes_per_partition, additional_config)
         end
       end
     end
-    def configure(producer:, consumer:, instrumenter: NullInstrumenter)
+    def configure(producer:, consumer:, instrumenter: NullInstrumenter, config: Racecar.config)
       @producer = producer
+      @delivery_handles = []
       @consumer = consumer
       @instrumenter = instrumenter
+      @config = config
     end
     def teardown; end
-    # Delivers messages that got produced.
+    # Blocks until all messages produced so far have been successfully published. If
+    # message delivery finally fails, a Racecar::MessageDeliveryError is raised. The
+    # delivery failed for the reason in the exception. The error can be broker side
+    # (e.g. downtime, configuration issue) or specific to the message being sent. The
+    # caller must handle the latter cases or run into head of line blocking.
     def deliver!
       @delivery_handles ||= []
       if @delivery_handles.any?
         instrumentation_payload = { delivered_message_count: @delivery_handles.size }
         @instrumenter.instrument('deliver_messages', instrumentation_payload) do
-          @delivery_handles.each(&:wait)
+          @delivery_handles.each do |handle|
+            # rdkafka-ruby checks every wait_timeout seconds if the message was
+            # successfully delivered, up to max_wait_timeout seconds before raising
+            # Rdkafka::AbstractHandle::WaitTimeoutError. librdkafka will (re)try to
+            # deliver all messages in the background, until "config.message_timeout"
+            # (message.timeout.ms) is exceeded. Phrased differently, rdkafka-ruby's
+            # WaitTimeoutError is just informative.
+            # The raising can be avoided if max_wait_timeout below is greater than
+            # config.message_timeout, but config is not available here (without
+            # changing the interface).
+            handle.wait(max_wait_timeout: 60, wait_timeout: 0.1)
+          rescue Rdkafka::AbstractHandle::WaitTimeoutError => e
+            partition = MessageDeliveryError.partition_from_delivery_handle(handle)
+            # ideally we could use the logger passed to the Runner, but it is not
+            # available here. The runner sets it for Rdkafka, though, so we can use
+            # that instead.
+            @config.logger.debug "Still trying to deliver message to (partition #{partition})... (will try up to Racecar.config.message_timeout)"
+            retry
+          rescue Rdkafka::RdkafkaError => e
+            raise MessageDeliveryError.new(e, handle)
+          end
         end
       end
       @delivery_handles.clear

data/lib/racecar/consumer_set.rb CHANGED Viewed

@@ -224,7 +224,7 @@ module Racecar
         "queued.min.messages"     => @config.min_message_queue_size,
         "session.timeout.ms"      => @config.session_timeout * 1000,
         "socket.timeout.ms"       => @config.socket_timeout * 1000,
-        "statistics.interval.ms"  => 1000, # 1s is the highest granularity offered
+        "statistics.interval.ms"  => @config.statistics_interval_ms
       }
       config.merge! @config.rdkafka_consumer
       config.merge! subscription.additional_config

data/lib/racecar/ctl.rb CHANGED Viewed

@@ -3,6 +3,7 @@
 require "optparse"
 require "racecar/rails_config_file_loader"
 require "racecar/daemon"
+require "racecar/message_delivery_error"
 module Racecar
   class Ctl
@@ -96,12 +97,17 @@ module Racecar
       Racecar.config.validate!
       producer = Rdkafka::Config.new({
-        "bootstrap.servers": Racecar.config.brokers.join(","),
-        "client.id":         Racecar.config.client_id,
+        "bootstrap.servers":  Racecar.config.brokers.join(","),
+        "client.id":          Racecar.config.client_id,
+        "message.timeout.ms": Racecar.config.message_timeout * 1000,
       }.merge(Racecar.config.rdkafka_producer)).producer
       handle = producer.produce(payload: message.value, key: message.key, topic: message.topic)
-      handle.wait(max_wait_timeout: 5)
+      begin
+        handle.wait(max_wait_timeout: Racecar.config.message_timeout)
+      rescue Rdkafka::RdkafkaError => e
+        raise MessageDeliveryError.new(e, handle)
+      end
       $stderr.puts "=> Delivered message to Kafka cluster"
     end

data/lib/racecar/datadog.rb CHANGED Viewed

@@ -75,8 +75,8 @@ module Racecar
       private
       %w[increment histogram count timing gauge].each do |type|
-        define_method(type) do |*args|
-          emit(type, *args)
+        define_method(type) do |*args, **kwargs|
+          emit(type, *args, **kwargs)
         end
       end

data/lib/racecar/message_delivery_error.rb ADDED Viewed

@@ -0,0 +1,112 @@
+# frozen_string_literal: true
+module Racecar
+  # MessageDeliveryError wraps an Rdkafka error and tries to give
+  # specific hints on how to debug or resolve the error within the
+  # Racecar context.
+  class MessageDeliveryError < StandardError
+    # partition_from_delivery_handle takes an rdkafka delivery handle
+    # and returns a human readable version of the partition. It handles
+    # the case where the partition is unknown.
+    def self.partition_from_delivery_handle(delivery_handle)
+      partition = delivery_handle&.create_result&.partition
+      # -1 is rdkafka-ruby's default value, which gets eventually set by librdkafka
+      return "no yet known" if partition.nil? || partition == -1
+      partition.to_s
+    end
+    def initialize(rdkafka_error, delivery_handle)
+      raise rdkafka_error unless rdkafka_error.is_a?(Rdkafka::RdkafkaError)
+      @rdkafka_error = rdkafka_error
+      @delivery_handle = delivery_handle
+    end
+    attr_reader :rdkafka_error
+    def code
+      @rdkafka_error.code
+    end
+    def to_s
+      msg = <<~EOM
+        Message delivery finally failed:
+        #{@rdkafka_error.to_s}
+        #{explain}
+      EOM
+    end
+    private
+    def explain
+      case @rdkafka_error.code
+      when :msg_timed_out # -192
+        <<~EOM
+          Could not deliver message within Racecar.config.message_timeout.
+          This can happen for various reasons, but most commonly because the connection to the broker is interrupted or there is no leader available. Check the broker's logs or the network for more insight.
+          Upstream documentation:
+          https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md#error-local-time-out
+        EOM
+      when :msg_size_too_large # 10
+        <<~EOM
+          Could not deliver message, since it is bigger than either the broker's or Racecar's maximum message size.
+          The broker's config option on the topic is called "max.message.bytes" and the broker wide default is "message.max.bytes". The client's is "message.max.bytes". Take extra care to distinguish this from similarly named properties for receiving/consuming messages (i.e. Racecar.config.max_bytes is NOT related).
+          Racecar's limit is currently not configurable and uses librdkafka's default of 1 MB (10³ bytes). As of writing, librdkafka will send at least one message regardless of this limit. It is therefore very likely you're hitting the broker's limit and not Racecar's/librdkafka's.
+          Upstream documentation:
+          broker per topic: https://docs.confluent.io/platform/current/installation/configuration/topic-configs.html#topicconfigs_max.message.bytes
+          broker default:   https://docs.confluent.io/platform/current/installation/configuration/broker-configs.html#brokerconfigs_message.max.bytes
+          client:           https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
+        EOM
+      when :unknown_topic_or_part # 3
+        partition = self.class.partition_from_delivery_handle(@delivery_handle)
+        <<~EOM
+          Could not deliver message, since the targeted topic or partition (#{partition}) does not exist.
+          Check that there are no typos, or that the broker's "auto.create.topics.enable" is enabled. For freshly created topics with auto create enabled, this may appear in the beginning (race condition on creation and publishing).
+          Upstream documentation:
+          broker setting: https://docs.confluent.io/platform/current/installation/configuration/broker-configs.html#brokerconfigs_auto.create.topics.enable
+          client:         https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md#topic-metadata-propagation-for-newly-created-topics
+                          https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md#topic-auto-creation
+        EOM
+      when :record_list_too_large # 18
+        <<~EOM
+          Tried to deliver more messages in a batch than the broker's segment size.
+          Either increase the broker's "log.segment.bytes", or decrease any of the client's related settings "batch.num.messages", "batch.size" or "message.max.bytes". None of these are configurable through Racecar yet, as the defaults should be sufficient and sane.
+          Upstream documentation:
+          broker: https://docs.confluent.io/platform/current/installation/configuration/broker-configs.html#brokerconfigs_log.segment.bytes
+          client: https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
+        EOM
+      when :topic_authorization_failed # 29
+        <<~EOM
+          Failed to deliver message because of insufficient authorization to write into the topic.
+          Double check that it is not a race condition on topic creation. If it isn't, verify the ACLs are correct.
+          Upstream documentation:
+          https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md#unknown-or-unauthorized-topics
+        EOM
+      else
+        <<~EOM
+          No specific information is available for this error. Consider adding it to Racecar. You can find generally helpful information in the upstream documentation:
+          https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md
+          https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
+        EOM
+      end
+    end
+  end
+end

data/lib/racecar/parallel_runner.rb ADDED Viewed

@@ -0,0 +1,106 @@
+# frozen_string_literal: true
+module Racecar
+  class ParallelRunner
+    Worker = Struct.new(:pid, :parent_reader)
+    SHUTDOWN_SIGNALS = ["INT", "QUIT", "TERM"]
+    def initialize(runner:, config:, logger:)
+      @runner = runner
+      @config = config
+      @logger = logger
+    end
+    def worker_pids
+      workers.map(&:pid)
+    end
+    def running?
+      @running
+    end
+    def run
+      logger.info "=> Running with #{config.parallel_workers} parallel workers"
+      self.workers = config.parallel_workers.times.map do
+        run_worker.tap { |w| logger.info "=> Forked new Racecar consumer with process id #{w.pid}" }
+      end
+      # Print the consumer config to STDERR on USR1.
+      trap("USR1") { $stderr.puts config.inspect }
+      SHUTDOWN_SIGNALS.each { |signal| trap(signal) { terminate_workers } }
+      @running = true
+      wait_for_exit
+    end
+    private
+    attr_accessor :workers
+    attr_reader :runner, :config, :logger
+    def run_worker
+      parent_reader, child_writer = IO.pipe
+      pid = fork do
+        begin
+          parent_reader.close
+          runner.run
+        rescue Exception => e
+          # Allow the parent process to re-raise the exception after shutdown
+          child_writer.binmode
+          child_writer.write(Marshal.dump(e))
+        ensure
+          child_writer.close
+        end
+      end
+      child_writer.close
+      Worker.new(pid, parent_reader)
+    end
+    def terminate_workers
+      return if @terminating
+      @terminating = true
+      $stderr.puts "=> Terminating workers"
+      Process.kill("TERM", *workers.map(&:pid))
+    end
+    def wait_for_exit
+      # The call to IO.select blocks until one or more of our readers are ready for reading,
+      # which could be for one of two reasons:
+      #
+      # - An exception is raised in the child process, in which case we should initiate
+      #   a shutdown;
+      #
+      # - A graceful shutdown was already initiated, and the pipe writer has been closed, in
+      #   which case there is nothing more to do.
+      #
+      # - One of the child processes was killed somehow. If this turns out to be too strict
+      #   (i.e. closing down all the workers, we could revisit and look at restarting dead
+      #   workers.
+      #
+      ready_readers = IO.select(workers.map(&:parent_reader)).first
+      first_read = ready_readers.first.read
+      terminate_workers
+      workers.map(&:pid).each do |pid|
+        logger.debug "=> Waiting for worker with pid #{pid} to exit"
+        Process.waitpid(pid)
+        logger.debug "=> Worker with pid #{pid} shutdown"
+      end
+      exception_found = !first_read.empty?
+      raise Marshal.load(first_read) if exception_found
+    end
+  end
+end

data/lib/racecar/runner.rb CHANGED Viewed

@@ -3,6 +3,7 @@
 require "rdkafka"
 require "racecar/pause"
 require "racecar/message"
+require "racecar/message_delivery_error"
 module Racecar
   class Runner
@@ -53,6 +54,7 @@ module Racecar
         producer:     producer,
         consumer:     consumer,
         instrumenter: @instrumenter,
+        config:       @config,
       )
       instrumentation_payload = {
@@ -79,12 +81,17 @@ module Racecar
       end
       logger.info "Gracefully shutting down"
-      processor.deliver!
-      processor.teardown
-      consumer.commit
-      @instrumenter.instrument('leave_group') do
-        consumer.close
+      begin
+        processor.deliver!
+        processor.teardown
+        consumer.commit
+      ensure
+        @instrumenter.instrument('leave_group') do
+          consumer.close
+        end
       end
+    ensure
+      producer.close
     end
     def stop
@@ -98,10 +105,20 @@ module Racecar
     def process_method
       @process_method ||= begin
         case
-        when processor.respond_to?(:process_batch) then :batch
-        when processor.respond_to?(:process) then :single
+        when processor.respond_to?(:process_batch)
+          if processor.method(:process_batch).arity != 1
+            raise Racecar::Error, "Invalid method signature for `process_batch`. The method must take exactly 1 argument."
+          end
+          :batch
+        when processor.respond_to?(:process)
+          if processor.method(:process).arity != 1
+            raise Racecar::Error, "Invalid method signature for `process`. The method must take exactly 1 argument."
+          end
+          :single
         else
-          raise NotImplementedError, "Consumer class must implement process or process_batch method"
+          raise NotImplementedError, "Consumer class `#{processor.class}` must implement a `process` or `process_batch` method"
         end
       end
     end
@@ -128,7 +145,8 @@ module Racecar
       producer_config = {
         "bootstrap.servers"      => config.brokers.join(","),
         "client.id"              => config.client_id,
-        "statistics.interval.ms" => 1000,
+        "statistics.interval.ms" => config.statistics_interval_ms,
+        "message.timeout.ms"     => config.message_timeout * 1000,
       }
       producer_config["compression.codec"] = config.producer_compression_codec.to_s unless config.producer_compression_codec.nil?
       producer_config.merge!(config.rdkafka_producer)
@@ -176,6 +194,7 @@ module Racecar
             consumer.store_offset(message)
           end
         rescue => e
+          instrumentation_payload[:unrecoverable_delivery_error] = reset_producer_on_unrecoverable_delivery_errors(e)
           instrumentation_payload[:retries_count] = pause.pauses_count
           config.error_handler.call(e, instrumentation_payload)
           raise e
@@ -206,6 +225,7 @@ module Racecar
             processor.deliver!
             consumer.store_offset(messages.last)
           rescue => e
+            instrumentation_payload[:unrecoverable_delivery_error] = reset_producer_on_unrecoverable_delivery_errors(e)
             instrumentation_payload[:retries_count] = pause.pauses_count
             config.error_handler.call(e, instrumentation_payload)
             raise e
@@ -214,6 +234,28 @@ module Racecar
       end
     end
+    # librdkafka will continue to try to deliver already queued messages, even if ruby-rdkafka
+    # raised before that. This method detects any unrecoverable errors and resets the producer
+    # as a last ditch effort.
+    # The function returns true if there were unrecoverable errors, or false otherwise.
+    def reset_producer_on_unrecoverable_delivery_errors(error)
+      return false unless error.is_a?(Racecar::MessageDeliveryError)
+      return false unless error.code == :msg_timed_out # -192
+      logger.error error.to_s
+      logger.error "Racecar will reset the producer to force a new broker connection."
+      @producer.close
+      @producer = nil
+      processor.configure(
+        producer:     producer,
+        consumer:     consumer,
+        instrumenter: @instrumenter,
+        config:       @config,
+      )
+      true
+    end
     def with_pause(topic, partition, offsets)
       pause = pauses[topic][partition]
       return yield pause if config.pause_timeout == 0

data/lib/racecar/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Racecar
-  VERSION = "2.2.0"
+  VERSION = "2.3.0.alpha1"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: racecar
 version: !ruby/object:Gem::Version
-  version: 2.2.0
+  version: 2.3.0.alpha1
 platform: ruby
 authors:
 - Daniel Schierbeck
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2021-01-29 00:00:00.000000000 Z
+date: 2021-03-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: king_konf
@@ -170,6 +170,7 @@ files:
 - ".gitignore"
 - ".rspec"
 - CHANGELOG.md
+- Dockerfile
 - Gemfile
 - Gemfile.lock
 - LICENSE.txt
@@ -184,6 +185,7 @@ files:
 - examples/producing_consumer.rb
 - exe/racecar
 - exe/racecarctl
+- extra/datadog-dashboard.json
 - lib/ensure_hash_compact.rb
 - lib/generators/racecar/consumer_generator.rb
 - lib/generators/racecar/install_generator.rb
@@ -199,7 +201,9 @@ files:
 - lib/racecar/datadog.rb
 - lib/racecar/instrumenter.rb
 - lib/racecar/message.rb
+- lib/racecar/message_delivery_error.rb
 - lib/racecar/null_instrumenter.rb
+- lib/racecar/parallel_runner.rb
 - lib/racecar/pause.rb
 - lib/racecar/rails_config_file_loader.rb
 - lib/racecar/runner.rb
@@ -220,9 +224,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">="
+  - - ">"
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 1.3.1
 requirements: []
 rubygems_version: 3.1.2
 signing_key: