racecar 2.2.0 → 2.3.0.alpha1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +9 -29
- data/CHANGELOG.md +8 -0
- data/Dockerfile +9 -0
- data/Gemfile.lock +6 -6
- data/README.md +38 -0
- data/docker-compose.yml +38 -5
- data/extra/datadog-dashboard.json +1 -0
- data/lib/racecar.rb +8 -1
- data/lib/racecar/cli.rb +1 -1
- data/lib/racecar/config.rb +22 -1
- data/lib/racecar/consumer.rb +40 -5
- data/lib/racecar/consumer_set.rb +1 -1
- data/lib/racecar/ctl.rb +9 -3
- data/lib/racecar/datadog.rb +2 -2
- data/lib/racecar/message_delivery_error.rb +112 -0
- data/lib/racecar/parallel_runner.rb +106 -0
- data/lib/racecar/runner.rb +51 -9
- data/lib/racecar/version.rb +1 -1
- metadata +8 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 539a8002f306c561358adb3f257d721dbe3764766ff7b14a52b4d7a7d2de4a22
|
4
|
+
data.tar.gz: ca482343833ac570fbadcc652cf3cd4e3691cba116093379c333c274137323c0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ff0d1b0115b9aee1268e57613f348ebf654698d0aba021ecf0ac6b9d418ea9a91c23f14aa550b76e0cffe62673cd95444ca871c4dc2d9175d1bbf2970d2be74
|
7
|
+
data.tar.gz: eafa3506a43d5d79a12adcea2f56ecae66068a7ff077d7beba63630cfe6d94a84888b05e2de2da40fc53daa63fa8d0ee6593b97db3eb26ec119705dbb39f6153
|
data/.github/workflows/ci.yml
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
name: CI
|
2
2
|
|
3
3
|
on:
|
4
|
+
pull_request:
|
5
|
+
branches: ["master"]
|
4
6
|
push:
|
5
|
-
branches:
|
6
|
-
- '**'
|
7
|
+
branches: ["master"]
|
7
8
|
|
8
9
|
jobs:
|
9
10
|
unit-specs:
|
@@ -11,12 +12,12 @@ jobs:
|
|
11
12
|
|
12
13
|
strategy:
|
13
14
|
matrix:
|
14
|
-
ruby-version: ["2.5", "2.6"]
|
15
|
+
ruby-version: ["2.5", "2.6", "3.0"]
|
15
16
|
|
16
17
|
steps:
|
17
18
|
- uses: zendesk/checkout@v2
|
18
19
|
- name: Set up Ruby
|
19
|
-
uses: zendesk/setup-ruby@v1.
|
20
|
+
uses: zendesk/setup-ruby@v1.64.1
|
20
21
|
with:
|
21
22
|
ruby-version: ${{ matrix.ruby-version }}
|
22
23
|
bundler-cache: true
|
@@ -25,37 +26,16 @@ jobs:
|
|
25
26
|
|
26
27
|
integration-specs:
|
27
28
|
runs-on: ubuntu-latest
|
28
|
-
|
29
|
-
services:
|
30
|
-
zookeeper:
|
31
|
-
image: confluentinc/cp-zookeeper
|
32
|
-
ports:
|
33
|
-
- 2181:2181
|
34
|
-
env:
|
35
|
-
ZOOKEEPER_CLIENT_PORT: 2181
|
36
|
-
|
37
|
-
kafka:
|
38
|
-
image: confluentinc/cp-kafka
|
39
|
-
ports:
|
40
|
-
- 9092:9092
|
41
|
-
- 29092:29092
|
42
|
-
options: --health-cmd "kafka-topics --list --bootstrap-server=localhost:9092" --health-interval 10s --health-timeout 5s --health-retries 5
|
43
|
-
env:
|
44
|
-
KAFKA_BROKER_ID: 1
|
45
|
-
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
|
46
|
-
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
|
47
|
-
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
|
48
|
-
KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
|
49
|
-
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
|
50
|
-
|
51
29
|
steps:
|
52
30
|
- uses: zendesk/checkout@v2
|
53
31
|
- name: Set up Ruby
|
54
|
-
uses: zendesk/setup-ruby@v1.
|
32
|
+
uses: zendesk/setup-ruby@v1.64.1
|
55
33
|
with:
|
56
34
|
ruby-version: 2.7
|
57
35
|
bundler-cache: true
|
36
|
+
- name: Bring up docker-compose stack
|
37
|
+
run: docker-compose up -d
|
58
38
|
- name: Build and test with RSpec
|
59
39
|
env:
|
60
40
|
RACECAR_BROKERS: localhost:9092
|
61
|
-
run: bundle exec rspec --format documentation --require spec_helper --color spec/integration/*_spec.rb
|
41
|
+
run: timeout --kill-after 180 150 bundle exec rspec --format documentation --require spec_helper --color spec/integration/*_spec.rb
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,14 @@
|
|
2
2
|
|
3
3
|
## Unreleased
|
4
4
|
|
5
|
+
* [Racecar::Consumer] When messages fail to deliver, an extended error with hints is now raised. Instead of `Rdkafka::RdkafkaError` you'll get a `Racecar::MessageDeliveryError` instead. ([#219](https://github.com/zendesk/racecar/pull/219)). If you have set a `Racecar.config.error_handler`, it might need to be updated.
|
6
|
+
* [Racecar::Consumer] When message delivery times out, Racecar will reset the producer in an attempt to fix some of the potential causes for this error. ([#219](https://github.com/zendesk/racecar/pull/219))
|
7
|
+
* Validate the `process` and `process_batch` method signature on consumer classes when initializing (#236)
|
8
|
+
* Add Ruby 3.0 compatibility (#237)
|
9
|
+
* Introduce parallel runner, which forks a number of independent consumers, allowing partitions to be processed in parallel. ([#222](https://github.com/zendesk/racecar/pull/222))
|
10
|
+
* [Racecar::Runner] Ensure producer is closed, whether it closes or errors. ([#222](https://github.com/zendesk/racecar/pull/222))
|
11
|
+
* Configure `statistics_interval` directly in the config. Disable statistics when no callback is defined ([#232](https://github.com/zendesk/racecar/pull/232))
|
12
|
+
|
5
13
|
## racecar v2.2.0
|
6
14
|
|
7
15
|
* [Racecar::ConsumerSet] **breaking change** `Racecar::ConsumerSet`'s functions `poll` and `batch_pall` expect the max wait values to be given in milliseconds. The defaults were using `config.max_wait_time`, which is in seconds. If you do not directly use `Racecar::ConsumerSet`, or always call its `poll` and `batch_poll` functions by specfiying the max wait time (the first argument), then this breaking change does not affect you. ([#214](https://github.com/zendesk/racecar/pull/214))
|
data/Dockerfile
ADDED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
racecar (2.
|
4
|
+
racecar (2.2.0)
|
5
5
|
king_konf (~> 1.0.0)
|
6
6
|
rdkafka (~> 0.8.0)
|
7
7
|
|
@@ -18,7 +18,7 @@ GEM
|
|
18
18
|
concurrent-ruby (1.1.7)
|
19
19
|
diff-lcs (1.4.4)
|
20
20
|
dogstatsd-ruby (4.8.2)
|
21
|
-
ffi (1.
|
21
|
+
ffi (1.14.2)
|
22
22
|
i18n (1.8.5)
|
23
23
|
concurrent-ruby (~> 1.0)
|
24
24
|
king_konf (1.0.0)
|
@@ -37,15 +37,15 @@ GEM
|
|
37
37
|
rspec-core (~> 3.10.0)
|
38
38
|
rspec-expectations (~> 3.10.0)
|
39
39
|
rspec-mocks (~> 3.10.0)
|
40
|
-
rspec-core (3.10.
|
40
|
+
rspec-core (3.10.1)
|
41
41
|
rspec-support (~> 3.10.0)
|
42
|
-
rspec-expectations (3.10.
|
42
|
+
rspec-expectations (3.10.1)
|
43
43
|
diff-lcs (>= 1.2.0, < 2.0)
|
44
44
|
rspec-support (~> 3.10.0)
|
45
|
-
rspec-mocks (3.10.
|
45
|
+
rspec-mocks (3.10.2)
|
46
46
|
diff-lcs (>= 1.2.0, < 2.0)
|
47
47
|
rspec-support (~> 3.10.0)
|
48
|
-
rspec-support (3.10.
|
48
|
+
rspec-support (3.10.2)
|
49
49
|
thread_safe (0.3.6)
|
50
50
|
timecop (0.9.2)
|
51
51
|
tzinfo (1.2.8)
|
data/README.md
CHANGED
@@ -83,6 +83,30 @@ Now run your consumer with `bundle exec racecar TapDanceConsumer`.
|
|
83
83
|
|
84
84
|
Note: if you're not using Rails, you'll have to add the file yourself. No-one will judge you for copy-pasting it.
|
85
85
|
|
86
|
+
#### Running consumers in parallel (experimental)
|
87
|
+
|
88
|
+
Warning - limited battle testing in production environments; use at your own risk!
|
89
|
+
|
90
|
+
If you want to process different partitions in parallel, and don't want to deploy a number of instances matching the total partitions of the topic, you can specify the number of workers to spin up - that number of processes will be forked, and each will register its own consumer in the group. Some things to note:
|
91
|
+
- This would make no difference on a single partitioned topic - only one consumer would ever be assigned a partition. A couple of example configurations to process all partitions in parallel (we'll assume a 15 partition topic):
|
92
|
+
- Parallel workers set to 3, 5 separate instances / replicas running in your container orchestrator
|
93
|
+
- Parallel workers set to 5, 3 separate instances / replicas running in your container orchestrator
|
94
|
+
- Since we're forking new processes, the memory demands are a little higher
|
95
|
+
- From some initial testing, running 5 parallel workers requires no more than double the memory of running a Racecar consumer without parallelism.
|
96
|
+
|
97
|
+
The number of parallel workers is configured per consumer class; you may only want to take advantage of this for busier consumers:
|
98
|
+
```ruby
|
99
|
+
class ParallelProcessingConsumer < Racecar::Consumer
|
100
|
+
subscribes_to "some-topic"
|
101
|
+
|
102
|
+
self.parallel_workers = 5
|
103
|
+
|
104
|
+
def process(message)
|
105
|
+
...
|
106
|
+
end
|
107
|
+
end
|
108
|
+
```
|
109
|
+
|
86
110
|
#### Initializing consumers
|
87
111
|
|
88
112
|
You can optionally add an `initialize` method if you need to do any set-up work before processing messages, e.g.
|
@@ -266,6 +290,8 @@ All timeouts are defined in number of seconds.
|
|
266
290
|
* `pause_with_exponential_backoff` – Set to `true` if you want to double the `pause_timeout` on each consecutive failure of a particular partition.
|
267
291
|
* `socket_timeout` – How long to wait when trying to communicate with a Kafka broker. Default is 30 seconds.
|
268
292
|
* `max_wait_time` – How long to allow the Kafka brokers to wait before returning messages. A higher number means larger batches, at the cost of higher latency. Default is 1 second.
|
293
|
+
* `message_timeout` – How long to try to deliver a produced message before finally giving up. Default is 5 minutes. Transient errors are automatically retried. If a message delivery fails, the current read message batch is retried.
|
294
|
+
* `statistics_interval` – How frequently librdkafka should publish statistics about its consumers and producers; you must also add a `statistics_callback` method to your processor, otherwise the stats are disabled. The default is 1 second, however this can be quite memory hungry, so you may want to tune this and monitor.
|
269
295
|
|
270
296
|
#### Memory & network usage
|
271
297
|
|
@@ -319,6 +345,8 @@ Racecar supports [Datadog](https://www.datadoghq.com/) monitoring integration. I
|
|
319
345
|
* `datadog_namespace` – The namespace to use for Datadog metrics.
|
320
346
|
* `datadog_tags` – Tags that should always be set on Datadog metrics.
|
321
347
|
|
348
|
+
Furthermore, there's a [standard Datadog dashboard configution file](https://raw.githubusercontent.com/zendesk/racecar/master/extra/datadog-dashboard.json) that you can import to get started with a Racecar dashboard for all of your consumers.
|
349
|
+
|
322
350
|
#### Consumers Without Rails ####
|
323
351
|
|
324
352
|
By default, if Rails is detected, it will be automatically started when the consumer is started. There are cases where you might not want or need Rails. You can pass the `--without-rails` option when starting the consumer and Rails won't be started.
|
@@ -492,6 +520,16 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
492
520
|
|
493
521
|
The integration tests run against a Kafka instance that is not automatically started from within `rspec`. You can set one up using the provided `docker-compose.yml` by running `docker-compose up`.
|
494
522
|
|
523
|
+
### Running RSpec within Docker
|
524
|
+
|
525
|
+
There can be behavioural inconsistencies between running the specs on your machine, and in the CI pipeline. Due to this, there is now a Dockerfile included in the project, which is based on the CircleCI ruby 2.7.2 image. This could easily be extended with more Dockerfiles to cover different Ruby versions if desired. In order to run the specs via Docker:
|
526
|
+
|
527
|
+
- Uncomment the `tests` service from the docker-compose.yml
|
528
|
+
- Bring up the stack with `docker-compose up -d`
|
529
|
+
- Execute the entire suite with `docker-compose run --rm tests rspec`
|
530
|
+
- Execute a single spec or directory with `docker-compose run --rm tests rspec spec/integration/consumer_spec.rb`
|
531
|
+
|
532
|
+
Please note - your code directory is mounted as a volume, so you can make code changes without needing to rebuild
|
495
533
|
|
496
534
|
## Contributing
|
497
535
|
|
data/docker-compose.yml
CHANGED
@@ -1,19 +1,19 @@
|
|
1
|
-
version: '2'
|
1
|
+
version: '2.1'
|
2
|
+
|
2
3
|
services:
|
3
4
|
zookeeper:
|
4
5
|
image: confluentinc/cp-zookeeper:5.5.1
|
5
|
-
hostname: zookeeper
|
6
|
-
container_name: zookeeper
|
7
6
|
ports:
|
8
7
|
- "2181:2181"
|
9
8
|
environment:
|
10
9
|
ZOOKEEPER_CLIENT_PORT: 2181
|
11
10
|
ZOOKEEPER_TICK_TIME: 2000
|
11
|
+
KAFKA_OPTS: "-Dzookeeper.4lw.commands.whitelist=*"
|
12
|
+
healthcheck:
|
13
|
+
test: echo ruok | nc 127.0.0.1 2181 | grep imok
|
12
14
|
|
13
15
|
broker:
|
14
16
|
image: confluentinc/cp-kafka:5.5.1
|
15
|
-
hostname: broker
|
16
|
-
container_name: broker
|
17
17
|
depends_on:
|
18
18
|
- zookeeper
|
19
19
|
ports:
|
@@ -30,3 +30,36 @@ services:
|
|
30
30
|
KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
|
31
31
|
KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
|
32
32
|
KAFKA_JMX_PORT: 9101
|
33
|
+
KAFKA_DELETE_TOPIC_ENABLE: 'true'
|
34
|
+
healthcheck:
|
35
|
+
test: nc -z 127.0.0.1 9092
|
36
|
+
|
37
|
+
wait-for-healthy-services:
|
38
|
+
image: alpine
|
39
|
+
depends_on:
|
40
|
+
broker:
|
41
|
+
condition: service_healthy
|
42
|
+
zookeeper:
|
43
|
+
condition: service_healthy
|
44
|
+
|
45
|
+
|
46
|
+
# If you want to run the tests locally with Docker, comment in the tests service.
|
47
|
+
# The behaviour, especially of the integration tests, can differ somewhat compared
|
48
|
+
# to running it on your machine.
|
49
|
+
|
50
|
+
# tests:
|
51
|
+
# build:
|
52
|
+
# context: .
|
53
|
+
# depends_on:
|
54
|
+
# wait-for-healthy-services:
|
55
|
+
# condition: service_started
|
56
|
+
# environment:
|
57
|
+
# RACECAR_BROKERS: broker:29092
|
58
|
+
# DOCKER_SUDO: 'true'
|
59
|
+
# # When bringing up the stack, we just let the container exit. For running the
|
60
|
+
# # specs, we'll use commands like `docker-compose run tests rspec`
|
61
|
+
# command: ["echo", "ready"]
|
62
|
+
# volumes:
|
63
|
+
# # The line below allows us to run docker commands from the container itself
|
64
|
+
# - "/var/run/docker.sock:/var/run/docker.sock"
|
65
|
+
# - .:/app
|
@@ -0,0 +1 @@
|
|
1
|
+
{"title":"Racecar consumer groups","description":"Dashboard for monitoring [Racecar](https://github.com/zendesk/racecar) Kafka consumer groups.","widgets":[{"id":4916208698459109,"definition":{"title":"Single-message processing","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":82605028,"definition":{"title":"95th percentile message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.95percentile{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":2857871641649870,"definition":{"title":"Max message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.max{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":88579656,"definition":{"title":"Median message processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_message.latency.median{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}}]}},{"id":4068194420543030,"definition":{"title":"Batch processing","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":341686567,"definition":{"title":"95th percentile batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_batch.latency.95percentile{$group_id,$client,$topic,$partition,$env} by {topic,group_id}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":341687897,"definition":{"title":"Median batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.process_batch.latency.median{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5352911818003929,"definition":{"title":"Max batch processing latency by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"max:racecar.consumer.process_batch.latency.max{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":1654098217056312,"definition":{"title":"Max message batch size","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"max:racecar.consumer.batch_size.max{$group_id,$client,$topic,$partition,$env} by {topic}","style":{"palette":"purple","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":7718619791149134,"definition":{"title":"Average per-message latency in batch processing mode","show_legend":false,"legend_size":"0","legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"max:racecar.consumer.process_batch.latency.avg{$group_id,$client,$topic,$partition,$env}/max:racecar.consumer.batch_size.avg{$group_id,$client,$topic,$partition,$env}","metadata":[{"expression":"max:racecar.consumer.process_batch.latency.avg{$env,$pod,$group_id,$client,$topic,$partition}/max:racecar.consumer.batch_size.avg{$env,$pod,$group_id,$client,$topic,$partition}","alias_name":"ms"}],"style":{"palette":"purple","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":4,"y":2,"width":4,"height":2}}]}},{"id":7110612496425151,"definition":{"title":"Throughput & Lag","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":301212748,"definition":{"title":"Message lag changes","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"derivative(max:racecar.consumer.offset{$group_id,$client,$topic,$partition,$env} by {topic,partition,pod})","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":82604183,"definition":{"title":"Processing throughput by topic","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {topic,group_id}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5547724125706857,"definition":{"title":"Processing throughput by group","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {group_id}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":375397853,"definition":{"title":"Processing throughput by host","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {group_id,host}.as_rate()","style":{"palette":"dog_classic","line_type":"solid","line_width":"thin"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":7820607170949322,"definition":{"title":"Messages consumed in timeframe","type":"query_value","requests":[{"q":"sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env}.as_count()","aggregator":"sum"}],"autoscale":true,"precision":0},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":1428183857213882,"definition":{"title":"Time lag (end-to-end latency)","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"avg:racecar.consumer.time_lag{$group_id,$client,$topic,$partition,$env} by {group_id,pod}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":2,"width":4,"height":2}}]}},{"id":1487807434456879,"definition":{"title":"Processing Errors & Group Stability","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":82605029,"definition":{"title":"Processing errors","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.process_batch.errors{$group_id,$client,$topic,$partition,$env} by {topic,pod,group_id,partition}.as_count()+sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic,pod,group_id,partition}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":83104736,"definition":{"title":"Processing error rate by topic (%)","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"(sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()/(sum:racecar.consumer.process_message.errors{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()+sum:racecar.consumer.messages{$group_id,$client,$topic,$partition,$env} by {topic}.as_count()))*100","style":{"palette":"orange","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6572534533091871,"definition":{"title":"Processing errors in timeframe","type":"query_value","requests":[{"q":"sum:racecar.consumer.process_batch.errors{$topic,$client,$group_id,$env}.as_count()+sum:racecar.consumer.process_message.errors{$topic,$client,$group_id,$env}.as_count()","aggregator":"sum","conditional_formats":[{"comparator":">","palette":"white_on_red","value":0},{"comparator":"<=","palette":"white_on_green","value":0}]}],"autoscale":true,"precision":0},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":302705923,"definition":{"title":"Pause duration","show_legend":false,"legend_size":"0","legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.consumer.pause.duration{$client,$group_id,$topic,$env} by {pod,group_id,topic,partition}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":2,"width":4,"height":2}},{"id":235544854,"definition":{"title":"Group joins","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.join_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":4,"y":2,"width":4,"height":2}},{"id":235544862,"definition":{"title":"Group leaves","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.leave_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":8,"y":2,"width":4,"height":2}},{"id":235545167,"definition":{"title":"Group syncs","show_legend":false,"legend_size":"0","type":"timeseries","requests":[{"q":"sum:racecar.consumer.sync_group.count{$group_id,$client,$env} by {group_id,host}.as_count()","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"}},"layout":{"x":0,"y":4,"width":4,"height":2}}]}},{"id":8013176155436939,"definition":{"title":"Producer & message delivery","reflow_type":"fixed","type":"group","layout_type":"ordered","widgets":[{"id":5948628389625057,"definition":{"title":"Message delivery latency (median)","title_size":"16","title_align":"left","show_legend":false,"type":"timeseries","requests":[{"q":"avg:racecar.producer.deliver.latency.median{$client,$env}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":3158040379950811,"definition":{"title":"Producer buffer size (max)","title_size":"16","title_align":"left","show_legend":false,"type":"timeseries","requests":[{"q":"max:racecar.producer.buffer.size.max{$client,$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":6916375790222772,"definition":{"title":"Producer buffer size (avg) kp","title_size":"16","title_align":"left","show_legend":false,"legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.producer.buffer.size.avg{$client,$env} by {topic,host}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":8,"y":0,"width":4,"height":2}},{"id":3160735194874896,"definition":{"title":"Message size (95p)","title_size":"16","title_align":"left","show_legend":false,"legend_layout":"vertical","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"q":"avg:racecar.producer.produce.message_size.95percentile{$env} by {topic}","style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","label":"","include_zero":true,"min":"auto","max":"auto"},"markers":[]},"layout":{"x":0,"y":2,"width":4,"height":2}}]}}],"template_variables":[{"name":"env","default":"production","prefix":"env"},{"name":"group_id","default":"*","prefix":"group_id"},{"name":"client","default":"*","prefix":"client"},{"name":"topic","default":"*","prefix":"topic"},{"name":"partition","default":"*","prefix":"partition"}],"layout_type":"ordered","is_read_only":false,"notify_list":[],"reflow_type":"fixed","id":"ywc-z36-g29"}
|
data/lib/racecar.rb
CHANGED
@@ -7,6 +7,7 @@ require "racecar/null_instrumenter"
|
|
7
7
|
require "racecar/consumer"
|
8
8
|
require "racecar/consumer_set"
|
9
9
|
require "racecar/runner"
|
10
|
+
require "racecar/parallel_runner"
|
10
11
|
require "racecar/config"
|
11
12
|
require "racecar/version"
|
12
13
|
require "ensure_hash_compact"
|
@@ -51,6 +52,12 @@ module Racecar
|
|
51
52
|
end
|
52
53
|
|
53
54
|
def self.run(processor)
|
54
|
-
Runner.new(processor, config: config, logger: logger, instrumenter: instrumenter)
|
55
|
+
runner = Runner.new(processor, config: config, logger: logger, instrumenter: instrumenter)
|
56
|
+
|
57
|
+
if config.parallel_workers && config.parallel_workers > 1
|
58
|
+
ParallelRunner.new(runner: runner, config: config, logger: logger).run
|
59
|
+
else
|
60
|
+
runner.run
|
61
|
+
end
|
55
62
|
end
|
56
63
|
end
|
data/lib/racecar/cli.rb
CHANGED
data/lib/racecar/config.rb
CHANGED
@@ -6,6 +6,8 @@ module Racecar
|
|
6
6
|
class Config < KingKonf::Config
|
7
7
|
env_prefix :racecar
|
8
8
|
|
9
|
+
STATISTICS_DISABLED_VALUE = 0
|
10
|
+
|
9
11
|
desc "A list of Kafka brokers in the cluster that you're consuming from"
|
10
12
|
list :brokers, default: ["localhost:9092"]
|
11
13
|
|
@@ -57,6 +59,9 @@ module Racecar
|
|
57
59
|
desc "How long to allow the Kafka brokers to wait before returning messages (in seconds)"
|
58
60
|
float :max_wait_time, default: 1
|
59
61
|
|
62
|
+
desc "How long to try to deliver a produced message before finally giving up (in seconds)"
|
63
|
+
float :message_timeout, default: 5*60
|
64
|
+
|
60
65
|
desc "Maximum amount of data the broker shall return for a Fetch request"
|
61
66
|
integer :max_bytes, default: 10485760
|
62
67
|
|
@@ -153,10 +158,24 @@ module Racecar
|
|
153
158
|
desc "Whether to boot Rails when starting the consumer"
|
154
159
|
boolean :without_rails, default: false
|
155
160
|
|
161
|
+
desc "How frequently librdkafka should report statistics to your application (in seconds). A statistics callback
|
162
|
+
must also be provided. This should be defined with a `statistics_callback` method on your processor. Stats
|
163
|
+
are disabled if this value is set to 0, or there is no callback defined. This is set by default to 1 second
|
164
|
+
for backward compatibility, however this can be quite memory intensive"
|
165
|
+
integer :statistics_interval, default: 1
|
166
|
+
|
156
167
|
# The error handler must be set directly on the object.
|
157
168
|
attr_reader :error_handler
|
158
169
|
|
159
|
-
attr_accessor :subscriptions, :logger
|
170
|
+
attr_accessor :subscriptions, :logger, :parallel_workers
|
171
|
+
|
172
|
+
def statistics_interval_ms
|
173
|
+
if Rdkafka::Config.statistics_callback
|
174
|
+
statistics_interval * 1000
|
175
|
+
else
|
176
|
+
STATISTICS_DISABLED_VALUE
|
177
|
+
end
|
178
|
+
end
|
160
179
|
|
161
180
|
def max_wait_time_ms
|
162
181
|
max_wait_time * 1000
|
@@ -201,6 +220,7 @@ module Racecar
|
|
201
220
|
consumer_class.name.gsub(/[a-z][A-Z]/) { |str| "#{str[0]}-#{str[1]}" }.downcase,
|
202
221
|
].compact.join
|
203
222
|
|
223
|
+
self.parallel_workers = consumer_class.parallel_workers
|
204
224
|
self.subscriptions = consumer_class.subscriptions
|
205
225
|
self.max_wait_time = consumer_class.max_wait_time || self.max_wait_time
|
206
226
|
self.pidfile ||= "#{group_id}.pid"
|
@@ -231,6 +251,7 @@ module Racecar
|
|
231
251
|
def rdkafka_security_config
|
232
252
|
{
|
233
253
|
"security.protocol" => security_protocol,
|
254
|
+
"enable.ssl.certificate.verification" => ssl_verify_hostname,
|
234
255
|
"ssl.ca.location" => ssl_ca_location,
|
235
256
|
"ssl.crl.location" => ssl_crl_location,
|
236
257
|
"ssl.keystore.location" => ssl_keystore_location,
|
data/lib/racecar/consumer.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require "racecar/message_delivery_error"
|
4
|
+
|
3
5
|
module Racecar
|
4
6
|
class Consumer
|
5
7
|
Subscription = Struct.new(:topic, :start_from_beginning, :max_bytes_per_partition, :additional_config)
|
@@ -7,7 +9,7 @@ module Racecar
|
|
7
9
|
class << self
|
8
10
|
attr_accessor :max_wait_time
|
9
11
|
attr_accessor :group_id
|
10
|
-
attr_accessor :producer, :consumer
|
12
|
+
attr_accessor :producer, :consumer, :parallel_workers
|
11
13
|
|
12
14
|
def subscriptions
|
13
15
|
@subscriptions ||= []
|
@@ -25,29 +27,62 @@ module Racecar
|
|
25
27
|
# @param additional_config [Hash] Configuration properties for consumer.
|
26
28
|
# See https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
|
27
29
|
# @return [nil]
|
28
|
-
def subscribes_to(
|
30
|
+
def subscribes_to(
|
31
|
+
*topics,
|
32
|
+
start_from_beginning: true,
|
33
|
+
max_bytes_per_partition: 1048576,
|
34
|
+
additional_config: {}
|
35
|
+
)
|
29
36
|
topics.each do |topic|
|
30
37
|
subscriptions << Subscription.new(topic, start_from_beginning, max_bytes_per_partition, additional_config)
|
31
38
|
end
|
32
39
|
end
|
33
40
|
end
|
34
41
|
|
35
|
-
def configure(producer:, consumer:, instrumenter: NullInstrumenter)
|
42
|
+
def configure(producer:, consumer:, instrumenter: NullInstrumenter, config: Racecar.config)
|
36
43
|
@producer = producer
|
44
|
+
@delivery_handles = []
|
45
|
+
|
37
46
|
@consumer = consumer
|
47
|
+
|
38
48
|
@instrumenter = instrumenter
|
49
|
+
@config = config
|
39
50
|
end
|
40
51
|
|
41
52
|
def teardown; end
|
42
53
|
|
43
|
-
#
|
54
|
+
# Blocks until all messages produced so far have been successfully published. If
|
55
|
+
# message delivery finally fails, a Racecar::MessageDeliveryError is raised. The
|
56
|
+
# delivery failed for the reason in the exception. The error can be broker side
|
57
|
+
# (e.g. downtime, configuration issue) or specific to the message being sent. The
|
58
|
+
# caller must handle the latter cases or run into head of line blocking.
|
44
59
|
def deliver!
|
45
60
|
@delivery_handles ||= []
|
46
61
|
if @delivery_handles.any?
|
47
62
|
instrumentation_payload = { delivered_message_count: @delivery_handles.size }
|
48
63
|
|
49
64
|
@instrumenter.instrument('deliver_messages', instrumentation_payload) do
|
50
|
-
@delivery_handles.each
|
65
|
+
@delivery_handles.each do |handle|
|
66
|
+
# rdkafka-ruby checks every wait_timeout seconds if the message was
|
67
|
+
# successfully delivered, up to max_wait_timeout seconds before raising
|
68
|
+
# Rdkafka::AbstractHandle::WaitTimeoutError. librdkafka will (re)try to
|
69
|
+
# deliver all messages in the background, until "config.message_timeout"
|
70
|
+
# (message.timeout.ms) is exceeded. Phrased differently, rdkafka-ruby's
|
71
|
+
# WaitTimeoutError is just informative.
|
72
|
+
# The raising can be avoided if max_wait_timeout below is greater than
|
73
|
+
# config.message_timeout, but config is not available here (without
|
74
|
+
# changing the interface).
|
75
|
+
handle.wait(max_wait_timeout: 60, wait_timeout: 0.1)
|
76
|
+
rescue Rdkafka::AbstractHandle::WaitTimeoutError => e
|
77
|
+
partition = MessageDeliveryError.partition_from_delivery_handle(handle)
|
78
|
+
# ideally we could use the logger passed to the Runner, but it is not
|
79
|
+
# available here. The runner sets it for Rdkafka, though, so we can use
|
80
|
+
# that instead.
|
81
|
+
@config.logger.debug "Still trying to deliver message to (partition #{partition})... (will try up to Racecar.config.message_timeout)"
|
82
|
+
retry
|
83
|
+
rescue Rdkafka::RdkafkaError => e
|
84
|
+
raise MessageDeliveryError.new(e, handle)
|
85
|
+
end
|
51
86
|
end
|
52
87
|
end
|
53
88
|
@delivery_handles.clear
|
data/lib/racecar/consumer_set.rb
CHANGED
@@ -224,7 +224,7 @@ module Racecar
|
|
224
224
|
"queued.min.messages" => @config.min_message_queue_size,
|
225
225
|
"session.timeout.ms" => @config.session_timeout * 1000,
|
226
226
|
"socket.timeout.ms" => @config.socket_timeout * 1000,
|
227
|
-
"statistics.interval.ms" =>
|
227
|
+
"statistics.interval.ms" => @config.statistics_interval_ms
|
228
228
|
}
|
229
229
|
config.merge! @config.rdkafka_consumer
|
230
230
|
config.merge! subscription.additional_config
|
data/lib/racecar/ctl.rb
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
require "optparse"
|
4
4
|
require "racecar/rails_config_file_loader"
|
5
5
|
require "racecar/daemon"
|
6
|
+
require "racecar/message_delivery_error"
|
6
7
|
|
7
8
|
module Racecar
|
8
9
|
class Ctl
|
@@ -96,12 +97,17 @@ module Racecar
|
|
96
97
|
Racecar.config.validate!
|
97
98
|
|
98
99
|
producer = Rdkafka::Config.new({
|
99
|
-
"bootstrap.servers":
|
100
|
-
"client.id":
|
100
|
+
"bootstrap.servers": Racecar.config.brokers.join(","),
|
101
|
+
"client.id": Racecar.config.client_id,
|
102
|
+
"message.timeout.ms": Racecar.config.message_timeout * 1000,
|
101
103
|
}.merge(Racecar.config.rdkafka_producer)).producer
|
102
104
|
|
103
105
|
handle = producer.produce(payload: message.value, key: message.key, topic: message.topic)
|
104
|
-
|
106
|
+
begin
|
107
|
+
handle.wait(max_wait_timeout: Racecar.config.message_timeout)
|
108
|
+
rescue Rdkafka::RdkafkaError => e
|
109
|
+
raise MessageDeliveryError.new(e, handle)
|
110
|
+
end
|
105
111
|
|
106
112
|
$stderr.puts "=> Delivered message to Kafka cluster"
|
107
113
|
end
|
data/lib/racecar/datadog.rb
CHANGED
@@ -0,0 +1,112 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Racecar
|
4
|
+
# MessageDeliveryError wraps an Rdkafka error and tries to give
|
5
|
+
# specific hints on how to debug or resolve the error within the
|
6
|
+
# Racecar context.
|
7
|
+
class MessageDeliveryError < StandardError
|
8
|
+
# partition_from_delivery_handle takes an rdkafka delivery handle
|
9
|
+
# and returns a human readable version of the partition. It handles
|
10
|
+
# the case where the partition is unknown.
|
11
|
+
def self.partition_from_delivery_handle(delivery_handle)
|
12
|
+
partition = delivery_handle&.create_result&.partition
|
13
|
+
# -1 is rdkafka-ruby's default value, which gets eventually set by librdkafka
|
14
|
+
return "no yet known" if partition.nil? || partition == -1
|
15
|
+
partition.to_s
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(rdkafka_error, delivery_handle)
|
19
|
+
raise rdkafka_error unless rdkafka_error.is_a?(Rdkafka::RdkafkaError)
|
20
|
+
|
21
|
+
@rdkafka_error = rdkafka_error
|
22
|
+
@delivery_handle = delivery_handle
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :rdkafka_error
|
26
|
+
|
27
|
+
def code
|
28
|
+
@rdkafka_error.code
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_s
|
32
|
+
msg = <<~EOM
|
33
|
+
Message delivery finally failed:
|
34
|
+
#{@rdkafka_error.to_s}
|
35
|
+
|
36
|
+
#{explain}
|
37
|
+
EOM
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def explain
|
43
|
+
case @rdkafka_error.code
|
44
|
+
when :msg_timed_out # -192
|
45
|
+
<<~EOM
|
46
|
+
Could not deliver message within Racecar.config.message_timeout.
|
47
|
+
|
48
|
+
This can happen for various reasons, but most commonly because the connection to the broker is interrupted or there is no leader available. Check the broker's logs or the network for more insight.
|
49
|
+
|
50
|
+
Upstream documentation:
|
51
|
+
https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md#error-local-time-out
|
52
|
+
EOM
|
53
|
+
|
54
|
+
when :msg_size_too_large # 10
|
55
|
+
<<~EOM
|
56
|
+
Could not deliver message, since it is bigger than either the broker's or Racecar's maximum message size.
|
57
|
+
|
58
|
+
The broker's config option on the topic is called "max.message.bytes" and the broker wide default is "message.max.bytes". The client's is "message.max.bytes". Take extra care to distinguish this from similarly named properties for receiving/consuming messages (i.e. Racecar.config.max_bytes is NOT related).
|
59
|
+
|
60
|
+
Racecar's limit is currently not configurable and uses librdkafka's default of 1 MB (10³ bytes). As of writing, librdkafka will send at least one message regardless of this limit. It is therefore very likely you're hitting the broker's limit and not Racecar's/librdkafka's.
|
61
|
+
|
62
|
+
Upstream documentation:
|
63
|
+
broker per topic: https://docs.confluent.io/platform/current/installation/configuration/topic-configs.html#topicconfigs_max.message.bytes
|
64
|
+
broker default: https://docs.confluent.io/platform/current/installation/configuration/broker-configs.html#brokerconfigs_message.max.bytes
|
65
|
+
client: https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
|
66
|
+
EOM
|
67
|
+
|
68
|
+
when :unknown_topic_or_part # 3
|
69
|
+
partition = self.class.partition_from_delivery_handle(@delivery_handle)
|
70
|
+
|
71
|
+
<<~EOM
|
72
|
+
Could not deliver message, since the targeted topic or partition (#{partition}) does not exist.
|
73
|
+
|
74
|
+
Check that there are no typos, or that the broker's "auto.create.topics.enable" is enabled. For freshly created topics with auto create enabled, this may appear in the beginning (race condition on creation and publishing).
|
75
|
+
|
76
|
+
Upstream documentation:
|
77
|
+
broker setting: https://docs.confluent.io/platform/current/installation/configuration/broker-configs.html#brokerconfigs_auto.create.topics.enable
|
78
|
+
client: https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md#topic-metadata-propagation-for-newly-created-topics
|
79
|
+
https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md#topic-auto-creation
|
80
|
+
EOM
|
81
|
+
|
82
|
+
when :record_list_too_large # 18
|
83
|
+
<<~EOM
|
84
|
+
Tried to deliver more messages in a batch than the broker's segment size.
|
85
|
+
|
86
|
+
Either increase the broker's "log.segment.bytes", or decrease any of the client's related settings "batch.num.messages", "batch.size" or "message.max.bytes". None of these are configurable through Racecar yet, as the defaults should be sufficient and sane.
|
87
|
+
|
88
|
+
Upstream documentation:
|
89
|
+
broker: https://docs.confluent.io/platform/current/installation/configuration/broker-configs.html#brokerconfigs_log.segment.bytes
|
90
|
+
client: https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
|
91
|
+
EOM
|
92
|
+
|
93
|
+
when :topic_authorization_failed # 29
|
94
|
+
<<~EOM
|
95
|
+
Failed to deliver message because of insufficient authorization to write into the topic.
|
96
|
+
|
97
|
+
Double check that it is not a race condition on topic creation. If it isn't, verify the ACLs are correct.
|
98
|
+
|
99
|
+
Upstream documentation:
|
100
|
+
https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md#unknown-or-unauthorized-topics
|
101
|
+
EOM
|
102
|
+
|
103
|
+
else
|
104
|
+
<<~EOM
|
105
|
+
No specific information is available for this error. Consider adding it to Racecar. You can find generally helpful information in the upstream documentation:
|
106
|
+
https://github.com/edenhill/librdkafka/blob/master/INTRODUCTION.md
|
107
|
+
https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
|
108
|
+
EOM
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Racecar
|
4
|
+
class ParallelRunner
|
5
|
+
Worker = Struct.new(:pid, :parent_reader)
|
6
|
+
|
7
|
+
SHUTDOWN_SIGNALS = ["INT", "QUIT", "TERM"]
|
8
|
+
|
9
|
+
def initialize(runner:, config:, logger:)
|
10
|
+
@runner = runner
|
11
|
+
@config = config
|
12
|
+
@logger = logger
|
13
|
+
end
|
14
|
+
|
15
|
+
def worker_pids
|
16
|
+
workers.map(&:pid)
|
17
|
+
end
|
18
|
+
|
19
|
+
def running?
|
20
|
+
@running
|
21
|
+
end
|
22
|
+
|
23
|
+
def run
|
24
|
+
logger.info "=> Running with #{config.parallel_workers} parallel workers"
|
25
|
+
|
26
|
+
self.workers = config.parallel_workers.times.map do
|
27
|
+
run_worker.tap { |w| logger.info "=> Forked new Racecar consumer with process id #{w.pid}" }
|
28
|
+
end
|
29
|
+
|
30
|
+
# Print the consumer config to STDERR on USR1.
|
31
|
+
trap("USR1") { $stderr.puts config.inspect }
|
32
|
+
|
33
|
+
SHUTDOWN_SIGNALS.each { |signal| trap(signal) { terminate_workers } }
|
34
|
+
|
35
|
+
@running = true
|
36
|
+
|
37
|
+
wait_for_exit
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
attr_accessor :workers
|
43
|
+
attr_reader :runner, :config, :logger
|
44
|
+
|
45
|
+
def run_worker
|
46
|
+
parent_reader, child_writer = IO.pipe
|
47
|
+
|
48
|
+
pid = fork do
|
49
|
+
begin
|
50
|
+
parent_reader.close
|
51
|
+
|
52
|
+
runner.run
|
53
|
+
rescue Exception => e
|
54
|
+
# Allow the parent process to re-raise the exception after shutdown
|
55
|
+
child_writer.binmode
|
56
|
+
child_writer.write(Marshal.dump(e))
|
57
|
+
ensure
|
58
|
+
child_writer.close
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
child_writer.close
|
63
|
+
|
64
|
+
Worker.new(pid, parent_reader)
|
65
|
+
end
|
66
|
+
|
67
|
+
def terminate_workers
|
68
|
+
return if @terminating
|
69
|
+
|
70
|
+
@terminating = true
|
71
|
+
$stderr.puts "=> Terminating workers"
|
72
|
+
|
73
|
+
Process.kill("TERM", *workers.map(&:pid))
|
74
|
+
end
|
75
|
+
|
76
|
+
def wait_for_exit
|
77
|
+
# The call to IO.select blocks until one or more of our readers are ready for reading,
|
78
|
+
# which could be for one of two reasons:
|
79
|
+
#
|
80
|
+
# - An exception is raised in the child process, in which case we should initiate
|
81
|
+
# a shutdown;
|
82
|
+
#
|
83
|
+
# - A graceful shutdown was already initiated, and the pipe writer has been closed, in
|
84
|
+
# which case there is nothing more to do.
|
85
|
+
#
|
86
|
+
# - One of the child processes was killed somehow. If this turns out to be too strict
|
87
|
+
# (i.e. closing down all the workers, we could revisit and look at restarting dead
|
88
|
+
# workers.
|
89
|
+
#
|
90
|
+
ready_readers = IO.select(workers.map(&:parent_reader)).first
|
91
|
+
|
92
|
+
first_read = ready_readers.first.read
|
93
|
+
|
94
|
+
terminate_workers
|
95
|
+
|
96
|
+
workers.map(&:pid).each do |pid|
|
97
|
+
logger.debug "=> Waiting for worker with pid #{pid} to exit"
|
98
|
+
Process.waitpid(pid)
|
99
|
+
logger.debug "=> Worker with pid #{pid} shutdown"
|
100
|
+
end
|
101
|
+
|
102
|
+
exception_found = !first_read.empty?
|
103
|
+
raise Marshal.load(first_read) if exception_found
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
data/lib/racecar/runner.rb
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
require "rdkafka"
|
4
4
|
require "racecar/pause"
|
5
5
|
require "racecar/message"
|
6
|
+
require "racecar/message_delivery_error"
|
6
7
|
|
7
8
|
module Racecar
|
8
9
|
class Runner
|
@@ -53,6 +54,7 @@ module Racecar
|
|
53
54
|
producer: producer,
|
54
55
|
consumer: consumer,
|
55
56
|
instrumenter: @instrumenter,
|
57
|
+
config: @config,
|
56
58
|
)
|
57
59
|
|
58
60
|
instrumentation_payload = {
|
@@ -79,12 +81,17 @@ module Racecar
|
|
79
81
|
end
|
80
82
|
|
81
83
|
logger.info "Gracefully shutting down"
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
84
|
+
begin
|
85
|
+
processor.deliver!
|
86
|
+
processor.teardown
|
87
|
+
consumer.commit
|
88
|
+
ensure
|
89
|
+
@instrumenter.instrument('leave_group') do
|
90
|
+
consumer.close
|
91
|
+
end
|
87
92
|
end
|
93
|
+
ensure
|
94
|
+
producer.close
|
88
95
|
end
|
89
96
|
|
90
97
|
def stop
|
@@ -98,10 +105,20 @@ module Racecar
|
|
98
105
|
def process_method
|
99
106
|
@process_method ||= begin
|
100
107
|
case
|
101
|
-
when processor.respond_to?(:process_batch)
|
102
|
-
|
108
|
+
when processor.respond_to?(:process_batch)
|
109
|
+
if processor.method(:process_batch).arity != 1
|
110
|
+
raise Racecar::Error, "Invalid method signature for `process_batch`. The method must take exactly 1 argument."
|
111
|
+
end
|
112
|
+
|
113
|
+
:batch
|
114
|
+
when processor.respond_to?(:process)
|
115
|
+
if processor.method(:process).arity != 1
|
116
|
+
raise Racecar::Error, "Invalid method signature for `process`. The method must take exactly 1 argument."
|
117
|
+
end
|
118
|
+
|
119
|
+
:single
|
103
120
|
else
|
104
|
-
raise NotImplementedError, "Consumer class must implement process or process_batch method"
|
121
|
+
raise NotImplementedError, "Consumer class `#{processor.class}` must implement a `process` or `process_batch` method"
|
105
122
|
end
|
106
123
|
end
|
107
124
|
end
|
@@ -128,7 +145,8 @@ module Racecar
|
|
128
145
|
producer_config = {
|
129
146
|
"bootstrap.servers" => config.brokers.join(","),
|
130
147
|
"client.id" => config.client_id,
|
131
|
-
"statistics.interval.ms" =>
|
148
|
+
"statistics.interval.ms" => config.statistics_interval_ms,
|
149
|
+
"message.timeout.ms" => config.message_timeout * 1000,
|
132
150
|
}
|
133
151
|
producer_config["compression.codec"] = config.producer_compression_codec.to_s unless config.producer_compression_codec.nil?
|
134
152
|
producer_config.merge!(config.rdkafka_producer)
|
@@ -176,6 +194,7 @@ module Racecar
|
|
176
194
|
consumer.store_offset(message)
|
177
195
|
end
|
178
196
|
rescue => e
|
197
|
+
instrumentation_payload[:unrecoverable_delivery_error] = reset_producer_on_unrecoverable_delivery_errors(e)
|
179
198
|
instrumentation_payload[:retries_count] = pause.pauses_count
|
180
199
|
config.error_handler.call(e, instrumentation_payload)
|
181
200
|
raise e
|
@@ -206,6 +225,7 @@ module Racecar
|
|
206
225
|
processor.deliver!
|
207
226
|
consumer.store_offset(messages.last)
|
208
227
|
rescue => e
|
228
|
+
instrumentation_payload[:unrecoverable_delivery_error] = reset_producer_on_unrecoverable_delivery_errors(e)
|
209
229
|
instrumentation_payload[:retries_count] = pause.pauses_count
|
210
230
|
config.error_handler.call(e, instrumentation_payload)
|
211
231
|
raise e
|
@@ -214,6 +234,28 @@ module Racecar
|
|
214
234
|
end
|
215
235
|
end
|
216
236
|
|
237
|
+
# librdkafka will continue to try to deliver already queued messages, even if ruby-rdkafka
|
238
|
+
# raised before that. This method detects any unrecoverable errors and resets the producer
|
239
|
+
# as a last ditch effort.
|
240
|
+
# The function returns true if there were unrecoverable errors, or false otherwise.
|
241
|
+
def reset_producer_on_unrecoverable_delivery_errors(error)
|
242
|
+
return false unless error.is_a?(Racecar::MessageDeliveryError)
|
243
|
+
return false unless error.code == :msg_timed_out # -192
|
244
|
+
|
245
|
+
logger.error error.to_s
|
246
|
+
logger.error "Racecar will reset the producer to force a new broker connection."
|
247
|
+
@producer.close
|
248
|
+
@producer = nil
|
249
|
+
processor.configure(
|
250
|
+
producer: producer,
|
251
|
+
consumer: consumer,
|
252
|
+
instrumenter: @instrumenter,
|
253
|
+
config: @config,
|
254
|
+
)
|
255
|
+
|
256
|
+
true
|
257
|
+
end
|
258
|
+
|
217
259
|
def with_pause(topic, partition, offsets)
|
218
260
|
pause = pauses[topic][partition]
|
219
261
|
return yield pause if config.pause_timeout == 0
|
data/lib/racecar/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: racecar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0.alpha1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Daniel Schierbeck
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-03-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: king_konf
|
@@ -170,6 +170,7 @@ files:
|
|
170
170
|
- ".gitignore"
|
171
171
|
- ".rspec"
|
172
172
|
- CHANGELOG.md
|
173
|
+
- Dockerfile
|
173
174
|
- Gemfile
|
174
175
|
- Gemfile.lock
|
175
176
|
- LICENSE.txt
|
@@ -184,6 +185,7 @@ files:
|
|
184
185
|
- examples/producing_consumer.rb
|
185
186
|
- exe/racecar
|
186
187
|
- exe/racecarctl
|
188
|
+
- extra/datadog-dashboard.json
|
187
189
|
- lib/ensure_hash_compact.rb
|
188
190
|
- lib/generators/racecar/consumer_generator.rb
|
189
191
|
- lib/generators/racecar/install_generator.rb
|
@@ -199,7 +201,9 @@ files:
|
|
199
201
|
- lib/racecar/datadog.rb
|
200
202
|
- lib/racecar/instrumenter.rb
|
201
203
|
- lib/racecar/message.rb
|
204
|
+
- lib/racecar/message_delivery_error.rb
|
202
205
|
- lib/racecar/null_instrumenter.rb
|
206
|
+
- lib/racecar/parallel_runner.rb
|
203
207
|
- lib/racecar/pause.rb
|
204
208
|
- lib/racecar/rails_config_file_loader.rb
|
205
209
|
- lib/racecar/runner.rb
|
@@ -220,9 +224,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
220
224
|
version: '0'
|
221
225
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
222
226
|
requirements:
|
223
|
-
- - "
|
227
|
+
- - ">"
|
224
228
|
- !ruby/object:Gem::Version
|
225
|
-
version:
|
229
|
+
version: 1.3.1
|
226
230
|
requirements: []
|
227
231
|
rubygems_version: 3.1.2
|
228
232
|
signing_key:
|