@techwavedev/agi-agent-kit 1.1.7 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of @techwavedev/agi-agent-kit might be problematic. Click here for more details.
- package/CHANGELOG.md +82 -1
- package/README.md +190 -12
- package/bin/init.js +30 -2
- package/package.json +6 -3
- package/templates/base/AGENTS.md +54 -23
- package/templates/base/README.md +325 -0
- package/templates/base/directives/memory_integration.md +95 -0
- package/templates/base/execution/memory_manager.py +309 -0
- package/templates/base/execution/session_boot.py +218 -0
- package/templates/base/execution/session_init.py +320 -0
- package/templates/base/skill-creator/SKILL_skillcreator.md +23 -36
- package/templates/base/skill-creator/scripts/init_skill.py +18 -135
- package/templates/skills/ec/README.md +31 -0
- package/templates/skills/ec/aws/SKILL.md +1020 -0
- package/templates/skills/ec/aws/defaults.yaml +13 -0
- package/templates/skills/ec/aws/references/common_patterns.md +80 -0
- package/templates/skills/ec/aws/references/mcp_servers.md +98 -0
- package/templates/skills/ec/aws-terraform/SKILL.md +349 -0
- package/templates/skills/ec/aws-terraform/references/best_practices.md +394 -0
- package/templates/skills/ec/aws-terraform/references/checkov_reference.md +337 -0
- package/templates/skills/ec/aws-terraform/scripts/configure_mcp.py +150 -0
- package/templates/skills/ec/confluent-kafka/SKILL.md +655 -0
- package/templates/skills/ec/confluent-kafka/references/ansible_playbooks.md +792 -0
- package/templates/skills/ec/confluent-kafka/references/ec_deployment.md +579 -0
- package/templates/skills/ec/confluent-kafka/references/kraft_migration.md +490 -0
- package/templates/skills/ec/confluent-kafka/references/troubleshooting.md +778 -0
- package/templates/skills/ec/confluent-kafka/references/upgrade_7x_to_8x.md +488 -0
- package/templates/skills/ec/confluent-kafka/scripts/kafka_health_check.py +435 -0
- package/templates/skills/ec/confluent-kafka/scripts/upgrade_preflight.py +568 -0
- package/templates/skills/ec/confluent-kafka/scripts/validate_config.py +455 -0
- package/templates/skills/ec/consul/SKILL.md +427 -0
- package/templates/skills/ec/consul/references/acl_setup.md +168 -0
- package/templates/skills/ec/consul/references/ha_config.md +196 -0
- package/templates/skills/ec/consul/references/troubleshooting.md +267 -0
- package/templates/skills/ec/consul/references/upgrades.md +213 -0
- package/templates/skills/ec/consul/scripts/consul_health_report.py +530 -0
- package/templates/skills/ec/consul/scripts/consul_status.py +264 -0
- package/templates/skills/ec/consul/scripts/generate_values.py +170 -0
- package/templates/skills/ec/documentation/SKILL.md +351 -0
- package/templates/skills/ec/documentation/references/best_practices.md +201 -0
- package/templates/skills/ec/documentation/scripts/analyze_code.py +307 -0
- package/templates/skills/ec/documentation/scripts/detect_changes.py +460 -0
- package/templates/skills/ec/documentation/scripts/generate_changelog.py +312 -0
- package/templates/skills/ec/documentation/scripts/sync_docs.py +272 -0
- package/templates/skills/ec/documentation/scripts/update_skill_docs.py +366 -0
- package/templates/skills/ec/gitlab/SKILL.md +529 -0
- package/templates/skills/ec/gitlab/references/agent_installation.md +416 -0
- package/templates/skills/ec/gitlab/references/api_reference.md +508 -0
- package/templates/skills/ec/gitlab/references/gitops_flux.md +465 -0
- package/templates/skills/ec/gitlab/references/troubleshooting.md +518 -0
- package/templates/skills/ec/gitlab/scripts/generate_agent_values.py +329 -0
- package/templates/skills/ec/gitlab/scripts/gitlab_agent_status.py +414 -0
- package/templates/skills/ec/jira/SKILL.md +484 -0
- package/templates/skills/ec/jira/references/jql_reference.md +148 -0
- package/templates/skills/ec/jira/scripts/add_comment.py +91 -0
- package/templates/skills/ec/jira/scripts/bulk_log_work.py +124 -0
- package/templates/skills/ec/jira/scripts/create_ticket.py +162 -0
- package/templates/skills/ec/jira/scripts/get_ticket.py +191 -0
- package/templates/skills/ec/jira/scripts/jira_client.py +383 -0
- package/templates/skills/ec/jira/scripts/log_work.py +154 -0
- package/templates/skills/ec/jira/scripts/search_tickets.py +104 -0
- package/templates/skills/ec/jira/scripts/update_comment.py +67 -0
- package/templates/skills/ec/jira/scripts/update_ticket.py +161 -0
- package/templates/skills/ec/karpenter/SKILL.md +301 -0
- package/templates/skills/ec/karpenter/references/ec2nodeclasses.md +421 -0
- package/templates/skills/ec/karpenter/references/migration.md +396 -0
- package/templates/skills/ec/karpenter/references/nodepools.md +400 -0
- package/templates/skills/ec/karpenter/references/troubleshooting.md +359 -0
- package/templates/skills/ec/karpenter/scripts/generate_ec2nodeclass.py +187 -0
- package/templates/skills/ec/karpenter/scripts/generate_nodepool.py +245 -0
- package/templates/skills/ec/karpenter/scripts/karpenter_status.py +359 -0
- package/templates/skills/ec/opensearch/SKILL.md +720 -0
- package/templates/skills/ec/opensearch/references/ml_neural_search.md +576 -0
- package/templates/skills/ec/opensearch/references/operator.md +532 -0
- package/templates/skills/ec/opensearch/references/query_dsl.md +532 -0
- package/templates/skills/ec/opensearch/scripts/configure_mcp.py +148 -0
- package/templates/skills/ec/victoriametrics/SKILL.md +598 -0
- package/templates/skills/ec/victoriametrics/references/kubernetes.md +531 -0
- package/templates/skills/ec/victoriametrics/references/prometheus_migration.md +333 -0
- package/templates/skills/ec/victoriametrics/references/troubleshooting.md +442 -0
- package/templates/skills/knowledge/SKILLS_CATALOG.md +274 -4
- package/templates/skills/knowledge/intelligent-routing/SKILL.md +237 -164
- package/templates/skills/knowledge/parallel-agents/SKILL.md +345 -73
- package/templates/skills/knowledge/plugin-discovery/SKILL.md +582 -0
- package/templates/skills/knowledge/plugin-discovery/scripts/platform_setup.py +1083 -0
- package/templates/skills/knowledge/design-md/README.md +0 -34
- package/templates/skills/knowledge/design-md/SKILL.md +0 -193
- package/templates/skills/knowledge/design-md/examples/DESIGN.md +0 -154
- package/templates/skills/knowledge/notebooklm-mcp/SKILL.md +0 -71
- package/templates/skills/knowledge/notebooklm-mcp/assets/example_asset.txt +0 -24
- package/templates/skills/knowledge/notebooklm-mcp/references/api_reference.md +0 -34
- package/templates/skills/knowledge/notebooklm-mcp/scripts/example.py +0 -19
- package/templates/skills/knowledge/react-components/README.md +0 -36
- package/templates/skills/knowledge/react-components/SKILL.md +0 -53
- package/templates/skills/knowledge/react-components/examples/gold-standard-card.tsx +0 -80
- package/templates/skills/knowledge/react-components/package-lock.json +0 -231
- package/templates/skills/knowledge/react-components/package.json +0 -16
- package/templates/skills/knowledge/react-components/resources/architecture-checklist.md +0 -15
- package/templates/skills/knowledge/react-components/resources/component-template.tsx +0 -37
- package/templates/skills/knowledge/react-components/resources/stitch-api-reference.md +0 -14
- package/templates/skills/knowledge/react-components/resources/style-guide.json +0 -27
- package/templates/skills/knowledge/react-components/scripts/fetch-stitch.sh +0 -30
- package/templates/skills/knowledge/react-components/scripts/validate.js +0 -68
- package/templates/skills/knowledge/self-update/SKILL.md +0 -60
- package/templates/skills/knowledge/self-update/scripts/update_kit.py +0 -103
- package/templates/skills/knowledge/stitch-loop/README.md +0 -54
- package/templates/skills/knowledge/stitch-loop/SKILL.md +0 -235
- package/templates/skills/knowledge/stitch-loop/examples/SITE.md +0 -73
- package/templates/skills/knowledge/stitch-loop/examples/next-prompt.md +0 -25
- package/templates/skills/knowledge/stitch-loop/resources/baton-schema.md +0 -61
- package/templates/skills/knowledge/stitch-loop/resources/site-template.md +0 -104
|
@@ -0,0 +1,778 @@
|
|
|
1
|
+
# Confluent Kafka Troubleshooting Guide
|
|
2
|
+
|
|
3
|
+
Comprehensive troubleshooting guide for common Confluent Kafka issues in tarball/Ansible deployments.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## ⚠️ EC Environment Path Mappings
|
|
8
|
+
|
|
9
|
+
> **This guide uses standard Confluent paths in examples.** For EC deployments, substitute paths as follows:
|
|
10
|
+
|
|
11
|
+
| Standard Path | EC Path |
|
|
12
|
+
| --------------------------- | -------------------------------------------------------- |
|
|
13
|
+
| `/opt/confluent/` | `{{ base_path }}/opt/confluent-{{ confluent_version }}/` |
|
|
14
|
+
| `/var/kafka-logs/` | `{{ base_path }}/opt/data` (broker) |
|
|
15
|
+
| `/var/kafka-controller/` | `{{ base_path }}/opt/data/controller` |
|
|
16
|
+
| `/var/log/confluent/kafka/` | `{{ base_path }}/logs/` |
|
|
17
|
+
| `/var/ssl/kafka/` | `{{ base_path }}/opt/ssl/` |
|
|
18
|
+
| `localhost:9092` | `$BOOTSTRAP` (use SSL port 9443) |
|
|
19
|
+
| `systemctl` | `systemctl --user` |
|
|
20
|
+
| `kafka:kafka` | `{{ kafka_user }}:{{ kafka_group }}` |
|
|
21
|
+
|
|
22
|
+
**EC Quick Setup:**
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
export KAFKA_HOME={{ base_path }}/opt/confluent-{{ confluent_version }}
|
|
26
|
+
export BOOTSTRAP={{ broker_host_1 }}:{{ broker_port }}
|
|
27
|
+
export LOG_DIR={{ base_path }}/logs
|
|
28
|
+
export DATA_DIR={{ base_path }}/opt/data
|
|
29
|
+
export SSL_DIR={{ base_path }}/opt/ssl
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
See **[ec_deployment.md](ec_deployment.md)** for complete EC paths and configuration.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Table of Contents
|
|
37
|
+
|
|
38
|
+
1. [Broker Issues](#broker-issues)
|
|
39
|
+
2. [KRaft Controller Issues](#kraft-controller-issues)
|
|
40
|
+
3. [Replication Problems](#replication-problems)
|
|
41
|
+
4. [Consumer Issues](#consumer-issues)
|
|
42
|
+
5. [Producer Issues](#producer-issues)
|
|
43
|
+
6. [Schema Registry Issues](#schema-registry-issues)
|
|
44
|
+
7. [Kafka Connect Issues](#kafka-connect-issues)
|
|
45
|
+
8. [Performance Issues](#performance-issues)
|
|
46
|
+
9. [Security Issues](#security-issues)
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Broker Issues
|
|
51
|
+
|
|
52
|
+
### Broker Won't Start
|
|
53
|
+
|
|
54
|
+
**Symptoms:**
|
|
55
|
+
|
|
56
|
+
- `systemctl status confluent-server` shows failed
|
|
57
|
+
- Broker process exits immediately after startup
|
|
58
|
+
|
|
59
|
+
**Diagnosis:**
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Check service status
|
|
63
|
+
systemctl status confluent-server
|
|
64
|
+
|
|
65
|
+
# View recent logs
|
|
66
|
+
journalctl -u confluent-server -n 100
|
|
67
|
+
|
|
68
|
+
# Check Kafka server logs
|
|
69
|
+
tail -500 /var/log/confluent/kafka/server.log | grep -i "error\|exception\|fatal"
|
|
70
|
+
|
|
71
|
+
# Check for port conflicts
|
|
72
|
+
netstat -tlnp | grep -E "9092|9093"
|
|
73
|
+
|
|
74
|
+
# Verify disk space
|
|
75
|
+
df -h /var/kafka-logs
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
**Common Causes & Solutions:**
|
|
79
|
+
|
|
80
|
+
| Cause | Solution |
|
|
81
|
+
| ----------------------- | --------------------------------------- |
|
|
82
|
+
| Port already in use | Kill conflicting process or change port |
|
|
83
|
+
| Corrupted log files | Remove corrupted segment (last resort) |
|
|
84
|
+
| Insufficient disk space | Free disk space or expand volume |
|
|
85
|
+
| Invalid configuration | Fix syntax errors in server.properties |
|
|
86
|
+
| Missing directories | Create log.dirs with proper permissions |
|
|
87
|
+
| Java not found | Set JAVA_HOME in systemd unit |
|
|
88
|
+
|
|
89
|
+
**Fix: Permission Issues**
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# Fix ownership
|
|
93
|
+
chown -R kafka:kafka /var/kafka-logs
|
|
94
|
+
chown -R kafka:kafka /opt/confluent
|
|
95
|
+
|
|
96
|
+
# Fix permissions
|
|
97
|
+
chmod 750 /var/kafka-logs
|
|
98
|
+
chmod 640 /opt/confluent/etc/kafka/server.properties
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**Fix: Corrupted Segment Recovery**
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
# CAUTION: Data loss possible - only as last resort
|
|
105
|
+
# Identify corrupted segment from logs
|
|
106
|
+
cat /var/log/confluent/kafka/server.log | grep "Corrupted"
|
|
107
|
+
|
|
108
|
+
# Move corrupted partition (will be re-replicated)
|
|
109
|
+
mv /var/kafka-logs/<topic>-<partition> /backup/corrupted/
|
|
110
|
+
|
|
111
|
+
# Restart broker - partition will sync from replicas
|
|
112
|
+
systemctl restart confluent-server
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
### Broker Crash Loop
|
|
118
|
+
|
|
119
|
+
**Symptoms:**
|
|
120
|
+
|
|
121
|
+
- Broker starts then crashes within seconds
|
|
122
|
+
- Repeated restart attempts
|
|
123
|
+
|
|
124
|
+
**Diagnosis:**
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
# Check for OOM killer
|
|
128
|
+
dmesg | grep -i "killed process" | tail -10
|
|
129
|
+
|
|
130
|
+
# Check heap dump
|
|
131
|
+
ls -la /opt/confluent/*.hprof
|
|
132
|
+
|
|
133
|
+
# Review GC logs
|
|
134
|
+
grep "GC pause" /var/log/confluent/kafka/kafka-gc.log | tail -20
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
**Solutions:**
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
# Increase heap size (edit systemd unit)
|
|
141
|
+
# /etc/systemd/system/confluent-server.service
|
|
142
|
+
Environment="KAFKA_HEAP_OPTS=-Xms8g -Xmx8g"
|
|
143
|
+
|
|
144
|
+
# Reduce replica fetch size if OOM during catchup
|
|
145
|
+
# In server.properties:
|
|
146
|
+
replica.fetch.max.bytes=524288
|
|
147
|
+
|
|
148
|
+
# Reload and restart
|
|
149
|
+
systemctl daemon-reload
|
|
150
|
+
systemctl restart confluent-server
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## KRaft Controller Issues
|
|
156
|
+
|
|
157
|
+
### Controller Quorum Not Forming
|
|
158
|
+
|
|
159
|
+
**Symptoms:**
|
|
160
|
+
|
|
161
|
+
- Controllers can't elect a leader
|
|
162
|
+
- `kafka-metadata` command hangs
|
|
163
|
+
|
|
164
|
+
**Diagnosis:**
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
# Check controller logs
|
|
168
|
+
tail -200 /var/log/confluent/kafka/controller.log | grep -i "voter\|quorum\|elect"
|
|
169
|
+
|
|
170
|
+
# Verify network connectivity between controllers
|
|
171
|
+
for port in 9093; do
|
|
172
|
+
nc -zv controller-01 $port
|
|
173
|
+
nc -zv controller-02 $port
|
|
174
|
+
nc -zv controller-03 $port
|
|
175
|
+
done
|
|
176
|
+
|
|
177
|
+
# Check controller.quorum.voters consistency
|
|
178
|
+
grep controller.quorum.voters /opt/confluent/etc/kafka/kraft/controller.properties
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
**Common Causes:**
|
|
182
|
+
|
|
183
|
+
| Cause | Solution |
|
|
184
|
+
| ----------------------- | -------------------------------------------------------- |
|
|
185
|
+
| Mismatched voter config | Ensure identical `controller.quorum.voters` on all nodes |
|
|
186
|
+
| Network/firewall issues | Open port 9093 between controllers |
|
|
187
|
+
| Different cluster IDs | Re-format storage with same cluster ID |
|
|
188
|
+
| Time skew | Sync NTP across all nodes |
|
|
189
|
+
|
|
190
|
+
**Fix: Rebuild Controller Quorum**
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
# CAUTION: Only if all controllers failed
|
|
194
|
+
# 1. Stop all controllers
|
|
195
|
+
for host in controller-01 controller-02 controller-03; do
|
|
196
|
+
ssh $host "systemctl stop confluent-server"
|
|
197
|
+
done
|
|
198
|
+
|
|
199
|
+
# 2. Backup existing data
|
|
200
|
+
tar -czvf /backup/controller-data-$(date +%Y%m%d).tar.gz /var/kafka-controller/
|
|
201
|
+
|
|
202
|
+
# 3. Clear and re-format with same cluster ID
|
|
203
|
+
CLUSTER_ID=$(cat /backup/cluster-id.txt)
|
|
204
|
+
/opt/confluent/bin/kafka-storage format -t $CLUSTER_ID -c /opt/confluent/etc/kafka/kraft/controller.properties --force
|
|
205
|
+
|
|
206
|
+
# 4. Start controllers one at a time
|
|
207
|
+
ssh controller-01 "systemctl start confluent-server"
|
|
208
|
+
sleep 30
|
|
209
|
+
ssh controller-02 "systemctl start confluent-server"
|
|
210
|
+
sleep 30
|
|
211
|
+
ssh controller-03 "systemctl start confluent-server"
|
|
212
|
+
|
|
213
|
+
# 5. Verify quorum
|
|
214
|
+
/opt/confluent/bin/kafka-metadata --snapshot /var/kafka-controller/__cluster_metadata-0/00000000000000000000.log --command quorum
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
### Broker Not Registering with KRaft
|
|
220
|
+
|
|
221
|
+
**Symptoms:**
|
|
222
|
+
|
|
223
|
+
- Broker starts but doesn't appear in `kafka-metadata broker` output
|
|
224
|
+
- "Cannot connect to controller" errors
|
|
225
|
+
|
|
226
|
+
**Diagnosis:**
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
# Check broker logs for controller connection
|
|
230
|
+
grep -i "controller" /var/log/confluent/kafka/server.log | tail -50
|
|
231
|
+
|
|
232
|
+
# Verify broker config matches controller quorum
|
|
233
|
+
grep controller.quorum.voters /opt/confluent/etc/kafka/server.properties
|
|
234
|
+
|
|
235
|
+
# Test connectivity to controller port
|
|
236
|
+
nc -zv controller-01 9093
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
**Fix:**
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
# Ensure these match between broker and controllers:
|
|
243
|
+
# - controller.quorum.voters
|
|
244
|
+
# - controller.listener.names
|
|
245
|
+
# - Security settings (SASL/SSL)
|
|
246
|
+
|
|
247
|
+
# Verify listener security map includes CONTROLLER
|
|
248
|
+
grep listener.security.protocol.map /opt/confluent/etc/kafka/server.properties
|
|
249
|
+
# Should include: CONTROLLER:PLAINTEXT (or SASL_SSL, etc.)
|
|
250
|
+
|
|
251
|
+
# Restart broker
|
|
252
|
+
systemctl restart confluent-server
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## Replication Problems
|
|
258
|
+
|
|
259
|
+
### Under-Replicated Partitions (URP)
|
|
260
|
+
|
|
261
|
+
**Symptoms:**
|
|
262
|
+
|
|
263
|
+
- `kafka-topics --describe --under-replicated-partitions` shows results
|
|
264
|
+
- Alerts from monitoring
|
|
265
|
+
|
|
266
|
+
**Diagnosis:**
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
# List URPs
|
|
270
|
+
/opt/confluent/bin/kafka-topics --bootstrap-server localhost:9092 \
|
|
271
|
+
--describe --under-replicated-partitions
|
|
272
|
+
|
|
273
|
+
# Check which brokers are affected
|
|
274
|
+
/opt/confluent/bin/kafka-topics --bootstrap-server localhost:9092 \
|
|
275
|
+
--describe --under-replicated-partitions | awk '{print $4}' | sort | uniq -c
|
|
276
|
+
|
|
277
|
+
# Check replica lag on affected broker
|
|
278
|
+
grep "Replica lag" /var/log/confluent/kafka/server.log | tail -20
|
|
279
|
+
|
|
280
|
+
# Check network between replicas
|
|
281
|
+
iperf3 -c broker-02 -p 5201
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
**Common Causes & Solutions:**
|
|
285
|
+
|
|
286
|
+
| Cause | Solution |
|
|
287
|
+
| ------------------ | ---------------------------------------- |
|
|
288
|
+
| Slow disk I/O | Check disk latency, use SSDs |
|
|
289
|
+
| Network congestion | Check bandwidth, increase socket buffers |
|
|
290
|
+
| Follower too slow | Increase `replica.lag.time.max.ms` |
|
|
291
|
+
| Broker overloaded | Redistribute partitions, add brokers |
|
|
292
|
+
| GC pauses | Tune JVM, increase heap |
|
|
293
|
+
|
|
294
|
+
**Fix: Increase Replica Lag Tolerance**
|
|
295
|
+
|
|
296
|
+
```properties
|
|
297
|
+
# In server.properties
|
|
298
|
+
replica.lag.time.max.ms=45000 # Default 30000
|
|
299
|
+
num.replica.fetchers=8 # Default 1
|
|
300
|
+
replica.fetch.max.bytes=10485760 # 10MB
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
### ISR Shrinking Repeatedly
|
|
306
|
+
|
|
307
|
+
**Symptoms:**
|
|
308
|
+
|
|
309
|
+
- ISR changes frequently in logs
|
|
310
|
+
- Followers constantly falling out and rejoining
|
|
311
|
+
|
|
312
|
+
**Diagnosis:**
|
|
313
|
+
|
|
314
|
+
```bash
|
|
315
|
+
# Count ISR changes
|
|
316
|
+
grep "ISR" /var/log/confluent/kafka/server.log | tail -100
|
|
317
|
+
|
|
318
|
+
# Check broker request latency
|
|
319
|
+
grep "RequestHandlerAvgIdlePercent" /var/log/confluent/kafka/server.log
|
|
320
|
+
|
|
321
|
+
# Monitor network latency
|
|
322
|
+
ping -c 100 broker-02 | tail -5
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
**Fix: Tune Replication**
|
|
326
|
+
|
|
327
|
+
```properties
|
|
328
|
+
# In server.properties - increase tolerance
|
|
329
|
+
replica.lag.time.max.ms=60000
|
|
330
|
+
replica.socket.receive.buffer.bytes=1048576
|
|
331
|
+
replica.socket.timeout.ms=60000
|
|
332
|
+
|
|
333
|
+
# Increase replica fetcher threads
|
|
334
|
+
num.replica.fetchers=4
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
---
|
|
338
|
+
|
|
339
|
+
## Consumer Issues
|
|
340
|
+
|
|
341
|
+
### High Consumer Lag
|
|
342
|
+
|
|
343
|
+
**Symptoms:**
|
|
344
|
+
|
|
345
|
+
- Consumer group shows large lag
|
|
346
|
+
- Messages processing delayed
|
|
347
|
+
|
|
348
|
+
**Diagnosis:**
|
|
349
|
+
|
|
350
|
+
```bash
|
|
351
|
+
# Check consumer group lag
|
|
352
|
+
/opt/confluent/bin/kafka-consumer-groups --bootstrap-server localhost:9092 \
|
|
353
|
+
--group my-group --describe
|
|
354
|
+
|
|
355
|
+
# Check if consumers are connected
|
|
356
|
+
/opt/confluent/bin/kafka-consumer-groups --bootstrap-server localhost:9092 \
|
|
357
|
+
--group my-group --describe --members
|
|
358
|
+
|
|
359
|
+
# Check consumer assignment
|
|
360
|
+
/opt/confluent/bin/kafka-consumer-groups --bootstrap-server localhost:9092 \
|
|
361
|
+
--group my-group --describe --members --verbose
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
**Solutions:**
|
|
365
|
+
|
|
366
|
+
| Cause | Solution |
|
|
367
|
+
| -------------------------------- | ------------------------------------------------------- |
|
|
368
|
+
| Too few consumers | Scale consumer instances |
|
|
369
|
+
| Slow processing | Optimize consumer logic, async processing |
|
|
370
|
+
| Too many partitions per consumer | Increase consumers or reduce partitions |
|
|
371
|
+
| Large messages | Increase `fetch.max.bytes`, `max.partition.fetch.bytes` |
|
|
372
|
+
| Network bottleneck | Increase fetch size, check bandwidth |
|
|
373
|
+
|
|
374
|
+
---
|
|
375
|
+
|
|
376
|
+
### Consumer Group Rebalancing Constantly
|
|
377
|
+
|
|
378
|
+
**Symptoms:**
|
|
379
|
+
|
|
380
|
+
- Frequent "JoinGroup" requests in logs
|
|
381
|
+
- Processing stops during rebalances
|
|
382
|
+
|
|
383
|
+
**Diagnosis:**
|
|
384
|
+
|
|
385
|
+
```bash
|
|
386
|
+
# Check for rebalance triggers
|
|
387
|
+
grep -i "rebalance\|join" /var/log/myapp/consumer.log | tail -50
|
|
388
|
+
|
|
389
|
+
# Check session timeout settings
|
|
390
|
+
# Consumer config should have reasonable timeouts
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
**Fix: Tune Consumer Config**
|
|
394
|
+
|
|
395
|
+
```properties
|
|
396
|
+
# In consumer configuration
|
|
397
|
+
session.timeout.ms=45000 # Default 10000
|
|
398
|
+
heartbeat.interval.ms=15000 # Should be < session.timeout/3
|
|
399
|
+
max.poll.interval.ms=600000 # Increase if processing is slow
|
|
400
|
+
max.poll.records=100 # Reduce if processing is slow
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
---
|
|
404
|
+
|
|
405
|
+
## Producer Issues
|
|
406
|
+
|
|
407
|
+
### Producer Timeout Errors
|
|
408
|
+
|
|
409
|
+
**Symptoms:**
|
|
410
|
+
|
|
411
|
+
- `TimeoutException` in producer logs
|
|
412
|
+
- Messages not delivered
|
|
413
|
+
|
|
414
|
+
**Diagnosis:**
|
|
415
|
+
|
|
416
|
+
```bash
|
|
417
|
+
# Check broker availability
|
|
418
|
+
/opt/confluent/bin/kafka-broker-api-versions --bootstrap-server localhost:9092
|
|
419
|
+
|
|
420
|
+
# Check producer metrics if using JMX
|
|
421
|
+
# record-send-rate, request-latency-avg
|
|
422
|
+
|
|
423
|
+
# Verify topic exists
|
|
424
|
+
/opt/confluent/bin/kafka-topics --bootstrap-server localhost:9092 \
|
|
425
|
+
--topic my-topic --describe
|
|
426
|
+
```
|
|
427
|
+
|
|
428
|
+
**Fix: Tune Producer Config**
|
|
429
|
+
|
|
430
|
+
```properties
|
|
431
|
+
# In producer configuration
|
|
432
|
+
request.timeout.ms=60000 # Default 30000
|
|
433
|
+
delivery.timeout.ms=180000 # Default 120000
|
|
434
|
+
retries=10 # Default MAX_INT in newer versions
|
|
435
|
+
retry.backoff.ms=500 # Delay between retries
|
|
436
|
+
```
|
|
437
|
+
|
|
438
|
+
---
|
|
439
|
+
|
|
440
|
+
### Producer Not Acknowledging (acks=all Slow)
|
|
441
|
+
|
|
442
|
+
**Symptoms:**
|
|
443
|
+
|
|
444
|
+
- Slow produce latency with `acks=all`
|
|
445
|
+
- Works fine with `acks=1`
|
|
446
|
+
|
|
447
|
+
**Diagnosis:**
|
|
448
|
+
|
|
449
|
+
```bash
|
|
450
|
+
# Check min.insync.replicas
|
|
451
|
+
/opt/confluent/bin/kafka-topics --bootstrap-server localhost:9092 \
|
|
452
|
+
--topic my-topic --describe
|
|
453
|
+
|
|
454
|
+
# Check for URPs on the topic
|
|
455
|
+
/opt/confluent/bin/kafka-topics --bootstrap-server localhost:9092 \
|
|
456
|
+
--topic my-topic --describe --under-replicated-partitions
|
|
457
|
+
```
|
|
458
|
+
|
|
459
|
+
**Solutions:**
|
|
460
|
+
|
|
461
|
+
- Ensure all replicas are in ISR
|
|
462
|
+
- Check network latency between brokers
|
|
463
|
+
- Reduce `min.insync.replicas` (trades durability for performance)
|
|
464
|
+
- Use compression to reduce replication traffic
|
|
465
|
+
|
|
466
|
+
---
|
|
467
|
+
|
|
468
|
+
## Schema Registry Issues
|
|
469
|
+
|
|
470
|
+
### Schema Registry 409 Conflict
|
|
471
|
+
|
|
472
|
+
**Symptoms:**
|
|
473
|
+
|
|
474
|
+
- `io.confluent.kafka.schemaregistry.client.rest.exceptions.RestClientException: Schema being registered is incompatible with an earlier schema`
|
|
475
|
+
|
|
476
|
+
**Diagnosis:**
|
|
477
|
+
|
|
478
|
+
```bash
|
|
479
|
+
# Check current compatibility mode
|
|
480
|
+
curl http://localhost:8081/config | jq
|
|
481
|
+
|
|
482
|
+
# Check specific subject compatibility
|
|
483
|
+
curl http://localhost:8081/config/my-topic-value | jq
|
|
484
|
+
|
|
485
|
+
# Get existing schema
|
|
486
|
+
curl http://localhost:8081/subjects/my-topic-value/versions/latest | jq
|
|
487
|
+
```
|
|
488
|
+
|
|
489
|
+
**Solutions:**
|
|
490
|
+
|
|
491
|
+
```bash
|
|
492
|
+
# Option 1: Change compatibility mode (if acceptable)
|
|
493
|
+
curl -X PUT -H "Content-Type: application/vnd.schemaregistry.v1+json" \
|
|
494
|
+
--data '{"compatibility": "NONE"}' \
|
|
495
|
+
http://localhost:8081/config/my-topic-value
|
|
496
|
+
|
|
497
|
+
# Option 2: Register as new subject
|
|
498
|
+
# Use a different subject name
|
|
499
|
+
|
|
500
|
+
# Option 3: Delete old versions (CAUTION - impacts consumers)
|
|
501
|
+
curl -X DELETE http://localhost:8081/subjects/my-topic-value/versions/1
|
|
502
|
+
```
|
|
503
|
+
|
|
504
|
+
---
|
|
505
|
+
|
|
506
|
+
### Schema Registry Leader Election Failure
|
|
507
|
+
|
|
508
|
+
**Symptoms:**
|
|
509
|
+
|
|
510
|
+
- Schema Registry returns 500 errors
|
|
511
|
+
- "Not the leader" errors
|
|
512
|
+
|
|
513
|
+
**Diagnosis:**
|
|
514
|
+
|
|
515
|
+
```bash
|
|
516
|
+
# Check SR logs
|
|
517
|
+
tail -100 /var/log/confluent/schema-registry/schema-registry.log
|
|
518
|
+
|
|
519
|
+
# Check _schemas topic
|
|
520
|
+
/opt/confluent/bin/kafka-topics --bootstrap-server localhost:9092 \
|
|
521
|
+
--topic _schemas --describe
|
|
522
|
+
|
|
523
|
+
# Check consumer group
|
|
524
|
+
/opt/confluent/bin/kafka-consumer-groups --bootstrap-server localhost:9092 \
|
|
525
|
+
--group schema-registry --describe
|
|
526
|
+
```
|
|
527
|
+
|
|
528
|
+
**Fix:**
|
|
529
|
+
|
|
530
|
+
```bash
|
|
531
|
+
# Restart SR instances to trigger re-election
|
|
532
|
+
systemctl restart confluent-schema-registry
|
|
533
|
+
|
|
534
|
+
# If _schemas topic is corrupt, may need to recreate (DESTROYS ALL SCHEMAS)
|
|
535
|
+
```
|
|
536
|
+
|
|
537
|
+
---
|
|
538
|
+
|
|
539
|
+
## Kafka Connect Issues
|
|
540
|
+
|
|
541
|
+
### Connector Task Failed
|
|
542
|
+
|
|
543
|
+
**Symptoms:**
|
|
544
|
+
|
|
545
|
+
- Connector status shows FAILED
|
|
546
|
+
- Connector tasks not processing
|
|
547
|
+
|
|
548
|
+
**Diagnosis:**
|
|
549
|
+
|
|
550
|
+
```bash
|
|
551
|
+
# Check connector status
|
|
552
|
+
curl -s http://localhost:8083/connectors/my-connector/status | jq
|
|
553
|
+
|
|
554
|
+
# Get task details
|
|
555
|
+
curl -s http://localhost:8083/connectors/my-connector/tasks/0/status | jq
|
|
556
|
+
|
|
557
|
+
# View Connect worker logs
|
|
558
|
+
tail -200 /var/log/confluent/kafka-connect/connect.log | grep -i "error\|exception"
|
|
559
|
+
```
|
|
560
|
+
|
|
561
|
+
**Fix: Restart Failed Task**
|
|
562
|
+
|
|
563
|
+
```bash
|
|
564
|
+
# Restart specific task
|
|
565
|
+
curl -X POST http://localhost:8083/connectors/my-connector/tasks/0/restart
|
|
566
|
+
|
|
567
|
+
# Or restart entire connector
|
|
568
|
+
curl -X POST http://localhost:8083/connectors/my-connector/restart
|
|
569
|
+
|
|
570
|
+
# Update connector config if needed
|
|
571
|
+
curl -X PUT -H "Content-Type: application/json" \
|
|
572
|
+
--data @updated-config.json \
|
|
573
|
+
http://localhost:8083/connectors/my-connector/config
|
|
574
|
+
```
|
|
575
|
+
|
|
576
|
+
---
|
|
577
|
+
|
|
578
|
+
### Connector Not Starting
|
|
579
|
+
|
|
580
|
+
**Symptoms:**
|
|
581
|
+
|
|
582
|
+
- POST to create connector returns 201 but connector never starts
|
|
583
|
+
- Connector shows UNASSIGNED
|
|
584
|
+
|
|
585
|
+
**Diagnosis:**
|
|
586
|
+
|
|
587
|
+
```bash
|
|
588
|
+
# Check if worker has capacity
|
|
589
|
+
curl -s http://localhost:8083/ | jq
|
|
590
|
+
|
|
591
|
+
# Check for classpath issues
|
|
592
|
+
curl -s http://localhost:8083/connector-plugins | jq | grep -i "class"
|
|
593
|
+
|
|
594
|
+
# Verify connector JAR is in plugin path
|
|
595
|
+
ls -la /opt/confluent/share/java/kafka-connect-jdbc/
|
|
596
|
+
```
|
|
597
|
+
|
|
598
|
+
---
|
|
599
|
+
|
|
600
|
+
## Performance Issues
|
|
601
|
+
|
|
602
|
+
### High Disk I/O
|
|
603
|
+
|
|
604
|
+
**Diagnosis:**
|
|
605
|
+
|
|
606
|
+
```bash
|
|
607
|
+
# Check disk I/O
|
|
608
|
+
iostat -x 5
|
|
609
|
+
|
|
610
|
+
# Check Kafka log flush
|
|
611
|
+
grep "LogFlushRateAndTimeMs" /var/log/confluent/kafka/server.log
|
|
612
|
+
|
|
613
|
+
# Check log compaction
|
|
614
|
+
grep "cleaner" /var/log/confluent/kafka/server.log
|
|
615
|
+
```
|
|
616
|
+
|
|
617
|
+
**Tuning:**
|
|
618
|
+
|
|
619
|
+
```properties
|
|
620
|
+
# In server.properties
|
|
621
|
+
log.flush.interval.messages=50000 # Reduce sync frequency
|
|
622
|
+
log.flush.interval.ms=10000
|
|
623
|
+
log.cleaner.io.buffer.load.factor=0.9
|
|
624
|
+
log.cleaner.threads=2
|
|
625
|
+
```
|
|
626
|
+
|
|
627
|
+
---
|
|
628
|
+
|
|
629
|
+
### High Network Utilization
|
|
630
|
+
|
|
631
|
+
**Diagnosis:**
|
|
632
|
+
|
|
633
|
+
```bash
|
|
634
|
+
# Check network stats
|
|
635
|
+
sar -n DEV 5
|
|
636
|
+
|
|
637
|
+
# Check broker network threads
|
|
638
|
+
grep "NetworkProcessorAvgIdlePercent" /var/log/confluent/kafka/server.log
|
|
639
|
+
|
|
640
|
+
# Monitor with JMX
|
|
641
|
+
# kafka.network:type=SocketServer,name=NetworkProcessorAvgIdlePercent
|
|
642
|
+
```
|
|
643
|
+
|
|
644
|
+
**Tuning:**
|
|
645
|
+
|
|
646
|
+
```properties
|
|
647
|
+
# In server.properties
|
|
648
|
+
num.network.threads=8
|
|
649
|
+
socket.send.buffer.bytes=1048576
|
|
650
|
+
socket.receive.buffer.bytes=1048576
|
|
651
|
+
|
|
652
|
+
# Enable compression
|
|
653
|
+
compression.type=lz4
|
|
654
|
+
```
|
|
655
|
+
|
|
656
|
+
---
|
|
657
|
+
|
|
658
|
+
## Security Issues
|
|
659
|
+
|
|
660
|
+
### ACL Permission Denied
|
|
661
|
+
|
|
662
|
+
**Symptoms:**
|
|
663
|
+
|
|
664
|
+
- `ClusterAuthorizationException` or `TopicAuthorizationException`
|
|
665
|
+
- Clients can't produce or consume
|
|
666
|
+
|
|
667
|
+
**Diagnosis:**
|
|
668
|
+
|
|
669
|
+
```bash
|
|
670
|
+
# List all ACLs
|
|
671
|
+
/opt/confluent/bin/kafka-acls --bootstrap-server localhost:9092 --list
|
|
672
|
+
|
|
673
|
+
# Check ACLs for specific topic
|
|
674
|
+
/opt/confluent/bin/kafka-acls --bootstrap-server localhost:9092 \
|
|
675
|
+
--topic my-topic --list
|
|
676
|
+
|
|
677
|
+
# Check ACLs for specific principal
|
|
678
|
+
/opt/confluent/bin/kafka-acls --bootstrap-server localhost:9092 \
|
|
679
|
+
--principal User:myuser --list
|
|
680
|
+
```
|
|
681
|
+
|
|
682
|
+
**Fix: Add Required ACLs**
|
|
683
|
+
|
|
684
|
+
```bash
|
|
685
|
+
# Allow producer
|
|
686
|
+
/opt/confluent/bin/kafka-acls --bootstrap-server localhost:9092 \
|
|
687
|
+
--add --allow-principal User:myuser \
|
|
688
|
+
--operation Write --topic my-topic
|
|
689
|
+
|
|
690
|
+
# Allow consumer
|
|
691
|
+
/opt/confluent/bin/kafka-acls --bootstrap-server localhost:9092 \
|
|
692
|
+
--add --allow-principal User:myuser \
|
|
693
|
+
--operation Read --topic my-topic \
|
|
694
|
+
--group my-consumer-group
|
|
695
|
+
```
|
|
696
|
+
|
|
697
|
+
---
|
|
698
|
+
|
|
699
|
+
### SSL Handshake Failure
|
|
700
|
+
|
|
701
|
+
**Symptoms:**
|
|
702
|
+
|
|
703
|
+
- `SslAuthenticationException: SSL handshake failed`
|
|
704
|
+
- Clients can't connect over SSL
|
|
705
|
+
|
|
706
|
+
**Diagnosis:**
|
|
707
|
+
|
|
708
|
+
```bash
|
|
709
|
+
# Test SSL connection
|
|
710
|
+
openssl s_client -connect localhost:9093 -tls1_2
|
|
711
|
+
|
|
712
|
+
# Verify keystore
|
|
713
|
+
keytool -list -keystore /var/ssl/kafka/kafka.keystore.jks
|
|
714
|
+
|
|
715
|
+
# Check certificate expiry
|
|
716
|
+
keytool -list -keystore /var/ssl/kafka/kafka.keystore.jks -v | grep "Valid"
|
|
717
|
+
|
|
718
|
+
# Verify truststore
|
|
719
|
+
keytool -list -keystore /var/ssl/kafka/kafka.truststore.jks
|
|
720
|
+
```
|
|
721
|
+
|
|
722
|
+
**Common Causes:**
|
|
723
|
+
|
|
724
|
+
| Cause | Solution |
|
|
725
|
+
| ---------------------- | ------------------------------------------------ |
|
|
726
|
+
| Certificate expired | Generate new certificates |
|
|
727
|
+
| Wrong CA in truststore | Import correct CA certificate |
|
|
728
|
+
| Hostname mismatch | Use correct SAN or disable hostname verification |
|
|
729
|
+
| TLS version mismatch | Align TLS versions between client and server |
|
|
730
|
+
|
|
731
|
+
---
|
|
732
|
+
|
|
733
|
+
## General Debugging Tools
|
|
734
|
+
|
|
735
|
+
### Useful Log Locations
|
|
736
|
+
|
|
737
|
+
```
|
|
738
|
+
/var/log/confluent/kafka/server.log # Broker logs
|
|
739
|
+
/var/log/confluent/kafka/controller.log # KRaft controller logs
|
|
740
|
+
/var/log/confluent/kafka/kafka-gc.log # GC logs
|
|
741
|
+
/var/log/confluent/schema-registry/ # Schema Registry logs
|
|
742
|
+
/var/log/confluent/kafka-connect/ # Connect worker logs
|
|
743
|
+
/var/log/confluent/control-center/ # Control Center logs
|
|
744
|
+
```
|
|
745
|
+
|
|
746
|
+
### JMX Monitoring
|
|
747
|
+
|
|
748
|
+
```bash
|
|
749
|
+
# Enable JMX (add to KAFKA_OPTS)
|
|
750
|
+
export KAFKA_OPTS="-Dcom.sun.management.jmxremote \
|
|
751
|
+
-Dcom.sun.management.jmxremote.port=9999 \
|
|
752
|
+
-Dcom.sun.management.jmxremote.authenticate=false \
|
|
753
|
+
-Dcom.sun.management.jmxremote.ssl=false"
|
|
754
|
+
|
|
755
|
+
# Query JMX metrics
|
|
756
|
+
java -jar jmxterm.jar -l localhost:9999
|
|
757
|
+
> beans kafka.*:*
|
|
758
|
+
> get -b kafka.server:type=ReplicaManager,name=UnderReplicatedPartitions Value
|
|
759
|
+
```
|
|
760
|
+
|
|
761
|
+
### Thread Dump
|
|
762
|
+
|
|
763
|
+
```bash
|
|
764
|
+
# Get Kafka broker thread dump
|
|
765
|
+
jstack $(pgrep -f kafka.Kafka) > /tmp/kafka-threads-$(date +%Y%m%d-%H%M%S).txt
|
|
766
|
+
|
|
767
|
+
# Analyze for deadlocks
|
|
768
|
+
jstack $(pgrep -f kafka.Kafka) | grep -A 50 "deadlock"
|
|
769
|
+
```
|
|
770
|
+
|
|
771
|
+
### Heap Dump
|
|
772
|
+
|
|
773
|
+
```bash
|
|
774
|
+
# Trigger heap dump
|
|
775
|
+
jmap -dump:format=b,file=/tmp/kafka-heap.hprof $(pgrep -f kafka.Kafka)
|
|
776
|
+
|
|
777
|
+
# Analyze with Eclipse MAT or jhat
|
|
778
|
+
```
|