@closeup1202/klag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +114 -0
- package/dist/cli/index.js +738 -0
- package/package.json +52 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 closeup1202
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# klag
|
|
2
|
+
|
|
3
|
+
> Know **why** your Kafka consumer lag is growing — in 5 seconds from the terminal
|
|
4
|
+
|
|
5
|
+
[](https://www.npmjs.com/package/klag)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
|
|
8
|
+
## Compared to existing tools
|
|
9
|
+
|
|
10
|
+
| | Burrow | Kafka UI | **klag** |
|
|
11
|
+
|--|--------|----------|---------------|
|
|
12
|
+
| Lag measurement | ✅ | ✅ | ✅ |
|
|
13
|
+
| Root cause detection | ❌ | ❌ | ✅ |
|
|
14
|
+
| CLI (npx) | ❌ | ❌ | ✅ |
|
|
15
|
+
| Requires separate server | ✅ | ✅ | ❌ |
|
|
16
|
+
|
|
17
|
+
## Run without installation
|
|
18
|
+
```bash
|
|
19
|
+
npx @closeup1202/klag --broker localhost:9092 --group my-service
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Output example
|
|
23
|
+
```
|
|
24
|
+
⚡ klag v0.1.0
|
|
25
|
+
|
|
26
|
+
🔍 Consumer Group: my-service
|
|
27
|
+
Broker: localhost:9092
|
|
28
|
+
Collected At: 2026-03-26 17:27:27 (Asia/Seoul)
|
|
29
|
+
|
|
30
|
+
Group Status : ⚠️ WARNING Total Lag : 1,234
|
|
31
|
+
|
|
32
|
+
┌────────┬───────────┬──────────────────┬────────────────┬───────┬─────────┬──────────────┬──────────────┐
|
|
33
|
+
│ Topic │ Partition │ Committed Offset │ Log-End Offset │ Lag │ Status │ Produce Rate │ Consume Rate │
|
|
34
|
+
├────────┼───────────┼──────────────────┼────────────────┼───────┼─────────┼──────────────┼──────────────┤
|
|
35
|
+
│ orders │ 0 │ 8,796 │ 10,000 │ 1,204 │ 🔴 HIGH │ 40.0 msg/s │ 0.0 msg/s │
|
|
36
|
+
│ orders │ 1 │ 9,988 │ 10,000 │ 12 │ 🟢 OK │ 0.0 msg/s │ 0.0 msg/s │
|
|
37
|
+
│ orders │ 2 │ 9,982 │ 10,000 │ 18 │ 🟢 OK │ 0.0 msg/s │ 0.0 msg/s │
|
|
38
|
+
└────────┴───────────┴──────────────────┴────────────────┴───────┴─────────┴──────────────┴──────────────┘
|
|
39
|
+
|
|
40
|
+
🔎 Root Cause Analysis
|
|
41
|
+
[PRODUCER_BURST] orders
|
|
42
|
+
→ produce rate 40.0 msg/s vs consume rate 0.0 msg/s (∞x difference)
|
|
43
|
+
→ Suggestion: consider increasing consumer instances or partition count
|
|
44
|
+
|
|
45
|
+
[HOT_PARTITION] orders
|
|
46
|
+
→ partition-0 holds 98% of lag (1,204 / 1,234) — skewed to 1 of 3 partitions
|
|
47
|
+
→ Suggestion: review partition key distribution strategy or consider increasing partition count
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
```bash
|
|
52
|
+
npm install -g @closeup1202/klag
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Usage
|
|
56
|
+
```bash
|
|
57
|
+
# Basic usage
|
|
58
|
+
klag --broker localhost:9092 --group my-service
|
|
59
|
+
|
|
60
|
+
# Fast mode without rate sampling
|
|
61
|
+
klag --broker localhost:9092 --group my-service --no-rate
|
|
62
|
+
|
|
63
|
+
# Watch mode (auto-refresh every N seconds)
|
|
64
|
+
klag --broker localhost:9092 --group my-service --watch
|
|
65
|
+
klag --broker localhost:9092 --group my-service --watch --interval 3000
|
|
66
|
+
|
|
67
|
+
# JSON output (CI/pipeline integration)
|
|
68
|
+
klag --broker localhost:9092 --group my-service --json
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Options
|
|
72
|
+
|
|
73
|
+
| Option | Description | Default |
|
|
74
|
+
|--------|-------------|---------|
|
|
75
|
+
| `-b, --broker <host:port>` | Kafka broker address | `localhost:9092` |
|
|
76
|
+
| `-g, --group <groupId>` | Consumer group ID | (required) |
|
|
77
|
+
| `-i, --interval <ms>` | Rate sampling interval | `5000` |
|
|
78
|
+
| `-t, --timeout <ms>` | Connection timeout | `5000` |
|
|
79
|
+
| `-w, --watch` | Watch mode | `false` |
|
|
80
|
+
| `--no-rate` | Skip rate sampling | `false` |
|
|
81
|
+
| `--json` | JSON output | `false` |
|
|
82
|
+
|
|
83
|
+
## Detectable root causes
|
|
84
|
+
|
|
85
|
+
### `[HOT_PARTITION]`
|
|
86
|
+
When 80% or more of total lag is concentrated on a single partition.
|
|
87
|
+
Occurs when producer key distribution is uneven (key skew).
|
|
88
|
+
|
|
89
|
+
### `[PRODUCER_BURST]`
|
|
90
|
+
When produce rate is at least 2x the consume rate (and the consumer is still running).
|
|
91
|
+
Occurs when traffic spikes and the consumer cannot keep up.
|
|
92
|
+
|
|
93
|
+
### `[SLOW_CONSUMER]`
|
|
94
|
+
When the produce rate is active but the consume rate has dropped to near zero.
|
|
95
|
+
Occurs when the consumer process has stalled, crashed, or is blocked (e.g., long GC pause).
|
|
96
|
+
|
|
97
|
+
### `[REBALANCING]`
|
|
98
|
+
When the consumer group is in `PreparingRebalance` or `CompletingRebalance` state.
|
|
99
|
+
All consumption pauses during rebalancing, which can cause a temporary lag spike.
|
|
100
|
+
|
|
101
|
+
## Requirements
|
|
102
|
+
|
|
103
|
+
- Node.js >= 18
|
|
104
|
+
- Kafka >= 2.0
|
|
105
|
+
|
|
106
|
+
## Roadmap
|
|
107
|
+
|
|
108
|
+
- [x] v0.1.0 — lag collection, hot partition, producer burst, slow consumer, rebalancing detection, watch mode with lag trend (▲▼)
|
|
109
|
+
- [ ] v0.2.0 — multi-group monitoring
|
|
110
|
+
- [ ] v0.3.0 — Slack alerts, Prometheus export
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
MIT © [closeup1202](https://github.com/closeup1202/klag)
|
|
@@ -0,0 +1,738 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// src/cli/index.ts
|
|
4
|
+
import chalk3 from "chalk";
|
|
5
|
+
import { Command } from "commander";
|
|
6
|
+
|
|
7
|
+
// src/analyzer/burstDetector.ts
|
|
8
|
+
var BURST_RATIO_THRESHOLD = 2;
|
|
9
|
+
var MIN_PRODUCE_RATE = 1;
|
|
10
|
+
var STALLED_CONSUME_RATE = 0.1;
|
|
11
|
+
function detectProducerBurst(snapshot, rateSnapshot) {
|
|
12
|
+
const { partitions: ratePartitions } = rateSnapshot;
|
|
13
|
+
if (ratePartitions.length === 0) return [];
|
|
14
|
+
const topicRateMap = /* @__PURE__ */ new Map();
|
|
15
|
+
for (const p of ratePartitions) {
|
|
16
|
+
let entry = topicRateMap.get(p.topic);
|
|
17
|
+
if (!entry) {
|
|
18
|
+
entry = { totalProduce: 0, totalConsume: 0 };
|
|
19
|
+
topicRateMap.set(p.topic, entry);
|
|
20
|
+
}
|
|
21
|
+
entry.totalProduce += p.produceRate;
|
|
22
|
+
entry.totalConsume += p.consumeRate;
|
|
23
|
+
}
|
|
24
|
+
const results = [];
|
|
25
|
+
for (const [topic, rates] of topicRateMap) {
|
|
26
|
+
const { totalProduce, totalConsume } = rates;
|
|
27
|
+
if (totalProduce < MIN_PRODUCE_RATE) continue;
|
|
28
|
+
if (totalConsume < STALLED_CONSUME_RATE) continue;
|
|
29
|
+
if (totalProduce / totalConsume < BURST_RATIO_THRESHOLD) continue;
|
|
30
|
+
const topicLag = snapshot.partitions.filter((p) => p.topic === topic).reduce((sum, p) => sum + p.lag, 0n);
|
|
31
|
+
if (topicLag === 0n) continue;
|
|
32
|
+
results.push({
|
|
33
|
+
type: "PRODUCER_BURST",
|
|
34
|
+
topic,
|
|
35
|
+
description: `produce rate ${totalProduce.toFixed(1)} msg/s vs consume rate ${totalConsume.toFixed(1)} msg/s (${(totalProduce / totalConsume).toFixed(1)}x difference) \u2014 consumer is falling behind ingestion rate`,
|
|
36
|
+
suggestion: "Consider increasing consumer instances or partition count"
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
return results;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// src/analyzer/hotPartitionDetector.ts
|
|
43
|
+
var HOT_PARTITION_THRESHOLD = 0.8;
|
|
44
|
+
var MIN_TOPIC_LAG = 10n;
|
|
45
|
+
function detectHotPartition(snapshot) {
|
|
46
|
+
const { partitions } = snapshot;
|
|
47
|
+
if (partitions.length === 0) return [];
|
|
48
|
+
const topicMap = /* @__PURE__ */ new Map();
|
|
49
|
+
for (const p of partitions) {
|
|
50
|
+
if (!topicMap.has(p.topic)) topicMap.set(p.topic, []);
|
|
51
|
+
topicMap.get(p.topic)?.push(p);
|
|
52
|
+
}
|
|
53
|
+
const results = [];
|
|
54
|
+
for (const [topic, topicPartitions] of topicMap) {
|
|
55
|
+
if (topicPartitions.length <= 1) continue;
|
|
56
|
+
const topicTotalLag = topicPartitions.reduce((sum, p) => sum + p.lag, 0n);
|
|
57
|
+
if (topicTotalLag === 0n) continue;
|
|
58
|
+
if (topicTotalLag < MIN_TOPIC_LAG) continue;
|
|
59
|
+
const details = topicPartitions.filter((p) => p.lag > 0n).map((p) => ({
|
|
60
|
+
partition: p.partition,
|
|
61
|
+
lag: p.lag,
|
|
62
|
+
ratio: Number(p.lag) / Number(topicTotalLag)
|
|
63
|
+
})).sort((a, b) => b.ratio - a.ratio);
|
|
64
|
+
const top = details[0];
|
|
65
|
+
if (!top || top.ratio < HOT_PARTITION_THRESHOLD) continue;
|
|
66
|
+
const ratioPercent = Math.round(top.ratio * 100);
|
|
67
|
+
const totalPartitionCount = topicPartitions.length;
|
|
68
|
+
results.push({
|
|
69
|
+
type: "HOT_PARTITION",
|
|
70
|
+
topic,
|
|
71
|
+
description: `partition-${top.partition} holds ${ratioPercent}% of lag (${top.lag.toLocaleString()} / ${topicTotalLag.toLocaleString()}) \u2014 1 of ${totalPartitionCount} partitions is skewed`,
|
|
72
|
+
suggestion: "Consider reviewing the partition key distribution strategy or increasing the partition count",
|
|
73
|
+
details
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
return results;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// src/analyzer/rebalancingDetector.ts
|
|
80
|
+
var REBALANCING_STATES = ["PreparingRebalance", "CompletingRebalance"];
|
|
81
|
+
function detectRebalancing(snapshot) {
|
|
82
|
+
const { groupState, totalLag } = snapshot;
|
|
83
|
+
if (!REBALANCING_STATES.includes(groupState)) return null;
|
|
84
|
+
if (totalLag === 0n) return null;
|
|
85
|
+
const isPreparing = groupState === "PreparingRebalance";
|
|
86
|
+
return {
|
|
87
|
+
type: "REBALANCING",
|
|
88
|
+
topic: "*",
|
|
89
|
+
description: `consumer group is currently in ${groupState} state \u2014 all consumption is paused during rebalancing`,
|
|
90
|
+
suggestion: isPreparing ? "A new consumer joined or left the group. Lag may spike temporarily \u2014 monitor if it recovers after rebalancing completes" : "Rebalancing is completing. Lag should recover shortly once partition assignment is finalized"
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// src/analyzer/slowConsumerDetector.ts
|
|
95
|
+
var MIN_PRODUCE_RATE2 = 1;
|
|
96
|
+
var STALLED_CONSUME_RATE2 = 0.1;
|
|
97
|
+
function detectSlowConsumer(snapshot, rateSnapshot) {
|
|
98
|
+
const { partitions: ratePartitions } = rateSnapshot;
|
|
99
|
+
if (ratePartitions.length === 0) return [];
|
|
100
|
+
const topicRateMap = /* @__PURE__ */ new Map();
|
|
101
|
+
for (const p of ratePartitions) {
|
|
102
|
+
let entry = topicRateMap.get(p.topic);
|
|
103
|
+
if (!entry) {
|
|
104
|
+
entry = { totalProduce: 0, totalConsume: 0 };
|
|
105
|
+
topicRateMap.set(p.topic, entry);
|
|
106
|
+
}
|
|
107
|
+
entry.totalProduce += p.produceRate;
|
|
108
|
+
entry.totalConsume += p.consumeRate;
|
|
109
|
+
}
|
|
110
|
+
const results = [];
|
|
111
|
+
for (const [topic, rates] of topicRateMap) {
|
|
112
|
+
const { totalProduce, totalConsume } = rates;
|
|
113
|
+
if (totalProduce < MIN_PRODUCE_RATE2) continue;
|
|
114
|
+
if (totalConsume >= STALLED_CONSUME_RATE2) continue;
|
|
115
|
+
const topicLag = snapshot.partitions.filter((p) => p.topic === topic).reduce((sum, p) => sum + p.lag, 0n);
|
|
116
|
+
if (topicLag === 0n) continue;
|
|
117
|
+
results.push({
|
|
118
|
+
type: "SLOW_CONSUMER",
|
|
119
|
+
topic,
|
|
120
|
+
description: `consumer has stalled \u2014 produce rate ${totalProduce.toFixed(1)} msg/s but consume rate is near 0 \u2014 messages are accumulating with no consumption`,
|
|
121
|
+
suggestion: "Check if consumer process is alive, look for errors in consumer logs, or check for long GC pauses"
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
return results;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// src/analyzer/index.ts
|
|
128
|
+
function analyze(snapshot, rateSnapshot) {
|
|
129
|
+
const results = [];
|
|
130
|
+
const rebalancing = detectRebalancing(snapshot);
|
|
131
|
+
if (rebalancing) results.push(rebalancing);
|
|
132
|
+
if (rateSnapshot) {
|
|
133
|
+
results.push(...detectProducerBurst(snapshot, rateSnapshot));
|
|
134
|
+
results.push(...detectSlowConsumer(snapshot, rateSnapshot));
|
|
135
|
+
}
|
|
136
|
+
results.push(...detectHotPartition(snapshot));
|
|
137
|
+
return results;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// src/collector/lagCollector.ts
|
|
141
|
+
import { AssignerProtocol, Kafka, logLevel } from "kafkajs";
|
|
142
|
+
async function collectLag(options) {
|
|
143
|
+
const kafka = new Kafka({
|
|
144
|
+
clientId: "klag",
|
|
145
|
+
brokers: [options.broker],
|
|
146
|
+
logLevel: logLevel.NOTHING,
|
|
147
|
+
// Hide kafkajs internal logs in CLI
|
|
148
|
+
requestTimeout: options.timeoutMs ?? 5e3,
|
|
149
|
+
connectionTimeout: options.timeoutMs ?? 3e3,
|
|
150
|
+
retry: {
|
|
151
|
+
retries: 1
|
|
152
|
+
// Added — only 1 retry (default is 5)
|
|
153
|
+
}
|
|
154
|
+
});
|
|
155
|
+
const admin = kafka.admin();
|
|
156
|
+
try {
|
|
157
|
+
await admin.connect();
|
|
158
|
+
const groupDescription = await admin.describeGroups([options.groupId]);
|
|
159
|
+
const group = groupDescription.groups[0];
|
|
160
|
+
if (!group) {
|
|
161
|
+
throw new Error(`Consumer group "${options.groupId}" not found`);
|
|
162
|
+
}
|
|
163
|
+
if (group.state === "Dead") {
|
|
164
|
+
throw new Error(`Consumer group "${options.groupId}" is in Dead state`);
|
|
165
|
+
}
|
|
166
|
+
const topicPartitionMap = /* @__PURE__ */ new Map();
|
|
167
|
+
for (const member of group.members) {
|
|
168
|
+
if (!member.memberAssignment) continue;
|
|
169
|
+
try {
|
|
170
|
+
const decoded = AssignerProtocol.MemberAssignment.decode(
|
|
171
|
+
member.memberAssignment
|
|
172
|
+
);
|
|
173
|
+
for (const [topic, partitions2] of Object.entries(decoded?.assignment)) {
|
|
174
|
+
if (!topicPartitionMap.has(topic)) {
|
|
175
|
+
topicPartitionMap.set(topic, /* @__PURE__ */ new Set());
|
|
176
|
+
}
|
|
177
|
+
for (const p of partitions2) {
|
|
178
|
+
topicPartitionMap.get(topic)?.add(p);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
} catch {
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
const topicNames = [...topicPartitionMap.keys()];
|
|
185
|
+
const committedOffsets = await admin.fetchOffsets({
|
|
186
|
+
groupId: options.groupId,
|
|
187
|
+
topics: topicNames.length > 0 ? topicNames : void 0
|
|
188
|
+
});
|
|
189
|
+
for (const topicOffset of committedOffsets) {
|
|
190
|
+
if (!topicPartitionMap.has(topicOffset.topic)) {
|
|
191
|
+
topicPartitionMap.set(topicOffset.topic, /* @__PURE__ */ new Set());
|
|
192
|
+
}
|
|
193
|
+
for (const p of topicOffset.partitions) {
|
|
194
|
+
topicPartitionMap.get(topicOffset.topic)?.add(p.partition);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
const topicList = [...topicPartitionMap.keys()];
|
|
198
|
+
const logEndEntries = await Promise.all(
|
|
199
|
+
topicList.map(async (topic) => {
|
|
200
|
+
const offsets = await admin.fetchTopicOffsets(topic);
|
|
201
|
+
const partitionMap = /* @__PURE__ */ new Map();
|
|
202
|
+
for (const p of offsets) {
|
|
203
|
+
partitionMap.set(p.partition, BigInt(p.offset));
|
|
204
|
+
}
|
|
205
|
+
return [topic, partitionMap];
|
|
206
|
+
})
|
|
207
|
+
);
|
|
208
|
+
const logEndOffsetMap = new Map(logEndEntries);
|
|
209
|
+
const committedOffsetMap = /* @__PURE__ */ new Map();
|
|
210
|
+
for (const topicOffset of committedOffsets) {
|
|
211
|
+
const partitionMap = /* @__PURE__ */ new Map();
|
|
212
|
+
for (const p of topicOffset.partitions) {
|
|
213
|
+
const offset = p.offset === "-1" ? 0n : BigInt(p.offset);
|
|
214
|
+
partitionMap.set(p.partition, offset);
|
|
215
|
+
}
|
|
216
|
+
committedOffsetMap.set(topicOffset.topic, partitionMap);
|
|
217
|
+
}
|
|
218
|
+
const partitions = [];
|
|
219
|
+
for (const [topic, partitionSet] of topicPartitionMap) {
|
|
220
|
+
const logEndMap = logEndOffsetMap.get(topic) ?? /* @__PURE__ */ new Map();
|
|
221
|
+
const commitMap = committedOffsetMap.get(topic) ?? /* @__PURE__ */ new Map();
|
|
222
|
+
for (const partition of partitionSet) {
|
|
223
|
+
const logEndOffset = logEndMap.get(partition) ?? 0n;
|
|
224
|
+
const committedOffset = commitMap.get(partition) ?? 0n;
|
|
225
|
+
const lag = logEndOffset > committedOffset ? logEndOffset - committedOffset : 0n;
|
|
226
|
+
partitions.push({
|
|
227
|
+
topic,
|
|
228
|
+
partition,
|
|
229
|
+
logEndOffset,
|
|
230
|
+
committedOffset,
|
|
231
|
+
lag
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
partitions.sort(
|
|
236
|
+
(a, b) => a.topic.localeCompare(b.topic) || a.partition - b.partition
|
|
237
|
+
);
|
|
238
|
+
const totalLag = partitions.reduce((sum, p) => sum + p.lag, 0n);
|
|
239
|
+
return {
|
|
240
|
+
groupId: options.groupId,
|
|
241
|
+
broker: options.broker,
|
|
242
|
+
collectedAt: /* @__PURE__ */ new Date(),
|
|
243
|
+
partitions,
|
|
244
|
+
totalLag,
|
|
245
|
+
groupState: group.state
|
|
246
|
+
};
|
|
247
|
+
} finally {
|
|
248
|
+
await admin.disconnect();
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// src/collector/rateCollector.ts
|
|
253
|
+
import { Kafka as Kafka2, logLevel as logLevel2 } from "kafkajs";
|
|
254
|
+
async function collectRate(options, knownTopics) {
|
|
255
|
+
const intervalMs = options.intervalMs ?? 5e3;
|
|
256
|
+
const intervalSec = intervalMs / 1e3;
|
|
257
|
+
const kafka = new Kafka2({
|
|
258
|
+
clientId: "klag-rate",
|
|
259
|
+
brokers: [options.broker],
|
|
260
|
+
logLevel: logLevel2.NOTHING,
|
|
261
|
+
requestTimeout: options.timeoutMs ?? 5e3,
|
|
262
|
+
connectionTimeout: options.timeoutMs ?? 3e3,
|
|
263
|
+
retry: {
|
|
264
|
+
retries: 1
|
|
265
|
+
// Added — only 1 retry (default is 5)
|
|
266
|
+
}
|
|
267
|
+
});
|
|
268
|
+
const admin = kafka.admin();
|
|
269
|
+
try {
|
|
270
|
+
await admin.connect();
|
|
271
|
+
const committedRaw = await admin.fetchOffsets(
|
|
272
|
+
knownTopics && knownTopics.length > 0 ? { groupId: options.groupId, topics: knownTopics } : { groupId: options.groupId }
|
|
273
|
+
);
|
|
274
|
+
const topics = committedRaw.map((t) => t.topic);
|
|
275
|
+
if (topics.length === 0) {
|
|
276
|
+
return { intervalMs, partitions: [] };
|
|
277
|
+
}
|
|
278
|
+
const logEnd1 = await fetchLogEndOffsets(admin, topics);
|
|
279
|
+
const committed1 = buildCommittedMap(committedRaw);
|
|
280
|
+
await sleep(intervalMs);
|
|
281
|
+
const committedRaw2 = await admin.fetchOffsets({
|
|
282
|
+
groupId: options.groupId,
|
|
283
|
+
topics
|
|
284
|
+
});
|
|
285
|
+
const logEnd2 = await fetchLogEndOffsets(admin, topics);
|
|
286
|
+
const committed2 = buildCommittedMap(committedRaw2);
|
|
287
|
+
const partitions = [];
|
|
288
|
+
for (const topic of topics) {
|
|
289
|
+
const end1 = logEnd1.get(topic) ?? /* @__PURE__ */ new Map();
|
|
290
|
+
const end2 = logEnd2.get(topic) ?? /* @__PURE__ */ new Map();
|
|
291
|
+
const com1 = committed1.get(topic) ?? /* @__PURE__ */ new Map();
|
|
292
|
+
const com2 = committed2.get(topic) ?? /* @__PURE__ */ new Map();
|
|
293
|
+
const allPartitions = /* @__PURE__ */ new Set([...end1.keys(), ...end2.keys()]);
|
|
294
|
+
for (const partition of allPartitions) {
|
|
295
|
+
const logEndDiff = (end2.get(partition) ?? 0n) - (end1.get(partition) ?? 0n);
|
|
296
|
+
const committedDiff = (com2.get(partition) ?? 0n) - (com1.get(partition) ?? 0n);
|
|
297
|
+
partitions.push({
|
|
298
|
+
topic,
|
|
299
|
+
partition,
|
|
300
|
+
produceRate: Math.max(0, Number(logEndDiff) / intervalSec),
|
|
301
|
+
consumeRate: Math.max(0, Number(committedDiff) / intervalSec)
|
|
302
|
+
});
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
partitions.sort(
|
|
306
|
+
(a, b) => a.topic.localeCompare(b.topic) || a.partition - b.partition
|
|
307
|
+
);
|
|
308
|
+
return { intervalMs, partitions };
|
|
309
|
+
} finally {
|
|
310
|
+
await admin.disconnect();
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
async function fetchLogEndOffsets(admin, topics) {
|
|
314
|
+
const entries = await Promise.all(
|
|
315
|
+
topics.map(async (topic) => {
|
|
316
|
+
const offsets = await admin.fetchTopicOffsets(topic);
|
|
317
|
+
const partitionMap = /* @__PURE__ */ new Map();
|
|
318
|
+
for (const p of offsets) {
|
|
319
|
+
partitionMap.set(p.partition, BigInt(p.offset));
|
|
320
|
+
}
|
|
321
|
+
return [topic, partitionMap];
|
|
322
|
+
})
|
|
323
|
+
);
|
|
324
|
+
return new Map(entries);
|
|
325
|
+
}
|
|
326
|
+
function buildCommittedMap(raw) {
|
|
327
|
+
const result = /* @__PURE__ */ new Map();
|
|
328
|
+
for (const topicOffset of raw) {
|
|
329
|
+
const partitionMap = /* @__PURE__ */ new Map();
|
|
330
|
+
for (const p of topicOffset.partitions) {
|
|
331
|
+
partitionMap.set(p.partition, p.offset === "-1" ? 0n : BigInt(p.offset));
|
|
332
|
+
}
|
|
333
|
+
result.set(topicOffset.topic, partitionMap);
|
|
334
|
+
}
|
|
335
|
+
return result;
|
|
336
|
+
}
|
|
337
|
+
function sleep(ms) {
|
|
338
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// src/reporter/tableReporter.ts
|
|
342
|
+
import chalk from "chalk";
|
|
343
|
+
import Table from "cli-table3";
|
|
344
|
+
|
|
345
|
+
// src/types/index.ts
|
|
346
|
+
var VERSION = "0.1.0";
|
|
347
|
+
function classifyLag(lag) {
|
|
348
|
+
if (lag < 100n) return "OK";
|
|
349
|
+
if (lag < 1000n) return "WARN";
|
|
350
|
+
return "HIGH";
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
// src/reporter/tableReporter.ts
|
|
354
|
+
var LEVEL_ICON = {
|
|
355
|
+
OK: chalk.green("\u{1F7E2} OK "),
|
|
356
|
+
WARN: chalk.yellow("\u{1F7E1} WARN"),
|
|
357
|
+
HIGH: chalk.red("\u{1F534} HIGH")
|
|
358
|
+
};
|
|
359
|
+
function formatLag(lag) {
|
|
360
|
+
return lag.toLocaleString();
|
|
361
|
+
}
|
|
362
|
+
function formatRate(rate) {
|
|
363
|
+
return rate < 0.1 ? "0" : `${rate.toFixed(1)} msg/s`;
|
|
364
|
+
}
|
|
365
|
+
function formatTrend(lagDiff) {
|
|
366
|
+
if (lagDiff === void 0) return chalk.gray(" - ");
|
|
367
|
+
if (lagDiff === 0n) return chalk.gray(" = ");
|
|
368
|
+
if (lagDiff > 0n) return chalk.red(`\u25B2 +${lagDiff.toLocaleString()}`);
|
|
369
|
+
return chalk.green(`\u25BC ${lagDiff.toLocaleString()}`);
|
|
370
|
+
}
|
|
371
|
+
function groupStatus(totalLag) {
|
|
372
|
+
const level = classifyLag(totalLag);
|
|
373
|
+
if (level === "OK") return chalk.green("\u2705 OK");
|
|
374
|
+
if (level === "WARN") return chalk.yellow("\u26A0\uFE0F WARNING");
|
|
375
|
+
return chalk.red("\u{1F6A8} CRITICAL");
|
|
376
|
+
}
|
|
377
|
+
function printLagTable(snapshot, rcaResults = [], rateSnapshot, watchMode = false) {
|
|
378
|
+
const { groupId, broker, collectedAt, partitions, totalLag } = snapshot;
|
|
379
|
+
if (!watchMode) {
|
|
380
|
+
console.log("");
|
|
381
|
+
console.log(chalk.bold.cyan("\u26A1 klag") + chalk.gray(` v${VERSION}`));
|
|
382
|
+
console.log("");
|
|
383
|
+
}
|
|
384
|
+
console.log(chalk.bold("\u{1F50D} Consumer Group: ") + chalk.white(groupId));
|
|
385
|
+
console.log(chalk.bold(" Broker: ") + chalk.white(broker));
|
|
386
|
+
const tz = Intl.DateTimeFormat().resolvedOptions().timeZone;
|
|
387
|
+
const localTime = collectedAt.toLocaleString("sv-SE", {
|
|
388
|
+
timeZone: tz,
|
|
389
|
+
year: "numeric",
|
|
390
|
+
month: "2-digit",
|
|
391
|
+
day: "2-digit",
|
|
392
|
+
hour: "2-digit",
|
|
393
|
+
minute: "2-digit",
|
|
394
|
+
second: "2-digit",
|
|
395
|
+
hour12: false
|
|
396
|
+
}).replace("T", " ");
|
|
397
|
+
console.log(
|
|
398
|
+
chalk.bold(" Collected At: ") + chalk.gray(`${localTime} (${tz})`)
|
|
399
|
+
);
|
|
400
|
+
console.log("");
|
|
401
|
+
const status = groupStatus(totalLag);
|
|
402
|
+
const totalStr = chalk.bold(formatLag(totalLag));
|
|
403
|
+
console.log(` Group Status : ${status} Total Lag : ${totalStr}`);
|
|
404
|
+
console.log("");
|
|
405
|
+
const hasRate = !!rateSnapshot && rateSnapshot.partitions.length > 0;
|
|
406
|
+
const hasTrend = watchMode;
|
|
407
|
+
const rateMap = /* @__PURE__ */ new Map();
|
|
408
|
+
if (hasRate && rateSnapshot) {
|
|
409
|
+
for (const r of rateSnapshot.partitions) {
|
|
410
|
+
rateMap.set(`${r.topic}-${r.partition}`, r);
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
const head = [
|
|
414
|
+
chalk.bold("Topic"),
|
|
415
|
+
chalk.bold("Partition"),
|
|
416
|
+
chalk.bold("Committed Offset"),
|
|
417
|
+
chalk.bold("Log-End Offset"),
|
|
418
|
+
chalk.bold("Lag"),
|
|
419
|
+
...hasTrend ? [chalk.bold("Trend")] : [],
|
|
420
|
+
chalk.bold("Status"),
|
|
421
|
+
...hasRate ? [chalk.bold("Produce Rate"), chalk.bold("Consume Rate")] : []
|
|
422
|
+
];
|
|
423
|
+
const table = new Table({
|
|
424
|
+
head,
|
|
425
|
+
colAligns: [
|
|
426
|
+
"left",
|
|
427
|
+
"right",
|
|
428
|
+
"right",
|
|
429
|
+
"right",
|
|
430
|
+
"right",
|
|
431
|
+
...hasTrend ? ["right"] : [],
|
|
432
|
+
"center",
|
|
433
|
+
...hasRate ? ["right", "right"] : []
|
|
434
|
+
],
|
|
435
|
+
style: { head: [], border: ["grey"] }
|
|
436
|
+
});
|
|
437
|
+
let lastTopic = "";
|
|
438
|
+
for (const p of partitions) {
|
|
439
|
+
const level = classifyLag(p.lag);
|
|
440
|
+
const lagStr = level === "HIGH" ? chalk.red(formatLag(p.lag)) : level === "WARN" ? chalk.yellow(formatLag(p.lag)) : chalk.green(formatLag(p.lag));
|
|
441
|
+
const rateEntry = rateMap.get(`${p.topic}-${p.partition}`);
|
|
442
|
+
const rateColumns = hasRate ? [
|
|
443
|
+
chalk.yellow(formatRate(rateEntry?.produceRate ?? 0)),
|
|
444
|
+
chalk.cyan(formatRate(rateEntry?.consumeRate ?? 0))
|
|
445
|
+
] : [];
|
|
446
|
+
const topicDisplay = p.topic !== lastTopic ? p.topic : "";
|
|
447
|
+
lastTopic = p.topic;
|
|
448
|
+
table.push([
|
|
449
|
+
topicDisplay,
|
|
450
|
+
String(p.partition),
|
|
451
|
+
formatLag(p.committedOffset),
|
|
452
|
+
formatLag(p.logEndOffset),
|
|
453
|
+
lagStr,
|
|
454
|
+
...hasTrend ? [formatTrend(p.lagDiff)] : [],
|
|
455
|
+
LEVEL_ICON[level],
|
|
456
|
+
...rateColumns
|
|
457
|
+
]);
|
|
458
|
+
}
|
|
459
|
+
console.log(table.toString());
|
|
460
|
+
console.log("");
|
|
461
|
+
if (rcaResults.length === 0) return;
|
|
462
|
+
console.log(chalk.bold("\u{1F50E} Root Cause Analysis"));
|
|
463
|
+
console.log("");
|
|
464
|
+
for (const rca of rcaResults) {
|
|
465
|
+
const typeLabel = `${chalk.bold.yellow(` [${rca.type}]`)} ${chalk.white(rca.topic)}`;
|
|
466
|
+
console.log(typeLabel);
|
|
467
|
+
console.log(chalk.gray(` \u2192 ${rca.description}`));
|
|
468
|
+
console.log(chalk.cyan(` \u2192 Suggestion: ${rca.suggestion}`));
|
|
469
|
+
console.log("");
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
// src/cli/validators.ts
|
|
474
|
+
import { InvalidArgumentError } from "commander";
|
|
475
|
+
function parseInterval(value) {
|
|
476
|
+
const parsed = parseInt(value, 10);
|
|
477
|
+
if (Number.isNaN(parsed) || parsed < 1e3) {
|
|
478
|
+
throw new InvalidArgumentError("--interval must be a number >= 1000ms.");
|
|
479
|
+
}
|
|
480
|
+
return parsed;
|
|
481
|
+
}
|
|
482
|
+
function parseBroker(value) {
|
|
483
|
+
const match = /^[^:]+:(\d+)$/.exec(value);
|
|
484
|
+
if (!match) {
|
|
485
|
+
throw new InvalidArgumentError(
|
|
486
|
+
"--broker format is invalid. Example: localhost:9092"
|
|
487
|
+
);
|
|
488
|
+
}
|
|
489
|
+
const port = parseInt(match[1], 10);
|
|
490
|
+
if (port < 1 || port > 65535) {
|
|
491
|
+
throw new InvalidArgumentError(
|
|
492
|
+
"--broker port must be between 1 and 65535."
|
|
493
|
+
);
|
|
494
|
+
}
|
|
495
|
+
return value;
|
|
496
|
+
}
|
|
497
|
+
function parseTimeout(value) {
|
|
498
|
+
const parsed = parseInt(value, 10);
|
|
499
|
+
if (Number.isNaN(parsed) || parsed < 1e3) {
|
|
500
|
+
throw new InvalidArgumentError("--timeout must be a number >= 1000ms.");
|
|
501
|
+
}
|
|
502
|
+
return parsed;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
// src/cli/watcher.ts
|
|
506
|
+
import chalk2 from "chalk";
|
|
507
|
+
var MAX_RETRIES = 3;
|
|
508
|
+
function clearScreen() {
|
|
509
|
+
process.stdout.write("\x1Bc");
|
|
510
|
+
}
|
|
511
|
+
function printWatchHeader(intervalMs, updatedAt) {
|
|
512
|
+
const intervalSec = intervalMs / 1e3;
|
|
513
|
+
const tz = Intl.DateTimeFormat().resolvedOptions().timeZone;
|
|
514
|
+
const timeStr = updatedAt.toLocaleString("sv-SE", {
|
|
515
|
+
timeZone: tz,
|
|
516
|
+
hour: "2-digit",
|
|
517
|
+
minute: "2-digit",
|
|
518
|
+
second: "2-digit",
|
|
519
|
+
hour12: false
|
|
520
|
+
});
|
|
521
|
+
console.log(
|
|
522
|
+
chalk2.bold.cyan("\u26A1 klag") + chalk2.gray(` v${VERSION}`) + " \u2502 " + chalk2.yellow("watch mode") + " \u2502 " + chalk2.gray(`${intervalSec}s refresh`) + " \u2502 " + chalk2.gray("Ctrl+C to exit")
|
|
523
|
+
);
|
|
524
|
+
console.log(chalk2.gray(` Last updated: ${timeStr} (${tz})`));
|
|
525
|
+
}
|
|
526
|
+
function printWatchError(message, retryCount, retryIn) {
|
|
527
|
+
clearScreen();
|
|
528
|
+
console.log(
|
|
529
|
+
chalk2.bold.cyan("\u26A1 klag") + chalk2.gray(` v${VERSION}`) + " \u2502 " + chalk2.yellow("watch mode") + " \u2502 " + chalk2.gray("Ctrl+C to exit")
|
|
530
|
+
);
|
|
531
|
+
console.log("");
|
|
532
|
+
console.error(chalk2.red(` \u274C Error: ${message}`));
|
|
533
|
+
console.log(
|
|
534
|
+
chalk2.yellow(` Retrying ${retryCount}/${MAX_RETRIES}... in ${retryIn}s`)
|
|
535
|
+
);
|
|
536
|
+
console.log("");
|
|
537
|
+
}
|
|
538
|
+
function printWatchFatal(message) {
|
|
539
|
+
clearScreen();
|
|
540
|
+
console.log(
|
|
541
|
+
chalk2.bold.cyan("\u26A1 klag") + chalk2.gray(` v${VERSION}`) + " \u2502 " + chalk2.yellow("watch mode")
|
|
542
|
+
);
|
|
543
|
+
console.log("");
|
|
544
|
+
console.error(chalk2.red(` \u274C Error: ${message}`));
|
|
545
|
+
console.error(
|
|
546
|
+
chalk2.red(` All ${MAX_RETRIES} retries failed \u2014 exiting watch mode`)
|
|
547
|
+
);
|
|
548
|
+
console.log("");
|
|
549
|
+
}
|
|
550
|
+
function applyDiff(current, previous) {
|
|
551
|
+
const prevMap = /* @__PURE__ */ new Map();
|
|
552
|
+
for (const p of previous.partitions) {
|
|
553
|
+
prevMap.set(`${p.topic}-${p.partition}`, p.lag);
|
|
554
|
+
}
|
|
555
|
+
const partitions = current.partitions.map((p) => {
|
|
556
|
+
const prevLag = prevMap.get(`${p.topic}-${p.partition}`);
|
|
557
|
+
const lagDiff = prevLag !== void 0 ? p.lag - prevLag : void 0;
|
|
558
|
+
return { ...p, lagDiff };
|
|
559
|
+
});
|
|
560
|
+
return { ...current, partitions };
|
|
561
|
+
}
|
|
562
|
+
async function runOnce(options, noRate, previous) {
|
|
563
|
+
const snapshot = await collectLag(options);
|
|
564
|
+
let rateSnapshot;
|
|
565
|
+
if (!noRate) {
|
|
566
|
+
const topics = [...new Set(snapshot.partitions.map((p) => p.topic))];
|
|
567
|
+
const waitSec = (options.intervalMs ?? 5e3) / 1e3;
|
|
568
|
+
process.stdout.write(
|
|
569
|
+
chalk2.gray(` Sampling rates... (waiting ${waitSec}s) `)
|
|
570
|
+
);
|
|
571
|
+
rateSnapshot = await collectRate(options, topics);
|
|
572
|
+
process.stdout.write(`\r${" ".repeat(50)}\r`);
|
|
573
|
+
}
|
|
574
|
+
const rcaResults = analyze(snapshot, rateSnapshot);
|
|
575
|
+
const snapshotWithDiff = previous ? applyDiff(snapshot, previous) : snapshot;
|
|
576
|
+
clearScreen();
|
|
577
|
+
printWatchHeader(options.intervalMs ?? 5e3, snapshot.collectedAt);
|
|
578
|
+
printLagTable(snapshotWithDiff, rcaResults, rateSnapshot, true);
|
|
579
|
+
return snapshot;
|
|
580
|
+
}
|
|
581
|
+
function printCountdown(seconds) {
|
|
582
|
+
return new Promise((resolve) => {
|
|
583
|
+
let remaining = seconds;
|
|
584
|
+
const tick = () => {
|
|
585
|
+
process.stdout.write(
|
|
586
|
+
`\r${chalk2.gray(` [\u25CF] Next refresh in ${remaining}s...`)} `
|
|
587
|
+
);
|
|
588
|
+
if (remaining === 0) {
|
|
589
|
+
process.stdout.write(`\r${" ".repeat(40)}\r`);
|
|
590
|
+
resolve();
|
|
591
|
+
return;
|
|
592
|
+
}
|
|
593
|
+
remaining--;
|
|
594
|
+
setTimeout(tick, 1e3);
|
|
595
|
+
};
|
|
596
|
+
tick();
|
|
597
|
+
});
|
|
598
|
+
}
|
|
599
|
+
function getFriendlyMessage(err, broker) {
|
|
600
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
601
|
+
if (message.includes("ECONNREFUSED") || message.includes("ETIMEDOUT") || message.includes("Connection error")) {
|
|
602
|
+
return `Cannot connect to broker (${broker})`;
|
|
603
|
+
}
|
|
604
|
+
if (message.includes("Dead state") || message.includes("not found")) {
|
|
605
|
+
return `Consumer group not found`;
|
|
606
|
+
}
|
|
607
|
+
return message;
|
|
608
|
+
}
|
|
609
|
+
async function startWatch(options, noRate) {
|
|
610
|
+
process.on("SIGINT", () => {
|
|
611
|
+
console.log(chalk2.gray("\n\n Watch mode exited\n"));
|
|
612
|
+
process.exit(0);
|
|
613
|
+
});
|
|
614
|
+
const intervalMs = options.intervalMs ?? 5e3;
|
|
615
|
+
const waitSec = Math.ceil(intervalMs / 1e3);
|
|
616
|
+
process.stdout.write(chalk2.gray(" Connecting to broker..."));
|
|
617
|
+
let errorCount = 0;
|
|
618
|
+
let previousSnapshot;
|
|
619
|
+
while (true) {
|
|
620
|
+
try {
|
|
621
|
+
previousSnapshot = await runOnce(options, noRate, previousSnapshot);
|
|
622
|
+
errorCount = 0;
|
|
623
|
+
if (noRate) {
|
|
624
|
+
await printCountdown(waitSec);
|
|
625
|
+
}
|
|
626
|
+
} catch (err) {
|
|
627
|
+
errorCount++;
|
|
628
|
+
const message = getFriendlyMessage(err, options.broker);
|
|
629
|
+
if (errorCount >= MAX_RETRIES) {
|
|
630
|
+
printWatchFatal(message);
|
|
631
|
+
process.exit(1);
|
|
632
|
+
}
|
|
633
|
+
printWatchError(message, errorCount, waitSec);
|
|
634
|
+
await printCountdown(waitSec);
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
// src/cli/index.ts
|
|
640
|
+
process.removeAllListeners("warning");
|
|
641
|
+
var program = new Command();
|
|
642
|
+
program.name("klag").description("Kafka consumer lag root cause analyzer").version(VERSION).requiredOption(
|
|
643
|
+
"-b, --broker <host:port>",
|
|
644
|
+
"Kafka broker address",
|
|
645
|
+
parseBroker,
|
|
646
|
+
"localhost:9092"
|
|
647
|
+
).requiredOption("-g, --group <groupId>", "Consumer group ID").option(
|
|
648
|
+
"-i, --interval <ms>",
|
|
649
|
+
"Rate sampling interval in ms",
|
|
650
|
+
parseInterval,
|
|
651
|
+
5e3
|
|
652
|
+
).option("-w, --watch", "Watch mode \u2014 refresh every interval").option("-t, --timeout <ms>", "Connection timeout in ms", parseTimeout, 5e3).option(
|
|
653
|
+
"--no-rate",
|
|
654
|
+
"Skip rate sampling (faster, no PRODUCER_BURST detection)"
|
|
655
|
+
).option("--json", "Output raw JSON instead of table").action(async (options) => {
|
|
656
|
+
try {
|
|
657
|
+
const kafkaOptions = {
|
|
658
|
+
broker: options.broker,
|
|
659
|
+
groupId: options.group,
|
|
660
|
+
intervalMs: options.interval,
|
|
661
|
+
timeoutMs: options.timeout
|
|
662
|
+
};
|
|
663
|
+
if (options.watch) {
|
|
664
|
+
await startWatch(kafkaOptions, options.rate === false);
|
|
665
|
+
return;
|
|
666
|
+
}
|
|
667
|
+
process.stdout.write(chalk3.gray(" Connecting to broker..."));
|
|
668
|
+
const snapshot = await collectLag(kafkaOptions);
|
|
669
|
+
process.stdout.write(`\r${" ".repeat(50)}\r`);
|
|
670
|
+
let rateSnapshot;
|
|
671
|
+
if (options.rate !== false) {
|
|
672
|
+
const topics = [...new Set(snapshot.partitions.map((p) => p.topic))];
|
|
673
|
+
const waitSec = (kafkaOptions.intervalMs ?? 5e3) / 1e3;
|
|
674
|
+
process.stdout.write(
|
|
675
|
+
chalk3.gray(` Sampling rates... (waiting ${waitSec}s) `)
|
|
676
|
+
);
|
|
677
|
+
rateSnapshot = await collectRate(kafkaOptions, topics);
|
|
678
|
+
process.stdout.write(`\r${" ".repeat(50)}\r`);
|
|
679
|
+
}
|
|
680
|
+
const rcaResults = analyze(snapshot, rateSnapshot);
|
|
681
|
+
if (options.json) {
|
|
682
|
+
const serializable = {
|
|
683
|
+
...snapshot,
|
|
684
|
+
totalLag: snapshot.totalLag.toString(),
|
|
685
|
+
partitions: snapshot.partitions.map((p) => ({
|
|
686
|
+
...p,
|
|
687
|
+
lag: p.lag.toString(),
|
|
688
|
+
logEndOffset: p.logEndOffset.toString(),
|
|
689
|
+
committedOffset: p.committedOffset.toString()
|
|
690
|
+
})),
|
|
691
|
+
rate: rateSnapshot,
|
|
692
|
+
rca: rcaResults
|
|
693
|
+
};
|
|
694
|
+
console.log(JSON.stringify(serializable, null, 2));
|
|
695
|
+
} else {
|
|
696
|
+
printLagTable(snapshot, rcaResults, rateSnapshot);
|
|
697
|
+
}
|
|
698
|
+
process.exit(0);
|
|
699
|
+
} catch (err) {
|
|
700
|
+
process.stdout.write(`\r${" ".repeat(50)}\r`);
|
|
701
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
702
|
+
if (message.includes("ECONNREFUSED") || message.includes("ETIMEDOUT") || message.includes("Connection error") || message.includes("connect ECONNREFUSED")) {
|
|
703
|
+
console.error(chalk3.red(`
|
|
704
|
+
\u274C Cannot connect to broker
|
|
705
|
+
`));
|
|
706
|
+
console.error(chalk3.yellow(" Check the following:"));
|
|
707
|
+
console.error(chalk3.gray(` \u2022 Is Kafka running: docker ps`));
|
|
708
|
+
console.error(chalk3.gray(` \u2022 Broker address: ${options.broker}`));
|
|
709
|
+
console.error(
|
|
710
|
+
chalk3.gray(
|
|
711
|
+
` \u2022 Port accessibility: nc -zv ${options.broker.split(":")[0]} ${options.broker.split(":")[1]}`
|
|
712
|
+
)
|
|
713
|
+
);
|
|
714
|
+
console.error("");
|
|
715
|
+
process.exit(1);
|
|
716
|
+
}
|
|
717
|
+
if (message.includes("not found") || message.includes("Dead state")) {
|
|
718
|
+
console.error(chalk3.red(`
|
|
719
|
+
\u274C Consumer group not found
|
|
720
|
+
`));
|
|
721
|
+
console.error(chalk3.yellow(" Check the following:"));
|
|
722
|
+
console.error(chalk3.gray(` \u2022 Group ID: ${options.group}`));
|
|
723
|
+
console.error(chalk3.gray(` \u2022 List existing groups:`));
|
|
724
|
+
console.error(
|
|
725
|
+
chalk3.gray(
|
|
726
|
+
` kafka-consumer-groups.sh --bootstrap-server ${options.broker} --list`
|
|
727
|
+
)
|
|
728
|
+
);
|
|
729
|
+
console.error("");
|
|
730
|
+
process.exit(1);
|
|
731
|
+
}
|
|
732
|
+
console.error(chalk3.red(`
|
|
733
|
+
\u274C Error: ${message}
|
|
734
|
+
`));
|
|
735
|
+
process.exit(1);
|
|
736
|
+
}
|
|
737
|
+
});
|
|
738
|
+
program.parse();
|
package/package.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@closeup1202/klag",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Kafka consumer lag root cause analyzer",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"klag": "dist/cli/index.js"
|
|
8
|
+
},
|
|
9
|
+
"files": [
|
|
10
|
+
"dist"
|
|
11
|
+
],
|
|
12
|
+
"engines": {
|
|
13
|
+
"node": ">=18.0.0"
|
|
14
|
+
},
|
|
15
|
+
"scripts": {
|
|
16
|
+
"build": "tsup",
|
|
17
|
+
"dev": "tsup --watch",
|
|
18
|
+
"test": "vitest run",
|
|
19
|
+
"test:watch": "vitest",
|
|
20
|
+
"lint": "biome check ./src",
|
|
21
|
+
"format": "biome format --write ./src",
|
|
22
|
+
"prepublishOnly": "npm run build && npm run test"
|
|
23
|
+
},
|
|
24
|
+
"keywords": [
|
|
25
|
+
"kafka",
|
|
26
|
+
"consumer-lag",
|
|
27
|
+
"cli",
|
|
28
|
+
"debugging",
|
|
29
|
+
"monitoring",
|
|
30
|
+
"kafkajs"
|
|
31
|
+
],
|
|
32
|
+
"license": "MIT",
|
|
33
|
+
"repository": {
|
|
34
|
+
"type": "git",
|
|
35
|
+
"url": "git+https://github.com/closeup1202/klag.git"
|
|
36
|
+
},
|
|
37
|
+
"homepage": "https://github.com/closeup1202/klag#readme",
|
|
38
|
+
"dependencies": {
|
|
39
|
+
"chalk": "^5.6.2",
|
|
40
|
+
"cli-table3": "^0.6.5",
|
|
41
|
+
"commander": "^14.0.3",
|
|
42
|
+
"kafkajs": "^2.2.4"
|
|
43
|
+
},
|
|
44
|
+
"devDependencies": {
|
|
45
|
+
"@biomejs/biome": "^2.4.8",
|
|
46
|
+
"@types/node": "^25.5.0",
|
|
47
|
+
"tsup": "^8.5.1",
|
|
48
|
+
"tsx": "^4.21.0",
|
|
49
|
+
"typescript": "^6.0.2",
|
|
50
|
+
"vitest": "^4.1.1"
|
|
51
|
+
}
|
|
52
|
+
}
|