@checkstack/healthcheck-tcp-backend 0.3.10 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +84 -0
- package/package.json +5 -5
- package/src/banner-collector.ts +16 -4
- package/src/strategy.ts +22 -2
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,89 @@
|
|
|
1
1
|
# @checkstack/healthcheck-tcp-backend
|
|
2
2
|
|
|
3
|
+
## 0.4.0
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- 8cad340: Add a finer per-run transport timing breakdown to health checks.
|
|
8
|
+
|
|
9
|
+
Each run now records an optional structured `metadata.timings` (DNS, connect,
|
|
10
|
+
TLS, wait/time-to-first-byte, transfer, and a `processing` catch-all for
|
|
11
|
+
non-HTTP operation time). The run-detail view renders the phases it has, in
|
|
12
|
+
transport order, and falls back to the previous Connection + Processing split
|
|
13
|
+
for older runs that lack the finer data.
|
|
14
|
+
|
|
15
|
+
For HTTP the request is issued verbatim through `fetch` (original URL, headers,
|
|
16
|
+
and body), so request behavior is identical to a plain `fetch`. The timing is
|
|
17
|
+
measured around it: `fetch` resolves at the response headers, so wait
|
|
18
|
+
(time-to-first-byte) and transfer (body) are measured exactly on the request,
|
|
19
|
+
DNS is timed at the resolve step, and connect/TLS come from a short-lived,
|
|
20
|
+
best-effort raw `net`/`tls` probe to the same already-validated IP (the request
|
|
21
|
+
socket exposes no connect/handshake events on the Bun runtime). The probe is
|
|
22
|
+
timing-only and never fails the check. The probe validates the TLS certificate
|
|
23
|
+
(against the original hostname via SNI) like the real request does - it does not
|
|
24
|
+
disable certificate validation; an unverifiable cert simply yields no TLS-phase
|
|
25
|
+
timing rather than aborting. Other transports surface the connect and operation
|
|
26
|
+
times they already measure.
|
|
27
|
+
|
|
28
|
+
The SSRF guard now validates the resolved host (rejecting cloud-metadata /
|
|
29
|
+
link-local and operator-denied ranges) as a pre-flight check and no longer pins
|
|
30
|
+
the request to the resolved IP. Pinning rewrote the URL to the IP literal and
|
|
31
|
+
moved the host to the `Host` header, which breaks HTTP/2 origins (their
|
|
32
|
+
authority comes from the URL's `:authority`, not `Host`) - that is why real
|
|
33
|
+
hosts such as `google.com` started answering 404/429 instead of 200. The
|
|
34
|
+
pre-flight validation keeps blocking static metadata/link-local targets and
|
|
35
|
+
direct denied IP literals; the only thing dropped is DNS-rebind TOCTOU
|
|
36
|
+
protection (a narrow window that pinning closed at the cost of breaking
|
|
37
|
+
legitimate HTTP/2 requests).
|
|
38
|
+
|
|
39
|
+
The run-detail "slowest" badge no longer collides with the timing bar, and a
|
|
40
|
+
genuinely sub-millisecond phase reads as "<1 ms" instead of a bare "0 ms".
|
|
41
|
+
|
|
42
|
+
### Patch Changes
|
|
43
|
+
|
|
44
|
+
- 8cad340: Retune anomaly-detection defaults across every health-check strategy and the
|
|
45
|
+
hardware collector for a low-noise, problem-focused out-of-the-box experience.
|
|
46
|
+
|
|
47
|
+
The detection engine already learns a per-metric baseline, debounces with a
|
|
48
|
+
confirmation window, and applies practical-significance floors. This pass tunes
|
|
49
|
+
the per-metric **defaults** so a fresh install alerts only on genuine,
|
|
50
|
+
statistically-significant, problem-mapping deviations instead of flooding on
|
|
51
|
+
every metric that wiggles. 264 metrics were reviewed:
|
|
52
|
+
|
|
53
|
+
- **Default-disabled** the high-noise and un-baselineable classes that were
|
|
54
|
+
alerting for no good reason: raw identifiers and counts (status codes, error
|
|
55
|
+
and row counts, build counts, player and executor counts), config echoes and
|
|
56
|
+
near-constants (probe packet counts, CPU core count, total/swap memory),
|
|
57
|
+
payload-size and other run-to-run-volatile values, and deterministic values
|
|
58
|
+
like certificate days-remaining (governed by the check's own static-threshold
|
|
59
|
+
health logic, not statistics). These stay chartable and can be re-enabled per
|
|
60
|
+
field.
|
|
61
|
+
- **Hardened** the signals that should alert - latency/response/execution time
|
|
62
|
+
and availability/success/saturation percentages - with confirmation windows
|
|
63
|
+
and absolute + relative floors so brief spikes and sub-threshold jitter no
|
|
64
|
+
longer flap, and prefer percentage metrics over their absolute twins.
|
|
65
|
+
|
|
66
|
+
No detection-engine or schema changes; only per-metric `x-anomaly-*` defaults.
|
|
67
|
+
Users who had opted into any now-disabled metric keep their explicit override.
|
|
68
|
+
|
|
69
|
+
- Updated dependencies [8cad340]
|
|
70
|
+
- Updated dependencies [8cad340]
|
|
71
|
+
- Updated dependencies [8cad340]
|
|
72
|
+
- Updated dependencies [8cad340]
|
|
73
|
+
- Updated dependencies [8cad340]
|
|
74
|
+
- Updated dependencies [8cad340]
|
|
75
|
+
- Updated dependencies [8cad340]
|
|
76
|
+
- @checkstack/backend-api@0.25.0
|
|
77
|
+
- @checkstack/healthcheck-common@1.8.0
|
|
78
|
+
- @checkstack/common@0.17.0
|
|
79
|
+
|
|
80
|
+
## 0.3.11
|
|
81
|
+
|
|
82
|
+
### Patch Changes
|
|
83
|
+
|
|
84
|
+
- Updated dependencies [2ec8f64]
|
|
85
|
+
- @checkstack/backend-api@0.24.1
|
|
86
|
+
|
|
3
87
|
## 0.3.10
|
|
4
88
|
|
|
5
89
|
### Patch Changes
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@checkstack/healthcheck-tcp-backend",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"main": "src/index.ts",
|
|
6
6
|
"checkstack": {
|
|
@@ -14,15 +14,15 @@
|
|
|
14
14
|
"pack": "bunx @checkstack/scripts plugin-pack"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@checkstack/backend-api": "0.
|
|
18
|
-
"@checkstack/common": "0.
|
|
19
|
-
"@checkstack/healthcheck-common": "1.
|
|
17
|
+
"@checkstack/backend-api": "0.25.0",
|
|
18
|
+
"@checkstack/common": "0.17.0",
|
|
19
|
+
"@checkstack/healthcheck-common": "1.8.0"
|
|
20
20
|
},
|
|
21
21
|
"devDependencies": {
|
|
22
22
|
"@types/bun": "^1.0.0",
|
|
23
23
|
"typescript": "^5.0.0",
|
|
24
24
|
"@checkstack/tsconfig": "0.0.7",
|
|
25
|
-
"@checkstack/scripts": "0.6.
|
|
25
|
+
"@checkstack/scripts": "0.6.3"
|
|
26
26
|
},
|
|
27
27
|
"description": "Checkstack healthcheck-tcp-backend plugin",
|
|
28
28
|
"author": {
|
package/src/banner-collector.ts
CHANGED
|
@@ -47,8 +47,12 @@ const bannerResultSchema = healthResultSchema({
|
|
|
47
47
|
hasBanner: healthResultBoolean({
|
|
48
48
|
"x-chart-type": "boolean",
|
|
49
49
|
"x-chart-label": "Has Banner",
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
// Whether a server emits a banner is protocol/configuration dependent and
|
|
51
|
+
// can legitimately flip run-to-run (timing, quiet protocols, partial
|
|
52
|
+
// reads). It does not map to a real availability problem on its own, so a
|
|
53
|
+
// dominance flip here is an alert-fatigue source. Charting stays available;
|
|
54
|
+
// alerting is off by default.
|
|
55
|
+
"x-anomaly-enabled": false,
|
|
52
56
|
}),
|
|
53
57
|
readTimeMs: healthResultNumber({
|
|
54
58
|
"x-chart-type": "line",
|
|
@@ -73,13 +77,21 @@ const bannerAggregatedFields = {
|
|
|
73
77
|
"x-chart-unit": "ms",
|
|
74
78
|
"x-anomaly-enabled": true,
|
|
75
79
|
"x-anomaly-direction": "lower-is-better",
|
|
80
|
+
// Latency aggregate: widen the band and require practical-significance
|
|
81
|
+
// floors so fast banner reads do not alert on small jitter.
|
|
82
|
+
"x-anomaly-sensitivity": 2,
|
|
83
|
+
"x-anomaly-confirmation-window": 3,
|
|
84
|
+
"x-anomaly-min-absolute-delta": 50,
|
|
85
|
+
"x-anomaly-min-relative-delta": 0.5,
|
|
76
86
|
}),
|
|
77
87
|
bannerRate: aggregatedRate({
|
|
78
88
|
"x-chart-type": "gauge",
|
|
79
89
|
"x-chart-label": "Banner Rate",
|
|
80
90
|
"x-chart-unit": "%",
|
|
81
|
-
|
|
82
|
-
|
|
91
|
+
// Banner presence is protocol/configuration dependent and varies legitimately
|
|
92
|
+
// run-to-run; its rate is not a real health signal and would alert on benign
|
|
93
|
+
// fluctuation. Charting stays available; alerting is off by default.
|
|
94
|
+
"x-anomaly-enabled": false,
|
|
83
95
|
}),
|
|
84
96
|
};
|
|
85
97
|
|
package/src/strategy.ts
CHANGED
|
@@ -11,6 +11,7 @@ import {
|
|
|
11
11
|
mergeCounter,
|
|
12
12
|
z,
|
|
13
13
|
type ConnectedClient,
|
|
14
|
+
type TransportTimings,
|
|
14
15
|
type InferAggregatedResult,
|
|
15
16
|
baseStrategyConfigSchema,
|
|
16
17
|
} from "@checkstack/backend-api";
|
|
@@ -107,6 +108,12 @@ const tcpAggregatedFields = {
|
|
|
107
108
|
"x-chart-unit": "ms",
|
|
108
109
|
"x-anomaly-enabled": true,
|
|
109
110
|
"x-anomaly-direction": "lower-is-better",
|
|
111
|
+
// Latency aggregate: widen the band and require practical-significance
|
|
112
|
+
// floors so fast endpoints do not alert on small jitter.
|
|
113
|
+
"x-anomaly-sensitivity": 2,
|
|
114
|
+
"x-anomaly-confirmation-window": 3,
|
|
115
|
+
"x-anomaly-min-absolute-delta": 50,
|
|
116
|
+
"x-anomaly-min-relative-delta": 0.5,
|
|
110
117
|
}),
|
|
111
118
|
successRate: aggregatedRate({
|
|
112
119
|
"x-chart-type": "gauge",
|
|
@@ -114,12 +121,19 @@ const tcpAggregatedFields = {
|
|
|
114
121
|
"x-chart-unit": "%",
|
|
115
122
|
"x-anomaly-enabled": true,
|
|
116
123
|
"x-anomaly-direction": "higher-is-better",
|
|
124
|
+
// Availability is the primary, real signal. Debounce so a single
|
|
125
|
+
// transient failed bucket does not alert.
|
|
126
|
+
"x-anomaly-confirmation-window": 3,
|
|
117
127
|
}),
|
|
118
128
|
errorCount: aggregatedCounter({
|
|
119
129
|
"x-chart-type": "counter",
|
|
120
130
|
"x-chart-label": "Errors",
|
|
121
|
-
|
|
122
|
-
|
|
131
|
+
// Raw per-bucket error count scales with how many runs land in a bucket,
|
|
132
|
+
// so it has no stable baseline and is fully redundant with successRate
|
|
133
|
+
// (which already captures the same failures as a rate). Charting stays
|
|
134
|
+
// available; alerting is owned by successRate to avoid duplicate, noisy
|
|
135
|
+
// alerts on the same failures.
|
|
136
|
+
"x-anomaly-enabled": false,
|
|
123
137
|
}),
|
|
124
138
|
};
|
|
125
139
|
|
|
@@ -278,10 +292,15 @@ export class TcpHealthCheckStrategy implements HealthCheckStrategy<
|
|
|
278
292
|
const validatedConfig = this.config.validate(config);
|
|
279
293
|
const socket = this.socketFactory();
|
|
280
294
|
|
|
295
|
+
const connectStart = performance.now();
|
|
281
296
|
await socket.connect({
|
|
282
297
|
host: validatedConfig.host,
|
|
283
298
|
port: validatedConfig.port,
|
|
284
299
|
});
|
|
300
|
+
// The only meaningful sub-phase for a raw TCP probe is the connect itself.
|
|
301
|
+
const timings: TransportTimings = {
|
|
302
|
+
connectMs: Math.max(0, Math.round(performance.now() - connectStart)),
|
|
303
|
+
};
|
|
285
304
|
|
|
286
305
|
const client: TcpTransportClient = {
|
|
287
306
|
async exec(request: TcpConnectRequest): Promise<TcpConnectResult> {
|
|
@@ -295,6 +314,7 @@ export class TcpHealthCheckStrategy implements HealthCheckStrategy<
|
|
|
295
314
|
|
|
296
315
|
return {
|
|
297
316
|
client,
|
|
317
|
+
timings,
|
|
298
318
|
close: () => socket.close(),
|
|
299
319
|
};
|
|
300
320
|
}
|