@checkstack/healthcheck-tcp-backend 0.3.10 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,89 @@
1
1
  # @checkstack/healthcheck-tcp-backend
2
2
 
3
+ ## 0.4.0
4
+
5
+ ### Minor Changes
6
+
7
+ - 8cad340: Add a finer per-run transport timing breakdown to health checks.
8
+
9
+ Each run now records an optional structured `metadata.timings` (DNS, connect,
10
+ TLS, wait/time-to-first-byte, transfer, and a `processing` catch-all for
11
+ non-HTTP operation time). The run-detail view renders the phases it has, in
12
+ transport order, and falls back to the previous Connection + Processing split
13
+ for older runs that lack the finer data.
14
+
15
+ For HTTP the request is issued verbatim through `fetch` (original URL, headers,
16
+ and body), so request behavior is identical to a plain `fetch`. The timing is
17
+ measured around it: `fetch` resolves at the response headers, so wait
18
+ (time-to-first-byte) and transfer (body) are measured exactly on the request,
19
+ DNS is timed at the resolve step, and connect/TLS come from a short-lived,
20
+ best-effort raw `net`/`tls` probe to the same already-validated IP (the request
21
+ socket exposes no connect/handshake events on the Bun runtime). The probe is
22
+ timing-only and never fails the check. The probe validates the TLS certificate
23
+ (against the original hostname via SNI) like the real request does - it does not
24
+ disable certificate validation; an unverifiable cert simply yields no TLS-phase
25
+ timing rather than aborting. Other transports surface the connect and operation
26
+ times they already measure.
27
+
28
+ The SSRF guard now validates the resolved host (rejecting cloud-metadata /
29
+ link-local and operator-denied ranges) as a pre-flight check and no longer pins
30
+ the request to the resolved IP. Pinning rewrote the URL to the IP literal and
31
+ moved the host to the `Host` header, which breaks HTTP/2 origins (their
32
+ authority comes from the URL's `:authority`, not `Host`) - that is why real
33
+ hosts such as `google.com` started answering 404/429 instead of 200. The
34
+ pre-flight validation keeps blocking static metadata/link-local targets and
35
+ direct denied IP literals; the only thing dropped is DNS-rebind TOCTOU
36
+ protection (a narrow window that pinning closed at the cost of breaking
37
+ legitimate HTTP/2 requests).
38
+
39
+ The run-detail "slowest" badge no longer collides with the timing bar, and a
40
+ genuinely sub-millisecond phase reads as "<1 ms" instead of a bare "0 ms".
41
+
42
+ ### Patch Changes
43
+
44
+ - 8cad340: Retune anomaly-detection defaults across every health-check strategy and the
45
+ hardware collector for a low-noise, problem-focused out-of-the-box experience.
46
+
47
+ The detection engine already learns a per-metric baseline, debounces with a
48
+ confirmation window, and applies practical-significance floors. This pass tunes
49
+ the per-metric **defaults** so a fresh install alerts only on genuine,
50
+ statistically-significant, problem-mapping deviations instead of flooding on
51
+ every metric that wiggles. 264 metrics were reviewed:
52
+
53
+ - **Default-disabled** the high-noise and un-baselineable classes that were
54
+ alerting for no good reason: raw identifiers and counts (status codes, error
55
+ and row counts, build counts, player and executor counts), config echoes and
56
+ near-constants (probe packet counts, CPU core count, total/swap memory),
57
+ payload-size and other run-to-run-volatile values, and deterministic values
58
+ like certificate days-remaining (governed by the check's own static-threshold
59
+ health logic, not statistics). These stay chartable and can be re-enabled per
60
+ field.
61
+ - **Hardened** the signals that should alert - latency/response/execution time
62
+ and availability/success/saturation percentages - with confirmation windows
63
+ and absolute + relative floors so brief spikes and sub-threshold jitter no
64
+ longer flap, and prefer percentage metrics over their absolute twins.
65
+
66
+ No detection-engine or schema changes; only per-metric `x-anomaly-*` defaults.
67
+ Users who had opted into any now-disabled metric keep their explicit override.
68
+
69
+ - Updated dependencies [8cad340]
70
+ - Updated dependencies [8cad340]
71
+ - Updated dependencies [8cad340]
72
+ - Updated dependencies [8cad340]
73
+ - Updated dependencies [8cad340]
74
+ - Updated dependencies [8cad340]
75
+ - Updated dependencies [8cad340]
76
+ - @checkstack/backend-api@0.25.0
77
+ - @checkstack/healthcheck-common@1.8.0
78
+ - @checkstack/common@0.17.0
79
+
80
+ ## 0.3.11
81
+
82
+ ### Patch Changes
83
+
84
+ - Updated dependencies [2ec8f64]
85
+ - @checkstack/backend-api@0.24.1
86
+
3
87
  ## 0.3.10
4
88
 
5
89
  ### Patch Changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@checkstack/healthcheck-tcp-backend",
3
- "version": "0.3.10",
3
+ "version": "0.4.0",
4
4
  "type": "module",
5
5
  "main": "src/index.ts",
6
6
  "checkstack": {
@@ -14,15 +14,15 @@
14
14
  "pack": "bunx @checkstack/scripts plugin-pack"
15
15
  },
16
16
  "dependencies": {
17
- "@checkstack/backend-api": "0.24.0",
18
- "@checkstack/common": "0.16.0",
19
- "@checkstack/healthcheck-common": "1.7.1"
17
+ "@checkstack/backend-api": "0.25.0",
18
+ "@checkstack/common": "0.17.0",
19
+ "@checkstack/healthcheck-common": "1.8.0"
20
20
  },
21
21
  "devDependencies": {
22
22
  "@types/bun": "^1.0.0",
23
23
  "typescript": "^5.0.0",
24
24
  "@checkstack/tsconfig": "0.0.7",
25
- "@checkstack/scripts": "0.6.2"
25
+ "@checkstack/scripts": "0.6.3"
26
26
  },
27
27
  "description": "Checkstack healthcheck-tcp-backend plugin",
28
28
  "author": {
@@ -47,8 +47,12 @@ const bannerResultSchema = healthResultSchema({
47
47
  hasBanner: healthResultBoolean({
48
48
  "x-chart-type": "boolean",
49
49
  "x-chart-label": "Has Banner",
50
- "x-anomaly-enabled": true,
51
- "x-anomaly-direction": "dominance",
50
+ // Whether a server emits a banner is protocol/configuration dependent and
51
+ // can legitimately flip run-to-run (timing, quiet protocols, partial
52
+ // reads). It does not map to a real availability problem on its own, so a
53
+ // dominance flip here is an alert-fatigue source. Charting stays available;
54
+ // alerting is off by default.
55
+ "x-anomaly-enabled": false,
52
56
  }),
53
57
  readTimeMs: healthResultNumber({
54
58
  "x-chart-type": "line",
@@ -73,13 +77,21 @@ const bannerAggregatedFields = {
73
77
  "x-chart-unit": "ms",
74
78
  "x-anomaly-enabled": true,
75
79
  "x-anomaly-direction": "lower-is-better",
80
+ // Latency aggregate: widen the band and require practical-significance
81
+ // floors so fast banner reads do not alert on small jitter.
82
+ "x-anomaly-sensitivity": 2,
83
+ "x-anomaly-confirmation-window": 3,
84
+ "x-anomaly-min-absolute-delta": 50,
85
+ "x-anomaly-min-relative-delta": 0.5,
76
86
  }),
77
87
  bannerRate: aggregatedRate({
78
88
  "x-chart-type": "gauge",
79
89
  "x-chart-label": "Banner Rate",
80
90
  "x-chart-unit": "%",
81
- "x-anomaly-enabled": true,
82
- "x-anomaly-direction": "higher-is-better",
91
+ // Banner presence is protocol/configuration dependent and varies legitimately
92
+ // run-to-run; its rate is not a real health signal and would alert on benign
93
+ // fluctuation. Charting stays available; alerting is off by default.
94
+ "x-anomaly-enabled": false,
83
95
  }),
84
96
  };
85
97
 
package/src/strategy.ts CHANGED
@@ -11,6 +11,7 @@ import {
11
11
  mergeCounter,
12
12
  z,
13
13
  type ConnectedClient,
14
+ type TransportTimings,
14
15
  type InferAggregatedResult,
15
16
  baseStrategyConfigSchema,
16
17
  } from "@checkstack/backend-api";
@@ -107,6 +108,12 @@ const tcpAggregatedFields = {
107
108
  "x-chart-unit": "ms",
108
109
  "x-anomaly-enabled": true,
109
110
  "x-anomaly-direction": "lower-is-better",
111
+ // Latency aggregate: widen the band and require practical-significance
112
+ // floors so fast endpoints do not alert on small jitter.
113
+ "x-anomaly-sensitivity": 2,
114
+ "x-anomaly-confirmation-window": 3,
115
+ "x-anomaly-min-absolute-delta": 50,
116
+ "x-anomaly-min-relative-delta": 0.5,
110
117
  }),
111
118
  successRate: aggregatedRate({
112
119
  "x-chart-type": "gauge",
@@ -114,12 +121,19 @@ const tcpAggregatedFields = {
114
121
  "x-chart-unit": "%",
115
122
  "x-anomaly-enabled": true,
116
123
  "x-anomaly-direction": "higher-is-better",
124
+ // Availability is the primary, real signal. Debounce so a single
125
+ // transient failed bucket does not alert.
126
+ "x-anomaly-confirmation-window": 3,
117
127
  }),
118
128
  errorCount: aggregatedCounter({
119
129
  "x-chart-type": "counter",
120
130
  "x-chart-label": "Errors",
121
- "x-anomaly-enabled": true,
122
- "x-anomaly-direction": "lower-is-better",
131
+ // Raw per-bucket error count scales with how many runs land in a bucket,
132
+ // so it has no stable baseline and is fully redundant with successRate
133
+ // (which already captures the same failures as a rate). Charting stays
134
+ // available; alerting is owned by successRate to avoid duplicate, noisy
135
+ // alerts on the same failures.
136
+ "x-anomaly-enabled": false,
123
137
  }),
124
138
  };
125
139
 
@@ -278,10 +292,15 @@ export class TcpHealthCheckStrategy implements HealthCheckStrategy<
278
292
  const validatedConfig = this.config.validate(config);
279
293
  const socket = this.socketFactory();
280
294
 
295
+ const connectStart = performance.now();
281
296
  await socket.connect({
282
297
  host: validatedConfig.host,
283
298
  port: validatedConfig.port,
284
299
  });
300
+ // The only meaningful sub-phase for a raw TCP probe is the connect itself.
301
+ const timings: TransportTimings = {
302
+ connectMs: Math.max(0, Math.round(performance.now() - connectStart)),
303
+ };
285
304
 
286
305
  const client: TcpTransportClient = {
287
306
  async exec(request: TcpConnectRequest): Promise<TcpConnectResult> {
@@ -295,6 +314,7 @@ export class TcpHealthCheckStrategy implements HealthCheckStrategy<
295
314
 
296
315
  return {
297
316
  client,
317
+ timings,
298
318
  close: () => socket.close(),
299
319
  };
300
320
  }