@checkstack/healthcheck-backend 1.4.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +303 -0
- package/drizzle/0018_abnormal_preak.sql +10 -0
- package/drizzle/meta/0018_snapshot.json +600 -0
- package/drizzle/meta/_journal.json +7 -0
- package/package.json +26 -21
- package/src/ai/assertion-validation.test.ts +117 -0
- package/src/ai/assertion-validation.ts +147 -0
- package/src/ai/healthcheck-capabilities.test.ts +158 -0
- package/src/ai/healthcheck-capabilities.ts +217 -0
- package/src/ai/healthcheck-delete.test.ts +81 -0
- package/src/ai/healthcheck-delete.ts +81 -0
- package/src/ai/healthcheck-projection.test.ts +36 -0
- package/src/ai/healthcheck-propose.test.ts +268 -0
- package/src/ai/healthcheck-propose.ts +290 -0
- package/src/ai/healthcheck-script-tools.test.ts +93 -0
- package/src/ai/healthcheck-script-tools.ts +179 -0
- package/src/ai/healthcheck-update.test.ts +123 -0
- package/src/ai/healthcheck-update.ts +123 -0
- package/src/ai/notify-subscribers.test.ts +109 -0
- package/src/ai/notify-subscribers.ts +176 -0
- package/src/ai/register-ai-tools.test.ts +41 -0
- package/src/ai/register-ai-tools.ts +53 -0
- package/src/ai/shell-env-table.test.ts +47 -0
- package/src/automations.test.ts +2 -1
- package/src/automations.ts +9 -1
- package/src/collector-script-test.test.ts +53 -1
- package/src/collector-script-test.ts +59 -7
- package/src/effective-environments.test.ts +93 -0
- package/src/effective-environments.ts +64 -0
- package/src/health-entity-id.ts +57 -0
- package/src/health-entity.test.ts +405 -31
- package/src/health-entity.ts +99 -43
- package/src/health-state.ts +41 -4
- package/src/healthcheck-gitops-kinds.test.ts +95 -0
- package/src/healthcheck-gitops-kinds.ts +56 -13
- package/src/index.ts +33 -0
- package/src/migration-chain-contract.test.ts +57 -0
- package/src/queue-executor.test.ts +814 -0
- package/src/queue-executor.ts +342 -50
- package/src/realtime-aggregation.test.ts +30 -0
- package/src/realtime-aggregation.ts +16 -0
- package/src/retention-job.ts +167 -93
- package/src/retention-rollup.test.ts +118 -0
- package/src/router.test.ts +120 -1
- package/src/router.ts +20 -0
- package/src/schema.ts +44 -6
- package/src/service.ts +199 -43
- package/src/state-evaluator.test.ts +50 -5
- package/src/state-evaluator.ts +9 -2
- package/src/state-transitions.test.ts +104 -0
- package/src/state-transitions.ts +39 -1
- package/src/validate-configuration.test.ts +205 -0
- package/src/validate-configuration.ts +159 -0
- package/tsconfig.json +9 -0
package/src/health-entity.ts
CHANGED
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
*/
|
|
24
24
|
import { z } from "zod";
|
|
25
25
|
import { HealthCheckStatusSchema } from "@checkstack/healthcheck-common";
|
|
26
|
-
import {
|
|
26
|
+
import type { AdvisoryLockService } from "@checkstack/backend-api";
|
|
27
27
|
import type {
|
|
28
28
|
EntityChangeDeriver,
|
|
29
29
|
EntityChangePayloadMapper,
|
|
@@ -31,13 +31,9 @@ import type {
|
|
|
31
31
|
EntityRead,
|
|
32
32
|
} from "@checkstack/automation-backend";
|
|
33
33
|
import type { HealthCheckService } from "./service";
|
|
34
|
-
import
|
|
35
|
-
// Re-export the change type through automation-backend's barrel (it
|
|
36
|
-
// re-exports it from automation-common) so this domain needs no extra dep.
|
|
34
|
+
import { parseHealthEntityId } from "./health-entity-id";
|
|
37
35
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
/** Entity kind id for the per-system aggregated health. */
|
|
36
|
+
/** Entity kind id for the aggregated health (system rollup + per-environment). */
|
|
41
37
|
export const HEALTH_ENTITY_KIND = "health";
|
|
42
38
|
|
|
43
39
|
/**
|
|
@@ -126,15 +122,23 @@ function readNumber(
|
|
|
126
122
|
* Restores the keys operators read (`trigger.payload.systemId`,
|
|
127
123
|
* `.previousStatus`, …) that the generic change shape omits.
|
|
128
124
|
*
|
|
129
|
-
*
|
|
130
|
-
*
|
|
131
|
-
*
|
|
132
|
-
*
|
|
133
|
-
*
|
|
125
|
+
* The entity id is now env-qualified (Phase 3b): `payload.systemId` is ALWAYS
|
|
126
|
+
* the systemId portion (so existing automations reading `trigger.payload.systemId`
|
|
127
|
+
* are unaffected — the rollup carries the bare systemId), and the NEW optional
|
|
128
|
+
* `payload.environmentId` is the env portion — present only for a per-environment
|
|
129
|
+
* change, absent (undefined) for the system rollup. `previousStatus` is
|
|
130
|
+
* `prev.status` and `newStatus` is `next.status`; `healthyChecks` / `totalChecks`
|
|
131
|
+
* come from `next`; `timestamp` is the change's `occurredAt`. `systemName` is not
|
|
132
|
+
* derivable from a health change (it lives in the catalog) and is OPTIONAL on the
|
|
133
|
+
* schemas, so it is omitted.
|
|
134
134
|
*/
|
|
135
135
|
export const healthChangeToPayload: EntityChangePayloadMapper = (changed) => {
|
|
136
|
+
const { systemId, environmentId } = parseHealthEntityId(changed.id);
|
|
136
137
|
return {
|
|
137
|
-
systemId
|
|
138
|
+
systemId,
|
|
139
|
+
// Present only for a per-env change; omitted for the rollup so the field
|
|
140
|
+
// is `undefined` (the optional schema accepts both).
|
|
141
|
+
...(environmentId === null ? {} : { environmentId }),
|
|
138
142
|
previousStatus: readStatus(changed.prev) ?? undefined,
|
|
139
143
|
newStatus: readStatus(changed.next) ?? undefined,
|
|
140
144
|
healthyChecks: readNumber(changed.next, "healthyChecks") ?? 0,
|
|
@@ -157,6 +161,12 @@ export const healthChangeToPayload: EntityChangePayloadMapper = (changed) => {
|
|
|
157
161
|
*/
|
|
158
162
|
export interface HealthChangeClassification {
|
|
159
163
|
systemId: string;
|
|
164
|
+
/**
|
|
165
|
+
* The environment portion of the entity id (Phase 3b). `null` for the
|
|
166
|
+
* system rollup change; the env id for a per-environment change. Cross-plugin
|
|
167
|
+
* consumers that only care about the system (SLO / dependency) can ignore it.
|
|
168
|
+
*/
|
|
169
|
+
environmentId: string | null;
|
|
160
170
|
previousStatus: string | null;
|
|
161
171
|
newStatus: string | null;
|
|
162
172
|
degraded: boolean;
|
|
@@ -168,6 +178,7 @@ export function classifyHealthChange(changed: {
|
|
|
168
178
|
prev: Record<string, unknown> | null;
|
|
169
179
|
next: Record<string, unknown> | null;
|
|
170
180
|
}): HealthChangeClassification {
|
|
181
|
+
const { systemId, environmentId } = parseHealthEntityId(changed.id);
|
|
171
182
|
const previousStatus = readStatus(changed.prev);
|
|
172
183
|
const newStatus = readStatus(changed.next);
|
|
173
184
|
const bothPresent = previousStatus !== null && newStatus !== null;
|
|
@@ -176,7 +187,8 @@ export function classifyHealthChange(changed: {
|
|
|
176
187
|
const recovered =
|
|
177
188
|
bothPresent && newStatus === "healthy" && previousStatus !== "healthy";
|
|
178
189
|
return {
|
|
179
|
-
systemId
|
|
190
|
+
systemId,
|
|
191
|
+
environmentId,
|
|
180
192
|
previousStatus,
|
|
181
193
|
newStatus,
|
|
182
194
|
degraded,
|
|
@@ -214,9 +226,17 @@ export function classifyHealthChange(changed: {
|
|
|
214
226
|
export async function computeHealthEntityState(args: {
|
|
215
227
|
service: HealthCheckService;
|
|
216
228
|
systemId: string;
|
|
229
|
+
/**
|
|
230
|
+
* Environment to compute the view for (Phase 3b). `undefined` = the SYSTEM
|
|
231
|
+
* ROLLUP (worst status across all environments + env-less runs — the
|
|
232
|
+
* all-runs aggregate, §7.4.2). `null` = the env-less slice. A string = that
|
|
233
|
+
* environment's per-env view. The existence gate (`checkStatuses.length`) is
|
|
234
|
+
* env-independent, so a per-env view and the rollup agree on totalChecks.
|
|
235
|
+
*/
|
|
236
|
+
environmentId?: string | null;
|
|
217
237
|
}): Promise<HealthEntityState | undefined> {
|
|
218
|
-
const { service, systemId } = args;
|
|
219
|
-
const overview = await service.getSystemHealthStatus(systemId);
|
|
238
|
+
const { service, systemId, environmentId } = args;
|
|
239
|
+
const overview = await service.getSystemHealthStatus(systemId, environmentId);
|
|
220
240
|
// No enabled check associations ⇒ no health entity for this system.
|
|
221
241
|
if (overview.checkStatuses.length === 0) return undefined;
|
|
222
242
|
return {
|
|
@@ -229,10 +249,16 @@ export async function computeHealthEntityState(args: {
|
|
|
229
249
|
|
|
230
250
|
/**
|
|
231
251
|
* Build the PLUGIN-BACKED + COMPUTED `read` accessor for the `health` entity.
|
|
232
|
-
*
|
|
233
|
-
*
|
|
234
|
-
*
|
|
235
|
-
*
|
|
252
|
+
*
|
|
253
|
+
* Env-aware id parsing (Phase 3b, §7.4.2): each incoming id is parsed via
|
|
254
|
+
* {@link parseHealthEntityId}. A BARE `"<systemId>"` resolves the SYSTEM
|
|
255
|
+
* ROLLUP; a `"<systemId>::<environmentId>"` resolves that environment's
|
|
256
|
+
* per-env view. The result is keyed by the ORIGINAL id, so the reactive
|
|
257
|
+
* engine, `getMany`, and scope enrichment all see the right view for the id
|
|
258
|
+
* they asked for. Systems with no enabled check associations are omitted
|
|
259
|
+
* (existence gate). No framework `entity_state` storage — compute-on-read from
|
|
260
|
+
* the durable, env-keyed `health_check_runs`, so a read returns the same answer
|
|
261
|
+
* on every pod (state-and-scale).
|
|
236
262
|
*/
|
|
237
263
|
export function createHealthEntityRead(deps: {
|
|
238
264
|
service: HealthCheckService;
|
|
@@ -242,9 +268,20 @@ export function createHealthEntityRead(deps: {
|
|
|
242
268
|
if (ids.length === 0) return {};
|
|
243
269
|
const out: Record<string, HealthEntityState> = {};
|
|
244
270
|
await Promise.all(
|
|
245
|
-
ids.map(async (
|
|
246
|
-
const
|
|
247
|
-
|
|
271
|
+
ids.map(async (id) => {
|
|
272
|
+
const { systemId, environmentId } = parseHealthEntityId(id);
|
|
273
|
+
const state = await computeHealthEntityState({
|
|
274
|
+
service,
|
|
275
|
+
systemId,
|
|
276
|
+
// A bare `<systemId>` id is the ROLLUP: `parseHealthEntityId`
|
|
277
|
+
// returns `environmentId: null` for it (so the payload mapper can
|
|
278
|
+
// tell "rollup → omit environmentId"), but the rollup must read ALL
|
|
279
|
+
// runs — `undefined` — NOT the env-less slice (`null`, which filters
|
|
280
|
+
// to `env_id IS NULL`). Reserve `null` for an explicit env-less
|
|
281
|
+
// read; map the rollup's null to undefined here.
|
|
282
|
+
environmentId: environmentId === null ? undefined : environmentId,
|
|
283
|
+
});
|
|
284
|
+
if (state) out[id] = state;
|
|
248
285
|
}),
|
|
249
286
|
);
|
|
250
287
|
return out;
|
|
@@ -298,19 +335,28 @@ export function createHealthEntityRead(deps: {
|
|
|
298
335
|
*/
|
|
299
336
|
export async function writeHealthEntity(args: {
|
|
300
337
|
handle: EntityHandle<HealthEntityState> | undefined;
|
|
301
|
-
|
|
338
|
+
/**
|
|
339
|
+
* The `health` entity id to mutate (Phase 3b): the env-qualified
|
|
340
|
+
* `"<systemId>::<environmentId>"` for a per-env write, or the bare
|
|
341
|
+
* `"<systemId>"` for the env-less / system-rollup write. This is the id the
|
|
342
|
+
* framework diffs/emits, so it drives both the per-env and rollup
|
|
343
|
+
* `ENTITY_CHANGED`.
|
|
344
|
+
*/
|
|
345
|
+
entityId: string;
|
|
302
346
|
apply: () => Promise<HealthEntityState>;
|
|
303
347
|
onError?: (error: unknown) => void;
|
|
304
348
|
/**
|
|
305
|
-
* Optional per-`
|
|
349
|
+
* Optional per-`entityId` critical section wrapping the snapshot-prev +
|
|
306
350
|
* apply + diff + emit. The executor supplies a transaction-scoped advisory
|
|
307
|
-
* lock (`withXactLock`, key `health:<
|
|
308
|
-
* of one system can't double-emit a
|
|
309
|
-
*
|
|
351
|
+
* lock (`withXactLock`, key `health:<entityId>`) so concurrent evaluations
|
|
352
|
+
* of one (system, environment) — or of the rollup — can't double-emit a
|
|
353
|
+
* single logical transition, and per-env + rollup writes serialize against
|
|
354
|
+
* their OWN keys (distinct envs / the rollup don't block each other).
|
|
355
|
+
* Identity by default (no serialization) for the unbound-handle / test paths.
|
|
310
356
|
*/
|
|
311
357
|
serialize?: <T>(fn: () => Promise<T>) => Promise<T>;
|
|
312
358
|
}): Promise<HealthEntityState> {
|
|
313
|
-
const { handle,
|
|
359
|
+
const { handle, entityId, apply, onError, serialize } = args;
|
|
314
360
|
if (!handle) {
|
|
315
361
|
// No reactivity bound — run the durable write directly.
|
|
316
362
|
return apply();
|
|
@@ -323,7 +369,7 @@ export async function writeHealthEntity(args: {
|
|
|
323
369
|
// call, and we wrap that whole call so two concurrent evals serialize.
|
|
324
370
|
return await run(() =>
|
|
325
371
|
handle.mutate({
|
|
326
|
-
id:
|
|
372
|
+
id: entityId,
|
|
327
373
|
apply: async () => {
|
|
328
374
|
durableState = await apply();
|
|
329
375
|
return durableState;
|
|
@@ -340,19 +386,26 @@ export async function writeHealthEntity(args: {
|
|
|
340
386
|
}
|
|
341
387
|
}
|
|
342
388
|
|
|
343
|
-
/**
|
|
344
|
-
|
|
345
|
-
|
|
389
|
+
/**
|
|
390
|
+
* Advisory-lock key namespace for the per-entity health critical section. The
|
|
391
|
+
* argument is the FULL `health` entity id (Phase 3b): the bare `"<systemId>"`
|
|
392
|
+
* for the rollup or `"<systemId>::<environmentId>"` for a per-env write. Two
|
|
393
|
+
* different envs (or an env vs the rollup) get DIFFERENT keys, so they
|
|
394
|
+
* serialize independently and never block each other.
|
|
395
|
+
*/
|
|
396
|
+
export function healthEntityLockKey(entityId: string): string {
|
|
397
|
+
return `health:${entityId}`;
|
|
346
398
|
}
|
|
347
399
|
|
|
348
400
|
/**
|
|
349
|
-
* Build the per-`
|
|
401
|
+
* Build the per-`entityId` serializer for {@link writeHealthEntity} backed by
|
|
350
402
|
* a transaction-scoped advisory lock (`withXactLock`, key
|
|
351
|
-
* `health:<
|
|
352
|
-
*
|
|
403
|
+
* `health:<entityId>`). The returned function blocks until it holds the
|
|
404
|
+
* entity's lock, runs `fn` (the whole snapshot-prev + apply + diff + emit), and
|
|
353
405
|
* auto-releases the lock at COMMIT/ROLLBACK. Two concurrent evaluations of one
|
|
354
|
-
* system
|
|
355
|
-
*
|
|
406
|
+
* (system, environment) — or of the rollup — therefore serialize, while
|
|
407
|
+
* distinct envs proceed in parallel. Exactly one logical transition per entity
|
|
408
|
+
* emits exactly one `ENTITY_CHANGED` + one transition row.
|
|
356
409
|
*
|
|
357
410
|
* `fn` does its own durable writes on the outer pool; the lock only gates
|
|
358
411
|
* ENTRY to the critical section, so its connection affinity is irrelevant —
|
|
@@ -360,10 +413,13 @@ export function healthSystemLockKey(systemId: string): string {
|
|
|
360
413
|
* commits.
|
|
361
414
|
*/
|
|
362
415
|
export function createHealthEntitySerializer(deps: {
|
|
363
|
-
|
|
364
|
-
}): (
|
|
365
|
-
const {
|
|
366
|
-
return (
|
|
416
|
+
advisoryLock: AdvisoryLockService;
|
|
417
|
+
}): (entityId: string) => <T>(fn: () => Promise<T>) => Promise<T> {
|
|
418
|
+
const { advisoryLock } = deps;
|
|
419
|
+
return (entityId) =>
|
|
367
420
|
<T>(fn: () => Promise<T>) =>
|
|
368
|
-
withXactLock({
|
|
421
|
+
advisoryLock.withXactLock({
|
|
422
|
+
key: healthEntityLockKey(entityId),
|
|
423
|
+
fn: () => fn(),
|
|
424
|
+
});
|
|
369
425
|
}
|
package/src/health-state.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { and, desc, eq, gte } from "drizzle-orm";
|
|
1
|
+
import { and, desc, eq, gte, isNull } from "drizzle-orm";
|
|
2
2
|
import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
|
|
3
3
|
import type { Logger, SafeDatabase } from "@checkstack/backend-api";
|
|
4
4
|
import type { InferClient } from "@checkstack/common";
|
|
@@ -122,15 +122,28 @@ export async function findLatestRun({
|
|
|
122
122
|
db,
|
|
123
123
|
systemId,
|
|
124
124
|
configurationId,
|
|
125
|
+
environmentId,
|
|
125
126
|
}: {
|
|
126
127
|
db: Db;
|
|
127
128
|
systemId: string;
|
|
128
129
|
configurationId?: string;
|
|
130
|
+
/**
|
|
131
|
+
* Environment to scope the run lookup to (Phase 3b). `undefined` = any
|
|
132
|
+
* environment (rollup). `null` = env-less runs only. A string = that env.
|
|
133
|
+
*/
|
|
134
|
+
environmentId?: string | null;
|
|
129
135
|
}): Promise<{ latencyMs?: number; lastRunAt?: Date }> {
|
|
130
136
|
const conditions = [eq(healthCheckRuns.systemId, systemId)];
|
|
131
137
|
if (configurationId) {
|
|
132
138
|
conditions.push(eq(healthCheckRuns.configurationId, configurationId));
|
|
133
139
|
}
|
|
140
|
+
if (environmentId !== undefined) {
|
|
141
|
+
conditions.push(
|
|
142
|
+
environmentId === null
|
|
143
|
+
? isNull(healthCheckRuns.environmentId)
|
|
144
|
+
: eq(healthCheckRuns.environmentId, environmentId),
|
|
145
|
+
);
|
|
146
|
+
}
|
|
134
147
|
|
|
135
148
|
const [row] = await db
|
|
136
149
|
.select({
|
|
@@ -161,12 +174,19 @@ export async function computeWindowedMetrics({
|
|
|
161
174
|
db,
|
|
162
175
|
systemId,
|
|
163
176
|
configurationId,
|
|
177
|
+
environmentId,
|
|
164
178
|
now = new Date(),
|
|
165
179
|
windowHours = DEFAULT_METRICS_WINDOW_HOURS,
|
|
166
180
|
}: {
|
|
167
181
|
db: Db;
|
|
168
182
|
systemId: string;
|
|
169
183
|
configurationId?: string;
|
|
184
|
+
/**
|
|
185
|
+
* Environment to scope the windowed metrics to (Phase 3b). `undefined` =
|
|
186
|
+
* any environment (rollup). `null` = env-less aggregates only. A string =
|
|
187
|
+
* that environment's aggregate buckets only.
|
|
188
|
+
*/
|
|
189
|
+
environmentId?: string | null;
|
|
170
190
|
now?: Date;
|
|
171
191
|
windowHours?: number;
|
|
172
192
|
}): Promise<{
|
|
@@ -185,6 +205,13 @@ export async function computeWindowedMetrics({
|
|
|
185
205
|
eq(healthCheckAggregates.configurationId, configurationId),
|
|
186
206
|
);
|
|
187
207
|
}
|
|
208
|
+
if (environmentId !== undefined) {
|
|
209
|
+
conditions.push(
|
|
210
|
+
environmentId === null
|
|
211
|
+
? isNull(healthCheckAggregates.environmentId)
|
|
212
|
+
: eq(healthCheckAggregates.environmentId, environmentId),
|
|
213
|
+
);
|
|
214
|
+
}
|
|
188
215
|
|
|
189
216
|
const buckets = await db
|
|
190
217
|
.select({
|
|
@@ -284,6 +311,7 @@ export async function computeHealthState({
|
|
|
284
311
|
db,
|
|
285
312
|
systemId,
|
|
286
313
|
configurationId,
|
|
314
|
+
environmentId,
|
|
287
315
|
resolveStatus,
|
|
288
316
|
maintenanceClient,
|
|
289
317
|
logger,
|
|
@@ -293,6 +321,14 @@ export async function computeHealthState({
|
|
|
293
321
|
db: Db;
|
|
294
322
|
systemId: string;
|
|
295
323
|
configurationId?: string;
|
|
324
|
+
/**
|
|
325
|
+
* Environment to scope EVERY durable read to (Phase 3b). `undefined` = the
|
|
326
|
+
* system rollup (all environments + env-less). `null` = the env-less slice.
|
|
327
|
+
* A string = that environment. `inStatusSince`, latest run, windowed
|
|
328
|
+
* metrics, and the transition count all narrow to this env so a per-env
|
|
329
|
+
* health snapshot reflects only that environment's runs/transitions.
|
|
330
|
+
*/
|
|
331
|
+
environmentId?: string | null;
|
|
296
332
|
/** Returns the aggregate status for the system (per-check when scoped). */
|
|
297
333
|
resolveStatus: () => Promise<HealthCheckStatus>;
|
|
298
334
|
maintenanceClient?: MaintenanceClient;
|
|
@@ -305,14 +341,15 @@ export async function computeHealthState({
|
|
|
305
341
|
|
|
306
342
|
const [inStatusSince, latest, windowed, inMaintenance, transitionsInWindow] =
|
|
307
343
|
await Promise.all([
|
|
308
|
-
findInStatusSince({ db, systemId, status }),
|
|
309
|
-
findLatestRun({ db, systemId, configurationId }),
|
|
310
|
-
computeWindowedMetrics({ db, systemId, configurationId, now }),
|
|
344
|
+
findInStatusSince({ db, systemId, status, environmentId }),
|
|
345
|
+
findLatestRun({ db, systemId, configurationId, environmentId }),
|
|
346
|
+
computeWindowedMetrics({ db, systemId, configurationId, environmentId, now }),
|
|
311
347
|
resolveInMaintenance({ maintenanceClient, systemId, logger }),
|
|
312
348
|
countStateTransitionsInWindow({
|
|
313
349
|
db,
|
|
314
350
|
systemId,
|
|
315
351
|
windowMinutes: transitionWindowMinutes,
|
|
352
|
+
environmentId,
|
|
316
353
|
now,
|
|
317
354
|
}),
|
|
318
355
|
]);
|
|
@@ -415,6 +415,101 @@ describe("Healthcheck GitOps Kind: Healthcheck", () => {
|
|
|
415
415
|
).rejects.toThrow(/config validation failed/);
|
|
416
416
|
});
|
|
417
417
|
|
|
418
|
+
it("migrates an OLD-shape authored config forward and stores the migrated value", async () => {
|
|
419
|
+
// A strategy at version 2 whose v1->v2 migration drops a removed
|
|
420
|
+
// `legacyMode` key. Authored gitops YAML still in the v1 shape (carrying
|
|
421
|
+
// `legacyMode`) must be migrated forward and applied, not rejected.
|
|
422
|
+
const v2Schema = z.object({ host: z.string() });
|
|
423
|
+
const versionedStrategy = {
|
|
424
|
+
id: "postgres",
|
|
425
|
+
displayName: "PostgreSQL",
|
|
426
|
+
description: "test",
|
|
427
|
+
config: new Versioned({
|
|
428
|
+
version: 2,
|
|
429
|
+
schema: v2Schema,
|
|
430
|
+
migrations: [
|
|
431
|
+
{
|
|
432
|
+
fromVersion: 1,
|
|
433
|
+
toVersion: 2,
|
|
434
|
+
description: "Drop removed legacyMode key",
|
|
435
|
+
migrate: ({ legacyMode: _legacyMode, ...rest }: Record<string, unknown>) =>
|
|
436
|
+
rest,
|
|
437
|
+
},
|
|
438
|
+
],
|
|
439
|
+
}),
|
|
440
|
+
};
|
|
441
|
+
mockHCRegistry.getStrategiesWithMeta = () =>
|
|
442
|
+
[
|
|
443
|
+
{ strategy: versionedStrategy, ownerPluginId: "mock", qualifiedId: "postgres" },
|
|
444
|
+
] as any;
|
|
445
|
+
|
|
446
|
+
const kind = buildKind();
|
|
447
|
+
|
|
448
|
+
const result = await kind.reconcile({
|
|
449
|
+
entity: {
|
|
450
|
+
apiVersion: CHECKSTACK_API_VERSION,
|
|
451
|
+
kind: "Healthcheck",
|
|
452
|
+
metadata: { name: "legacy-check" },
|
|
453
|
+
spec: {
|
|
454
|
+
strategy: "postgres",
|
|
455
|
+
intervalSeconds: 30,
|
|
456
|
+
// Old v1 shape: carries the now-removed `legacyMode`.
|
|
457
|
+
config: { host: "db.legacy", legacyMode: true },
|
|
458
|
+
},
|
|
459
|
+
},
|
|
460
|
+
context: mockContext,
|
|
461
|
+
});
|
|
462
|
+
|
|
463
|
+
expect(result.entityId).toBe("hc-1");
|
|
464
|
+
// The MIGRATED config (legacyMode dropped) is what gets stored.
|
|
465
|
+
expect(mockService.configs[0].config).toEqual({ host: "db.legacy" });
|
|
466
|
+
});
|
|
467
|
+
|
|
468
|
+
it("rejects a genuine typo the migration does not account for (strict)", async () => {
|
|
469
|
+
const v2Schema = z.object({ host: z.string() });
|
|
470
|
+
const versionedStrategy = {
|
|
471
|
+
id: "postgres",
|
|
472
|
+
displayName: "PostgreSQL",
|
|
473
|
+
description: "test",
|
|
474
|
+
config: new Versioned({
|
|
475
|
+
version: 2,
|
|
476
|
+
schema: v2Schema,
|
|
477
|
+
migrations: [
|
|
478
|
+
{
|
|
479
|
+
fromVersion: 1,
|
|
480
|
+
toVersion: 2,
|
|
481
|
+
description: "Drop removed legacyMode key",
|
|
482
|
+
migrate: ({ legacyMode: _legacyMode, ...rest }: Record<string, unknown>) =>
|
|
483
|
+
rest,
|
|
484
|
+
},
|
|
485
|
+
],
|
|
486
|
+
}),
|
|
487
|
+
};
|
|
488
|
+
mockHCRegistry.getStrategiesWithMeta = () =>
|
|
489
|
+
[
|
|
490
|
+
{ strategy: versionedStrategy, ownerPluginId: "mock", qualifiedId: "postgres" },
|
|
491
|
+
] as any;
|
|
492
|
+
|
|
493
|
+
const kind = buildKind();
|
|
494
|
+
|
|
495
|
+
await expect(
|
|
496
|
+
kind.reconcile({
|
|
497
|
+
entity: {
|
|
498
|
+
apiVersion: CHECKSTACK_API_VERSION,
|
|
499
|
+
kind: "Healthcheck",
|
|
500
|
+
metadata: { name: "typo-check" },
|
|
501
|
+
spec: {
|
|
502
|
+
strategy: "postgres",
|
|
503
|
+
intervalSeconds: 30,
|
|
504
|
+
// `hsot` is a genuine typo no migration accounts for.
|
|
505
|
+
config: { host: "db.local", hsot: "oops" },
|
|
506
|
+
},
|
|
507
|
+
},
|
|
508
|
+
context: mockContext,
|
|
509
|
+
}),
|
|
510
|
+
).rejects.toThrow(/config validation failed/);
|
|
511
|
+
});
|
|
512
|
+
|
|
418
513
|
it("validates collector configs against collector registry schemas", async () => {
|
|
419
514
|
const kind = buildKind();
|
|
420
515
|
|
|
@@ -15,6 +15,7 @@ import type {
|
|
|
15
15
|
} from "@checkstack/backend-api";
|
|
16
16
|
import { NotificationPolicySchema } from "@checkstack/healthcheck-common";
|
|
17
17
|
import { HealthCheckService } from "./service";
|
|
18
|
+
import { validateVersionedConfigStrict } from "./validate-configuration";
|
|
18
19
|
import {
|
|
19
20
|
DynamicOperators,
|
|
20
21
|
numericField,
|
|
@@ -154,13 +155,25 @@ export function buildHealthcheckKind(
|
|
|
154
155
|
},
|
|
155
156
|
);
|
|
156
157
|
|
|
157
|
-
//
|
|
158
|
-
|
|
159
|
-
|
|
158
|
+
// Migrate-then-validate-strict: authored gitops YAML may be in an OLD
|
|
159
|
+
// config shape, so run the migration chain (assume-v1-on-read) before
|
|
160
|
+
// strict validation. Old-shape YAML still applies; genuine typos
|
|
161
|
+
// (unknown keys no migration accounts for) are still rejected. Shares the
|
|
162
|
+
// exact strict-validate path the `validateConfiguration` RPC uses, so the
|
|
163
|
+
// two agree on what counts as valid. A strategy config is always a plain
|
|
164
|
+
// object validated by the strategy's own schema, so narrowing the
|
|
165
|
+
// `unknown` result to the stored `Record` shape is safe.
|
|
166
|
+
const strategyResult = await validateVersionedConfigStrict({
|
|
167
|
+
config: strategy.config,
|
|
168
|
+
value: resolvedConfig,
|
|
169
|
+
basePath: ["config"],
|
|
170
|
+
});
|
|
171
|
+
if (!strategyResult.ok) {
|
|
160
172
|
throw new Error(
|
|
161
|
-
`Strategy "${spec.strategy}" config validation failed: ${
|
|
173
|
+
`Strategy "${spec.strategy}" config validation failed: ${formatIssues(strategyResult.issues)}`,
|
|
162
174
|
);
|
|
163
175
|
}
|
|
176
|
+
const migratedConfig = strategyResult.value as Record<string, unknown>;
|
|
164
177
|
|
|
165
178
|
// Resolve and validate collector configs using their registry schemas
|
|
166
179
|
const resolvedCollectors = spec.collectors
|
|
@@ -190,17 +203,30 @@ export function buildHealthcheckKind(
|
|
|
190
203
|
schema: registered.collector.config.schema,
|
|
191
204
|
});
|
|
192
205
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
206
|
+
// Migrate-then-validate-strict: authored gitops YAML may use an
|
|
207
|
+
// OLD collector config shape. Run the migration chain before
|
|
208
|
+
// strict validation so old-shape YAML still applies while
|
|
209
|
+
// genuine typos are still rejected. Shares the exact strict-
|
|
210
|
+
// validate path the `validateConfiguration` RPC uses. A collector
|
|
211
|
+
// config is always a plain object validated by the collector's
|
|
212
|
+
// schema, so narrowing the `unknown` result to the stored
|
|
213
|
+
// `Record` shape is safe.
|
|
214
|
+
const collectorResult = await validateVersionedConfigStrict({
|
|
215
|
+
config: registered.collector.config,
|
|
216
|
+
value: resolvedCollectorConfig,
|
|
217
|
+
basePath: ["config"],
|
|
218
|
+
});
|
|
219
|
+
if (!collectorResult.ok) {
|
|
198
220
|
throw new Error(
|
|
199
|
-
`Collector "${c.collectorId}" config validation failed: ${
|
|
221
|
+
`Collector "${c.collectorId}" config validation failed: ${formatIssues(collectorResult.issues)}`,
|
|
200
222
|
);
|
|
201
223
|
}
|
|
224
|
+
const migratedCollectorConfig = collectorResult.value as Record<
|
|
225
|
+
string,
|
|
226
|
+
unknown
|
|
227
|
+
>;
|
|
202
228
|
|
|
203
|
-
return { ...c, config:
|
|
229
|
+
return { ...c, config: migratedCollectorConfig };
|
|
204
230
|
}),
|
|
205
231
|
)
|
|
206
232
|
: undefined;
|
|
@@ -212,7 +238,7 @@ export function buildHealthcheckKind(
|
|
|
212
238
|
await service.updateConfiguration(existingEntityId, {
|
|
213
239
|
name: displayName,
|
|
214
240
|
strategyId: spec.strategy,
|
|
215
|
-
config:
|
|
241
|
+
config: migratedConfig,
|
|
216
242
|
intervalSeconds: spec.intervalSeconds,
|
|
217
243
|
collectors: resolvedCollectors?.map((c) => ({
|
|
218
244
|
id: c.collectorId,
|
|
@@ -230,7 +256,7 @@ export function buildHealthcheckKind(
|
|
|
230
256
|
const config = await service.createConfiguration({
|
|
231
257
|
name: displayName,
|
|
232
258
|
strategyId: spec.strategy,
|
|
233
|
-
config:
|
|
259
|
+
config: migratedConfig,
|
|
234
260
|
intervalSeconds: spec.intervalSeconds,
|
|
235
261
|
collectors: resolvedCollectors?.map((c) => ({
|
|
236
262
|
id: c.collectorId,
|
|
@@ -517,6 +543,23 @@ export function registerHealthcheckGitOpsDocumentation({
|
|
|
517
543
|
}
|
|
518
544
|
}
|
|
519
545
|
|
|
546
|
+
/**
|
|
547
|
+
* Render the structured issues from {@link validateVersionedConfigStrict} into
|
|
548
|
+
* a single human-readable message for the thrown GitOps reconcile error,
|
|
549
|
+
* preserving the per-field path (e.g. `config.url: Invalid url`).
|
|
550
|
+
*/
|
|
551
|
+
function formatIssues(
|
|
552
|
+
issues: Array<{ path: Array<string | number>; message: string }>,
|
|
553
|
+
): string {
|
|
554
|
+
return issues
|
|
555
|
+
.map((issue) =>
|
|
556
|
+
issue.path.length > 0
|
|
557
|
+
? `${issue.path.join(".")}: ${issue.message}`
|
|
558
|
+
: issue.message,
|
|
559
|
+
)
|
|
560
|
+
.join("; ");
|
|
561
|
+
}
|
|
562
|
+
|
|
520
563
|
function unwrapZodType(type: z.ZodTypeAny): z.ZodTypeAny {
|
|
521
564
|
let current = type;
|
|
522
565
|
while (current) {
|
package/src/index.ts
CHANGED
|
@@ -17,6 +17,12 @@ import {
|
|
|
17
17
|
NotificationApi,
|
|
18
18
|
specToRegistration,
|
|
19
19
|
} from "@checkstack/notification-common";
|
|
20
|
+
import {
|
|
21
|
+
aiToolExtensionPoint,
|
|
22
|
+
aiToolProjectionExtensionPoint,
|
|
23
|
+
deferredProjectionExecute,
|
|
24
|
+
} from "@checkstack/ai-backend";
|
|
25
|
+
import { buildHealthcheckAiTools } from "./ai/register-ai-tools";
|
|
20
26
|
import {
|
|
21
27
|
createBackendPlugin,
|
|
22
28
|
coreServices,
|
|
@@ -198,6 +204,7 @@ export default createBackendPlugin({
|
|
|
198
204
|
cacheManager: coreServices.cacheManager,
|
|
199
205
|
config: coreServices.config,
|
|
200
206
|
secretResolver: secretResolverRef,
|
|
207
|
+
advisoryLock: coreServices.advisoryLock,
|
|
201
208
|
},
|
|
202
209
|
// Phase 2: Register router and setup worker
|
|
203
210
|
init: async ({
|
|
@@ -212,6 +219,7 @@ export default createBackendPlugin({
|
|
|
212
219
|
cacheManager,
|
|
213
220
|
config,
|
|
214
221
|
secretResolver,
|
|
222
|
+
advisoryLock,
|
|
215
223
|
}) => {
|
|
216
224
|
logger.debug("🏥 Initializing Health Check Backend...");
|
|
217
225
|
|
|
@@ -232,6 +240,30 @@ export default createBackendPlugin({
|
|
|
232
240
|
collectorRegistry,
|
|
233
241
|
);
|
|
234
242
|
|
|
243
|
+
// Register this plugin's AI tools (propose/update/delete) into the AI
|
|
244
|
+
// registry via the extension point - owned here, not in ai-backend.
|
|
245
|
+
const aiToolExt = env.getExtensionPoint(aiToolExtensionPoint);
|
|
246
|
+
for (const tool of buildHealthcheckAiTools()) {
|
|
247
|
+
aiToolExt.registerTool(tool, pluginMetadata);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Expose this plugin's OWN read-only AI projection of the existing
|
|
251
|
+
// `getConfigurations` query via aiToolProjectionExtensionPoint - owned
|
|
252
|
+
// here, not in ai-backend. The projected read tool is routed by the
|
|
253
|
+
// transport (MCP / chat) AS the principal, so `getConfigurations`'
|
|
254
|
+
// own contract access rules gate it; `deferredProjectionExecute` is
|
|
255
|
+
// the fail-closed net if a transport ever forgot to route.
|
|
256
|
+
env.getExtensionPoint(aiToolProjectionExtensionPoint).expose({
|
|
257
|
+
procedure: healthCheckContract.getConfigurations,
|
|
258
|
+
sourcePluginMetadata: pluginMetadata,
|
|
259
|
+
procedureKey: "getConfigurations",
|
|
260
|
+
name: "healthcheck.status",
|
|
261
|
+
description:
|
|
262
|
+
"List health-check configurations and their current status. Read-only.",
|
|
263
|
+
effect: "read",
|
|
264
|
+
execute: deferredProjectionExecute,
|
|
265
|
+
});
|
|
266
|
+
|
|
235
267
|
// Create catalog client for notification delegation
|
|
236
268
|
const catalogClient = rpcClient.forPlugin(CatalogApi);
|
|
237
269
|
|
|
@@ -258,6 +290,7 @@ export default createBackendPlugin({
|
|
|
258
290
|
await setupHealthCheckWorker({
|
|
259
291
|
notificationClient,
|
|
260
292
|
db: database,
|
|
293
|
+
advisoryLock,
|
|
261
294
|
registry: healthCheckRegistry,
|
|
262
295
|
collectorRegistry,
|
|
263
296
|
logger,
|