@checkstack/healthcheck-backend 1.4.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/CHANGELOG.md +303 -0
  2. package/drizzle/0018_abnormal_preak.sql +10 -0
  3. package/drizzle/meta/0018_snapshot.json +600 -0
  4. package/drizzle/meta/_journal.json +7 -0
  5. package/package.json +26 -21
  6. package/src/ai/assertion-validation.test.ts +117 -0
  7. package/src/ai/assertion-validation.ts +147 -0
  8. package/src/ai/healthcheck-capabilities.test.ts +158 -0
  9. package/src/ai/healthcheck-capabilities.ts +217 -0
  10. package/src/ai/healthcheck-delete.test.ts +81 -0
  11. package/src/ai/healthcheck-delete.ts +81 -0
  12. package/src/ai/healthcheck-projection.test.ts +36 -0
  13. package/src/ai/healthcheck-propose.test.ts +268 -0
  14. package/src/ai/healthcheck-propose.ts +290 -0
  15. package/src/ai/healthcheck-script-tools.test.ts +93 -0
  16. package/src/ai/healthcheck-script-tools.ts +179 -0
  17. package/src/ai/healthcheck-update.test.ts +123 -0
  18. package/src/ai/healthcheck-update.ts +123 -0
  19. package/src/ai/notify-subscribers.test.ts +109 -0
  20. package/src/ai/notify-subscribers.ts +176 -0
  21. package/src/ai/register-ai-tools.test.ts +41 -0
  22. package/src/ai/register-ai-tools.ts +53 -0
  23. package/src/ai/shell-env-table.test.ts +47 -0
  24. package/src/automations.test.ts +2 -1
  25. package/src/automations.ts +9 -1
  26. package/src/collector-script-test.test.ts +53 -1
  27. package/src/collector-script-test.ts +59 -7
  28. package/src/effective-environments.test.ts +93 -0
  29. package/src/effective-environments.ts +64 -0
  30. package/src/health-entity-id.ts +57 -0
  31. package/src/health-entity.test.ts +405 -31
  32. package/src/health-entity.ts +99 -43
  33. package/src/health-state.ts +41 -4
  34. package/src/healthcheck-gitops-kinds.test.ts +95 -0
  35. package/src/healthcheck-gitops-kinds.ts +56 -13
  36. package/src/index.ts +33 -0
  37. package/src/migration-chain-contract.test.ts +57 -0
  38. package/src/queue-executor.test.ts +814 -0
  39. package/src/queue-executor.ts +342 -50
  40. package/src/realtime-aggregation.test.ts +30 -0
  41. package/src/realtime-aggregation.ts +16 -0
  42. package/src/retention-job.ts +167 -93
  43. package/src/retention-rollup.test.ts +118 -0
  44. package/src/router.test.ts +120 -1
  45. package/src/router.ts +20 -0
  46. package/src/schema.ts +44 -6
  47. package/src/service.ts +199 -43
  48. package/src/state-evaluator.test.ts +50 -5
  49. package/src/state-evaluator.ts +9 -2
  50. package/src/state-transitions.test.ts +104 -0
  51. package/src/state-transitions.ts +39 -1
  52. package/src/validate-configuration.test.ts +205 -0
  53. package/src/validate-configuration.ts +159 -0
  54. package/tsconfig.json +9 -0
@@ -23,7 +23,7 @@
23
23
  */
24
24
  import { z } from "zod";
25
25
  import { HealthCheckStatusSchema } from "@checkstack/healthcheck-common";
26
- import { withXactLock, type SafeDatabase } from "@checkstack/backend-api";
26
+ import type { AdvisoryLockService } from "@checkstack/backend-api";
27
27
  import type {
28
28
  EntityChangeDeriver,
29
29
  EntityChangePayloadMapper,
@@ -31,13 +31,9 @@ import type {
31
31
  EntityRead,
32
32
  } from "@checkstack/automation-backend";
33
33
  import type { HealthCheckService } from "./service";
34
- import * as schema from "./schema";
35
- // Re-export the change type through automation-backend's barrel (it
36
- // re-exports it from automation-common) so this domain needs no extra dep.
34
+ import { parseHealthEntityId } from "./health-entity-id";
37
35
 
38
- type Db = SafeDatabase<typeof schema>;
39
-
40
- /** Entity kind id for the per-system aggregated health. */
36
+ /** Entity kind id for the aggregated health (system rollup + per-environment). */
41
37
  export const HEALTH_ENTITY_KIND = "health";
42
38
 
43
39
  /**
@@ -126,15 +122,23 @@ function readNumber(
126
122
  * Restores the keys operators read (`trigger.payload.systemId`,
127
123
  * `.previousStatus`, …) that the generic change shape omits.
128
124
  *
129
- * `systemId` is the entity id; `previousStatus` is `prev.status` and `newStatus`
130
- * is `next.status`; `healthyChecks` / `totalChecks` come from `next`;
131
- * `timestamp` is the change's `occurredAt`. `systemName` is not derivable from a
132
- * health change (it lives in the catalog) and is OPTIONAL on the schemas, so it
133
- * is omitted.
125
+ * The entity id is now env-qualified (Phase 3b): `payload.systemId` is ALWAYS
126
+ * the systemId portion (so existing automations reading `trigger.payload.systemId`
127
+ * are unaffected the rollup carries the bare systemId), and the NEW optional
128
+ * `payload.environmentId` is the env portion present only for a per-environment
129
+ * change, absent (undefined) for the system rollup. `previousStatus` is
130
+ * `prev.status` and `newStatus` is `next.status`; `healthyChecks` / `totalChecks`
131
+ * come from `next`; `timestamp` is the change's `occurredAt`. `systemName` is not
132
+ * derivable from a health change (it lives in the catalog) and is OPTIONAL on the
133
+ * schemas, so it is omitted.
134
134
  */
135
135
  export const healthChangeToPayload: EntityChangePayloadMapper = (changed) => {
136
+ const { systemId, environmentId } = parseHealthEntityId(changed.id);
136
137
  return {
137
- systemId: changed.id,
138
+ systemId,
139
+ // Present only for a per-env change; omitted for the rollup so the field
140
+ // is `undefined` (the optional schema accepts both).
141
+ ...(environmentId === null ? {} : { environmentId }),
138
142
  previousStatus: readStatus(changed.prev) ?? undefined,
139
143
  newStatus: readStatus(changed.next) ?? undefined,
140
144
  healthyChecks: readNumber(changed.next, "healthyChecks") ?? 0,
@@ -157,6 +161,12 @@ export const healthChangeToPayload: EntityChangePayloadMapper = (changed) => {
157
161
  */
158
162
  export interface HealthChangeClassification {
159
163
  systemId: string;
164
+ /**
165
+ * The environment portion of the entity id (Phase 3b). `null` for the
166
+ * system rollup change; the env id for a per-environment change. Cross-plugin
167
+ * consumers that only care about the system (SLO / dependency) can ignore it.
168
+ */
169
+ environmentId: string | null;
160
170
  previousStatus: string | null;
161
171
  newStatus: string | null;
162
172
  degraded: boolean;
@@ -168,6 +178,7 @@ export function classifyHealthChange(changed: {
168
178
  prev: Record<string, unknown> | null;
169
179
  next: Record<string, unknown> | null;
170
180
  }): HealthChangeClassification {
181
+ const { systemId, environmentId } = parseHealthEntityId(changed.id);
171
182
  const previousStatus = readStatus(changed.prev);
172
183
  const newStatus = readStatus(changed.next);
173
184
  const bothPresent = previousStatus !== null && newStatus !== null;
@@ -176,7 +187,8 @@ export function classifyHealthChange(changed: {
176
187
  const recovered =
177
188
  bothPresent && newStatus === "healthy" && previousStatus !== "healthy";
178
189
  return {
179
- systemId: changed.id,
190
+ systemId,
191
+ environmentId,
180
192
  previousStatus,
181
193
  newStatus,
182
194
  degraded,
@@ -214,9 +226,17 @@ export function classifyHealthChange(changed: {
214
226
  export async function computeHealthEntityState(args: {
215
227
  service: HealthCheckService;
216
228
  systemId: string;
229
+ /**
230
+ * Environment to compute the view for (Phase 3b). `undefined` = the SYSTEM
231
+ * ROLLUP (worst status across all environments + env-less runs — the
232
+ * all-runs aggregate, §7.4.2). `null` = the env-less slice. A string = that
233
+ * environment's per-env view. The existence gate (`checkStatuses.length`) is
234
+ * env-independent, so a per-env view and the rollup agree on totalChecks.
235
+ */
236
+ environmentId?: string | null;
217
237
  }): Promise<HealthEntityState | undefined> {
218
- const { service, systemId } = args;
219
- const overview = await service.getSystemHealthStatus(systemId);
238
+ const { service, systemId, environmentId } = args;
239
+ const overview = await service.getSystemHealthStatus(systemId, environmentId);
220
240
  // No enabled check associations ⇒ no health entity for this system.
221
241
  if (overview.checkStatuses.length === 0) return undefined;
222
242
  return {
@@ -229,10 +249,16 @@ export async function computeHealthEntityState(args: {
229
249
 
230
250
  /**
231
251
  * Build the PLUGIN-BACKED + COMPUTED `read` accessor for the `health` entity.
232
- * For each systemId, assembles the view via {@link computeHealthEntityState}
233
- * (systems with no runs omitted). This is the single source of truth that
234
- * `handle.mutate` snapshots `prev` from and `get`/`getMany`/scope enrichment
235
- * route through no framework `entity_state` storage.
252
+ *
253
+ * Env-aware id parsing (Phase 3b, §7.4.2): each incoming id is parsed via
254
+ * {@link parseHealthEntityId}. A BARE `"<systemId>"` resolves the SYSTEM
255
+ * ROLLUP; a `"<systemId>::<environmentId>"` resolves that environment's
256
+ * per-env view. The result is keyed by the ORIGINAL id, so the reactive
257
+ * engine, `getMany`, and scope enrichment all see the right view for the id
258
+ * they asked for. Systems with no enabled check associations are omitted
259
+ * (existence gate). No framework `entity_state` storage — compute-on-read from
260
+ * the durable, env-keyed `health_check_runs`, so a read returns the same answer
261
+ * on every pod (state-and-scale).
236
262
  */
237
263
  export function createHealthEntityRead(deps: {
238
264
  service: HealthCheckService;
@@ -242,9 +268,20 @@ export function createHealthEntityRead(deps: {
242
268
  if (ids.length === 0) return {};
243
269
  const out: Record<string, HealthEntityState> = {};
244
270
  await Promise.all(
245
- ids.map(async (systemId) => {
246
- const state = await computeHealthEntityState({ service, systemId });
247
- if (state) out[systemId] = state;
271
+ ids.map(async (id) => {
272
+ const { systemId, environmentId } = parseHealthEntityId(id);
273
+ const state = await computeHealthEntityState({
274
+ service,
275
+ systemId,
276
+ // A bare `<systemId>` id is the ROLLUP: `parseHealthEntityId`
277
+ // returns `environmentId: null` for it (so the payload mapper can
278
+ // tell "rollup → omit environmentId"), but the rollup must read ALL
279
+ // runs — `undefined` — NOT the env-less slice (`null`, which filters
280
+ // to `env_id IS NULL`). Reserve `null` for an explicit env-less
281
+ // read; map the rollup's null to undefined here.
282
+ environmentId: environmentId === null ? undefined : environmentId,
283
+ });
284
+ if (state) out[id] = state;
248
285
  }),
249
286
  );
250
287
  return out;
@@ -298,19 +335,28 @@ export function createHealthEntityRead(deps: {
298
335
  */
299
336
  export async function writeHealthEntity(args: {
300
337
  handle: EntityHandle<HealthEntityState> | undefined;
301
- systemId: string;
338
+ /**
339
+ * The `health` entity id to mutate (Phase 3b): the env-qualified
340
+ * `"<systemId>::<environmentId>"` for a per-env write, or the bare
341
+ * `"<systemId>"` for the env-less / system-rollup write. This is the id the
342
+ * framework diffs/emits, so it drives both the per-env and rollup
343
+ * `ENTITY_CHANGED`.
344
+ */
345
+ entityId: string;
302
346
  apply: () => Promise<HealthEntityState>;
303
347
  onError?: (error: unknown) => void;
304
348
  /**
305
- * Optional per-`systemId` critical section wrapping the snapshot-prev +
349
+ * Optional per-`entityId` critical section wrapping the snapshot-prev +
306
350
  * apply + diff + emit. The executor supplies a transaction-scoped advisory
307
- * lock (`withXactLock`, key `health:<systemId>`) so concurrent evaluations
308
- * of one system can't double-emit a single logical transition. Identity by
309
- * default (no serialization) for the unbound-handle / test paths.
351
+ * lock (`withXactLock`, key `health:<entityId>`) so concurrent evaluations
352
+ * of one (system, environment) — or of the rollup — can't double-emit a
353
+ * single logical transition, and per-env + rollup writes serialize against
354
+ * their OWN keys (distinct envs / the rollup don't block each other).
355
+ * Identity by default (no serialization) for the unbound-handle / test paths.
310
356
  */
311
357
  serialize?: <T>(fn: () => Promise<T>) => Promise<T>;
312
358
  }): Promise<HealthEntityState> {
313
- const { handle, systemId, apply, onError, serialize } = args;
359
+ const { handle, entityId, apply, onError, serialize } = args;
314
360
  if (!handle) {
315
361
  // No reactivity bound — run the durable write directly.
316
362
  return apply();
@@ -323,7 +369,7 @@ export async function writeHealthEntity(args: {
323
369
  // call, and we wrap that whole call so two concurrent evals serialize.
324
370
  return await run(() =>
325
371
  handle.mutate({
326
- id: systemId,
372
+ id: entityId,
327
373
  apply: async () => {
328
374
  durableState = await apply();
329
375
  return durableState;
@@ -340,19 +386,26 @@ export async function writeHealthEntity(args: {
340
386
  }
341
387
  }
342
388
 
343
- /** Advisory-lock key namespace for the per-system health critical section. */
344
- export function healthSystemLockKey(systemId: string): string {
345
- return `health:${systemId}`;
389
+ /**
390
+ * Advisory-lock key namespace for the per-entity health critical section. The
391
+ * argument is the FULL `health` entity id (Phase 3b): the bare `"<systemId>"`
392
+ * for the rollup or `"<systemId>::<environmentId>"` for a per-env write. Two
393
+ * different envs (or an env vs the rollup) get DIFFERENT keys, so they
394
+ * serialize independently and never block each other.
395
+ */
396
+ export function healthEntityLockKey(entityId: string): string {
397
+ return `health:${entityId}`;
346
398
  }
347
399
 
348
400
  /**
349
- * Build the per-`systemId` serializer for {@link writeHealthEntity} backed by
401
+ * Build the per-`entityId` serializer for {@link writeHealthEntity} backed by
350
402
  * a transaction-scoped advisory lock (`withXactLock`, key
351
- * `health:<systemId>`). The returned function blocks until it holds the
352
- * system's lock, runs `fn` (the whole snapshot-prev + apply + diff + emit), and
403
+ * `health:<entityId>`). The returned function blocks until it holds the
404
+ * entity's lock, runs `fn` (the whole snapshot-prev + apply + diff + emit), and
353
405
  * auto-releases the lock at COMMIT/ROLLBACK. Two concurrent evaluations of one
354
- * system therefore serialize exactly one logical `healthy degraded`
355
- * transition emits exactly one `ENTITY_CHANGED` + one transition row.
406
+ * (system, environment)or of the rollup therefore serialize, while
407
+ * distinct envs proceed in parallel. Exactly one logical transition per entity
408
+ * emits exactly one `ENTITY_CHANGED` + one transition row.
356
409
  *
357
410
  * `fn` does its own durable writes on the outer pool; the lock only gates
358
411
  * ENTRY to the critical section, so its connection affinity is irrelevant —
@@ -360,10 +413,13 @@ export function healthSystemLockKey(systemId: string): string {
360
413
  * commits.
361
414
  */
362
415
  export function createHealthEntitySerializer(deps: {
363
- db: Db;
364
- }): (systemId: string) => <T>(fn: () => Promise<T>) => Promise<T> {
365
- const { db } = deps;
366
- return (systemId) =>
416
+ advisoryLock: AdvisoryLockService;
417
+ }): (entityId: string) => <T>(fn: () => Promise<T>) => Promise<T> {
418
+ const { advisoryLock } = deps;
419
+ return (entityId) =>
367
420
  <T>(fn: () => Promise<T>) =>
368
- withXactLock({ db, key: healthSystemLockKey(systemId), fn: () => fn() });
421
+ advisoryLock.withXactLock({
422
+ key: healthEntityLockKey(entityId),
423
+ fn: () => fn(),
424
+ });
369
425
  }
@@ -1,4 +1,4 @@
1
- import { and, desc, eq, gte } from "drizzle-orm";
1
+ import { and, desc, eq, gte, isNull } from "drizzle-orm";
2
2
  import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
3
3
  import type { Logger, SafeDatabase } from "@checkstack/backend-api";
4
4
  import type { InferClient } from "@checkstack/common";
@@ -122,15 +122,28 @@ export async function findLatestRun({
122
122
  db,
123
123
  systemId,
124
124
  configurationId,
125
+ environmentId,
125
126
  }: {
126
127
  db: Db;
127
128
  systemId: string;
128
129
  configurationId?: string;
130
+ /**
131
+ * Environment to scope the run lookup to (Phase 3b). `undefined` = any
132
+ * environment (rollup). `null` = env-less runs only. A string = that env.
133
+ */
134
+ environmentId?: string | null;
129
135
  }): Promise<{ latencyMs?: number; lastRunAt?: Date }> {
130
136
  const conditions = [eq(healthCheckRuns.systemId, systemId)];
131
137
  if (configurationId) {
132
138
  conditions.push(eq(healthCheckRuns.configurationId, configurationId));
133
139
  }
140
+ if (environmentId !== undefined) {
141
+ conditions.push(
142
+ environmentId === null
143
+ ? isNull(healthCheckRuns.environmentId)
144
+ : eq(healthCheckRuns.environmentId, environmentId),
145
+ );
146
+ }
134
147
 
135
148
  const [row] = await db
136
149
  .select({
@@ -161,12 +174,19 @@ export async function computeWindowedMetrics({
161
174
  db,
162
175
  systemId,
163
176
  configurationId,
177
+ environmentId,
164
178
  now = new Date(),
165
179
  windowHours = DEFAULT_METRICS_WINDOW_HOURS,
166
180
  }: {
167
181
  db: Db;
168
182
  systemId: string;
169
183
  configurationId?: string;
184
+ /**
185
+ * Environment to scope the windowed metrics to (Phase 3b). `undefined` =
186
+ * any environment (rollup). `null` = env-less aggregates only. A string =
187
+ * that environment's aggregate buckets only.
188
+ */
189
+ environmentId?: string | null;
170
190
  now?: Date;
171
191
  windowHours?: number;
172
192
  }): Promise<{
@@ -185,6 +205,13 @@ export async function computeWindowedMetrics({
185
205
  eq(healthCheckAggregates.configurationId, configurationId),
186
206
  );
187
207
  }
208
+ if (environmentId !== undefined) {
209
+ conditions.push(
210
+ environmentId === null
211
+ ? isNull(healthCheckAggregates.environmentId)
212
+ : eq(healthCheckAggregates.environmentId, environmentId),
213
+ );
214
+ }
188
215
 
189
216
  const buckets = await db
190
217
  .select({
@@ -284,6 +311,7 @@ export async function computeHealthState({
284
311
  db,
285
312
  systemId,
286
313
  configurationId,
314
+ environmentId,
287
315
  resolveStatus,
288
316
  maintenanceClient,
289
317
  logger,
@@ -293,6 +321,14 @@ export async function computeHealthState({
293
321
  db: Db;
294
322
  systemId: string;
295
323
  configurationId?: string;
324
+ /**
325
+ * Environment to scope EVERY durable read to (Phase 3b). `undefined` = the
326
+ * system rollup (all environments + env-less). `null` = the env-less slice.
327
+ * A string = that environment. `inStatusSince`, latest run, windowed
328
+ * metrics, and the transition count all narrow to this env so a per-env
329
+ * health snapshot reflects only that environment's runs/transitions.
330
+ */
331
+ environmentId?: string | null;
296
332
  /** Returns the aggregate status for the system (per-check when scoped). */
297
333
  resolveStatus: () => Promise<HealthCheckStatus>;
298
334
  maintenanceClient?: MaintenanceClient;
@@ -305,14 +341,15 @@ export async function computeHealthState({
305
341
 
306
342
  const [inStatusSince, latest, windowed, inMaintenance, transitionsInWindow] =
307
343
  await Promise.all([
308
- findInStatusSince({ db, systemId, status }),
309
- findLatestRun({ db, systemId, configurationId }),
310
- computeWindowedMetrics({ db, systemId, configurationId, now }),
344
+ findInStatusSince({ db, systemId, status, environmentId }),
345
+ findLatestRun({ db, systemId, configurationId, environmentId }),
346
+ computeWindowedMetrics({ db, systemId, configurationId, environmentId, now }),
311
347
  resolveInMaintenance({ maintenanceClient, systemId, logger }),
312
348
  countStateTransitionsInWindow({
313
349
  db,
314
350
  systemId,
315
351
  windowMinutes: transitionWindowMinutes,
352
+ environmentId,
316
353
  now,
317
354
  }),
318
355
  ]);
@@ -415,6 +415,101 @@ describe("Healthcheck GitOps Kind: Healthcheck", () => {
415
415
  ).rejects.toThrow(/config validation failed/);
416
416
  });
417
417
 
418
+ it("migrates an OLD-shape authored config forward and stores the migrated value", async () => {
419
+ // A strategy at version 2 whose v1->v2 migration drops a removed
420
+ // `legacyMode` key. Authored gitops YAML still in the v1 shape (carrying
421
+ // `legacyMode`) must be migrated forward and applied, not rejected.
422
+ const v2Schema = z.object({ host: z.string() });
423
+ const versionedStrategy = {
424
+ id: "postgres",
425
+ displayName: "PostgreSQL",
426
+ description: "test",
427
+ config: new Versioned({
428
+ version: 2,
429
+ schema: v2Schema,
430
+ migrations: [
431
+ {
432
+ fromVersion: 1,
433
+ toVersion: 2,
434
+ description: "Drop removed legacyMode key",
435
+ migrate: ({ legacyMode: _legacyMode, ...rest }: Record<string, unknown>) =>
436
+ rest,
437
+ },
438
+ ],
439
+ }),
440
+ };
441
+ mockHCRegistry.getStrategiesWithMeta = () =>
442
+ [
443
+ { strategy: versionedStrategy, ownerPluginId: "mock", qualifiedId: "postgres" },
444
+ ] as any;
445
+
446
+ const kind = buildKind();
447
+
448
+ const result = await kind.reconcile({
449
+ entity: {
450
+ apiVersion: CHECKSTACK_API_VERSION,
451
+ kind: "Healthcheck",
452
+ metadata: { name: "legacy-check" },
453
+ spec: {
454
+ strategy: "postgres",
455
+ intervalSeconds: 30,
456
+ // Old v1 shape: carries the now-removed `legacyMode`.
457
+ config: { host: "db.legacy", legacyMode: true },
458
+ },
459
+ },
460
+ context: mockContext,
461
+ });
462
+
463
+ expect(result.entityId).toBe("hc-1");
464
+ // The MIGRATED config (legacyMode dropped) is what gets stored.
465
+ expect(mockService.configs[0].config).toEqual({ host: "db.legacy" });
466
+ });
467
+
468
+ it("rejects a genuine typo the migration does not account for (strict)", async () => {
469
+ const v2Schema = z.object({ host: z.string() });
470
+ const versionedStrategy = {
471
+ id: "postgres",
472
+ displayName: "PostgreSQL",
473
+ description: "test",
474
+ config: new Versioned({
475
+ version: 2,
476
+ schema: v2Schema,
477
+ migrations: [
478
+ {
479
+ fromVersion: 1,
480
+ toVersion: 2,
481
+ description: "Drop removed legacyMode key",
482
+ migrate: ({ legacyMode: _legacyMode, ...rest }: Record<string, unknown>) =>
483
+ rest,
484
+ },
485
+ ],
486
+ }),
487
+ };
488
+ mockHCRegistry.getStrategiesWithMeta = () =>
489
+ [
490
+ { strategy: versionedStrategy, ownerPluginId: "mock", qualifiedId: "postgres" },
491
+ ] as any;
492
+
493
+ const kind = buildKind();
494
+
495
+ await expect(
496
+ kind.reconcile({
497
+ entity: {
498
+ apiVersion: CHECKSTACK_API_VERSION,
499
+ kind: "Healthcheck",
500
+ metadata: { name: "typo-check" },
501
+ spec: {
502
+ strategy: "postgres",
503
+ intervalSeconds: 30,
504
+ // `hsot` is a genuine typo no migration accounts for.
505
+ config: { host: "db.local", hsot: "oops" },
506
+ },
507
+ },
508
+ context: mockContext,
509
+ }),
510
+ ).rejects.toThrow(/config validation failed/);
511
+ });
512
+
418
513
  it("validates collector configs against collector registry schemas", async () => {
419
514
  const kind = buildKind();
420
515
 
@@ -15,6 +15,7 @@ import type {
15
15
  } from "@checkstack/backend-api";
16
16
  import { NotificationPolicySchema } from "@checkstack/healthcheck-common";
17
17
  import { HealthCheckService } from "./service";
18
+ import { validateVersionedConfigStrict } from "./validate-configuration";
18
19
  import {
19
20
  DynamicOperators,
20
21
  numericField,
@@ -154,13 +155,25 @@ export function buildHealthcheckKind(
154
155
  },
155
156
  );
156
157
 
157
- // Validate resolved config against strategy's Zod schema
158
- const configValidation = strategy.config.schema.safeParse(resolvedConfig);
159
- if (!configValidation.success) {
158
+ // Migrate-then-validate-strict: authored gitops YAML may be in an OLD
159
+ // config shape, so run the migration chain (assume-v1-on-read) before
160
+ // strict validation. Old-shape YAML still applies; genuine typos
161
+ // (unknown keys no migration accounts for) are still rejected. Shares the
162
+ // exact strict-validate path the `validateConfiguration` RPC uses, so the
163
+ // two agree on what counts as valid. A strategy config is always a plain
164
+ // object validated by the strategy's own schema, so narrowing the
165
+ // `unknown` result to the stored `Record` shape is safe.
166
+ const strategyResult = await validateVersionedConfigStrict({
167
+ config: strategy.config,
168
+ value: resolvedConfig,
169
+ basePath: ["config"],
170
+ });
171
+ if (!strategyResult.ok) {
160
172
  throw new Error(
161
- `Strategy "${spec.strategy}" config validation failed: ${configValidation.error.message}`,
173
+ `Strategy "${spec.strategy}" config validation failed: ${formatIssues(strategyResult.issues)}`,
162
174
  );
163
175
  }
176
+ const migratedConfig = strategyResult.value as Record<string, unknown>;
164
177
 
165
178
  // Resolve and validate collector configs using their registry schemas
166
179
  const resolvedCollectors = spec.collectors
@@ -190,17 +203,30 @@ export function buildHealthcheckKind(
190
203
  schema: registered.collector.config.schema,
191
204
  });
192
205
 
193
- const collectorConfigValidation =
194
- registered.collector.config.schema.safeParse(
195
- resolvedCollectorConfig,
196
- );
197
- if (!collectorConfigValidation.success) {
206
+ // Migrate-then-validate-strict: authored gitops YAML may use an
207
+ // OLD collector config shape. Run the migration chain before
208
+ // strict validation so old-shape YAML still applies while
209
+ // genuine typos are still rejected. Shares the exact strict-
210
+ // validate path the `validateConfiguration` RPC uses. A collector
211
+ // config is always a plain object validated by the collector's
212
+ // schema, so narrowing the `unknown` result to the stored
213
+ // `Record` shape is safe.
214
+ const collectorResult = await validateVersionedConfigStrict({
215
+ config: registered.collector.config,
216
+ value: resolvedCollectorConfig,
217
+ basePath: ["config"],
218
+ });
219
+ if (!collectorResult.ok) {
198
220
  throw new Error(
199
- `Collector "${c.collectorId}" config validation failed: ${collectorConfigValidation.error.message}`,
221
+ `Collector "${c.collectorId}" config validation failed: ${formatIssues(collectorResult.issues)}`,
200
222
  );
201
223
  }
224
+ const migratedCollectorConfig = collectorResult.value as Record<
225
+ string,
226
+ unknown
227
+ >;
202
228
 
203
- return { ...c, config: resolvedCollectorConfig };
229
+ return { ...c, config: migratedCollectorConfig };
204
230
  }),
205
231
  )
206
232
  : undefined;
@@ -212,7 +238,7 @@ export function buildHealthcheckKind(
212
238
  await service.updateConfiguration(existingEntityId, {
213
239
  name: displayName,
214
240
  strategyId: spec.strategy,
215
- config: resolvedConfig,
241
+ config: migratedConfig,
216
242
  intervalSeconds: spec.intervalSeconds,
217
243
  collectors: resolvedCollectors?.map((c) => ({
218
244
  id: c.collectorId,
@@ -230,7 +256,7 @@ export function buildHealthcheckKind(
230
256
  const config = await service.createConfiguration({
231
257
  name: displayName,
232
258
  strategyId: spec.strategy,
233
- config: resolvedConfig,
259
+ config: migratedConfig,
234
260
  intervalSeconds: spec.intervalSeconds,
235
261
  collectors: resolvedCollectors?.map((c) => ({
236
262
  id: c.collectorId,
@@ -517,6 +543,23 @@ export function registerHealthcheckGitOpsDocumentation({
517
543
  }
518
544
  }
519
545
 
546
+ /**
547
+ * Render the structured issues from {@link validateVersionedConfigStrict} into
548
+ * a single human-readable message for the thrown GitOps reconcile error,
549
+ * preserving the per-field path (e.g. `config.url: Invalid url`).
550
+ */
551
+ function formatIssues(
552
+ issues: Array<{ path: Array<string | number>; message: string }>,
553
+ ): string {
554
+ return issues
555
+ .map((issue) =>
556
+ issue.path.length > 0
557
+ ? `${issue.path.join(".")}: ${issue.message}`
558
+ : issue.message,
559
+ )
560
+ .join("; ");
561
+ }
562
+
520
563
  function unwrapZodType(type: z.ZodTypeAny): z.ZodTypeAny {
521
564
  let current = type;
522
565
  while (current) {
package/src/index.ts CHANGED
@@ -17,6 +17,12 @@ import {
17
17
  NotificationApi,
18
18
  specToRegistration,
19
19
  } from "@checkstack/notification-common";
20
+ import {
21
+ aiToolExtensionPoint,
22
+ aiToolProjectionExtensionPoint,
23
+ deferredProjectionExecute,
24
+ } from "@checkstack/ai-backend";
25
+ import { buildHealthcheckAiTools } from "./ai/register-ai-tools";
20
26
  import {
21
27
  createBackendPlugin,
22
28
  coreServices,
@@ -198,6 +204,7 @@ export default createBackendPlugin({
198
204
  cacheManager: coreServices.cacheManager,
199
205
  config: coreServices.config,
200
206
  secretResolver: secretResolverRef,
207
+ advisoryLock: coreServices.advisoryLock,
201
208
  },
202
209
  // Phase 2: Register router and setup worker
203
210
  init: async ({
@@ -212,6 +219,7 @@ export default createBackendPlugin({
212
219
  cacheManager,
213
220
  config,
214
221
  secretResolver,
222
+ advisoryLock,
215
223
  }) => {
216
224
  logger.debug("🏥 Initializing Health Check Backend...");
217
225
 
@@ -232,6 +240,30 @@ export default createBackendPlugin({
232
240
  collectorRegistry,
233
241
  );
234
242
 
243
+ // Register this plugin's AI tools (propose/update/delete) into the AI
244
+ // registry via the extension point - owned here, not in ai-backend.
245
+ const aiToolExt = env.getExtensionPoint(aiToolExtensionPoint);
246
+ for (const tool of buildHealthcheckAiTools()) {
247
+ aiToolExt.registerTool(tool, pluginMetadata);
248
+ }
249
+
250
+ // Expose this plugin's OWN read-only AI projection of the existing
251
+ // `getConfigurations` query via aiToolProjectionExtensionPoint - owned
252
+ // here, not in ai-backend. The projected read tool is routed by the
253
+ // transport (MCP / chat) AS the principal, so `getConfigurations`'
254
+ // own contract access rules gate it; `deferredProjectionExecute` is
255
+ // the fail-closed net if a transport ever forgot to route.
256
+ env.getExtensionPoint(aiToolProjectionExtensionPoint).expose({
257
+ procedure: healthCheckContract.getConfigurations,
258
+ sourcePluginMetadata: pluginMetadata,
259
+ procedureKey: "getConfigurations",
260
+ name: "healthcheck.status",
261
+ description:
262
+ "List health-check configurations and their current status. Read-only.",
263
+ effect: "read",
264
+ execute: deferredProjectionExecute,
265
+ });
266
+
235
267
  // Create catalog client for notification delegation
236
268
  const catalogClient = rpcClient.forPlugin(CatalogApi);
237
269
 
@@ -258,6 +290,7 @@ export default createBackendPlugin({
258
290
  await setupHealthCheckWorker({
259
291
  notificationClient,
260
292
  db: database,
293
+ advisoryLock,
261
294
  registry: healthCheckRegistry,
262
295
  collectorRegistry,
263
296
  logger,