@checkstack/healthcheck-backend 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/CHANGELOG.md +223 -0
  2. package/drizzle/0018_abnormal_preak.sql +10 -0
  3. package/drizzle/meta/0018_snapshot.json +600 -0
  4. package/drizzle/meta/_journal.json +7 -0
  5. package/package.json +26 -21
  6. package/src/ai/assertion-validation.test.ts +117 -0
  7. package/src/ai/assertion-validation.ts +147 -0
  8. package/src/ai/healthcheck-capabilities.test.ts +158 -0
  9. package/src/ai/healthcheck-capabilities.ts +217 -0
  10. package/src/ai/healthcheck-delete.test.ts +81 -0
  11. package/src/ai/healthcheck-delete.ts +81 -0
  12. package/src/ai/healthcheck-projection.test.ts +36 -0
  13. package/src/ai/healthcheck-propose.test.ts +268 -0
  14. package/src/ai/healthcheck-propose.ts +290 -0
  15. package/src/ai/healthcheck-script-tools.test.ts +93 -0
  16. package/src/ai/healthcheck-script-tools.ts +179 -0
  17. package/src/ai/healthcheck-update.test.ts +123 -0
  18. package/src/ai/healthcheck-update.ts +123 -0
  19. package/src/ai/notify-subscribers.test.ts +109 -0
  20. package/src/ai/notify-subscribers.ts +176 -0
  21. package/src/ai/register-ai-tools.test.ts +41 -0
  22. package/src/ai/register-ai-tools.ts +53 -0
  23. package/src/ai/shell-env-table.test.ts +47 -0
  24. package/src/automations.test.ts +2 -1
  25. package/src/automations.ts +9 -1
  26. package/src/collector-script-test.test.ts +53 -1
  27. package/src/collector-script-test.ts +59 -7
  28. package/src/effective-environments.test.ts +93 -0
  29. package/src/effective-environments.ts +64 -0
  30. package/src/health-entity-id.ts +57 -0
  31. package/src/health-entity.test.ts +384 -6
  32. package/src/health-entity.ts +93 -35
  33. package/src/health-state.ts +41 -4
  34. package/src/healthcheck-gitops-kinds.test.ts +95 -0
  35. package/src/healthcheck-gitops-kinds.ts +56 -13
  36. package/src/index.ts +30 -0
  37. package/src/migration-chain-contract.test.ts +57 -0
  38. package/src/queue-executor.test.ts +801 -0
  39. package/src/queue-executor.ts +336 -52
  40. package/src/realtime-aggregation.test.ts +30 -0
  41. package/src/realtime-aggregation.ts +16 -0
  42. package/src/retention-job.ts +167 -93
  43. package/src/retention-rollup.test.ts +118 -0
  44. package/src/router.test.ts +120 -1
  45. package/src/router.ts +20 -0
  46. package/src/schema.ts +44 -6
  47. package/src/service.ts +199 -43
  48. package/src/state-transitions.test.ts +104 -0
  49. package/src/state-transitions.ts +39 -1
  50. package/src/validate-configuration.test.ts +205 -0
  51. package/src/validate-configuration.ts +159 -0
  52. package/tsconfig.json +9 -0
@@ -1,4 +1,4 @@
1
- import { and, desc, eq, gte } from "drizzle-orm";
1
+ import { and, desc, eq, gte, isNull } from "drizzle-orm";
2
2
  import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
3
3
  import type { Logger, SafeDatabase } from "@checkstack/backend-api";
4
4
  import type { InferClient } from "@checkstack/common";
@@ -122,15 +122,28 @@ export async function findLatestRun({
122
122
  db,
123
123
  systemId,
124
124
  configurationId,
125
+ environmentId,
125
126
  }: {
126
127
  db: Db;
127
128
  systemId: string;
128
129
  configurationId?: string;
130
+ /**
131
+ * Environment to scope the run lookup to (Phase 3b). `undefined` = any
132
+ * environment (rollup). `null` = env-less runs only. A string = that env.
133
+ */
134
+ environmentId?: string | null;
129
135
  }): Promise<{ latencyMs?: number; lastRunAt?: Date }> {
130
136
  const conditions = [eq(healthCheckRuns.systemId, systemId)];
131
137
  if (configurationId) {
132
138
  conditions.push(eq(healthCheckRuns.configurationId, configurationId));
133
139
  }
140
+ if (environmentId !== undefined) {
141
+ conditions.push(
142
+ environmentId === null
143
+ ? isNull(healthCheckRuns.environmentId)
144
+ : eq(healthCheckRuns.environmentId, environmentId),
145
+ );
146
+ }
134
147
 
135
148
  const [row] = await db
136
149
  .select({
@@ -161,12 +174,19 @@ export async function computeWindowedMetrics({
161
174
  db,
162
175
  systemId,
163
176
  configurationId,
177
+ environmentId,
164
178
  now = new Date(),
165
179
  windowHours = DEFAULT_METRICS_WINDOW_HOURS,
166
180
  }: {
167
181
  db: Db;
168
182
  systemId: string;
169
183
  configurationId?: string;
184
+ /**
185
+ * Environment to scope the windowed metrics to (Phase 3b). `undefined` =
186
+ * any environment (rollup). `null` = env-less aggregates only. A string =
187
+ * that environment's aggregate buckets only.
188
+ */
189
+ environmentId?: string | null;
170
190
  now?: Date;
171
191
  windowHours?: number;
172
192
  }): Promise<{
@@ -185,6 +205,13 @@ export async function computeWindowedMetrics({
185
205
  eq(healthCheckAggregates.configurationId, configurationId),
186
206
  );
187
207
  }
208
+ if (environmentId !== undefined) {
209
+ conditions.push(
210
+ environmentId === null
211
+ ? isNull(healthCheckAggregates.environmentId)
212
+ : eq(healthCheckAggregates.environmentId, environmentId),
213
+ );
214
+ }
188
215
 
189
216
  const buckets = await db
190
217
  .select({
@@ -284,6 +311,7 @@ export async function computeHealthState({
284
311
  db,
285
312
  systemId,
286
313
  configurationId,
314
+ environmentId,
287
315
  resolveStatus,
288
316
  maintenanceClient,
289
317
  logger,
@@ -293,6 +321,14 @@ export async function computeHealthState({
293
321
  db: Db;
294
322
  systemId: string;
295
323
  configurationId?: string;
324
+ /**
325
+ * Environment to scope EVERY durable read to (Phase 3b). `undefined` = the
326
+ * system rollup (all environments + env-less). `null` = the env-less slice.
327
+ * A string = that environment. `inStatusSince`, latest run, windowed
328
+ * metrics, and the transition count all narrow to this env so a per-env
329
+ * health snapshot reflects only that environment's runs/transitions.
330
+ */
331
+ environmentId?: string | null;
296
332
  /** Returns the aggregate status for the system (per-check when scoped). */
297
333
  resolveStatus: () => Promise<HealthCheckStatus>;
298
334
  maintenanceClient?: MaintenanceClient;
@@ -305,14 +341,15 @@ export async function computeHealthState({
305
341
 
306
342
  const [inStatusSince, latest, windowed, inMaintenance, transitionsInWindow] =
307
343
  await Promise.all([
308
- findInStatusSince({ db, systemId, status }),
309
- findLatestRun({ db, systemId, configurationId }),
310
- computeWindowedMetrics({ db, systemId, configurationId, now }),
344
+ findInStatusSince({ db, systemId, status, environmentId }),
345
+ findLatestRun({ db, systemId, configurationId, environmentId }),
346
+ computeWindowedMetrics({ db, systemId, configurationId, environmentId, now }),
311
347
  resolveInMaintenance({ maintenanceClient, systemId, logger }),
312
348
  countStateTransitionsInWindow({
313
349
  db,
314
350
  systemId,
315
351
  windowMinutes: transitionWindowMinutes,
352
+ environmentId,
316
353
  now,
317
354
  }),
318
355
  ]);
@@ -415,6 +415,101 @@ describe("Healthcheck GitOps Kind: Healthcheck", () => {
415
415
  ).rejects.toThrow(/config validation failed/);
416
416
  });
417
417
 
418
+ it("migrates an OLD-shape authored config forward and stores the migrated value", async () => {
419
+ // A strategy at version 2 whose v1->v2 migration drops a removed
420
+ // `legacyMode` key. Authored gitops YAML still in the v1 shape (carrying
421
+ // `legacyMode`) must be migrated forward and applied, not rejected.
422
+ const v2Schema = z.object({ host: z.string() });
423
+ const versionedStrategy = {
424
+ id: "postgres",
425
+ displayName: "PostgreSQL",
426
+ description: "test",
427
+ config: new Versioned({
428
+ version: 2,
429
+ schema: v2Schema,
430
+ migrations: [
431
+ {
432
+ fromVersion: 1,
433
+ toVersion: 2,
434
+ description: "Drop removed legacyMode key",
435
+ migrate: ({ legacyMode: _legacyMode, ...rest }: Record<string, unknown>) =>
436
+ rest,
437
+ },
438
+ ],
439
+ }),
440
+ };
441
+ mockHCRegistry.getStrategiesWithMeta = () =>
442
+ [
443
+ { strategy: versionedStrategy, ownerPluginId: "mock", qualifiedId: "postgres" },
444
+ ] as any;
445
+
446
+ const kind = buildKind();
447
+
448
+ const result = await kind.reconcile({
449
+ entity: {
450
+ apiVersion: CHECKSTACK_API_VERSION,
451
+ kind: "Healthcheck",
452
+ metadata: { name: "legacy-check" },
453
+ spec: {
454
+ strategy: "postgres",
455
+ intervalSeconds: 30,
456
+ // Old v1 shape: carries the now-removed `legacyMode`.
457
+ config: { host: "db.legacy", legacyMode: true },
458
+ },
459
+ },
460
+ context: mockContext,
461
+ });
462
+
463
+ expect(result.entityId).toBe("hc-1");
464
+ // The MIGRATED config (legacyMode dropped) is what gets stored.
465
+ expect(mockService.configs[0].config).toEqual({ host: "db.legacy" });
466
+ });
467
+
468
+ it("rejects a genuine typo the migration does not account for (strict)", async () => {
469
+ const v2Schema = z.object({ host: z.string() });
470
+ const versionedStrategy = {
471
+ id: "postgres",
472
+ displayName: "PostgreSQL",
473
+ description: "test",
474
+ config: new Versioned({
475
+ version: 2,
476
+ schema: v2Schema,
477
+ migrations: [
478
+ {
479
+ fromVersion: 1,
480
+ toVersion: 2,
481
+ description: "Drop removed legacyMode key",
482
+ migrate: ({ legacyMode: _legacyMode, ...rest }: Record<string, unknown>) =>
483
+ rest,
484
+ },
485
+ ],
486
+ }),
487
+ };
488
+ mockHCRegistry.getStrategiesWithMeta = () =>
489
+ [
490
+ { strategy: versionedStrategy, ownerPluginId: "mock", qualifiedId: "postgres" },
491
+ ] as any;
492
+
493
+ const kind = buildKind();
494
+
495
+ await expect(
496
+ kind.reconcile({
497
+ entity: {
498
+ apiVersion: CHECKSTACK_API_VERSION,
499
+ kind: "Healthcheck",
500
+ metadata: { name: "typo-check" },
501
+ spec: {
502
+ strategy: "postgres",
503
+ intervalSeconds: 30,
504
+ // `hsot` is a genuine typo no migration accounts for.
505
+ config: { host: "db.local", hsot: "oops" },
506
+ },
507
+ },
508
+ context: mockContext,
509
+ }),
510
+ ).rejects.toThrow(/config validation failed/);
511
+ });
512
+
418
513
  it("validates collector configs against collector registry schemas", async () => {
419
514
  const kind = buildKind();
420
515
 
@@ -15,6 +15,7 @@ import type {
15
15
  } from "@checkstack/backend-api";
16
16
  import { NotificationPolicySchema } from "@checkstack/healthcheck-common";
17
17
  import { HealthCheckService } from "./service";
18
+ import { validateVersionedConfigStrict } from "./validate-configuration";
18
19
  import {
19
20
  DynamicOperators,
20
21
  numericField,
@@ -154,13 +155,25 @@ export function buildHealthcheckKind(
154
155
  },
155
156
  );
156
157
 
157
- // Validate resolved config against strategy's Zod schema
158
- const configValidation = strategy.config.schema.safeParse(resolvedConfig);
159
- if (!configValidation.success) {
158
+ // Migrate-then-validate-strict: authored gitops YAML may be in an OLD
159
+ // config shape, so run the migration chain (assume-v1-on-read) before
160
+ // strict validation. Old-shape YAML still applies; genuine typos
161
+ // (unknown keys no migration accounts for) are still rejected. Shares the
162
+ // exact strict-validate path the `validateConfiguration` RPC uses, so the
163
+ // two agree on what counts as valid. A strategy config is always a plain
164
+ // object validated by the strategy's own schema, so narrowing the
165
+ // `unknown` result to the stored `Record` shape is safe.
166
+ const strategyResult = await validateVersionedConfigStrict({
167
+ config: strategy.config,
168
+ value: resolvedConfig,
169
+ basePath: ["config"],
170
+ });
171
+ if (!strategyResult.ok) {
160
172
  throw new Error(
161
- `Strategy "${spec.strategy}" config validation failed: ${configValidation.error.message}`,
173
+ `Strategy "${spec.strategy}" config validation failed: ${formatIssues(strategyResult.issues)}`,
162
174
  );
163
175
  }
176
+ const migratedConfig = strategyResult.value as Record<string, unknown>;
164
177
 
165
178
  // Resolve and validate collector configs using their registry schemas
166
179
  const resolvedCollectors = spec.collectors
@@ -190,17 +203,30 @@ export function buildHealthcheckKind(
190
203
  schema: registered.collector.config.schema,
191
204
  });
192
205
 
193
- const collectorConfigValidation =
194
- registered.collector.config.schema.safeParse(
195
- resolvedCollectorConfig,
196
- );
197
- if (!collectorConfigValidation.success) {
206
+ // Migrate-then-validate-strict: authored gitops YAML may use an
207
+ // OLD collector config shape. Run the migration chain before
208
+ // strict validation so old-shape YAML still applies while
209
+ // genuine typos are still rejected. Shares the exact strict-
210
+ // validate path the `validateConfiguration` RPC uses. A collector
211
+ // config is always a plain object validated by the collector's
212
+ // schema, so narrowing the `unknown` result to the stored
213
+ // `Record` shape is safe.
214
+ const collectorResult = await validateVersionedConfigStrict({
215
+ config: registered.collector.config,
216
+ value: resolvedCollectorConfig,
217
+ basePath: ["config"],
218
+ });
219
+ if (!collectorResult.ok) {
198
220
  throw new Error(
199
- `Collector "${c.collectorId}" config validation failed: ${collectorConfigValidation.error.message}`,
221
+ `Collector "${c.collectorId}" config validation failed: ${formatIssues(collectorResult.issues)}`,
200
222
  );
201
223
  }
224
+ const migratedCollectorConfig = collectorResult.value as Record<
225
+ string,
226
+ unknown
227
+ >;
202
228
 
203
- return { ...c, config: resolvedCollectorConfig };
229
+ return { ...c, config: migratedCollectorConfig };
204
230
  }),
205
231
  )
206
232
  : undefined;
@@ -212,7 +238,7 @@ export function buildHealthcheckKind(
212
238
  await service.updateConfiguration(existingEntityId, {
213
239
  name: displayName,
214
240
  strategyId: spec.strategy,
215
- config: resolvedConfig,
241
+ config: migratedConfig,
216
242
  intervalSeconds: spec.intervalSeconds,
217
243
  collectors: resolvedCollectors?.map((c) => ({
218
244
  id: c.collectorId,
@@ -230,7 +256,7 @@ export function buildHealthcheckKind(
230
256
  const config = await service.createConfiguration({
231
257
  name: displayName,
232
258
  strategyId: spec.strategy,
233
- config: resolvedConfig,
259
+ config: migratedConfig,
234
260
  intervalSeconds: spec.intervalSeconds,
235
261
  collectors: resolvedCollectors?.map((c) => ({
236
262
  id: c.collectorId,
@@ -517,6 +543,23 @@ export function registerHealthcheckGitOpsDocumentation({
517
543
  }
518
544
  }
519
545
 
546
+ /**
547
+ * Render the structured issues from {@link validateVersionedConfigStrict} into
548
+ * a single human-readable message for the thrown GitOps reconcile error,
549
+ * preserving the per-field path (e.g. `config.url: Invalid url`).
550
+ */
551
+ function formatIssues(
552
+ issues: Array<{ path: Array<string | number>; message: string }>,
553
+ ): string {
554
+ return issues
555
+ .map((issue) =>
556
+ issue.path.length > 0
557
+ ? `${issue.path.join(".")}: ${issue.message}`
558
+ : issue.message,
559
+ )
560
+ .join("; ");
561
+ }
562
+
520
563
  function unwrapZodType(type: z.ZodTypeAny): z.ZodTypeAny {
521
564
  let current = type;
522
565
  while (current) {
package/src/index.ts CHANGED
@@ -17,6 +17,12 @@ import {
17
17
  NotificationApi,
18
18
  specToRegistration,
19
19
  } from "@checkstack/notification-common";
20
+ import {
21
+ aiToolExtensionPoint,
22
+ aiToolProjectionExtensionPoint,
23
+ deferredProjectionExecute,
24
+ } from "@checkstack/ai-backend";
25
+ import { buildHealthcheckAiTools } from "./ai/register-ai-tools";
20
26
  import {
21
27
  createBackendPlugin,
22
28
  coreServices,
@@ -234,6 +240,30 @@ export default createBackendPlugin({
234
240
  collectorRegistry,
235
241
  );
236
242
 
243
+ // Register this plugin's AI tools (propose/update/delete) into the AI
244
+ // registry via the extension point - owned here, not in ai-backend.
245
+ const aiToolExt = env.getExtensionPoint(aiToolExtensionPoint);
246
+ for (const tool of buildHealthcheckAiTools()) {
247
+ aiToolExt.registerTool(tool, pluginMetadata);
248
+ }
249
+
250
+ // Expose this plugin's OWN read-only AI projection of the existing
251
+ // `getConfigurations` query via aiToolProjectionExtensionPoint - owned
252
+ // here, not in ai-backend. The projected read tool is routed by the
253
+ // transport (MCP / chat) AS the principal, so `getConfigurations`'
254
+ // own contract access rules gate it; `deferredProjectionExecute` is
255
+ // the fail-closed net if a transport ever forgot to route.
256
+ env.getExtensionPoint(aiToolProjectionExtensionPoint).expose({
257
+ procedure: healthCheckContract.getConfigurations,
258
+ sourcePluginMetadata: pluginMetadata,
259
+ procedureKey: "getConfigurations",
260
+ name: "healthcheck.status",
261
+ description:
262
+ "List health-check configurations and their current status. Read-only.",
263
+ effect: "read",
264
+ execute: deferredProjectionExecute,
265
+ });
266
+
237
267
  // Create catalog client for notification delegation
238
268
  const catalogClient = rpcClient.forPlugin(CatalogApi);
239
269
 
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Contract test: every healthcheck-backend-owned Versioned config that is
3
+ * stored and read back via the migration chain MUST have a COMPLETE,
4
+ * contiguous chain from version 1 to its current `version`. Pure STRUCTURAL
5
+ * check (`validateMigrationChainFromV1` — no `migrate()` is run), so it carries
6
+ * zero per-config upkeep: the day someone bumps a config's `version` without
7
+ * shipping a covering migration, the read path would silently fail at runtime
8
+ * on a genuinely-v1 stored blob — this test turns that into a CI failure
9
+ * instead. See the HTTP plugin's equivalent test for the full rationale.
10
+ *
11
+ * Covers the configs this CORE package owns: the per-assignment state
12
+ * thresholds wrapper and every built-in automation action config. The
13
+ * strategy / collector `config` / `result` / `aggregatedResult` Versioned
14
+ * schemas are registered by the healthcheck strategy plugins (e.g.
15
+ * healthcheck-http-backend) and are guarded by an equivalent contract test in
16
+ * each plugin package.
17
+ */
18
+ import { describe, expect, it } from "bun:test";
19
+ import type { QueueManager } from "@checkstack/queue-api";
20
+ import type { Hook } from "@checkstack/backend-api";
21
+ import { stateThresholds } from "./state-thresholds-migrations";
22
+ import { createHealthCheckActions } from "./automations";
23
+ import type { HealthCheckService } from "./service";
24
+
25
+ // `createHealthCheckActions` only constructs the action definitions; the deps
26
+ // are touched lazily inside `execute()`, which the contract check never calls.
27
+ // Stubs are sufficient.
28
+ const stubService = {} as unknown as HealthCheckService;
29
+ const stubQueueManager = {} as unknown as QueueManager;
30
+ const stubEmitHook = async <T>(_hook: Hook<T>, _payload: T): Promise<void> => {};
31
+
32
+ describe("healthcheck config migration-chain contract", () => {
33
+ it("the state-thresholds config has a complete v1->version chain", () => {
34
+ const problem = stateThresholds.validateMigrationChainFromV1();
35
+ expect(
36
+ problem,
37
+ `state thresholds config (version ${stateThresholds.version}) has a broken migration chain: ${problem}`,
38
+ ).toBeUndefined();
39
+ });
40
+
41
+ it("every built-in action config has a complete v1->version chain", () => {
42
+ const actions = createHealthCheckActions({
43
+ service: stubService,
44
+ queueManager: stubQueueManager,
45
+ emitHook: stubEmitHook,
46
+ });
47
+ expect(actions.length).toBeGreaterThan(0);
48
+
49
+ for (const action of actions) {
50
+ const problem = action.config.validateMigrationChainFromV1();
51
+ expect(
52
+ problem,
53
+ `Action "${action.id}" config (version ${action.config.version}) has a broken migration chain: ${problem}`,
54
+ ).toBeUndefined();
55
+ }
56
+ });
57
+ });