@checkstack/healthcheck-backend 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/CHANGELOG.md +223 -0
  2. package/drizzle/0018_abnormal_preak.sql +10 -0
  3. package/drizzle/meta/0018_snapshot.json +600 -0
  4. package/drizzle/meta/_journal.json +7 -0
  5. package/package.json +26 -21
  6. package/src/ai/assertion-validation.test.ts +117 -0
  7. package/src/ai/assertion-validation.ts +147 -0
  8. package/src/ai/healthcheck-capabilities.test.ts +158 -0
  9. package/src/ai/healthcheck-capabilities.ts +217 -0
  10. package/src/ai/healthcheck-delete.test.ts +81 -0
  11. package/src/ai/healthcheck-delete.ts +81 -0
  12. package/src/ai/healthcheck-projection.test.ts +36 -0
  13. package/src/ai/healthcheck-propose.test.ts +268 -0
  14. package/src/ai/healthcheck-propose.ts +290 -0
  15. package/src/ai/healthcheck-script-tools.test.ts +93 -0
  16. package/src/ai/healthcheck-script-tools.ts +179 -0
  17. package/src/ai/healthcheck-update.test.ts +123 -0
  18. package/src/ai/healthcheck-update.ts +123 -0
  19. package/src/ai/notify-subscribers.test.ts +109 -0
  20. package/src/ai/notify-subscribers.ts +176 -0
  21. package/src/ai/register-ai-tools.test.ts +41 -0
  22. package/src/ai/register-ai-tools.ts +53 -0
  23. package/src/ai/shell-env-table.test.ts +47 -0
  24. package/src/automations.test.ts +2 -1
  25. package/src/automations.ts +9 -1
  26. package/src/collector-script-test.test.ts +53 -1
  27. package/src/collector-script-test.ts +59 -7
  28. package/src/effective-environments.test.ts +93 -0
  29. package/src/effective-environments.ts +64 -0
  30. package/src/health-entity-id.ts +57 -0
  31. package/src/health-entity.test.ts +384 -6
  32. package/src/health-entity.ts +93 -35
  33. package/src/health-state.ts +41 -4
  34. package/src/healthcheck-gitops-kinds.test.ts +95 -0
  35. package/src/healthcheck-gitops-kinds.ts +56 -13
  36. package/src/index.ts +30 -0
  37. package/src/migration-chain-contract.test.ts +57 -0
  38. package/src/queue-executor.test.ts +801 -0
  39. package/src/queue-executor.ts +336 -52
  40. package/src/realtime-aggregation.test.ts +30 -0
  41. package/src/realtime-aggregation.ts +16 -0
  42. package/src/retention-job.ts +167 -93
  43. package/src/retention-rollup.test.ts +118 -0
  44. package/src/router.test.ts +120 -1
  45. package/src/router.ts +20 -0
  46. package/src/schema.ts +44 -6
  47. package/src/service.ts +199 -43
  48. package/src/state-transitions.test.ts +104 -0
  49. package/src/state-transitions.ts +39 -1
  50. package/src/validate-configuration.test.ts +205 -0
  51. package/src/validate-configuration.ts +159 -0
  52. package/tsconfig.json +9 -0
package/src/schema.ts CHANGED
@@ -97,6 +97,20 @@ export const systemHealthChecks = pgTable(
97
97
  * When set, the check runs on these satellite nodes in addition to (or instead of) the core.
98
98
  */
99
99
  satelliteIds: jsonb("satellite_ids").$type<string[]>(),
100
+ /**
101
+ * Per-assignment environment selector for per-environment fan-out.
102
+ *
103
+ * Semantics (null vs [] are SEMANTICALLY DISTINCT here, unlike most
104
+ * nullable jsonb in this schema):
105
+ * - `null` => all environments the system currently belongs to.
106
+ * - `[]` => opt out: run ONCE with no environment in context.
107
+ * - non-empty => exactly those environment ids, intersected with the
108
+ * system's current membership (a removed env silently drops out).
109
+ *
110
+ * The service distinguishes `row.environmentIds === null` from
111
+ * `length === 0`; jsonb stores both faithfully.
112
+ */
113
+ environmentIds: jsonb("environment_ids").$type<string[]>(),
100
114
  /**
101
115
  * Whether to also run this check locally on the core instance.
102
116
  * Defaults to true. Only relevant when satelliteIds is set.
@@ -145,22 +159,30 @@ export const healthCheckStateTransitions = pgTable(
145
159
  configurationId: uuid("configuration_id")
146
160
  .notNull()
147
161
  .references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
162
+ /**
163
+ * Environment this transition belongs to. null = a transition recorded
164
+ * for an env-less run (the opt-out / no-membership case). Per-environment
165
+ * transitions stay distinct so "in status since" is env-scoped.
166
+ */
167
+ environmentId: text("environment_id"),
148
168
  fromStatus: healthCheckStatusEnum("from_status"),
149
169
  toStatus: healthCheckStatusEnum("to_status").notNull(),
150
170
  transitionedAt: timestamp("transitioned_at").defaultNow().notNull(),
151
171
  },
152
172
  (t) => ({
153
- // Powers "most recent transition into status X for this system"
154
- // (WHERE system_id = ? AND to_status = ? ORDER BY transitioned_at DESC).
173
+ // Powers "most recent transition into status X for this system+env"
174
+ // (WHERE system_id = ? AND environment_id = ? AND to_status = ?
175
+ // ORDER BY transitioned_at DESC).
155
176
  lookupIdx: index("health_check_state_transitions_lookup_idx").on(
156
177
  t.systemId,
178
+ t.environmentId,
157
179
  t.toStatus,
158
180
  t.transitionedAt,
159
181
  ),
160
- // Powers the retention "keep newest per system" sweep.
182
+ // Powers the retention "keep newest per system+env" sweep.
161
183
  systemRecentIdx: index(
162
184
  "health_check_state_transitions_system_recent_idx",
163
- ).on(t.systemId, t.transitionedAt),
185
+ ).on(t.systemId, t.environmentId, t.transitionedAt),
164
186
  }),
165
187
  );
166
188
 
@@ -170,6 +192,15 @@ export const healthCheckRuns = pgTable("health_check_runs", {
170
192
  .notNull()
171
193
  .references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
172
194
  systemId: text("system_id").notNull(),
195
+ /**
196
+ * Environment this run was executed for (per-environment fan-out).
197
+ * null = ran with no environment (the opt-out / no-membership case,
198
+ * which is exactly the pre-feature behavior). Nullable text, NOT a FK
199
+ * to the catalog `environments` table (healthcheck and catalog are
200
+ * separate plugins with separate Postgres schemas, mirroring how
201
+ * `systemId` is a bare text with no FK to `systems`).
202
+ */
203
+ environmentId: text("environment_id"),
173
204
  status: healthCheckStatusEnum("status").notNull(),
174
205
  /** Execution duration in milliseconds */
175
206
  latencyMs: integer("latency_ms"),
@@ -207,6 +238,11 @@ export const healthCheckAggregates = pgTable(
207
238
  .notNull()
208
239
  .references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
209
240
  systemId: text("system_id").notNull(),
241
+ /**
242
+ * Environment this bucket aggregates. null = env-less runs. Part of the
243
+ * unique key so per-environment buckets stay separate.
244
+ */
245
+ environmentId: text("environment_id"),
210
246
  bucketStart: timestamp("bucket_start").notNull(),
211
247
  bucketSize: bucketSizeEnum("bucket_size").notNull(),
212
248
 
@@ -238,12 +274,14 @@ export const healthCheckAggregates = pgTable(
238
274
  sourceLabel: text("source_label"),
239
275
  },
240
276
  (t) => ({
241
- // Unique constraint includes sourceId for per-region aggregation.
242
- // NULLS NOT DISTINCT ensures local runs (sourceId=NULL) correctly
277
+ // Unique constraint includes environmentId (per-environment fan-out)
278
+ // and sourceId (per-region aggregation). NULLS NOT DISTINCT ensures
279
+ // env-less / local runs (environmentId / sourceId = NULL) correctly
243
280
  // conflict-match instead of creating duplicate rows per hour.
244
281
  bucketUnique: unique("health_check_aggregates_bucket_unique").on(
245
282
  t.configurationId,
246
283
  t.systemId,
284
+ t.environmentId,
247
285
  t.bucketStart,
248
286
  t.bucketSize,
249
287
  t.sourceId,
package/src/service.ts CHANGED
@@ -9,6 +9,7 @@ import {
9
9
  type NotificationPolicy,
10
10
  NotificationPolicySchema,
11
11
  DEFAULT_NOTIFICATION_POLICY,
12
+ type CollectorConfigEntry,
12
13
  } from "@checkstack/healthcheck-common";
13
14
  import type { ConfigService } from "@checkstack/backend-api";
14
15
  import type { InferClient } from "@checkstack/common";
@@ -39,6 +40,7 @@ import {
39
40
  import { ORPCError } from "@orpc/server";
40
41
  import { evaluateHealthStatus } from "./state-evaluator";
41
42
  import { computeHealthState, type HealthState } from "./health-state";
43
+ import { parseHealthEntityId } from "./health-entity-id";
42
44
  import { stateThresholds } from "./state-thresholds-migrations";
43
45
  import type { MaintenanceApi } from "@checkstack/maintenance-common";
44
46
  import type { Logger } from "@checkstack/backend-api";
@@ -61,6 +63,16 @@ import {
61
63
  // Drizzle type helper - uses SafeDatabase to prevent relational query API usage
62
64
  type Db = SafeDatabase<typeof schema>;
63
65
 
66
+ /**
67
+ * Narrow a migrated config (typed `unknown` by the versioning chain) to a
68
+ * spreadable record. Every registered strategy/collector config schema is an
69
+ * object, so a successfully validated value is always object-shaped at
70
+ * runtime; this guard keeps the type-level handling cast-free.
71
+ */
72
+ function isConfigRecord(value: unknown): value is Record<string, unknown> {
73
+ return typeof value === "object" && value !== null;
74
+ }
75
+
64
76
  // Catalog client type used to resolve human-readable system names for
65
77
  // satellite assignment run-context. Optional on the service.
66
78
  type CatalogClient = InferClient<typeof CatalogApi>;
@@ -208,7 +220,7 @@ export class HealthCheckService {
208
220
 
209
221
  async getConfigurations(): Promise<HealthCheckConfiguration[]> {
210
222
  const configs = await this.db.select().from(healthCheckConfigurations);
211
- return configs.map((c) => this.mapConfig(c));
223
+ return Promise.all(configs.map((c) => this.mapConfig(c)));
212
224
  }
213
225
 
214
226
  async associateSystem(props: {
@@ -217,6 +229,13 @@ export class HealthCheckService {
217
229
  enabled?: boolean;
218
230
  stateThresholds?: StateThresholds;
219
231
  satelliteIds?: string[];
232
+ /**
233
+ * Per-assignment environment selector. `null` (or `undefined`) = all
234
+ * current environments; `[]` = opt out (env-less); non-empty = those
235
+ * ids. `null` and `[]` are stored distinctly so the run-time resolver
236
+ * can tell "all" from "opt out". `undefined` is normalized to `null`.
237
+ */
238
+ environmentIds?: string[] | null;
220
239
  includeLocal?: boolean;
221
240
  notificationPolicy?: NotificationPolicy;
222
241
  }) {
@@ -226,10 +245,16 @@ export class HealthCheckService {
226
245
  enabled = true,
227
246
  stateThresholds: stateThresholds_,
228
247
  satelliteIds,
248
+ environmentIds,
229
249
  includeLocal = true,
230
250
  notificationPolicy,
231
251
  } = props;
232
252
 
253
+ // Preserve the null/[]/list distinction faithfully. `undefined` props
254
+ // mean "not provided" -> treat as `null` ("all current environments"),
255
+ // the default fan-out behavior. `[]` is kept verbatim (opt-out).
256
+ const environmentIdsValue: string[] | null = environmentIds ?? null;
257
+
233
258
  // Wrap thresholds in versioned config if provided
234
259
  const versionedThresholds: VersionedStateThresholds | undefined =
235
260
  stateThresholds_ ? stateThresholds.create(stateThresholds_) : undefined;
@@ -242,6 +267,7 @@ export class HealthCheckService {
242
267
  enabled,
243
268
  stateThresholds: versionedThresholds,
244
269
  satelliteIds: satelliteIds ?? undefined,
270
+ environmentIds: environmentIdsValue,
245
271
  includeLocal,
246
272
  notificationPolicy: notificationPolicy ?? undefined,
247
273
  })
@@ -254,6 +280,7 @@ export class HealthCheckService {
254
280
  enabled,
255
281
  stateThresholds: versionedThresholds,
256
282
  satelliteIds: satelliteIds ?? undefined,
283
+ environmentIds: environmentIdsValue,
257
284
  includeLocal,
258
285
  notificationPolicy: notificationPolicy ?? undefined,
259
286
  updatedAt: new Date(),
@@ -385,7 +412,7 @@ export class HealthCheckService {
385
412
  )
386
413
  .where(eq(systemHealthChecks.systemId, systemId));
387
414
 
388
- return rows.map((r) => this.mapConfig(r.config));
415
+ return Promise.all(rows.map((r) => this.mapConfig(r.config)));
389
416
  }
390
417
 
391
418
  /**
@@ -399,6 +426,7 @@ export class HealthCheckService {
399
426
  enabled: systemHealthChecks.enabled,
400
427
  stateThresholds: systemHealthChecks.stateThresholds,
401
428
  satelliteIds: systemHealthChecks.satelliteIds,
429
+ environmentIds: systemHealthChecks.environmentIds,
402
430
  includeLocal: systemHealthChecks.includeLocal,
403
431
  notificationPolicy: systemHealthChecks.notificationPolicy,
404
432
  })
@@ -422,6 +450,9 @@ export class HealthCheckService {
422
450
  enabled: row.enabled,
423
451
  stateThresholds: thresholds,
424
452
  satelliteIds: row.satelliteIds ?? undefined,
453
+ // Preserve the null/[]/list distinction (null = all envs, [] = opt
454
+ // out). Do NOT collapse null to undefined via `??`.
455
+ environmentIds: row.environmentIds,
425
456
  includeLocal: row.includeLocal,
426
457
  notificationPolicy: row.notificationPolicy ?? undefined,
427
458
  });
@@ -475,9 +506,25 @@ export class HealthCheckService {
475
506
  /**
476
507
  * Get the evaluated health status for a system based on configured thresholds.
477
508
  * Aggregates status from all health check configurations for this system.
509
+ *
510
+ * Environment dimension (Phase 3b, §7.4.2):
511
+ * - `environmentId` OMITTED (or `undefined`) ⇒ the **system rollup**: all
512
+ * runs for the system regardless of environment. "Any env unhealthy ⇒ at
513
+ * least one unhealthy run in the window" already yields worst-status
514
+ * semantics for the window-based evaluator, and it exactly matches the
515
+ * pre-3b behavior when no environments exist (no extra catalog read).
516
+ * - `environmentId` a STRING ⇒ the per-environment slice: only runs whose
517
+ * `environment_id` equals that id.
518
+ * - `environmentId` `null` ⇒ the ENV-LESS slice: only runs with
519
+ * `environment_id IS NULL` (the opt-out / no-membership case).
520
+ *
521
+ * The env filter narrows ONLY the per-check run window; the set of enabled
522
+ * associations (and thus `checkStatuses.length`, the existence gate) is the
523
+ * same across views, so a per-env view and the rollup agree on totalChecks.
478
524
  */
479
525
  async getSystemHealthStatus(
480
526
  systemId: string,
527
+ environmentId?: string | null,
481
528
  ): Promise<SystemHealthStatusResponse> {
482
529
  // Get all associations for this system with their thresholds and config names
483
530
  const associations = await this.db
@@ -512,6 +559,17 @@ export class HealthCheckService {
512
559
  const checkStatuses: SystemCheckStatus[] = [];
513
560
  const maxWindowSize = 100; // Max configurable window size
514
561
 
562
+ // Environment filter for the per-check run window. `undefined` (rollup)
563
+ // adds no predicate; `null` filters to the env-less slice; a string
564
+ // filters to that environment. The lookup index leads with
565
+ // (system_id, environment_id, …) so the env-scoped query is index-efficient.
566
+ const envFilter =
567
+ environmentId === undefined
568
+ ? undefined
569
+ : environmentId === null
570
+ ? isNull(healthCheckRuns.environmentId)
571
+ : eq(healthCheckRuns.environmentId, environmentId);
572
+
515
573
  for (const assoc of associations) {
516
574
  const runs = await this.db
517
575
  .select({
@@ -523,6 +581,7 @@ export class HealthCheckService {
523
581
  and(
524
582
  eq(healthCheckRuns.systemId, systemId),
525
583
  eq(healthCheckRuns.configurationId, assoc.configurationId),
584
+ ...(envFilter ? [envFilter] : []),
526
585
  ),
527
586
  )
528
587
  .orderBy(desc(healthCheckRuns.timestamp))
@@ -577,6 +636,7 @@ export class HealthCheckService {
577
636
  async getHealthState({
578
637
  systemId,
579
638
  configurationId,
639
+ environmentId,
580
640
  maintenanceClient,
581
641
  logger,
582
642
  transitionWindowMinutes,
@@ -584,6 +644,13 @@ export class HealthCheckService {
584
644
  }: {
585
645
  systemId: string;
586
646
  configurationId?: string;
647
+ /**
648
+ * Environment to scope the snapshot to (Phase 3b). `undefined` = the
649
+ * system rollup; `null` = the env-less slice; a string = that env. Threads
650
+ * into both the status resolver and every durable read in
651
+ * `computeHealthState`.
652
+ */
653
+ environmentId?: string | null;
587
654
  maintenanceClient?: MaintenanceClient;
588
655
  logger?: Logger;
589
656
  transitionWindowMinutes?: number;
@@ -593,12 +660,16 @@ export class HealthCheckService {
593
660
  db: this.db,
594
661
  systemId,
595
662
  configurationId,
663
+ environmentId,
596
664
  maintenanceClient,
597
665
  logger,
598
666
  transitionWindowMinutes,
599
667
  now,
600
668
  resolveStatus: async () => {
601
- const overview = await this.getSystemHealthStatus(systemId);
669
+ const overview = await this.getSystemHealthStatus(
670
+ systemId,
671
+ environmentId,
672
+ );
602
673
  if (!configurationId) return overview.status;
603
674
  const check = overview.checkStatuses.find(
604
675
  (c) => c.configurationId === configurationId,
@@ -611,10 +682,17 @@ export class HealthCheckService {
611
682
  }
612
683
 
613
684
  /**
614
- * Bulk variant of {@link getHealthState}. Resolves every system in
615
- * parallel against a single shared `now` so durations are consistent
616
- * across the batch. Avoids N+1 from dashboards and multi-system
617
- * automation rules.
685
+ * Bulk variant of {@link getHealthState}. Resolves every id in parallel
686
+ * against a single shared `now` so durations are consistent across the
687
+ * batch. Avoids N+1 from dashboards and multi-system automation rules.
688
+ *
689
+ * Environment-aware (Phase 3b, §7.4.4): an id may be the bare `"<systemId>"`
690
+ * (the system rollup) OR the env-qualified `"<systemId>::<environmentId>"`
691
+ * (a per-environment view). Each id is parsed via {@link parseHealthEntityId}
692
+ * and resolved against the right env slice, and the result is keyed by the
693
+ * ORIGINAL id string. So scope enrichment that reads
694
+ * `health.systems["<systemId>::<environmentId>"]` gets the per-env snapshot
695
+ * and `health.systems["<systemId>"]` gets the rollup, with no caller change.
618
696
  */
619
697
  async getBulkHealthState({
620
698
  systemIds,
@@ -623,6 +701,7 @@ export class HealthCheckService {
623
701
  transitionWindowMinutes,
624
702
  now = new Date(),
625
703
  }: {
704
+ /** Health entity ids — bare systemId (rollup) or `systemId::environmentId`. */
626
705
  systemIds: string[];
627
706
  maintenanceClient?: MaintenanceClient;
628
707
  logger?: Logger;
@@ -630,19 +709,25 @@ export class HealthCheckService {
630
709
  now?: Date;
631
710
  }): Promise<Record<string, HealthState>> {
632
711
  const entries = await Promise.all(
633
- systemIds.map(
634
- async (systemId) =>
635
- [
712
+ systemIds.map(async (id) => {
713
+ const { systemId, environmentId } = parseHealthEntityId(id);
714
+ return [
715
+ id,
716
+ await this.getHealthState({
636
717
  systemId,
637
- await this.getHealthState({
638
- systemId,
639
- maintenanceClient,
640
- logger,
641
- transitionWindowMinutes,
642
- now,
643
- }),
644
- ] as const,
645
- ),
718
+ // A bare `<systemId>` id is the ROLLUP and must read ALL runs
719
+ // (`undefined`), NOT the env-less slice (`null`, i.e.
720
+ // `env_id IS NULL`). `parseHealthEntityId` returns `null` for a
721
+ // bare id; map it to `undefined` here. `null` stays reserved for
722
+ // an explicit env-less read.
723
+ environmentId: environmentId === null ? undefined : environmentId,
724
+ maintenanceClient,
725
+ logger,
726
+ transitionWindowMinutes,
727
+ now,
728
+ }),
729
+ ] as const;
730
+ }),
646
731
  );
647
732
  return Object.fromEntries(entries);
648
733
  }
@@ -797,6 +882,7 @@ export class HealthCheckService {
797
882
  status: run.status,
798
883
  timestamp: run.timestamp,
799
884
  latencyMs: run.latencyMs ?? undefined,
885
+ environmentId: run.environmentId ?? undefined,
800
886
  sourceId: run.sourceId ?? undefined,
801
887
  sourceLabel: run.sourceLabel ?? undefined,
802
888
  })),
@@ -875,6 +961,7 @@ export class HealthCheckService {
875
961
  result: run.result ?? {},
876
962
  timestamp: run.timestamp,
877
963
  latencyMs: run.latencyMs ?? undefined,
964
+ environmentId: run.environmentId ?? undefined,
878
965
  sourceId: run.sourceId ?? undefined,
879
966
  sourceLabel: run.sourceLabel ?? undefined,
880
967
  })),
@@ -905,6 +992,7 @@ export class HealthCheckService {
905
992
  result: r.result ?? {},
906
993
  timestamp: r.timestamp,
907
994
  latencyMs: r.latencyMs ?? undefined,
995
+ environmentId: r.environmentId ?? undefined,
908
996
  sourceId: r.sourceId ?? undefined,
909
997
  sourceLabel: r.sourceLabel ?? undefined,
910
998
  };
@@ -1255,15 +1343,23 @@ export class HealthCheckService {
1255
1343
  return new Date(rangeStart.getTime() + bucketIndex * intervalMs);
1256
1344
  }
1257
1345
 
1258
- private mapConfig(
1346
+ /**
1347
+ * Map a stored configuration row to the public DTO, migrating the
1348
+ * (UNVERSIONED) strategy + collector configs via assume-v1-on-read so the
1349
+ * read API (router / frontend / gitops `getConfiguration`) returns migrated
1350
+ * shapes. Migrations are idempotent, so an already-current config is a
1351
+ * no-op. An unregistered strategy/collector or a failed migrate falls back
1352
+ * to the raw stored blob rather than dropping the configuration.
1353
+ */
1354
+ private async mapConfig(
1259
1355
  row: InferSelectModel<typeof healthCheckConfigurations>,
1260
- ): HealthCheckConfiguration {
1356
+ ): Promise<HealthCheckConfiguration> {
1261
1357
  return {
1262
1358
  id: row.id,
1263
1359
  name: row.name,
1264
1360
  strategyId: row.strategyId,
1265
- config: row.config,
1266
- collectors: row.collectors ?? undefined,
1361
+ config: await this.migrateStrategyConfig(row.strategyId, row.config),
1362
+ collectors: await this.migrateCollectorEntries(row.collectors),
1267
1363
  intervalSeconds: row.intervalSeconds,
1268
1364
  paused: row.paused,
1269
1365
  createdAt: row.createdAt,
@@ -1271,6 +1367,56 @@ export class HealthCheckService {
1271
1367
  };
1272
1368
  }
1273
1369
 
1370
+ /**
1371
+ * Migrate a stored strategy config via assume-v1-on-read. Falls back to the
1372
+ * raw blob when the strategy is not registered or the migrate/validate
1373
+ * throws, so a read never drops a configuration on a transient mismatch.
1374
+ */
1375
+ private async migrateStrategyConfig(
1376
+ strategyId: string,
1377
+ rawConfig: Record<string, unknown>,
1378
+ ): Promise<Record<string, unknown>> {
1379
+ const strategy = this.registry?.getStrategy(strategyId);
1380
+ if (!strategy) return rawConfig;
1381
+ try {
1382
+ const migrated = await strategy.config.parseAssumingV1(rawConfig);
1383
+ return { ...migrated };
1384
+ } catch {
1385
+ return rawConfig;
1386
+ }
1387
+ }
1388
+
1389
+ /**
1390
+ * Migrate each collector entry's stored config via assume-v1-on-read,
1391
+ * preserving id/collectorId/assertions. Falls back to the raw entry config
1392
+ * when the collector is not registered or migrate/validate throws.
1393
+ */
1394
+ private async migrateCollectorEntries(
1395
+ collectors: CollectorConfigEntry[] | null,
1396
+ ): Promise<CollectorConfigEntry[] | undefined> {
1397
+ if (!collectors || collectors.length === 0) return undefined;
1398
+ return Promise.all(
1399
+ collectors.map(async (entry) => {
1400
+ const registered = this.collectorRegistry?.getCollector(
1401
+ entry.collectorId,
1402
+ );
1403
+ if (!registered) return entry;
1404
+ try {
1405
+ const migrated = await registered.collector.config.parseAssumingV1(
1406
+ entry.config,
1407
+ );
1408
+ // A registered collector's config schema is always an object, so a
1409
+ // successful migrate yields a record; fall back to the raw entry if
1410
+ // the validated value is somehow not object-shaped.
1411
+ if (!isConfigRecord(migrated)) return entry;
1412
+ return { ...entry, config: { ...migrated } };
1413
+ } catch {
1414
+ return entry;
1415
+ }
1416
+ }),
1417
+ );
1418
+ }
1419
+
1274
1420
  /**
1275
1421
  * Remove a satellite ID from all systemHealthChecks.satelliteIds arrays.
1276
1422
  * Called when a satellite is deleted via the satellite.removed hook.
@@ -1450,27 +1596,37 @@ export class HealthCheckService {
1450
1596
  ? ({ ...result } as Record<string, unknown>)
1451
1597
  : {};
1452
1598
 
1453
- await this.db.insert(healthCheckRuns).values({
1454
- configurationId: configId,
1455
- systemId,
1456
- status,
1457
- latencyMs,
1458
- result: resultRecord,
1459
- sourceId,
1460
- sourceLabel,
1461
- });
1599
+ // Atomic: the run row and the hourly-aggregate increment it feeds must
1600
+ // commit together. Without the transaction a failure on the (non-idempotent
1601
+ // `runCount + 1`) aggregate left a committed run that the aggregate never
1602
+ // counted - or, on the reverse ordering, an aggregate with no backing run.
1603
+ // NOTE: this guarantees run/aggregate consistency, but does NOT make a
1604
+ // *duplicate satellite delivery* (a re-POST after a committed write)
1605
+ // idempotent - that requires a dedupe key on the high-volume runs table and
1606
+ // is tracked as a separate follow-up.
1607
+ await this.db.transaction(async (tx) => {
1608
+ await tx.insert(healthCheckRuns).values({
1609
+ configurationId: configId,
1610
+ systemId,
1611
+ status,
1612
+ latencyMs,
1613
+ result: resultRecord,
1614
+ sourceId,
1615
+ sourceLabel,
1616
+ });
1462
1617
 
1463
- // Trigger incremental hourly aggregation — same as local executor
1464
- await incrementHourlyAggregate({
1465
- db: this.db,
1466
- systemId,
1467
- configurationId: configId,
1468
- status,
1469
- latencyMs,
1470
- runTimestamp: new Date(props.executedAt),
1471
- result: resultRecord,
1472
- collectorRegistry: this.collectorRegistry,
1473
- sourceLabel,
1618
+ // Trigger incremental hourly aggregation — same as local executor
1619
+ await incrementHourlyAggregate({
1620
+ db: tx,
1621
+ systemId,
1622
+ configurationId: configId,
1623
+ status,
1624
+ latencyMs,
1625
+ runTimestamp: new Date(props.executedAt),
1626
+ result: resultRecord,
1627
+ collectorRegistry: this.collectorRegistry,
1628
+ sourceLabel,
1629
+ });
1474
1630
  });
1475
1631
  }
1476
1632
  }
@@ -23,6 +23,33 @@ function selectMockDb(rows: Array<{ transitionedAt: Date }>) {
23
23
  };
24
24
  }
25
25
 
26
+ /**
27
+ * Capturing variant: records the AND-condition list passed to `where` so a
28
+ * test can assert the env predicate was (or was not) added. Drizzle's `and(...)`
29
+ * is opaque here, so we count the conditions instead of inspecting SQL: the
30
+ * env-scoped query adds one extra condition over the system-wide query.
31
+ */
32
+ function whereCapturingDb(rows: Array<{ transitionedAt?: Date; count?: number }>) {
33
+ const whereArgs: unknown[][] = [];
34
+ return {
35
+ db: {
36
+ select: mock(() => ({
37
+ from: mock(() => ({
38
+ where: mock((...args: unknown[]) => {
39
+ whereArgs.push(args);
40
+ return Object.assign(Promise.resolve(rows), {
41
+ orderBy: mock(() => ({
42
+ limit: mock(() => Promise.resolve(rows)),
43
+ })),
44
+ });
45
+ }),
46
+ })),
47
+ })),
48
+ },
49
+ whereArgs,
50
+ };
51
+ }
52
+
26
53
  describe("findInStatusSince", () => {
27
54
  it("returns the most-recent transitionedAt for the status", async () => {
28
55
  const since = new Date("2026-05-30T10:00:00.000Z");
@@ -35,6 +62,30 @@ describe("findInStatusSince", () => {
35
62
  expect(result).toBe(since);
36
63
  });
37
64
 
65
+ it("adds an environment predicate when environmentId is provided (env-scoped)", async () => {
66
+ const { db, whereArgs } = whereCapturingDb([]);
67
+ await findInStatusSince({
68
+ db: db as never,
69
+ systemId: "system-1",
70
+ status: "unhealthy",
71
+ environmentId: "prod",
72
+ });
73
+ // and(systemId, toStatus, envFilter) — the env predicate is present.
74
+ expect(whereArgs[0]?.[0]).toBeDefined();
75
+ });
76
+
77
+ it("scopes to the env-less slice when environmentId is null", async () => {
78
+ const { db } = whereCapturingDb([]);
79
+ const result = await findInStatusSince({
80
+ db: db as never,
81
+ systemId: "system-1",
82
+ status: "unhealthy",
83
+ environmentId: null,
84
+ });
85
+ // No row in the env-less slice ⇒ fail-safe null (no throw).
86
+ expect(result).toBeNull();
87
+ });
88
+
38
89
  it("returns null (fail-safe) when no transition row exists", async () => {
39
90
  const db = selectMockDb([]);
40
91
  const result = await findInStatusSince({
@@ -68,12 +119,53 @@ describe("recordStateTransition", () => {
68
119
  expect(values.mock.calls[0]?.[0]).toEqual({
69
120
  systemId: "system-1",
70
121
  configurationId: "config-1",
122
+ environmentId: null,
71
123
  fromStatus: "healthy",
72
124
  toStatus: "unhealthy",
73
125
  transitionedAt: now,
74
126
  });
75
127
  });
76
128
 
129
+ it("records the environmentId on a per-environment transition", async () => {
130
+ const values =
131
+ mock<(v: Record<string, unknown>) => Promise<void>>(() =>
132
+ Promise.resolve(),
133
+ );
134
+ const db = { insert: mock(() => ({ values })) };
135
+
136
+ await recordStateTransition({
137
+ db: db as never,
138
+ systemId: "system-1",
139
+ configurationId: "config-1",
140
+ environmentId: "prod",
141
+ fromStatus: "healthy",
142
+ toStatus: "unhealthy",
143
+ });
144
+
145
+ const arg = values.mock.calls[0]?.[0] as { environmentId: unknown };
146
+ expect(arg.environmentId).toBe("prod");
147
+ });
148
+
149
+ it("normalizes an env-less transition to environmentId = null", async () => {
150
+ const values =
151
+ mock<(v: Record<string, unknown>) => Promise<void>>(() =>
152
+ Promise.resolve(),
153
+ );
154
+ const db = { insert: mock(() => ({ values })) };
155
+
156
+ await recordStateTransition({
157
+ db: db as never,
158
+ systemId: "system-1",
159
+ configurationId: "config-1",
160
+ // environmentId omitted
161
+ fromStatus: "healthy",
162
+ toStatus: "degraded",
163
+ });
164
+
165
+ const arg = values.mock.calls[0]?.[0] as { environmentId: unknown };
166
+ expect(arg.environmentId).toBeNull();
167
+ });
168
+
77
169
  it("stores null fromStatus on the first-ever transition", async () => {
78
170
  const values =
79
171
  mock<(v: Record<string, unknown>) => Promise<void>>(() =>
@@ -123,4 +215,16 @@ describe("countStateTransitionsInWindow", () => {
123
215
  });
124
216
  expect(result).toBe(0);
125
217
  });
218
+
219
+ it("accepts an environmentId to scope the count to one environment", async () => {
220
+ const { db } = countMockDb(2);
221
+ const result = await countStateTransitionsInWindow({
222
+ db: db as never,
223
+ systemId: "system-1",
224
+ windowMinutes: 60,
225
+ environmentId: "prod",
226
+ });
227
+ // The env predicate is added without error; the mock yields the count.
228
+ expect(result).toBe(2);
229
+ });
126
230
  });