@celilo/cli 0.5.0-alpha.4 → 0.5.0-alpha.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/drizzle/0010_dns_internal_records.sql +12 -0
  2. package/drizzle/0011_backups_name.sql +1 -0
  3. package/drizzle/meta/_journal.json +14 -0
  4. package/package.json +2 -2
  5. package/src/ansible/inventory.test.ts +10 -10
  6. package/src/ansible/validation.test.ts +25 -15
  7. package/src/cli/command-registry.ts +13 -2
  8. package/src/cli/commands/events.test.ts +4 -4
  9. package/src/cli/commands/events.ts +2 -2
  10. package/src/cli/commands/service-add-proxmox.ts +9 -0
  11. package/src/cli/commands/system-doctor.ts +135 -40
  12. package/src/cli/commands/system-migrate.test.ts +40 -0
  13. package/src/cli/commands/system-migrate.ts +65 -0
  14. package/src/cli/completion.ts +1 -0
  15. package/src/cli/index.ts +7 -2
  16. package/src/config/paths.test.ts +61 -48
  17. package/src/db/client.ts +15 -146
  18. package/src/db/migrate.ts +14 -6
  19. package/src/db/schema-introspection.ts +88 -0
  20. package/src/db/schema.ts +38 -0
  21. package/src/hooks/capability-loader-firewall.test.ts +3 -3
  22. package/src/hooks/capability-loader.ts +24 -15
  23. package/src/infrastructure/property-extractor.test.ts +15 -0
  24. package/src/infrastructure/property-extractor.ts +12 -0
  25. package/src/manifest/schema.ts +7 -0
  26. package/src/manifest/validate.test.ts +53 -0
  27. package/src/services/bus-interview.test.ts +2 -2
  28. package/src/services/bus-secret-flow.test.ts +2 -2
  29. package/src/services/celilo-mgmt-hooks.test.ts +3 -2
  30. package/src/services/deploy-preflight.ts +25 -0
  31. package/src/services/deploy-validation.test.ts +2 -2
  32. package/src/services/dns-internal-records.test.ts +126 -0
  33. package/src/services/dns-internal-records.ts +119 -0
  34. package/src/services/dns-provider-backfill.test.ts +2 -2
  35. package/src/services/dns-registrations.test.ts +10 -10
  36. package/src/services/fleet-checks.test.ts +495 -0
  37. package/src/services/fleet-checks.ts +663 -0
  38. package/src/services/module-build.test.ts +43 -38
  39. package/src/templates/generator.test.ts +62 -12
  40. package/src/templates/generator.ts +69 -50
  41. package/src/test-utils/fixtures.test.ts +1 -1
  42. package/src/test-utils/integration-guard.ts +33 -0
  43. package/src/types/infrastructure.ts +6 -0
  44. package/src/variables/computed/computed-integration.test.ts +3 -3
  45. package/src/variables/computed/computed.test.ts +5 -5
  46. package/src/variables/declarative-derivation.test.ts +6 -6
@@ -0,0 +1,663 @@
1
+ /**
2
+ * Fleet runtime drift detection — the predicates behind `celilo system
3
+ * doctor`'s fleet section (designs/CELILO_DOCTOR_FLEET_DRIFT.md, ISS-0113).
4
+ *
5
+ * Every facet of post-migration drift that workstream B caught by hand —
6
+ * a dead/stale dispatcher (ISS-0086), an empty `subscribers` table
7
+ * (ISS-0088), a capability chain that never re-derived (ISS-0095 /
8
+ * idp_dmz_ip) — becomes one check here. Each check asserts the *outcome*,
9
+ * not a proxy for it (design D5): "a dispatcher process exists" is not the
10
+ * same as "it's the supervised, current one that's actually emitting timer
11
+ * ticks", and the difference is exactly the bug B hit.
12
+ *
13
+ * Detection is read-only and cheap (design D3). These predicates are also
14
+ * the building blocks the defensive wiring (workstream D) and the celilo
15
+ * MCP (ISS-0112) call — so they take their bus + DB handles as arguments
16
+ * and return structured findings rather than rendering or exiting. The
17
+ * rendering + `--fix` orchestration lives in the doctor command.
18
+ */
19
+
20
+ import type { Bus } from '@celilo/event-bus';
21
+ import { inArray } from 'drizzle-orm';
22
+ import { getModuleStoragePath } from '../config/paths';
23
+ import type { DbClient } from '../db/client';
24
+ import { capabilities as capabilitiesTable, modules } from '../db/schema';
25
+ import { findSchemaDrift } from '../db/schema-introspection';
26
+ import { resolveFirewallNatIp } from '../hooks/capability-loader';
27
+ import type { ModuleManifest } from '../manifest/schema';
28
+ import { getModuleSystems } from './deployed-systems';
29
+ import { listDnsInternalRecords } from './dns-internal-records';
30
+ import { type SupervisorPlatform, readInstalledUnit } from './events-daemon';
31
+ import { resolveSubscription } from './module-subscriptions';
32
+
33
+ /**
34
+ * Zones reachable from the operator's LAN. A celilo placement zone other
35
+ * than `internal` is firewall-segmented — an unmanaged LAN device has no
36
+ * route into it, so an internal-DNS record pointing at a container IP there
37
+ * is unreachable (the bug). The `internal` zone IS the LAN, so a record at
38
+ * one of its systems' IPs is fine. (Never pin literal subnets to zones —
39
+ * compare by the zone role the system carries.)
40
+ */
41
+ const LAN_REACHABLE_ZONE = 'internal';
42
+
43
+ export type FleetFindingStatus = 'ok' | 'warn' | 'fail';
44
+
45
+ /**
46
+ * One drift facet's verdict. `autoFixable` marks the checks `--fix` may
47
+ * run unattended (today: subscribers resync only) — everything else is
48
+ * report + a named manual remediation, never a surprise prod redeploy.
49
+ */
50
+ export interface FleetFinding {
51
+ /** Stable id for the check (e.g. 'dispatcher'); not user-facing prose. */
52
+ id: string;
53
+ title: string;
54
+ status: FleetFindingStatus;
55
+ summary: string;
56
+ /** Extra context lines, rendered indented under the summary. */
57
+ detail: string[];
58
+ /** A concrete next step, or null when status is ok. */
59
+ remediation: string | null;
60
+ autoFixable: boolean;
61
+ }
62
+
63
+ /**
64
+ * A timer subscriber should see a fresh tick within its interval plus
65
+ * slack. 15m is the shortest DDNS-refresh cadence (namecheap); a tick
66
+ * older than this means the dispatcher isn't emitting (the workstream-B
67
+ * stale-orphan symptom). One window covers the common case without
68
+ * parsing every interval name.
69
+ */
70
+ const TIMER_TICK_MAX_AGE_MS = 20 * 60 * 1000;
71
+
72
+ const UNRESOLVED_REF = /\$\{?(?:self|capability|infra|infrastructure|system|secret):/;
73
+
74
+ /** Worst of a set of statuses (fail > warn > ok). */
75
+ function worst(statuses: FleetFindingStatus[]): FleetFindingStatus {
76
+ if (statuses.includes('fail')) return 'fail';
77
+ if (statuses.includes('warn')) return 'warn';
78
+ return 'ok';
79
+ }
80
+
81
+ /**
82
+ * The DB schema the running CLI expects must actually exist on this box.
83
+ * celilo only runs drizzle's `migrate()` on a FRESH database; an existing
84
+ * install gets a hand-maintained CREATE/ALTER list in db/client.ts instead
85
+ * (ISS-0100). When that list drifts from the shipped migrations — e.g. a new
86
+ * migration adds a table nobody added to the list — the running code expects
87
+ * a table/column the DB doesn't have, and features fail at runtime.
88
+ *
89
+ * This asserts the outcome directly (track-agnostic): every table + column in
90
+ * the code's drizzle schema is present in the DB. A miss means migrations
91
+ * haven't reached this box. It does NOT count migration rows — an existing DB
92
+ * patched via the hand list legitimately lags `__drizzle_migrations` while
93
+ * its schema is current, so presence is the honest signal.
94
+ */
95
+ export function checkSchemaDrift(db: DbClient): FleetFinding {
96
+ const { missingTables, missingColumns, tableCount } = findSchemaDrift(db.$client);
97
+
98
+ const detail: string[] = [];
99
+ if (missingTables.length > 0) detail.push(`missing table(s): ${missingTables.join(', ')}`);
100
+ if (missingColumns.length > 0) detail.push(`missing column(s): ${missingColumns.join(', ')}`);
101
+ const status: FleetFindingStatus = detail.length > 0 ? 'fail' : 'ok';
102
+
103
+ return {
104
+ id: 'schema',
105
+ title: 'database schema matches the running CLI (migrations applied)',
106
+ status,
107
+ summary:
108
+ status === 'ok'
109
+ ? `all ${tableCount} schema tables present`
110
+ : 'database schema is behind the running CLI — migrations not applied',
111
+ detail,
112
+ remediation:
113
+ status === 'ok'
114
+ ? null
115
+ : 'run `celilo system migrate` to apply pending migrations on this box — see ISS-0100',
116
+ autoFixable: false,
117
+ };
118
+ }
119
+
120
+ interface HeartbeatRow {
121
+ dispatcher_id: string;
122
+ last_heartbeat: number;
123
+ started_at: number;
124
+ pid: number;
125
+ version: string;
126
+ }
127
+
128
+ export interface DispatcherCheckOptions {
129
+ now?: number;
130
+ /**
131
+ * mtime (ms) of the installed dispatcher code (`@celilo/event-bus`
132
+ * package.json). A dispatcher whose `started_at` predates this is
133
+ * running stale in-memory code — the exact workstream-B orphan. Omit
134
+ * to skip the staleness aspect (e.g. unit tests, or when the package
135
+ * can't be located).
136
+ */
137
+ installedCodeMtimeMs?: number | null;
138
+ /** Override for readInstalledUnit — tests point this at a temp home. */
139
+ home?: string;
140
+ platform?: SupervisorPlatform;
141
+ }
142
+
143
+ /**
144
+ * The dispatcher check is four-part (design D5): a dispatcher is (1)
145
+ * running, (2) the *supervised* one (survives reboot, not an orphan),
146
+ * (3) running *current* code, and (4) actually emitting timer ticks +
147
+ * draining deliveries. A naive "is a process up?" check reports green
148
+ * while broken — that's the trap this exists to avoid.
149
+ */
150
+ export function checkDispatcher(bus: Bus, opts: DispatcherCheckOptions = {}): FleetFinding {
151
+ const now = opts.now ?? Date.now();
152
+ const health = bus.health();
153
+ const hb = bus.db
154
+ .query<HeartbeatRow, []>(
155
+ 'SELECT dispatcher_id, last_heartbeat, started_at, pid, version FROM dispatcher_heartbeat ORDER BY last_heartbeat DESC LIMIT 1',
156
+ )
157
+ .get();
158
+
159
+ const detail: string[] = [];
160
+ const statuses: FleetFindingStatus[] = [];
161
+ const remediations: string[] = [];
162
+
163
+ // (1) running — a fresh heartbeat. health() already classifies a
164
+ // stale/absent heartbeat as no_dispatcher.
165
+ if (health.status === 'no_dispatcher' || !hb) {
166
+ statuses.push('fail');
167
+ detail.push(
168
+ 'no live dispatcher — heartbeat absent or stale (events are queueing, not delivered)',
169
+ );
170
+ remediations.push(
171
+ 'start the dispatcher: `systemctl --user enable --now celilo-events.service` (or `celilo events install-daemon` then enable it)',
172
+ );
173
+ // Without a heartbeat there's nothing more to assert about it.
174
+ return {
175
+ id: 'dispatcher',
176
+ title: 'event dispatcher running, supervised & current',
177
+ status: 'fail',
178
+ summary: 'no live event dispatcher',
179
+ detail,
180
+ remediation: remediations.join('; '),
181
+ autoFixable: false,
182
+ };
183
+ }
184
+
185
+ const ageMs = health.lastHeartbeatAgeMs ?? now - hb.last_heartbeat;
186
+ detail.push(
187
+ `running (pid ${hb.pid}, heartbeat ${Math.round(ageMs / 1000)}s ago, code v${hb.version})`,
188
+ );
189
+ if (health.status === 'stuck') {
190
+ statuses.push('warn');
191
+ detail.push(
192
+ `${health.stuckRunningCount} delivery(ies) stuck in 'running' — possible crashed handler`,
193
+ );
194
+ remediations.push('`celilo events repair` to sweep stuck deliveries');
195
+ }
196
+
197
+ // (2) supervised — a unit file exists (user or system scope). A
198
+ // running dispatcher with NO unit is the orphan case: works now, gone
199
+ // after reboot.
200
+ const supervised =
201
+ readInstalledUnit({ scope: 'user', home: opts.home, platform: opts.platform }).exists ||
202
+ readInstalledUnit({ scope: 'system', home: opts.home, platform: opts.platform }).exists;
203
+ if (!supervised) {
204
+ statuses.push('warn');
205
+ detail.push('not under a supervisor unit — will not survive a reboot (orphan process)');
206
+ remediations.push('`celilo events install-daemon` then enable the unit so it is supervised');
207
+ }
208
+
209
+ // (3) current — started before the installed code was last written ⇒
210
+ // running stale in-memory code (delivers, but may not emit new event
211
+ // types like timer ticks). The workstream-B orphan, exactly.
212
+ if (opts.installedCodeMtimeMs != null && hb.started_at < opts.installedCodeMtimeMs) {
213
+ statuses.push('warn');
214
+ const startedAgoMin = Math.round((now - hb.started_at) / 60000);
215
+ detail.push(
216
+ `started ${startedAgoMin}min ago — before the last code update; running stale code, restart to pick it up`,
217
+ );
218
+ remediations.push('restart the dispatcher: `systemctl --user restart celilo-events.service`');
219
+ }
220
+
221
+ // (4) emitting + delivering. Only assert ticks if something subscribes
222
+ // to a timer (no subscriber ⇒ no expectation). Assert no piled-up
223
+ // failed deliveries either way.
224
+ const timerSub = bus.db
225
+ .query<{ pattern: string }, []>(
226
+ "SELECT pattern FROM subscribers WHERE pattern LIKE 'timer.tick.%' LIMIT 1",
227
+ )
228
+ .get();
229
+ if (timerSub) {
230
+ const latestTick = bus.recentEvents({ type: timerSub.pattern, limit: 1 })[0];
231
+ if (!latestTick) {
232
+ statuses.push('warn');
233
+ detail.push(
234
+ `a subscriber wants '${timerSub.pattern}' but no such tick has ever been emitted (refresh/DDNS not firing)`,
235
+ );
236
+ remediations.push('restart the dispatcher so it emits timer ticks');
237
+ } else if (now - latestTick.emittedAt > TIMER_TICK_MAX_AGE_MS) {
238
+ statuses.push('warn');
239
+ const ageMin = Math.round((now - latestTick.emittedAt) / 60000);
240
+ detail.push(
241
+ `last '${timerSub.pattern}' was ${ageMin}min ago — dispatcher not emitting on schedule`,
242
+ );
243
+ remediations.push('restart the dispatcher so it resumes emitting timer ticks');
244
+ }
245
+ }
246
+
247
+ const failed = bus.failedDeliveries({ limit: 50 });
248
+ if (failed.length > 0) {
249
+ statuses.push('warn');
250
+ const sample = failed[0]?.lastError ? ` (e.g. ${failed[0].lastError.split('\n')[0]})` : '';
251
+ detail.push(`${failed.length} failed/abandoned delivery(ies)${sample}`);
252
+ remediations.push('inspect failed deliveries and re-emit/repair as needed');
253
+ }
254
+
255
+ const status = worst(statuses);
256
+ return {
257
+ id: 'dispatcher',
258
+ title: 'event dispatcher running, supervised & current',
259
+ status,
260
+ summary:
261
+ status === 'ok'
262
+ ? 'dispatcher healthy, supervised, current, and emitting'
263
+ : 'dispatcher running but degraded',
264
+ detail,
265
+ remediation: remediations.length > 0 ? remediations.join('; ') : null,
266
+ autoFixable: false,
267
+ };
268
+ }
269
+
270
+ /** A deployed module + its parsed manifest (INSTALLED/VERIFIED only). */
271
+ interface DeployedModule {
272
+ id: string;
273
+ manifest: ModuleManifest;
274
+ }
275
+
276
+ function loadDeployedModules(db: DbClient): DeployedModule[] {
277
+ const rows = db
278
+ .select()
279
+ .from(modules)
280
+ .where(inArray(modules.state, ['INSTALLED', 'VERIFIED']))
281
+ .all();
282
+ return rows.map((m) => ({ id: m.id, manifest: m.manifestData as unknown as ModuleManifest }));
283
+ }
284
+
285
+ /**
286
+ * The bus `subscribers` table must reflect what the deployed fleet's
287
+ * manifests declare. A restore/migration starts it EMPTY (ISS-0088), so
288
+ * every reactive subscription silently vanishes until a resync or a
289
+ * redeploy. Missing rows fail; stale rows (a since-removed module) warn.
290
+ */
291
+ export function checkSubscribers(bus: Bus, db: DbClient): FleetFinding {
292
+ const deployed = loadDeployedModules(db);
293
+
294
+ // Expected: every (scoped name → pattern) the deployed manifests declare.
295
+ const expected = new Map<string, string>();
296
+ for (const mod of deployed) {
297
+ const subs = mod.manifest.subscriptions ?? [];
298
+ const modulePath = `${getModuleStoragePath()}/${mod.id}`;
299
+ for (const sub of subs) {
300
+ const resolved = resolveSubscription(sub, mod.id, modulePath);
301
+ expected.set(resolved.name, resolved.pattern);
302
+ }
303
+ }
304
+
305
+ const actualRows = bus.db
306
+ .query<{ name: string; pattern: string }, []>('SELECT name, pattern FROM subscribers')
307
+ .all();
308
+ const actual = new Map(actualRows.map((r) => [r.name, r.pattern]));
309
+
310
+ const missing: string[] = [];
311
+ const mismatched: string[] = [];
312
+ for (const [name, pattern] of expected) {
313
+ const have = actual.get(name);
314
+ if (have === undefined) missing.push(name);
315
+ else if (have !== pattern) mismatched.push(`${name} (manifest: ${pattern}, bus: ${have})`);
316
+ }
317
+ const stale = actualRows.map((r) => r.name).filter((name) => !expected.has(name));
318
+
319
+ const detail: string[] = [];
320
+ const statuses: FleetFindingStatus[] = [];
321
+ if (missing.length > 0) {
322
+ statuses.push('fail');
323
+ detail.push(
324
+ `${missing.length} subscription(s) declared by the fleet but missing from the bus: ${missing.join(', ')}`,
325
+ );
326
+ }
327
+ if (mismatched.length > 0) {
328
+ statuses.push('warn');
329
+ detail.push(
330
+ `${mismatched.length} subscription(s) with a pattern the bus disagrees on: ${mismatched.join('; ')}`,
331
+ );
332
+ }
333
+ if (stale.length > 0) {
334
+ statuses.push('warn');
335
+ detail.push(
336
+ `${stale.length} subscriber(s) on the bus with no deployed module: ${stale.join(', ')}`,
337
+ );
338
+ }
339
+
340
+ const status = worst(statuses);
341
+ return {
342
+ id: 'subscribers',
343
+ title: 'bus subscribers reflect the deployed fleet',
344
+ status,
345
+ summary:
346
+ status === 'ok'
347
+ ? `${expected.size} subscription(s) match the deployed fleet`
348
+ : 'bus subscribers drifted from the deployed fleet',
349
+ detail,
350
+ remediation: status === 'ok' ? null : '`celilo events resync-subscriptions` (safe, idempotent)',
351
+ autoFixable: status !== 'ok',
352
+ };
353
+ }
354
+
355
+ /** A `$capability:<name>.<path>` reference parsed out of a derive_from. */
356
+ interface CapabilityRef {
357
+ variable: string;
358
+ capability: string;
359
+ path: string;
360
+ }
361
+
362
+ export type CapabilityDerivationReason = 'no-provider' | 'empty-value' | 'unresolved-ref';
363
+
364
+ /**
365
+ * A broken `source: capability` derivation found on a consumer module.
366
+ * - `no-provider`: nothing in the capabilities map provides `capability`.
367
+ * - `empty-value`: the field exists but is null/undefined/empty.
368
+ * - `unresolved-ref`: the field is itself an unresolved template ref
369
+ * (e.g. authentik's `idp.dmz_ip = $self:caddy_dmz_ip`) — the chain is
370
+ * broken one+ hops upstream.
371
+ */
372
+ export interface CapabilityDerivationProblem {
373
+ consumerModule: string;
374
+ variable: string;
375
+ capability: string;
376
+ path: string;
377
+ reason: CapabilityDerivationReason;
378
+ /** The offending value, for `unresolved-ref`. */
379
+ value?: string;
380
+ }
381
+
382
+ function parseCapabilityRefs(manifest: ModuleManifest): CapabilityRef[] {
383
+ const refs: CapabilityRef[] = [];
384
+ for (const v of manifest.variables?.owns ?? []) {
385
+ if (v.source !== 'capability' || !v.derive_from) continue;
386
+ const re = /\$\{?capability:([\w-]+)\.([\w.]+)/g;
387
+ let m: RegExpExecArray | null = re.exec(v.derive_from);
388
+ while (m !== null) {
389
+ refs.push({ variable: v.name, capability: m[1], path: m[2] });
390
+ m = re.exec(v.derive_from);
391
+ }
392
+ }
393
+ return refs;
394
+ }
395
+
396
+ /** Walk a dotted path into a JSON object; undefined if any hop is absent. */
397
+ function getNested(data: Record<string, unknown>, path: string): unknown {
398
+ let cur: unknown = data;
399
+ for (const seg of path.split('.')) {
400
+ if (cur == null || typeof cur !== 'object') return undefined;
401
+ cur = (cur as Record<string, unknown>)[seg];
402
+ }
403
+ return cur;
404
+ }
405
+
406
+ /**
407
+ * The shared capability-derivation predicate (design D4: build once, call
408
+ * from the detector AND the preventer). For a consumer manifest and a map
409
+ * of capability-name → data, return every `source: capability` derivation
410
+ * that won't resolve.
411
+ *
412
+ * The `capabilities` map can be either RAW (the doctor reads capability
413
+ * rows straight from the DB, so a still-derived field shows up as
414
+ * `unresolved-ref`) or RESOLVED (deploy preflight / generate pass
415
+ * `ResolutionContext.capabilities`, where `$self:` refs are already
416
+ * substituted against the provider's config — so a broken upstream link
417
+ * shows up as `empty-value` or `unresolved-ref`). Callers decide severity:
418
+ * the doctor treats `unresolved-ref` as "needs the ISS-0114 chain trace"
419
+ * (a note), while preflight/generate treat every reason as a hard error.
420
+ */
421
+ export function findBrokenCapabilityDerivations(
422
+ consumerModule: string,
423
+ manifest: ModuleManifest,
424
+ capabilities: Record<string, Record<string, unknown> | undefined>,
425
+ ): CapabilityDerivationProblem[] {
426
+ const problems: CapabilityDerivationProblem[] = [];
427
+ for (const ref of parseCapabilityRefs(manifest)) {
428
+ const base = {
429
+ consumerModule,
430
+ variable: ref.variable,
431
+ capability: ref.capability,
432
+ path: ref.path,
433
+ };
434
+ const data = capabilities[ref.capability];
435
+ if (!data) {
436
+ problems.push({ ...base, reason: 'no-provider' });
437
+ continue;
438
+ }
439
+ const value = getNested(data, ref.path);
440
+ if (value === undefined || value === null || value === '') {
441
+ problems.push({ ...base, reason: 'empty-value' });
442
+ continue;
443
+ }
444
+ if (typeof value === 'string' && UNRESOLVED_REF.test(value)) {
445
+ problems.push({ ...base, reason: 'unresolved-ref', value });
446
+ }
447
+ }
448
+ return problems;
449
+ }
450
+
451
+ /** One-line human description of a broken derivation, shared by all callers. */
452
+ export function describeCapabilityProblem(p: CapabilityDerivationProblem): string {
453
+ const head = `${p.consumerModule}.${p.variable} derives from $capability:${p.capability}.${p.path}`;
454
+ switch (p.reason) {
455
+ case 'no-provider':
456
+ return `${head}, but no deployed module provides '${p.capability}' — deploy/redeploy its provider first`;
457
+ case 'empty-value':
458
+ return `${head}, but the provider's '${p.capability}' data has no value there — redeploy the provider so it re-registers`;
459
+ case 'unresolved-ref':
460
+ return `${head}, which resolves to an unresolved ref (${p.value}) — its own upstream chain is broken; redeploy the provider chain (provider → consumer)`;
461
+ }
462
+ }
463
+
464
+ /**
465
+ * Every `source: capability` variable a deployed module derives must have
466
+ * a live provider whose capability data carries the referenced field
467
+ * (the forgejo `$self:idp_dmz_ip not found` class, ISS-0095/ISS-0115).
468
+ *
469
+ * Reads RAW capability data, so a present-but-still-derived field (e.g.
470
+ * authentik's `idp.dmz_ip = $self:caddy_dmz_ip`) can't be verified here
471
+ * without the backward chain-walker (ISS-0114). Rather than ship a second
472
+ * walker (design D2.1), those `unresolved-ref` cases are flagged as "needs
473
+ * the chain trace" — a note pointing at `celilo capability chain`, not a
474
+ * false-positive fail. (Deploy preflight + generate run the same predicate
475
+ * against the RESOLVED context, where the same break IS a hard error.)
476
+ */
477
+ export function checkCapabilityProviders(db: DbClient): FleetFinding {
478
+ const deployed = loadDeployedModules(db);
479
+ const capRows = db
480
+ .select({ name: capabilitiesTable.capabilityName, data: capabilitiesTable.data })
481
+ .from(capabilitiesTable)
482
+ .all();
483
+ const rawMap: Record<string, Record<string, unknown>> = {};
484
+ for (const r of capRows) rawMap[r.name] = r.data;
485
+
486
+ const breaks: string[] = [];
487
+ const traceNeeded: string[] = [];
488
+ let refCount = 0;
489
+
490
+ for (const mod of deployed) {
491
+ const problems = findBrokenCapabilityDerivations(mod.id, mod.manifest, rawMap);
492
+ refCount += parseCapabilityRefs(mod.manifest).length;
493
+ for (const p of problems) {
494
+ if (p.reason === 'unresolved-ref') {
495
+ traceNeeded.push(
496
+ `${p.consumerModule}.${p.variable} ← ${p.capability}.${p.path} (= ${p.value})`,
497
+ );
498
+ } else {
499
+ breaks.push(describeCapabilityProblem(p));
500
+ }
501
+ }
502
+ }
503
+
504
+ const detail: string[] = [];
505
+ let status: FleetFindingStatus = 'ok';
506
+ if (breaks.length > 0) {
507
+ status = 'fail';
508
+ detail.push(...breaks);
509
+ }
510
+ if (traceNeeded.length > 0) {
511
+ detail.push(
512
+ `${traceNeeded.length} derived value(s) resolve through another capability — verify with \`celilo capability chain <module> <var>\` (ISS-0114): ${traceNeeded.join('; ')}`,
513
+ );
514
+ }
515
+
516
+ return {
517
+ id: 'capability-derived',
518
+ title: 'capability-derived config has live providers',
519
+ status,
520
+ summary:
521
+ status === 'ok'
522
+ ? `${refCount} capability-derived reference(s) have providers`
523
+ : 'capability-derived config is missing a provider',
524
+ detail,
525
+ remediation:
526
+ status === 'ok'
527
+ ? null
528
+ : 'redeploy the provider module(s) so they re-register capability data, then redeploy the consumer (provider → consumer order)',
529
+ autoFixable: false,
530
+ };
531
+ }
532
+
533
+ /**
534
+ * Internal split-horizon DNS records for service hostnames must resolve to
535
+ * the firewall natIp (the LAN-reachable DNAT ingress), not a zone-side
536
+ * container IP a LAN device can't route to (ISS-0094 / ISS-0111). Reads the
537
+ * dns_internal ledger offline — every `registerRecord({type:'A'})` the
538
+ * capability loader saw — and compares each to the natIp:
539
+ * - == natIp → ok
540
+ * - a segmented-zone system's container IP → fail (unroutable from the LAN)
541
+ * - an `internal`-zone system's IP → ok (that zone IS the LAN)
542
+ * - anything else → warn (unknown / possibly stale)
543
+ *
544
+ * Skipped cleanly when no firewall advertises a natIp (a flat network has no
545
+ * segmented zones, so container IPs are reachable).
546
+ */
547
+ export async function checkServiceDns(db: DbClient): Promise<FleetFinding> {
548
+ const base = {
549
+ id: 'service-dns',
550
+ title: 'service DNS points at the firewall natIp',
551
+ autoFixable: false,
552
+ } as const;
553
+
554
+ // The ledger table may be absent on a DB whose schema is behind (ISS-0100).
555
+ // Don't crash the whole doctor — the schema-drift check owns that signal.
556
+ let records: ReturnType<typeof listDnsInternalRecords>;
557
+ try {
558
+ records = listDnsInternalRecords(db);
559
+ } catch {
560
+ return {
561
+ ...base,
562
+ status: 'ok',
563
+ summary: 'internal-DNS ledger not present (schema behind — see the schema check)',
564
+ detail: [],
565
+ remediation: null,
566
+ };
567
+ }
568
+
569
+ if (records.length === 0) {
570
+ return {
571
+ ...base,
572
+ status: 'ok',
573
+ summary: 'no internal DNS records registered',
574
+ detail: [],
575
+ remediation: null,
576
+ };
577
+ }
578
+
579
+ const natIp = await resolveFirewallNatIp(db);
580
+ if (!natIp) {
581
+ return {
582
+ ...base,
583
+ status: 'ok',
584
+ summary: `${records.length} internal DNS record(s); no firewall natIp to check against`,
585
+ detail: ['no firewall advertises a natIp — flat network, container IPs are LAN-reachable'],
586
+ remediation: null,
587
+ };
588
+ }
589
+
590
+ // Map every deployed system's container IP → its zone, so a record can be
591
+ // recognized as pointing at a segmented-zone container (the bug) vs an
592
+ // internal-zone (LAN) system.
593
+ const ipZone = new Map<string, { moduleId: string; zone: string }>();
594
+ for (const mod of loadDeployedModules(db)) {
595
+ for (const sys of getModuleSystems(mod.id, db)) {
596
+ if (sys.ipv4_address) ipZone.set(sys.ipv4_address, { moduleId: mod.id, zone: sys.zone });
597
+ }
598
+ }
599
+
600
+ const atContainer: string[] = [];
601
+ const atOther: string[] = [];
602
+ for (const r of records) {
603
+ if (r.ip === natIp) continue;
604
+ const owner = ipZone.get(r.ip);
605
+ if (owner && owner.zone !== LAN_REACHABLE_ZONE) {
606
+ atContainer.push(
607
+ `${r.host} → ${r.ip} (${owner.moduleId}'s ${owner.zone}-zone container IP — a LAN device can't route there; should be the natIp ${natIp})`,
608
+ );
609
+ } else if (!owner) {
610
+ atOther.push(`${r.host} → ${r.ip} (neither the natIp ${natIp} nor a known system IP)`);
611
+ }
612
+ }
613
+
614
+ const detail: string[] = [];
615
+ const statuses: FleetFindingStatus[] = [];
616
+ if (atContainer.length > 0) {
617
+ statuses.push('fail');
618
+ detail.push(...atContainer);
619
+ }
620
+ if (atOther.length > 0) {
621
+ statuses.push('warn');
622
+ detail.push(...atOther);
623
+ }
624
+
625
+ const status = worst(statuses);
626
+ return {
627
+ ...base,
628
+ status,
629
+ summary:
630
+ status === 'ok'
631
+ ? `${records.length} internal DNS record(s) resolve to the natIp or a LAN-reachable system`
632
+ : 'internal DNS records point at zone-side IPs unreachable from the LAN',
633
+ detail,
634
+ remediation:
635
+ status === 'fail'
636
+ ? 'redeploy the provider so it registers the record at the firewall natIp (firewall.exposeService result), not the container IP'
637
+ : null,
638
+ };
639
+ }
640
+
641
+ export interface RunFleetChecksOptions {
642
+ now?: number;
643
+ installedCodeMtimeMs?: number | null;
644
+ }
645
+
646
+ /**
647
+ * Run every fleet check against the given handles. The caller owns
648
+ * gating (skip when there's no celilo DB) and rendering; this just
649
+ * returns the findings, in the order they're shown.
650
+ */
651
+ export async function runFleetChecks(
652
+ bus: Bus,
653
+ db: DbClient,
654
+ opts: RunFleetChecksOptions = {},
655
+ ): Promise<FleetFinding[]> {
656
+ return [
657
+ checkSchemaDrift(db),
658
+ checkDispatcher(bus, { now: opts.now, installedCodeMtimeMs: opts.installedCodeMtimeMs }),
659
+ checkSubscribers(bus, db),
660
+ checkCapabilityProviders(db),
661
+ await checkServiceDns(db),
662
+ ];
663
+ }