@celilo/cli 0.5.0-alpha.4 → 0.5.0-alpha.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/drizzle/0010_dns_internal_records.sql +12 -0
- package/drizzle/0011_backups_name.sql +1 -0
- package/drizzle/meta/_journal.json +14 -0
- package/package.json +2 -2
- package/src/cli/command-registry.ts +13 -2
- package/src/cli/commands/system-doctor.ts +135 -40
- package/src/cli/commands/system-migrate.test.ts +40 -0
- package/src/cli/commands/system-migrate.ts +65 -0
- package/src/cli/completion.ts +1 -0
- package/src/cli/index.ts +5 -0
- package/src/db/client.ts +15 -146
- package/src/db/migrate.ts +14 -6
- package/src/db/schema-introspection.ts +88 -0
- package/src/db/schema.ts +38 -0
- package/src/hooks/capability-loader.ts +24 -15
- package/src/services/deploy-preflight.ts +25 -0
- package/src/services/dns-internal-records.test.ts +126 -0
- package/src/services/dns-internal-records.ts +119 -0
- package/src/services/fleet-checks.test.ts +495 -0
- package/src/services/fleet-checks.ts +663 -0
- package/src/templates/generator.ts +21 -0
|
@@ -0,0 +1,663 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fleet runtime drift detection — the predicates behind `celilo system
|
|
3
|
+
* doctor`'s fleet section (designs/CELILO_DOCTOR_FLEET_DRIFT.md, ISS-0113).
|
|
4
|
+
*
|
|
5
|
+
* Every facet of post-migration drift that workstream B caught by hand —
|
|
6
|
+
* a dead/stale dispatcher (ISS-0086), an empty `subscribers` table
|
|
7
|
+
* (ISS-0088), a capability chain that never re-derived (ISS-0095 /
|
|
8
|
+
* idp_dmz_ip) — becomes one check here. Each check asserts the *outcome*,
|
|
9
|
+
* not a proxy for it (design D5): "a dispatcher process exists" is not the
|
|
10
|
+
* same as "it's the supervised, current one that's actually emitting timer
|
|
11
|
+
* ticks", and the difference is exactly the bug B hit.
|
|
12
|
+
*
|
|
13
|
+
* Detection is read-only and cheap (design D3). These predicates are also
|
|
14
|
+
* the building blocks the defensive wiring (workstream D) and the celilo
|
|
15
|
+
* MCP (ISS-0112) call — so they take their bus + DB handles as arguments
|
|
16
|
+
* and return structured findings rather than rendering or exiting. The
|
|
17
|
+
* rendering + `--fix` orchestration lives in the doctor command.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import type { Bus } from '@celilo/event-bus';
|
|
21
|
+
import { inArray } from 'drizzle-orm';
|
|
22
|
+
import { getModuleStoragePath } from '../config/paths';
|
|
23
|
+
import type { DbClient } from '../db/client';
|
|
24
|
+
import { capabilities as capabilitiesTable, modules } from '../db/schema';
|
|
25
|
+
import { findSchemaDrift } from '../db/schema-introspection';
|
|
26
|
+
import { resolveFirewallNatIp } from '../hooks/capability-loader';
|
|
27
|
+
import type { ModuleManifest } from '../manifest/schema';
|
|
28
|
+
import { getModuleSystems } from './deployed-systems';
|
|
29
|
+
import { listDnsInternalRecords } from './dns-internal-records';
|
|
30
|
+
import { type SupervisorPlatform, readInstalledUnit } from './events-daemon';
|
|
31
|
+
import { resolveSubscription } from './module-subscriptions';
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Zones reachable from the operator's LAN. A celilo placement zone other
|
|
35
|
+
* than `internal` is firewall-segmented — an unmanaged LAN device has no
|
|
36
|
+
* route into it, so an internal-DNS record pointing at a container IP there
|
|
37
|
+
* is unreachable (the bug). The `internal` zone IS the LAN, so a record at
|
|
38
|
+
* one of its systems' IPs is fine. (Never pin literal subnets to zones —
|
|
39
|
+
* compare by the zone role the system carries.)
|
|
40
|
+
*/
|
|
41
|
+
const LAN_REACHABLE_ZONE = 'internal';
|
|
42
|
+
|
|
43
|
+
export type FleetFindingStatus = 'ok' | 'warn' | 'fail';
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* One drift facet's verdict. `autoFixable` marks the checks `--fix` may
|
|
47
|
+
* run unattended (today: subscribers resync only) — everything else is
|
|
48
|
+
* report + a named manual remediation, never a surprise prod redeploy.
|
|
49
|
+
*/
|
|
50
|
+
export interface FleetFinding {
|
|
51
|
+
/** Stable id for the check (e.g. 'dispatcher'); not user-facing prose. */
|
|
52
|
+
id: string;
|
|
53
|
+
title: string;
|
|
54
|
+
status: FleetFindingStatus;
|
|
55
|
+
summary: string;
|
|
56
|
+
/** Extra context lines, rendered indented under the summary. */
|
|
57
|
+
detail: string[];
|
|
58
|
+
/** A concrete next step, or null when status is ok. */
|
|
59
|
+
remediation: string | null;
|
|
60
|
+
autoFixable: boolean;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* A timer subscriber should see a fresh tick within its interval plus
|
|
65
|
+
* slack. 15m is the shortest DDNS-refresh cadence (namecheap); a tick
|
|
66
|
+
* older than this means the dispatcher isn't emitting (the workstream-B
|
|
67
|
+
* stale-orphan symptom). One window covers the common case without
|
|
68
|
+
* parsing every interval name.
|
|
69
|
+
*/
|
|
70
|
+
const TIMER_TICK_MAX_AGE_MS = 20 * 60 * 1000;
|
|
71
|
+
|
|
72
|
+
const UNRESOLVED_REF = /\$\{?(?:self|capability|infra|infrastructure|system|secret):/;
|
|
73
|
+
|
|
74
|
+
/** Worst of a set of statuses (fail > warn > ok). */
|
|
75
|
+
function worst(statuses: FleetFindingStatus[]): FleetFindingStatus {
|
|
76
|
+
if (statuses.includes('fail')) return 'fail';
|
|
77
|
+
if (statuses.includes('warn')) return 'warn';
|
|
78
|
+
return 'ok';
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* The DB schema the running CLI expects must actually exist on this box.
|
|
83
|
+
* celilo only runs drizzle's `migrate()` on a FRESH database; an existing
|
|
84
|
+
* install gets a hand-maintained CREATE/ALTER list in db/client.ts instead
|
|
85
|
+
* (ISS-0100). When that list drifts from the shipped migrations — e.g. a new
|
|
86
|
+
* migration adds a table nobody added to the list — the running code expects
|
|
87
|
+
* a table/column the DB doesn't have, and features fail at runtime.
|
|
88
|
+
*
|
|
89
|
+
* This asserts the outcome directly (track-agnostic): every table + column in
|
|
90
|
+
* the code's drizzle schema is present in the DB. A miss means migrations
|
|
91
|
+
* haven't reached this box. It does NOT count migration rows — an existing DB
|
|
92
|
+
* patched via the hand list legitimately lags `__drizzle_migrations` while
|
|
93
|
+
* its schema is current, so presence is the honest signal.
|
|
94
|
+
*/
|
|
95
|
+
export function checkSchemaDrift(db: DbClient): FleetFinding {
|
|
96
|
+
const { missingTables, missingColumns, tableCount } = findSchemaDrift(db.$client);
|
|
97
|
+
|
|
98
|
+
const detail: string[] = [];
|
|
99
|
+
if (missingTables.length > 0) detail.push(`missing table(s): ${missingTables.join(', ')}`);
|
|
100
|
+
if (missingColumns.length > 0) detail.push(`missing column(s): ${missingColumns.join(', ')}`);
|
|
101
|
+
const status: FleetFindingStatus = detail.length > 0 ? 'fail' : 'ok';
|
|
102
|
+
|
|
103
|
+
return {
|
|
104
|
+
id: 'schema',
|
|
105
|
+
title: 'database schema matches the running CLI (migrations applied)',
|
|
106
|
+
status,
|
|
107
|
+
summary:
|
|
108
|
+
status === 'ok'
|
|
109
|
+
? `all ${tableCount} schema tables present`
|
|
110
|
+
: 'database schema is behind the running CLI — migrations not applied',
|
|
111
|
+
detail,
|
|
112
|
+
remediation:
|
|
113
|
+
status === 'ok'
|
|
114
|
+
? null
|
|
115
|
+
: 'run `celilo system migrate` to apply pending migrations on this box — see ISS-0100',
|
|
116
|
+
autoFixable: false,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
interface HeartbeatRow {
|
|
121
|
+
dispatcher_id: string;
|
|
122
|
+
last_heartbeat: number;
|
|
123
|
+
started_at: number;
|
|
124
|
+
pid: number;
|
|
125
|
+
version: string;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
export interface DispatcherCheckOptions {
|
|
129
|
+
now?: number;
|
|
130
|
+
/**
|
|
131
|
+
* mtime (ms) of the installed dispatcher code (`@celilo/event-bus`
|
|
132
|
+
* package.json). A dispatcher whose `started_at` predates this is
|
|
133
|
+
* running stale in-memory code — the exact workstream-B orphan. Omit
|
|
134
|
+
* to skip the staleness aspect (e.g. unit tests, or when the package
|
|
135
|
+
* can't be located).
|
|
136
|
+
*/
|
|
137
|
+
installedCodeMtimeMs?: number | null;
|
|
138
|
+
/** Override for readInstalledUnit — tests point this at a temp home. */
|
|
139
|
+
home?: string;
|
|
140
|
+
platform?: SupervisorPlatform;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* The dispatcher check is four-part (design D5): a dispatcher is (1)
|
|
145
|
+
* running, (2) the *supervised* one (survives reboot, not an orphan),
|
|
146
|
+
* (3) running *current* code, and (4) actually emitting timer ticks +
|
|
147
|
+
* draining deliveries. A naive "is a process up?" check reports green
|
|
148
|
+
* while broken — that's the trap this exists to avoid.
|
|
149
|
+
*/
|
|
150
|
+
export function checkDispatcher(bus: Bus, opts: DispatcherCheckOptions = {}): FleetFinding {
|
|
151
|
+
const now = opts.now ?? Date.now();
|
|
152
|
+
const health = bus.health();
|
|
153
|
+
const hb = bus.db
|
|
154
|
+
.query<HeartbeatRow, []>(
|
|
155
|
+
'SELECT dispatcher_id, last_heartbeat, started_at, pid, version FROM dispatcher_heartbeat ORDER BY last_heartbeat DESC LIMIT 1',
|
|
156
|
+
)
|
|
157
|
+
.get();
|
|
158
|
+
|
|
159
|
+
const detail: string[] = [];
|
|
160
|
+
const statuses: FleetFindingStatus[] = [];
|
|
161
|
+
const remediations: string[] = [];
|
|
162
|
+
|
|
163
|
+
// (1) running — a fresh heartbeat. health() already classifies a
|
|
164
|
+
// stale/absent heartbeat as no_dispatcher.
|
|
165
|
+
if (health.status === 'no_dispatcher' || !hb) {
|
|
166
|
+
statuses.push('fail');
|
|
167
|
+
detail.push(
|
|
168
|
+
'no live dispatcher — heartbeat absent or stale (events are queueing, not delivered)',
|
|
169
|
+
);
|
|
170
|
+
remediations.push(
|
|
171
|
+
'start the dispatcher: `systemctl --user enable --now celilo-events.service` (or `celilo events install-daemon` then enable it)',
|
|
172
|
+
);
|
|
173
|
+
// Without a heartbeat there's nothing more to assert about it.
|
|
174
|
+
return {
|
|
175
|
+
id: 'dispatcher',
|
|
176
|
+
title: 'event dispatcher running, supervised & current',
|
|
177
|
+
status: 'fail',
|
|
178
|
+
summary: 'no live event dispatcher',
|
|
179
|
+
detail,
|
|
180
|
+
remediation: remediations.join('; '),
|
|
181
|
+
autoFixable: false,
|
|
182
|
+
};
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
const ageMs = health.lastHeartbeatAgeMs ?? now - hb.last_heartbeat;
|
|
186
|
+
detail.push(
|
|
187
|
+
`running (pid ${hb.pid}, heartbeat ${Math.round(ageMs / 1000)}s ago, code v${hb.version})`,
|
|
188
|
+
);
|
|
189
|
+
if (health.status === 'stuck') {
|
|
190
|
+
statuses.push('warn');
|
|
191
|
+
detail.push(
|
|
192
|
+
`${health.stuckRunningCount} delivery(ies) stuck in 'running' — possible crashed handler`,
|
|
193
|
+
);
|
|
194
|
+
remediations.push('`celilo events repair` to sweep stuck deliveries');
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// (2) supervised — a unit file exists (user or system scope). A
|
|
198
|
+
// running dispatcher with NO unit is the orphan case: works now, gone
|
|
199
|
+
// after reboot.
|
|
200
|
+
const supervised =
|
|
201
|
+
readInstalledUnit({ scope: 'user', home: opts.home, platform: opts.platform }).exists ||
|
|
202
|
+
readInstalledUnit({ scope: 'system', home: opts.home, platform: opts.platform }).exists;
|
|
203
|
+
if (!supervised) {
|
|
204
|
+
statuses.push('warn');
|
|
205
|
+
detail.push('not under a supervisor unit — will not survive a reboot (orphan process)');
|
|
206
|
+
remediations.push('`celilo events install-daemon` then enable the unit so it is supervised');
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// (3) current — started before the installed code was last written ⇒
|
|
210
|
+
// running stale in-memory code (delivers, but may not emit new event
|
|
211
|
+
// types like timer ticks). The workstream-B orphan, exactly.
|
|
212
|
+
if (opts.installedCodeMtimeMs != null && hb.started_at < opts.installedCodeMtimeMs) {
|
|
213
|
+
statuses.push('warn');
|
|
214
|
+
const startedAgoMin = Math.round((now - hb.started_at) / 60000);
|
|
215
|
+
detail.push(
|
|
216
|
+
`started ${startedAgoMin}min ago — before the last code update; running stale code, restart to pick it up`,
|
|
217
|
+
);
|
|
218
|
+
remediations.push('restart the dispatcher: `systemctl --user restart celilo-events.service`');
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// (4) emitting + delivering. Only assert ticks if something subscribes
|
|
222
|
+
// to a timer (no subscriber ⇒ no expectation). Assert no piled-up
|
|
223
|
+
// failed deliveries either way.
|
|
224
|
+
const timerSub = bus.db
|
|
225
|
+
.query<{ pattern: string }, []>(
|
|
226
|
+
"SELECT pattern FROM subscribers WHERE pattern LIKE 'timer.tick.%' LIMIT 1",
|
|
227
|
+
)
|
|
228
|
+
.get();
|
|
229
|
+
if (timerSub) {
|
|
230
|
+
const latestTick = bus.recentEvents({ type: timerSub.pattern, limit: 1 })[0];
|
|
231
|
+
if (!latestTick) {
|
|
232
|
+
statuses.push('warn');
|
|
233
|
+
detail.push(
|
|
234
|
+
`a subscriber wants '${timerSub.pattern}' but no such tick has ever been emitted (refresh/DDNS not firing)`,
|
|
235
|
+
);
|
|
236
|
+
remediations.push('restart the dispatcher so it emits timer ticks');
|
|
237
|
+
} else if (now - latestTick.emittedAt > TIMER_TICK_MAX_AGE_MS) {
|
|
238
|
+
statuses.push('warn');
|
|
239
|
+
const ageMin = Math.round((now - latestTick.emittedAt) / 60000);
|
|
240
|
+
detail.push(
|
|
241
|
+
`last '${timerSub.pattern}' was ${ageMin}min ago — dispatcher not emitting on schedule`,
|
|
242
|
+
);
|
|
243
|
+
remediations.push('restart the dispatcher so it resumes emitting timer ticks');
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
const failed = bus.failedDeliveries({ limit: 50 });
|
|
248
|
+
if (failed.length > 0) {
|
|
249
|
+
statuses.push('warn');
|
|
250
|
+
const sample = failed[0]?.lastError ? ` (e.g. ${failed[0].lastError.split('\n')[0]})` : '';
|
|
251
|
+
detail.push(`${failed.length} failed/abandoned delivery(ies)${sample}`);
|
|
252
|
+
remediations.push('inspect failed deliveries and re-emit/repair as needed');
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
const status = worst(statuses);
|
|
256
|
+
return {
|
|
257
|
+
id: 'dispatcher',
|
|
258
|
+
title: 'event dispatcher running, supervised & current',
|
|
259
|
+
status,
|
|
260
|
+
summary:
|
|
261
|
+
status === 'ok'
|
|
262
|
+
? 'dispatcher healthy, supervised, current, and emitting'
|
|
263
|
+
: 'dispatcher running but degraded',
|
|
264
|
+
detail,
|
|
265
|
+
remediation: remediations.length > 0 ? remediations.join('; ') : null,
|
|
266
|
+
autoFixable: false,
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/** A deployed module + its parsed manifest (INSTALLED/VERIFIED only). */
|
|
271
|
+
interface DeployedModule {
|
|
272
|
+
id: string;
|
|
273
|
+
manifest: ModuleManifest;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
function loadDeployedModules(db: DbClient): DeployedModule[] {
|
|
277
|
+
const rows = db
|
|
278
|
+
.select()
|
|
279
|
+
.from(modules)
|
|
280
|
+
.where(inArray(modules.state, ['INSTALLED', 'VERIFIED']))
|
|
281
|
+
.all();
|
|
282
|
+
return rows.map((m) => ({ id: m.id, manifest: m.manifestData as unknown as ModuleManifest }));
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* The bus `subscribers` table must reflect what the deployed fleet's
|
|
287
|
+
* manifests declare. A restore/migration starts it EMPTY (ISS-0088), so
|
|
288
|
+
* every reactive subscription silently vanishes until a resync or a
|
|
289
|
+
* redeploy. Missing rows fail; stale rows (a since-removed module) warn.
|
|
290
|
+
*/
|
|
291
|
+
export function checkSubscribers(bus: Bus, db: DbClient): FleetFinding {
|
|
292
|
+
const deployed = loadDeployedModules(db);
|
|
293
|
+
|
|
294
|
+
// Expected: every (scoped name → pattern) the deployed manifests declare.
|
|
295
|
+
const expected = new Map<string, string>();
|
|
296
|
+
for (const mod of deployed) {
|
|
297
|
+
const subs = mod.manifest.subscriptions ?? [];
|
|
298
|
+
const modulePath = `${getModuleStoragePath()}/${mod.id}`;
|
|
299
|
+
for (const sub of subs) {
|
|
300
|
+
const resolved = resolveSubscription(sub, mod.id, modulePath);
|
|
301
|
+
expected.set(resolved.name, resolved.pattern);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
const actualRows = bus.db
|
|
306
|
+
.query<{ name: string; pattern: string }, []>('SELECT name, pattern FROM subscribers')
|
|
307
|
+
.all();
|
|
308
|
+
const actual = new Map(actualRows.map((r) => [r.name, r.pattern]));
|
|
309
|
+
|
|
310
|
+
const missing: string[] = [];
|
|
311
|
+
const mismatched: string[] = [];
|
|
312
|
+
for (const [name, pattern] of expected) {
|
|
313
|
+
const have = actual.get(name);
|
|
314
|
+
if (have === undefined) missing.push(name);
|
|
315
|
+
else if (have !== pattern) mismatched.push(`${name} (manifest: ${pattern}, bus: ${have})`);
|
|
316
|
+
}
|
|
317
|
+
const stale = actualRows.map((r) => r.name).filter((name) => !expected.has(name));
|
|
318
|
+
|
|
319
|
+
const detail: string[] = [];
|
|
320
|
+
const statuses: FleetFindingStatus[] = [];
|
|
321
|
+
if (missing.length > 0) {
|
|
322
|
+
statuses.push('fail');
|
|
323
|
+
detail.push(
|
|
324
|
+
`${missing.length} subscription(s) declared by the fleet but missing from the bus: ${missing.join(', ')}`,
|
|
325
|
+
);
|
|
326
|
+
}
|
|
327
|
+
if (mismatched.length > 0) {
|
|
328
|
+
statuses.push('warn');
|
|
329
|
+
detail.push(
|
|
330
|
+
`${mismatched.length} subscription(s) with a pattern the bus disagrees on: ${mismatched.join('; ')}`,
|
|
331
|
+
);
|
|
332
|
+
}
|
|
333
|
+
if (stale.length > 0) {
|
|
334
|
+
statuses.push('warn');
|
|
335
|
+
detail.push(
|
|
336
|
+
`${stale.length} subscriber(s) on the bus with no deployed module: ${stale.join(', ')}`,
|
|
337
|
+
);
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
const status = worst(statuses);
|
|
341
|
+
return {
|
|
342
|
+
id: 'subscribers',
|
|
343
|
+
title: 'bus subscribers reflect the deployed fleet',
|
|
344
|
+
status,
|
|
345
|
+
summary:
|
|
346
|
+
status === 'ok'
|
|
347
|
+
? `${expected.size} subscription(s) match the deployed fleet`
|
|
348
|
+
: 'bus subscribers drifted from the deployed fleet',
|
|
349
|
+
detail,
|
|
350
|
+
remediation: status === 'ok' ? null : '`celilo events resync-subscriptions` (safe, idempotent)',
|
|
351
|
+
autoFixable: status !== 'ok',
|
|
352
|
+
};
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/** A `$capability:<name>.<path>` reference parsed out of a derive_from. */
|
|
356
|
+
interface CapabilityRef {
|
|
357
|
+
variable: string;
|
|
358
|
+
capability: string;
|
|
359
|
+
path: string;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
export type CapabilityDerivationReason = 'no-provider' | 'empty-value' | 'unresolved-ref';
|
|
363
|
+
|
|
364
|
+
/**
|
|
365
|
+
* A broken `source: capability` derivation found on a consumer module.
|
|
366
|
+
* - `no-provider`: nothing in the capabilities map provides `capability`.
|
|
367
|
+
* - `empty-value`: the field exists but is null/undefined/empty.
|
|
368
|
+
* - `unresolved-ref`: the field is itself an unresolved template ref
|
|
369
|
+
* (e.g. authentik's `idp.dmz_ip = $self:caddy_dmz_ip`) — the chain is
|
|
370
|
+
* broken one+ hops upstream.
|
|
371
|
+
*/
|
|
372
|
+
export interface CapabilityDerivationProblem {
|
|
373
|
+
consumerModule: string;
|
|
374
|
+
variable: string;
|
|
375
|
+
capability: string;
|
|
376
|
+
path: string;
|
|
377
|
+
reason: CapabilityDerivationReason;
|
|
378
|
+
/** The offending value, for `unresolved-ref`. */
|
|
379
|
+
value?: string;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
function parseCapabilityRefs(manifest: ModuleManifest): CapabilityRef[] {
|
|
383
|
+
const refs: CapabilityRef[] = [];
|
|
384
|
+
for (const v of manifest.variables?.owns ?? []) {
|
|
385
|
+
if (v.source !== 'capability' || !v.derive_from) continue;
|
|
386
|
+
const re = /\$\{?capability:([\w-]+)\.([\w.]+)/g;
|
|
387
|
+
let m: RegExpExecArray | null = re.exec(v.derive_from);
|
|
388
|
+
while (m !== null) {
|
|
389
|
+
refs.push({ variable: v.name, capability: m[1], path: m[2] });
|
|
390
|
+
m = re.exec(v.derive_from);
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
return refs;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
/** Walk a dotted path into a JSON object; undefined if any hop is absent. */
|
|
397
|
+
function getNested(data: Record<string, unknown>, path: string): unknown {
|
|
398
|
+
let cur: unknown = data;
|
|
399
|
+
for (const seg of path.split('.')) {
|
|
400
|
+
if (cur == null || typeof cur !== 'object') return undefined;
|
|
401
|
+
cur = (cur as Record<string, unknown>)[seg];
|
|
402
|
+
}
|
|
403
|
+
return cur;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
/**
|
|
407
|
+
* The shared capability-derivation predicate (design D4: build once, call
|
|
408
|
+
* from the detector AND the preventer). For a consumer manifest and a map
|
|
409
|
+
* of capability-name → data, return every `source: capability` derivation
|
|
410
|
+
* that won't resolve.
|
|
411
|
+
*
|
|
412
|
+
* The `capabilities` map can be either RAW (the doctor reads capability
|
|
413
|
+
* rows straight from the DB, so a still-derived field shows up as
|
|
414
|
+
* `unresolved-ref`) or RESOLVED (deploy preflight / generate pass
|
|
415
|
+
* `ResolutionContext.capabilities`, where `$self:` refs are already
|
|
416
|
+
* substituted against the provider's config — so a broken upstream link
|
|
417
|
+
* shows up as `empty-value` or `unresolved-ref`). Callers decide severity:
|
|
418
|
+
* the doctor treats `unresolved-ref` as "needs the ISS-0114 chain trace"
|
|
419
|
+
* (a note), while preflight/generate treat every reason as a hard error.
|
|
420
|
+
*/
|
|
421
|
+
export function findBrokenCapabilityDerivations(
|
|
422
|
+
consumerModule: string,
|
|
423
|
+
manifest: ModuleManifest,
|
|
424
|
+
capabilities: Record<string, Record<string, unknown> | undefined>,
|
|
425
|
+
): CapabilityDerivationProblem[] {
|
|
426
|
+
const problems: CapabilityDerivationProblem[] = [];
|
|
427
|
+
for (const ref of parseCapabilityRefs(manifest)) {
|
|
428
|
+
const base = {
|
|
429
|
+
consumerModule,
|
|
430
|
+
variable: ref.variable,
|
|
431
|
+
capability: ref.capability,
|
|
432
|
+
path: ref.path,
|
|
433
|
+
};
|
|
434
|
+
const data = capabilities[ref.capability];
|
|
435
|
+
if (!data) {
|
|
436
|
+
problems.push({ ...base, reason: 'no-provider' });
|
|
437
|
+
continue;
|
|
438
|
+
}
|
|
439
|
+
const value = getNested(data, ref.path);
|
|
440
|
+
if (value === undefined || value === null || value === '') {
|
|
441
|
+
problems.push({ ...base, reason: 'empty-value' });
|
|
442
|
+
continue;
|
|
443
|
+
}
|
|
444
|
+
if (typeof value === 'string' && UNRESOLVED_REF.test(value)) {
|
|
445
|
+
problems.push({ ...base, reason: 'unresolved-ref', value });
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
return problems;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
/** One-line human description of a broken derivation, shared by all callers. */
|
|
452
|
+
export function describeCapabilityProblem(p: CapabilityDerivationProblem): string {
|
|
453
|
+
const head = `${p.consumerModule}.${p.variable} derives from $capability:${p.capability}.${p.path}`;
|
|
454
|
+
switch (p.reason) {
|
|
455
|
+
case 'no-provider':
|
|
456
|
+
return `${head}, but no deployed module provides '${p.capability}' — deploy/redeploy its provider first`;
|
|
457
|
+
case 'empty-value':
|
|
458
|
+
return `${head}, but the provider's '${p.capability}' data has no value there — redeploy the provider so it re-registers`;
|
|
459
|
+
case 'unresolved-ref':
|
|
460
|
+
return `${head}, which resolves to an unresolved ref (${p.value}) — its own upstream chain is broken; redeploy the provider chain (provider → consumer)`;
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
/**
|
|
465
|
+
* Every `source: capability` variable a deployed module derives must have
|
|
466
|
+
* a live provider whose capability data carries the referenced field
|
|
467
|
+
* (the forgejo `$self:idp_dmz_ip not found` class, ISS-0095/ISS-0115).
|
|
468
|
+
*
|
|
469
|
+
* Reads RAW capability data, so a present-but-still-derived field (e.g.
|
|
470
|
+
* authentik's `idp.dmz_ip = $self:caddy_dmz_ip`) can't be verified here
|
|
471
|
+
* without the backward chain-walker (ISS-0114). Rather than ship a second
|
|
472
|
+
* walker (design D2.1), those `unresolved-ref` cases are flagged as "needs
|
|
473
|
+
* the chain trace" — a note pointing at `celilo capability chain`, not a
|
|
474
|
+
* false-positive fail. (Deploy preflight + generate run the same predicate
|
|
475
|
+
* against the RESOLVED context, where the same break IS a hard error.)
|
|
476
|
+
*/
|
|
477
|
+
export function checkCapabilityProviders(db: DbClient): FleetFinding {
|
|
478
|
+
const deployed = loadDeployedModules(db);
|
|
479
|
+
const capRows = db
|
|
480
|
+
.select({ name: capabilitiesTable.capabilityName, data: capabilitiesTable.data })
|
|
481
|
+
.from(capabilitiesTable)
|
|
482
|
+
.all();
|
|
483
|
+
const rawMap: Record<string, Record<string, unknown>> = {};
|
|
484
|
+
for (const r of capRows) rawMap[r.name] = r.data;
|
|
485
|
+
|
|
486
|
+
const breaks: string[] = [];
|
|
487
|
+
const traceNeeded: string[] = [];
|
|
488
|
+
let refCount = 0;
|
|
489
|
+
|
|
490
|
+
for (const mod of deployed) {
|
|
491
|
+
const problems = findBrokenCapabilityDerivations(mod.id, mod.manifest, rawMap);
|
|
492
|
+
refCount += parseCapabilityRefs(mod.manifest).length;
|
|
493
|
+
for (const p of problems) {
|
|
494
|
+
if (p.reason === 'unresolved-ref') {
|
|
495
|
+
traceNeeded.push(
|
|
496
|
+
`${p.consumerModule}.${p.variable} ← ${p.capability}.${p.path} (= ${p.value})`,
|
|
497
|
+
);
|
|
498
|
+
} else {
|
|
499
|
+
breaks.push(describeCapabilityProblem(p));
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
const detail: string[] = [];
|
|
505
|
+
let status: FleetFindingStatus = 'ok';
|
|
506
|
+
if (breaks.length > 0) {
|
|
507
|
+
status = 'fail';
|
|
508
|
+
detail.push(...breaks);
|
|
509
|
+
}
|
|
510
|
+
if (traceNeeded.length > 0) {
|
|
511
|
+
detail.push(
|
|
512
|
+
`${traceNeeded.length} derived value(s) resolve through another capability — verify with \`celilo capability chain <module> <var>\` (ISS-0114): ${traceNeeded.join('; ')}`,
|
|
513
|
+
);
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
return {
|
|
517
|
+
id: 'capability-derived',
|
|
518
|
+
title: 'capability-derived config has live providers',
|
|
519
|
+
status,
|
|
520
|
+
summary:
|
|
521
|
+
status === 'ok'
|
|
522
|
+
? `${refCount} capability-derived reference(s) have providers`
|
|
523
|
+
: 'capability-derived config is missing a provider',
|
|
524
|
+
detail,
|
|
525
|
+
remediation:
|
|
526
|
+
status === 'ok'
|
|
527
|
+
? null
|
|
528
|
+
: 'redeploy the provider module(s) so they re-register capability data, then redeploy the consumer (provider → consumer order)',
|
|
529
|
+
autoFixable: false,
|
|
530
|
+
};
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
/**
|
|
534
|
+
* Internal split-horizon DNS records for service hostnames must resolve to
|
|
535
|
+
* the firewall natIp (the LAN-reachable DNAT ingress), not a zone-side
|
|
536
|
+
* container IP a LAN device can't route to (ISS-0094 / ISS-0111). Reads the
|
|
537
|
+
* dns_internal ledger offline — every `registerRecord({type:'A'})` the
|
|
538
|
+
* capability loader saw — and compares each to the natIp:
|
|
539
|
+
* - == natIp → ok
|
|
540
|
+
* - a segmented-zone system's container IP → fail (unroutable from the LAN)
|
|
541
|
+
* - an `internal`-zone system's IP → ok (that zone IS the LAN)
|
|
542
|
+
* - anything else → warn (unknown / possibly stale)
|
|
543
|
+
*
|
|
544
|
+
* Skipped cleanly when no firewall advertises a natIp (a flat network has no
|
|
545
|
+
* segmented zones, so container IPs are reachable).
|
|
546
|
+
*/
|
|
547
|
+
export async function checkServiceDns(db: DbClient): Promise<FleetFinding> {
|
|
548
|
+
const base = {
|
|
549
|
+
id: 'service-dns',
|
|
550
|
+
title: 'service DNS points at the firewall natIp',
|
|
551
|
+
autoFixable: false,
|
|
552
|
+
} as const;
|
|
553
|
+
|
|
554
|
+
// The ledger table may be absent on a DB whose schema is behind (ISS-0100).
|
|
555
|
+
// Don't crash the whole doctor — the schema-drift check owns that signal.
|
|
556
|
+
let records: ReturnType<typeof listDnsInternalRecords>;
|
|
557
|
+
try {
|
|
558
|
+
records = listDnsInternalRecords(db);
|
|
559
|
+
} catch {
|
|
560
|
+
return {
|
|
561
|
+
...base,
|
|
562
|
+
status: 'ok',
|
|
563
|
+
summary: 'internal-DNS ledger not present (schema behind — see the schema check)',
|
|
564
|
+
detail: [],
|
|
565
|
+
remediation: null,
|
|
566
|
+
};
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
if (records.length === 0) {
|
|
570
|
+
return {
|
|
571
|
+
...base,
|
|
572
|
+
status: 'ok',
|
|
573
|
+
summary: 'no internal DNS records registered',
|
|
574
|
+
detail: [],
|
|
575
|
+
remediation: null,
|
|
576
|
+
};
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
const natIp = await resolveFirewallNatIp(db);
|
|
580
|
+
if (!natIp) {
|
|
581
|
+
return {
|
|
582
|
+
...base,
|
|
583
|
+
status: 'ok',
|
|
584
|
+
summary: `${records.length} internal DNS record(s); no firewall natIp to check against`,
|
|
585
|
+
detail: ['no firewall advertises a natIp — flat network, container IPs are LAN-reachable'],
|
|
586
|
+
remediation: null,
|
|
587
|
+
};
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
// Map every deployed system's container IP → its zone, so a record can be
|
|
591
|
+
// recognized as pointing at a segmented-zone container (the bug) vs an
|
|
592
|
+
// internal-zone (LAN) system.
|
|
593
|
+
const ipZone = new Map<string, { moduleId: string; zone: string }>();
|
|
594
|
+
for (const mod of loadDeployedModules(db)) {
|
|
595
|
+
for (const sys of getModuleSystems(mod.id, db)) {
|
|
596
|
+
if (sys.ipv4_address) ipZone.set(sys.ipv4_address, { moduleId: mod.id, zone: sys.zone });
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
const atContainer: string[] = [];
|
|
601
|
+
const atOther: string[] = [];
|
|
602
|
+
for (const r of records) {
|
|
603
|
+
if (r.ip === natIp) continue;
|
|
604
|
+
const owner = ipZone.get(r.ip);
|
|
605
|
+
if (owner && owner.zone !== LAN_REACHABLE_ZONE) {
|
|
606
|
+
atContainer.push(
|
|
607
|
+
`${r.host} → ${r.ip} (${owner.moduleId}'s ${owner.zone}-zone container IP — a LAN device can't route there; should be the natIp ${natIp})`,
|
|
608
|
+
);
|
|
609
|
+
} else if (!owner) {
|
|
610
|
+
atOther.push(`${r.host} → ${r.ip} (neither the natIp ${natIp} nor a known system IP)`);
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
const detail: string[] = [];
|
|
615
|
+
const statuses: FleetFindingStatus[] = [];
|
|
616
|
+
if (atContainer.length > 0) {
|
|
617
|
+
statuses.push('fail');
|
|
618
|
+
detail.push(...atContainer);
|
|
619
|
+
}
|
|
620
|
+
if (atOther.length > 0) {
|
|
621
|
+
statuses.push('warn');
|
|
622
|
+
detail.push(...atOther);
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
const status = worst(statuses);
|
|
626
|
+
return {
|
|
627
|
+
...base,
|
|
628
|
+
status,
|
|
629
|
+
summary:
|
|
630
|
+
status === 'ok'
|
|
631
|
+
? `${records.length} internal DNS record(s) resolve to the natIp or a LAN-reachable system`
|
|
632
|
+
: 'internal DNS records point at zone-side IPs unreachable from the LAN',
|
|
633
|
+
detail,
|
|
634
|
+
remediation:
|
|
635
|
+
status === 'fail'
|
|
636
|
+
? 'redeploy the provider so it registers the record at the firewall natIp (firewall.exposeService result), not the container IP'
|
|
637
|
+
: null,
|
|
638
|
+
};
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
export interface RunFleetChecksOptions {
|
|
642
|
+
now?: number;
|
|
643
|
+
installedCodeMtimeMs?: number | null;
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
/**
|
|
647
|
+
* Run every fleet check against the given handles. The caller owns
|
|
648
|
+
* gating (skip when there's no celilo DB) and rendering; this just
|
|
649
|
+
* returns the findings, in the order they're shown.
|
|
650
|
+
*/
|
|
651
|
+
export async function runFleetChecks(
|
|
652
|
+
bus: Bus,
|
|
653
|
+
db: DbClient,
|
|
654
|
+
opts: RunFleetChecksOptions = {},
|
|
655
|
+
): Promise<FleetFinding[]> {
|
|
656
|
+
return [
|
|
657
|
+
checkSchemaDrift(db),
|
|
658
|
+
checkDispatcher(bus, { now: opts.now, installedCodeMtimeMs: opts.installedCodeMtimeMs }),
|
|
659
|
+
checkSubscribers(bus, db),
|
|
660
|
+
checkCapabilityProviders(db),
|
|
661
|
+
await checkServiceDns(db),
|
|
662
|
+
];
|
|
663
|
+
}
|
|
@@ -19,6 +19,10 @@ import {
|
|
|
19
19
|
import { getSingularSystemSpec } from '../manifest/schema';
|
|
20
20
|
import type { AnsibleCollection, ModuleManifest } from '../manifest/schema';
|
|
21
21
|
import { validateZoneRequirements } from '../manifest/validate';
|
|
22
|
+
import {
|
|
23
|
+
describeCapabilityProblem,
|
|
24
|
+
findBrokenCapabilityDerivations,
|
|
25
|
+
} from '../services/fleet-checks';
|
|
22
26
|
import { selectInfrastructure } from '../services/infrastructure-selector';
|
|
23
27
|
import { upsertModuleConfig } from '../services/module-config';
|
|
24
28
|
import type { InfrastructureSelection } from '../types/infrastructure';
|
|
@@ -780,6 +784,23 @@ export async function generateTemplates(options: GenerateOptions): Promise<Gener
|
|
|
780
784
|
};
|
|
781
785
|
}
|
|
782
786
|
|
|
787
|
+
// Policy: fail loud at the source on a broken capability chain. A
|
|
788
|
+
// required `source: capability` var whose chain doesn't resolve is
|
|
789
|
+
// silently dropped during derivation, then surfaces downstream as a
|
|
790
|
+
// cryptic `$self:<x> not found` in some template. Assert it here against
|
|
791
|
+
// the resolved capabilities map so generate names the broken link and
|
|
792
|
+
// the provider to redeploy (ISS-0115; the data-plane sibling of ISS-0088).
|
|
793
|
+
const capProblems = findBrokenCapabilityDerivations(moduleId, manifest, context.capabilities);
|
|
794
|
+
if (capProblems.length > 0) {
|
|
795
|
+
return {
|
|
796
|
+
success: false,
|
|
797
|
+
error: `Cannot generate '${moduleId}': ${capProblems.length} capability-derived variable(s) won't resolve:\n${capProblems
|
|
798
|
+
.map((p) => ` - ${describeCapabilityProblem(p)}`)
|
|
799
|
+
.join('\n')}`,
|
|
800
|
+
details: capProblems,
|
|
801
|
+
};
|
|
802
|
+
}
|
|
803
|
+
|
|
783
804
|
// Execution: Store derived variables in module_configs
|
|
784
805
|
// Variables with derive_from are resolved in the context but not stored in module_configs.
|
|
785
806
|
// Store them so hooks and host_vars can access them.
|