@pleri/olam-cli 0.1.169 → 0.1.173
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -0
- package/dist/agent-stream/driver-runner.js +13 -0
- package/dist/commands/auth-status.d.ts +1 -0
- package/dist/commands/auth-status.d.ts.map +1 -1
- package/dist/commands/auth-status.js +45 -4
- package/dist/commands/auth-status.js.map +1 -1
- package/dist/commands/create.d.ts.map +1 -1
- package/dist/commands/create.js +26 -0
- package/dist/commands/create.js.map +1 -1
- package/dist/commands/enter.d.ts.map +1 -1
- package/dist/commands/enter.js +5 -0
- package/dist/commands/enter.js.map +1 -1
- package/dist/commands/resume.d.ts +63 -0
- package/dist/commands/resume.d.ts.map +1 -0
- package/dist/commands/resume.js +174 -0
- package/dist/commands/resume.js.map +1 -0
- package/dist/commands/setup.d.ts +19 -0
- package/dist/commands/setup.d.ts.map +1 -1
- package/dist/commands/setup.js +157 -19
- package/dist/commands/setup.js.map +1 -1
- package/dist/image-digests.json +8 -8
- package/dist/index.js +1025 -577
- package/dist/index.js.map +1 -1
- package/dist/lib/health-probes.d.ts +28 -0
- package/dist/lib/health-probes.d.ts.map +1 -1
- package/dist/lib/health-probes.js +75 -0
- package/dist/lib/health-probes.js.map +1 -1
- package/dist/lib/k8s-context-discovery.d.ts +80 -0
- package/dist/lib/k8s-context-discovery.d.ts.map +1 -0
- package/dist/lib/k8s-context-discovery.js +102 -0
- package/dist/lib/k8s-context-discovery.js.map +1 -0
- package/dist/mcp-server.js +2417 -1060
- package/dist/spawn/home-override.d.ts +82 -0
- package/dist/spawn/home-override.d.ts.map +1 -0
- package/dist/spawn/home-override.js +107 -0
- package/dist/spawn/home-override.js.map +1 -0
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/30-configmap.yaml +5 -0
- package/host-cp/k8s/manifests/50-deployment.yaml +9 -2
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
- package/host-cp/lifecycle/classify.mjs +110 -0
- package/host-cp/lifecycle/emit.mjs +119 -0
- package/host-cp/lifecycle/evidence.mjs +45 -0
- package/host-cp/lifecycle/failure-kinds.mjs +56 -0
- package/host-cp/lifecycle/index.mjs +22 -0
- package/host-cp/lifecycle/phases.mjs +52 -0
- package/host-cp/observability/grafana-port-forward.sh +1 -1
- package/host-cp/observability/kyverno-cardinality-mutate.sh +2 -2
- package/host-cp/observability/loki-ingest.sh +1 -1
- package/host-cp/observability/ndjson-span-sink.mjs +183 -0
- package/host-cp/observability/prom-no-double-grafana.sh +4 -4
- package/host-cp/observability/redactor.mjs +72 -0
- package/host-cp/recovery/engine.mjs +148 -0
- package/host-cp/recovery/index.mjs +16 -0
- package/host-cp/recovery/ledger.mjs +105 -0
- package/host-cp/recovery/recipes.mjs +46 -0
- package/host-cp/recovery/scenarios.mjs +124 -0
- package/host-cp/recovery/step-runners.mjs +263 -0
- package/host-cp/src/docker-events.mjs +30 -6
- package/host-cp/src/linear-sync.mjs +43 -0
- package/host-cp/src/plan-chat-service.mjs +129 -1
- package/host-cp/src/pr-nanny.mjs +55 -3
- package/host-cp/src/server.mjs +261 -0
- package/package.json +1 -1
|
@@ -68,7 +68,7 @@ const SCOPE_ID_RE = /^[A-Za-z0-9_.-]+$/;
|
|
|
68
68
|
// /v1/shape. Only these tables have server-side where-rewrite support; any
|
|
69
69
|
// other table=... param gets a 400. Guards against a client enumerating
|
|
70
70
|
// tables the service doesn't own.
|
|
71
|
-
const ALLOWED_SHAPE_TABLES = new Set(['chunks', 'message_usage']);
|
|
71
|
+
const ALLOWED_SHAPE_TABLES = new Set(['chunks', 'message_usage', 'planning_artifacts']);
|
|
72
72
|
|
|
73
73
|
// B6 (plan-chat-context-window-display Phase B): context-window caps per
|
|
74
74
|
// model. Mirrors CONTEXT_CAPS from @olam/intelligence/src/llm-router/providers/claude.ts.
|
|
@@ -399,6 +399,44 @@ export function createHandler({
|
|
|
399
399
|
}
|
|
400
400
|
}
|
|
401
401
|
}
|
|
402
|
+
// H2 (plan-chat-spa-canonical-surface Phase G) — extract commit_plan /
|
|
403
|
+
// propose_plan tool_use chunks into the planning_artifacts mutable
|
|
404
|
+
// table. The chunk's content carries JSON-stringified {name, input}
|
|
405
|
+
// per the substrate contract; we parse it once + write a single row.
|
|
406
|
+
// Failure to extract is logged + swallowed — the chunk itself is
|
|
407
|
+
// already persisted; artifact row absence is recoverable via
|
|
408
|
+
// re-extraction from chunks in a follow-up batch job.
|
|
409
|
+
if (body.chunk_type === 'tool_use') {
|
|
410
|
+
try {
|
|
411
|
+
const parsed = JSON.parse(body.chunk);
|
|
412
|
+
const toolName = typeof parsed?.name === 'string' ? parsed.name : '';
|
|
413
|
+
const artifactType = (toolName === 'commit_plan' || toolName === 'propose_plan')
|
|
414
|
+
? 'commit_plan'
|
|
415
|
+
: toolName === 'component_scaffold'
|
|
416
|
+
? 'component_scaffold'
|
|
417
|
+
: toolName === 'design_jam'
|
|
418
|
+
? 'design_jam'
|
|
419
|
+
: null;
|
|
420
|
+
if (artifactType && parsed.input && typeof parsed.input === 'object') {
|
|
421
|
+
const input = parsed.input;
|
|
422
|
+
const title = typeof input.title === 'string' && input.title.length > 0
|
|
423
|
+
? input.title.slice(0, 200)
|
|
424
|
+
: `Untitled ${artifactType}`;
|
|
425
|
+
const artifactId = `${body.message_id}:${body.seq}`;
|
|
426
|
+
await pool.query(
|
|
427
|
+
`INSERT INTO planning_artifacts
|
|
428
|
+
(id, world_id, session_id, type, title, body)
|
|
429
|
+
VALUES ($1, $2, $3, $4, $5, $6)
|
|
430
|
+
ON CONFLICT (id) DO NOTHING`,
|
|
431
|
+
[artifactId, body.world_id, body.session_id, artifactType, title, input],
|
|
432
|
+
);
|
|
433
|
+
}
|
|
434
|
+
} catch (artifactErr) {
|
|
435
|
+
console.warn(
|
|
436
|
+
`[plan-chat-service] planning_artifacts extraction failed for message=${body.message_id} seq=${body.seq}: ${artifactErr?.message ?? artifactErr}`,
|
|
437
|
+
);
|
|
438
|
+
}
|
|
439
|
+
}
|
|
402
440
|
} catch (err) {
|
|
403
441
|
if (err && typeof err === 'object' && 'code' in err && err.code === '23505') {
|
|
404
442
|
return send(res, 409, { error: 'duplicate', message: '(message_id, seq) already exists' });
|
|
@@ -634,6 +672,26 @@ export function createHandler({
|
|
|
634
672
|
createWorld,
|
|
635
673
|
destroyWorld,
|
|
636
674
|
});
|
|
675
|
+
// H7 (Phase G) — back-fill crystallized_world_id to all planning_artifacts
|
|
676
|
+
// rows for this session. The status pill state machine (Phase E E4) reads
|
|
677
|
+
// this field; the editor view + diagrams viewer use it for the
|
|
678
|
+
// "View world →" CTA. Failure here is logged + swallowed — the
|
|
679
|
+
// crystallize itself already succeeded; back-fill is a best-effort
|
|
680
|
+
// sync that an operator can re-run via a CLI helper.
|
|
681
|
+
if (result.worldId) {
|
|
682
|
+
try {
|
|
683
|
+
await pool.query(
|
|
684
|
+
`UPDATE planning_artifacts
|
|
685
|
+
SET crystallized_world_id = $1, status = 'crystallized'
|
|
686
|
+
WHERE session_id = $2 AND status = 'open'`,
|
|
687
|
+
[result.worldId, body.session_id],
|
|
688
|
+
);
|
|
689
|
+
} catch (backfillErr) {
|
|
690
|
+
console.warn(
|
|
691
|
+
`[plan-chat-service] crystallize back-fill failed for session=${body.session_id}: ${backfillErr?.message ?? backfillErr}`,
|
|
692
|
+
);
|
|
693
|
+
}
|
|
694
|
+
}
|
|
637
695
|
return send(res, 200, {
|
|
638
696
|
ok: true,
|
|
639
697
|
created_world_id: result.worldId,
|
|
@@ -649,6 +707,68 @@ export function createHandler({
|
|
|
649
707
|
}
|
|
650
708
|
}
|
|
651
709
|
|
|
710
|
+
// H4 (Phase G) — GET + PATCH /v1/artifacts/:id endpoint pair backing
|
|
711
|
+
// the SPA's editor-view round-trip. GET returns the artifact row;
|
|
712
|
+
// PATCH updates body (the JSON payload) + bumps updated_at via trigger.
|
|
713
|
+
// SCOPE_ID_RE on :id; bearer auth identical to the chunks endpoints.
|
|
714
|
+
async function handleGetArtifact(req, res, id) {
|
|
715
|
+
if (!checkAuth(req)) return unauthorized(res);
|
|
716
|
+
if (!SCOPE_ID_RE.test(id)) return badRequest(res, 'invalid artifact id');
|
|
717
|
+
try {
|
|
718
|
+
const result = await pool.query(
|
|
719
|
+
`SELECT id, world_id, session_id, type, title, body, status,
|
|
720
|
+
linear_issue_url, crystallized_world_id,
|
|
721
|
+
created_at, updated_at
|
|
722
|
+
FROM planning_artifacts WHERE id = $1`,
|
|
723
|
+
[id],
|
|
724
|
+
);
|
|
725
|
+
const row = result.rows[0];
|
|
726
|
+
if (!row) return send(res, 404, { error: 'not-found' });
|
|
727
|
+
return send(res, 200, row);
|
|
728
|
+
} catch (err) {
|
|
729
|
+
return send(res, 500, { error: 'query-failed', message: String(err?.message ?? err) });
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
async function handlePatchArtifact(req, res, id) {
|
|
734
|
+
if (!checkAuth(req)) return unauthorized(res);
|
|
735
|
+
if (!SCOPE_ID_RE.test(id)) return badRequest(res, 'invalid artifact id');
|
|
736
|
+
let body;
|
|
737
|
+
try {
|
|
738
|
+
body = await readJson(req);
|
|
739
|
+
} catch {
|
|
740
|
+
return badRequest(res, 'malformed JSON body');
|
|
741
|
+
}
|
|
742
|
+
const sets = [];
|
|
743
|
+
const values = [id];
|
|
744
|
+
if (body.body !== undefined) {
|
|
745
|
+
values.push(body.body);
|
|
746
|
+
sets.push(`body = $${values.length}::jsonb`);
|
|
747
|
+
}
|
|
748
|
+
if (typeof body.title === 'string') {
|
|
749
|
+
values.push(body.title.slice(0, 200));
|
|
750
|
+
sets.push(`title = $${values.length}`);
|
|
751
|
+
}
|
|
752
|
+
if (typeof body.status === 'string' && ['open', 'crystallized', 'failed', 'archived'].includes(body.status)) {
|
|
753
|
+
values.push(body.status);
|
|
754
|
+
sets.push(`status = $${values.length}`);
|
|
755
|
+
}
|
|
756
|
+
if (sets.length === 0) {
|
|
757
|
+
return badRequest(res, 'no patchable fields supplied (body | title | status)');
|
|
758
|
+
}
|
|
759
|
+
try {
|
|
760
|
+
const result = await pool.query(
|
|
761
|
+
`UPDATE planning_artifacts SET ${sets.join(', ')} WHERE id = $1 RETURNING *`,
|
|
762
|
+
values,
|
|
763
|
+
);
|
|
764
|
+
const row = result.rows[0];
|
|
765
|
+
if (!row) return send(res, 404, { error: 'not-found' });
|
|
766
|
+
return send(res, 200, row);
|
|
767
|
+
} catch (err) {
|
|
768
|
+
return send(res, 500, { error: 'update-failed', message: String(err?.message ?? err) });
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
|
|
652
772
|
return async function handler(req, res) {
|
|
653
773
|
const url = new URL(req.url ?? '/', `http://${req.headers.host}`);
|
|
654
774
|
if (req.method === 'GET' && url.pathname === '/livez') return send(res, 200, { ok: true });
|
|
@@ -656,6 +776,14 @@ export function createHandler({
|
|
|
656
776
|
if (req.method === 'GET' && url.pathname === '/v1/shape') return handleGetShape(req, res, url);
|
|
657
777
|
if (req.method === 'GET' && url.pathname === '/v1/planning-sessions') return handleGetPlanningSessions(req, res, url);
|
|
658
778
|
if (req.method === 'POST' && url.pathname === '/v1/crystallize') return handlePostCrystallize(req, res);
|
|
779
|
+
// H4 — /v1/artifacts/:id pair
|
|
780
|
+
const artifactMatch = /^\/v1\/artifacts\/([^/]+)$/.exec(url.pathname);
|
|
781
|
+
if (artifactMatch) {
|
|
782
|
+
const id = decodeURIComponent(artifactMatch[1]);
|
|
783
|
+
if (req.method === 'GET') return handleGetArtifact(req, res, id);
|
|
784
|
+
if (req.method === 'PATCH') return handlePatchArtifact(req, res, id);
|
|
785
|
+
return send(res, 405, { error: 'method-not-allowed' });
|
|
786
|
+
}
|
|
659
787
|
return send(res, 404, { error: 'not-found' });
|
|
660
788
|
};
|
|
661
789
|
}
|
package/host-cp/src/pr-nanny.mjs
CHANGED
|
@@ -13,10 +13,17 @@
|
|
|
13
13
|
* 2. wall-clock since first dispatch >= MAX_WALL_CLOCK_MIN (default 60)
|
|
14
14
|
* 3. same-root-cause loop detected (last 2 dispatch summaries identical)
|
|
15
15
|
* 4. operator manual pause
|
|
16
|
+
*
|
|
17
|
+
* Tier escalation (PR #N tier-escalation):
|
|
18
|
+
* On each retry, the nanny advances to the next tier in `escalationTiers`
|
|
19
|
+
* (stored per-world in nanny_current_tier) instead of repeating the same
|
|
20
|
+
* model. When the chain is exhausted, emits `dispatch.tier-exhausted` on
|
|
21
|
+
* the host-stream and falls back to existing operator escalation.
|
|
16
22
|
*/
|
|
17
23
|
|
|
18
24
|
import { execFile } from 'node:child_process';
|
|
19
25
|
import { promisify } from 'node:util';
|
|
26
|
+
import { pickNextTier } from './dispatch/tier-escalator.mjs';
|
|
20
27
|
|
|
21
28
|
const execFileAsync = promisify(execFile);
|
|
22
29
|
|
|
@@ -68,8 +75,9 @@ function parsePrUrl(prUrl) {
|
|
|
68
75
|
* @param {{
|
|
69
76
|
* prStateStore: ReturnType<import('./world-pr-state.mjs').createWorldPrStateStore>,
|
|
70
77
|
* getGhToken: () => Promise<string|null>,
|
|
71
|
-
* dispatchToWorld: (worldId: string, prompt: string) => Promise<void>,
|
|
78
|
+
* dispatchToWorld: (worldId: string, prompt: string, opts?: { tier?: string }) => Promise<void>,
|
|
72
79
|
* consultCodex: (ctx: string) => Promise<string>,
|
|
80
|
+
* broadcastTierEvent?: (eventType: string, payload: unknown) => void,
|
|
73
81
|
* pollIntervalMs?: number,
|
|
74
82
|
* maxDispatches?: number,
|
|
75
83
|
* maxWallClockMin?: number,
|
|
@@ -80,6 +88,7 @@ export function createPrNanny({
|
|
|
80
88
|
getGhToken,
|
|
81
89
|
dispatchToWorld,
|
|
82
90
|
consultCodex,
|
|
91
|
+
broadcastTierEvent = () => {},
|
|
83
92
|
pollIntervalMs = 60_000,
|
|
84
93
|
maxDispatches = parseInt(process.env.OLAM_PR_NANNY_MAX_DISPATCHES ?? '5', 10),
|
|
85
94
|
maxWallClockMin = parseInt(process.env.OLAM_PR_NANNY_MAX_WALL_CLOCK_MIN ?? '60', 10),
|
|
@@ -198,17 +207,60 @@ export function createPrNanny({
|
|
|
198
207
|
return;
|
|
199
208
|
}
|
|
200
209
|
|
|
210
|
+
// ── Tier escalation (PR #938) ───────────────────────────────────────────
|
|
211
|
+
//
|
|
212
|
+
// `nanny_escalation_tiers` is set by the olam_dispatch caller via the
|
|
213
|
+
// escalationTiers schema field and persisted here by server.mjs when the
|
|
214
|
+
// world is registered for nanny tracking. Defaults to ['sonnet'] when
|
|
215
|
+
// absent (no escalation, no cost surprise).
|
|
216
|
+
//
|
|
217
|
+
// `nanny_current_tier` tracks the model tier used by the LAST dispatch for
|
|
218
|
+
// this PR. On first dispatch (dispatchCount === 0) it is undefined, and we
|
|
219
|
+
// use escalationTiers[0] as the starting tier. On retries we advance the
|
|
220
|
+
// chain via pickNextTier. This is the pr-state store (option c from the
|
|
221
|
+
// design doc) — it persists across polls and matches the nanny_* field
|
|
222
|
+
// pattern already established by nanny_dispatch_count et al.
|
|
223
|
+
const escalationTiers = entry.nanny_escalation_tiers ?? ['sonnet'];
|
|
224
|
+
const currentTier = entry.nanny_current_tier ?? escalationTiers[0] ?? 'sonnet';
|
|
225
|
+
let tierForThisDispatch = currentTier;
|
|
226
|
+
|
|
227
|
+
if (dispatchCount > 0) {
|
|
228
|
+
// This is a retry — try to escalate the tier.
|
|
229
|
+
const nextTier = pickNextTier(currentTier, escalationTiers);
|
|
230
|
+
if (nextTier !== null) {
|
|
231
|
+
tierForThisDispatch = nextTier;
|
|
232
|
+
broadcastTierEvent('dispatch.escalated', {
|
|
233
|
+
worldId,
|
|
234
|
+
fromTier: currentTier,
|
|
235
|
+
toTier: nextTier,
|
|
236
|
+
reason: 'retry-after-failure',
|
|
237
|
+
});
|
|
238
|
+
console.log(`[pr-nanny] tier escalated for ${worldId}: ${currentTier} → ${nextTier}`);
|
|
239
|
+
} else {
|
|
240
|
+
// Chain exhausted — emit tier-exhausted and fall back to operator escalation.
|
|
241
|
+
broadcastTierEvent('dispatch.tier-exhausted', {
|
|
242
|
+
worldId,
|
|
243
|
+
exhaustedTier: currentTier,
|
|
244
|
+
escalationTiers,
|
|
245
|
+
});
|
|
246
|
+
console.log(`[pr-nanny] tier chain exhausted for ${worldId} (last tier: ${currentTier}) — escalating to operator`);
|
|
247
|
+
prStateStore.set(worldId, { nanny_escalated: true, nanny_escalate_reason: 'tier_exhausted' });
|
|
248
|
+
return;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
201
252
|
// Dispatch fix
|
|
202
253
|
try {
|
|
203
|
-
await dispatchToWorld(worldId, prompt);
|
|
254
|
+
await dispatchToWorld(worldId, prompt, { tier: tierForThisDispatch });
|
|
204
255
|
const now = new Date().toISOString();
|
|
205
256
|
prStateStore.set(worldId, {
|
|
206
257
|
nanny_dispatch_count: dispatchCount + 1,
|
|
207
258
|
nanny_first_dispatch_at: entry.nanny_first_dispatch_at ?? now,
|
|
208
259
|
nanny_last_dispatch_at: now,
|
|
209
260
|
nanny_last_dispatch_prompt: prompt,
|
|
261
|
+
nanny_current_tier: tierForThisDispatch,
|
|
210
262
|
});
|
|
211
|
-
console.log(`[pr-nanny] dispatched fix to ${worldId} (dispatch ${dispatchCount + 1}/${maxDispatches})`);
|
|
263
|
+
console.log(`[pr-nanny] dispatched fix to ${worldId} (dispatch ${dispatchCount + 1}/${maxDispatches}, tier: ${tierForThisDispatch})`);
|
|
212
264
|
} catch (err) {
|
|
213
265
|
console.error(`[pr-nanny] dispatch failed for ${worldId}: ${err.message}`);
|
|
214
266
|
}
|
package/host-cp/src/server.mjs
CHANGED
|
@@ -34,7 +34,19 @@ import { computeProgress } from './world-progress.mjs';
|
|
|
34
34
|
import { createPrCache } from './pr-cache.mjs';
|
|
35
35
|
import { fetchContainerSecret } from './container-secret-fetcher.mjs';
|
|
36
36
|
import { subscribeDockerEvents } from './docker-events.mjs';
|
|
37
|
+
import {
|
|
38
|
+
recordWorldLifecycle,
|
|
39
|
+
emptyEvidence,
|
|
40
|
+
WorldLifecyclePhase,
|
|
41
|
+
WorldStartupFailureKind,
|
|
42
|
+
} from '../lifecycle/index.mjs';
|
|
37
43
|
import { createHostStream, newStreamId } from './host-stream.mjs';
|
|
44
|
+
import {
|
|
45
|
+
createNdjsonSpanSink,
|
|
46
|
+
attachBetaResponseEvents,
|
|
47
|
+
} from '../observability/ndjson-span-sink.mjs';
|
|
48
|
+
import { betaResponseEmitter } from '@olam/auth-client';
|
|
49
|
+
import { attemptRecovery, findScenarioForKind } from '../recovery/index.mjs';
|
|
38
50
|
import { detectHaltChunk } from './halt-detect.mjs';
|
|
39
51
|
import { spawnUpgraderContainer } from './upgrade-spawner.mjs';
|
|
40
52
|
import { parseProxyPath, perWorldBase, proxyToWorld } from './proxy.mjs';
|
|
@@ -74,6 +86,8 @@ import {
|
|
|
74
86
|
handleServerBridges,
|
|
75
87
|
} from './routes/process-port.mjs';
|
|
76
88
|
import { instrumentHandler, renderMetrics } from './metrics.mjs';
|
|
89
|
+
import { handleDispatchFromEmail } from './lib/email-dispatch.mjs';
|
|
90
|
+
import { emitTierSuggestion } from '../dispatch/auto-tier-scheduler.mjs';
|
|
77
91
|
|
|
78
92
|
// ── Deployment-mode detection ─────────────────────────────────────
|
|
79
93
|
//
|
|
@@ -142,6 +156,20 @@ const OLAM_REPO_HOST_PATH = process.env.OLAM_REPO_HOST_PATH ?? '';
|
|
|
142
156
|
const OLAM_GH_CONFIG_HOST_PATH = process.env.OLAM_GH_CONFIG_HOST_PATH ?? '';
|
|
143
157
|
const OLAM_UPGRADER_IMAGE = process.env.OLAM_UPGRADER_IMAGE ?? 'ghcr.io/pleri/olam-host-cp:latest';
|
|
144
158
|
const WORKSPACES_DIR = process.env.OLAM_WORKSPACES_DIR ?? '/data/workspaces';
|
|
159
|
+
// Email-trigger surface (PR feat/email-as-world-trigger). The signing
|
|
160
|
+
// secret is the operator-shared key with the CF Email Worker — see
|
|
161
|
+
// docs/architecture/email-as-trigger.md. The allowlist is enforced
|
|
162
|
+
// defense-in-depth: the worker rejects at SMTP-time so bounces reach
|
|
163
|
+
// senders; we re-check at HTTP-time so a misrouted direct POST cannot
|
|
164
|
+
// bypass it. Both empty → endpoint stays mis-configured and returns
|
|
165
|
+
// 500/403 (fail-closed).
|
|
166
|
+
const OLAM_EMAIL_SIGNING_SECRET = process.env.OLAM_EMAIL_SIGNING_SECRET ?? '';
|
|
167
|
+
const OLAM_EMAIL_ALLOWED_SENDERS = process.env.OLAM_EMAIL_ALLOWED_SENDERS ?? '';
|
|
168
|
+
const OLAM_EMAIL_ATTACHMENTS_ROOT =
|
|
169
|
+
process.env.OLAM_EMAIL_ATTACHMENTS_ROOT ??
|
|
170
|
+
(HOST_CP_MODE === 'container'
|
|
171
|
+
? '/data/email-attachments'
|
|
172
|
+
: path.join(os.homedir(), '.olam', 'email-attachments'));
|
|
145
173
|
const WORLD_NAMES_PATH =
|
|
146
174
|
process.env.OLAM_WORLD_NAMES_PATH ??
|
|
147
175
|
(HOST_CP_MODE === 'container'
|
|
@@ -458,6 +486,29 @@ const sseGate = new SseGate({ maxConcurrent: SSE_CAP });
|
|
|
458
486
|
// poll-every-2s `useListeningServers` loop.
|
|
459
487
|
const hostStream = createHostStream({ log: (m) => console.log(`[host-stream] ${m}`) });
|
|
460
488
|
|
|
489
|
+
// Zero-config NDJSON span sink. Subscribes to host-stream `event: span`
|
|
490
|
+
// broadcasts and appends to ~/.olam/logs/host.trace.ndjson (override via
|
|
491
|
+
// OLAM_TRACE_LOG_PATH). Fail-open: a sink-bootstrap error logs a warning
|
|
492
|
+
// and proceeds without tracing rather than blocking host-cp boot.
|
|
493
|
+
const ndjsonSpanSink = await createNdjsonSpanSink({ hostStream }).catch((err) => {
|
|
494
|
+
console.warn(`[trace] NDJSON span sink unavailable: ${err?.message ?? err}`);
|
|
495
|
+
return null;
|
|
496
|
+
});
|
|
497
|
+
|
|
498
|
+
// Wire @olam/auth-client `beta-response` events (Anthropic SDK 0.96+ beta
|
|
499
|
+
// flags — thinking-token-count, cache-diagnostics, future passthrough) into
|
|
500
|
+
// the NDJSON trace as `withCredential.beta-response` spans. Opt-in via the
|
|
501
|
+
// caller's `withCredential('claude', fn, { betas: [...] })` options; when
|
|
502
|
+
// no caller opts in, the emitter never fires and this subscription is a
|
|
503
|
+
// no-op. See docs/decisions/047-anthropic-sdk-beta-flags.md.
|
|
504
|
+
if (ndjsonSpanSink) {
|
|
505
|
+
try {
|
|
506
|
+
attachBetaResponseEvents({ sink: ndjsonSpanSink, emitter: betaResponseEmitter });
|
|
507
|
+
} catch (err) {
|
|
508
|
+
console.warn(`[trace] beta-response wire unavailable: ${err?.message ?? err}`);
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
|
|
461
512
|
// A4: coalesce docker-event bursts into a single servers.snapshot. World
|
|
462
513
|
// boot fires `create` + `start` + healthcheck transitions in <100ms; we
|
|
463
514
|
// don't want a broadcast storm. Window matches plan-source.md P3 target.
|
|
@@ -485,6 +536,93 @@ const stopEvents = subscribeDockerEvents({
|
|
|
485
536
|
// this callback is by construction an olam world.
|
|
486
537
|
scheduleServersSnapshot();
|
|
487
538
|
},
|
|
539
|
+
// Killshot #2 — emit typed world.lifecycle events alongside the cache
|
|
540
|
+
// invalidate. Docker actions map onto phases as follows:
|
|
541
|
+
// start | restart → Spawning (container boot kicked off)
|
|
542
|
+
// stop → Finished (clean operator-initiated stop)
|
|
543
|
+
// die | kill → Failed (involuntary exit; carries exit code +
|
|
544
|
+
// classifier-derived failureKind)
|
|
545
|
+
// The lifecycle module's classifier runs against a synthetic evidence
|
|
546
|
+
// bundle so the trace records *why* the bucket was chosen. TrustRequired,
|
|
547
|
+
// ReadyForPrompt, and Running emissions are not observable from
|
|
548
|
+
// host-cp's docker-events surface — those transitions happen inside
|
|
549
|
+
// container-cp and are wired in a follow-up (see ADR 033 § Open
|
|
550
|
+
// questions for the planned container-cp → host-cp emission seam).
|
|
551
|
+
onWorldLifecycleEvent: ({ worldId, action, exitCode }) => {
|
|
552
|
+
const now = Date.now();
|
|
553
|
+
if (action === 'start' || action === 'restart') {
|
|
554
|
+
recordWorldLifecycle(hostStream, {
|
|
555
|
+
worldId,
|
|
556
|
+
phase: WorldLifecyclePhase.Spawning,
|
|
557
|
+
at: now,
|
|
558
|
+
});
|
|
559
|
+
return;
|
|
560
|
+
}
|
|
561
|
+
if (action === 'stop') {
|
|
562
|
+
recordWorldLifecycle(hostStream, {
|
|
563
|
+
worldId,
|
|
564
|
+
phase: WorldLifecyclePhase.Finished,
|
|
565
|
+
at: now,
|
|
566
|
+
});
|
|
567
|
+
return;
|
|
568
|
+
}
|
|
569
|
+
if (action === 'die' || action === 'kill') {
|
|
570
|
+
const ev = emptyEvidence(worldId, now);
|
|
571
|
+
ev.lastPhase = WorldLifecyclePhase.Running;
|
|
572
|
+
ev.lastPhaseAt = now;
|
|
573
|
+
if (exitCode !== undefined) ev.processExitCode = exitCode;
|
|
574
|
+
// For involuntary exit with a code we know the bucket up front;
|
|
575
|
+
// skip the classifier inference and pass it through explicitly so
|
|
576
|
+
// the trace records the exact docker-derived signal.
|
|
577
|
+
const failureKind =
|
|
578
|
+
exitCode !== undefined ? WorldStartupFailureKind.ProviderProcessGone : undefined;
|
|
579
|
+
const lifecycleEvent = recordWorldLifecycle(hostStream, {
|
|
580
|
+
worldId,
|
|
581
|
+
phase: WorldLifecyclePhase.Failed,
|
|
582
|
+
at: now,
|
|
583
|
+
evidence: ev,
|
|
584
|
+
failureKind,
|
|
585
|
+
});
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
// Killshot #3 — bounded auto-recovery. Attempt once per
|
|
589
|
+
// (worldId, failureKind) pair; the engine enforces idempotency.
|
|
590
|
+
// Emit recovery.* events on the host-stream so the NDJSON trace
|
|
591
|
+
// sink captures the full attempt trail.
|
|
592
|
+
const resolvedKind = lifecycleEvent.failureKind ?? null;
|
|
593
|
+
const scenario = findScenarioForKind(resolvedKind);
|
|
594
|
+
if (scenario !== undefined) {
|
|
595
|
+
hostStream.broadcast('recovery.attempt-started', {
|
|
596
|
+
worldId,
|
|
597
|
+
scenario: scenario?.name ?? 'unmatched',
|
|
598
|
+
recipe: scenario?.recipe ?? null,
|
|
599
|
+
});
|
|
600
|
+
attemptRecovery(worldId, ev, resolvedKind)
|
|
601
|
+
.then((entry) => {
|
|
602
|
+
if (entry.outcome === 'escalated') {
|
|
603
|
+
hostStream.broadcast('recovery.escalated', {
|
|
604
|
+
worldId,
|
|
605
|
+
ledgerEntry: entry,
|
|
606
|
+
});
|
|
607
|
+
} else if (entry.outcome === 'success') {
|
|
608
|
+
hostStream.broadcast('recovery.attempt-succeeded', {
|
|
609
|
+
worldId,
|
|
610
|
+
ledgerEntry: entry,
|
|
611
|
+
});
|
|
612
|
+
} else {
|
|
613
|
+
hostStream.broadcast('recovery.attempt-failed', {
|
|
614
|
+
worldId,
|
|
615
|
+
ledgerEntry: entry,
|
|
616
|
+
});
|
|
617
|
+
}
|
|
618
|
+
})
|
|
619
|
+
.catch((err) => {
|
|
620
|
+
// Recovery engine always resolves — this path is a safety net.
|
|
621
|
+
console.error(`[recovery] unexpected engine rejection for ${worldId}: ${err?.message}`);
|
|
622
|
+
});
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
},
|
|
488
626
|
});
|
|
489
627
|
|
|
490
628
|
// Initial servers.snapshot so subscribers connecting before any docker
|
|
@@ -803,6 +941,58 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
|
|
|
803
941
|
if (handled) return;
|
|
804
942
|
}
|
|
805
943
|
|
|
944
|
+
// /api/telemetry/planning-sessions — B9: aggregate planning_sessions by
|
|
945
|
+
// session_source for the canonical-surface bet's adoption signal. Per
|
|
946
|
+
// plan-chat-spa-canonical-surface plan § Operator workflow seam falsification
|
|
947
|
+
// trigger: if plan-chat-spa weekly-active sessions < 60% of control-plane/app
|
|
948
|
+
// by 2026-Q3, freeze plan-chat-spa feature work. This endpoint is the
|
|
949
|
+
// data source for that measurement.
|
|
950
|
+
//
|
|
951
|
+
// Query param: ?since=YYYY-MM-DD (required; rejects with 400 otherwise).
|
|
952
|
+
// Response: { plan_chat_spa: N, control_plane_app: M, unknown: K, ratio: pct }
|
|
953
|
+
// where ratio = plan_chat_spa / (plan_chat_spa + control_plane_app) * 100,
|
|
954
|
+
// null if denominator is 0.
|
|
955
|
+
if (url.pathname === '/api/telemetry/planning-sessions' && req.method === 'GET') {
|
|
956
|
+
const since = url.searchParams.get('since');
|
|
957
|
+
if (!since || !/^\d{4}-\d{2}-\d{2}$/.test(since)) {
|
|
958
|
+
res.writeHead(400, { 'Content-Type': 'application/json' });
|
|
959
|
+
return res.end(JSON.stringify({
|
|
960
|
+
error: 'bad_request',
|
|
961
|
+
message: 'Missing or malformed `since` query param. Expected YYYY-MM-DD.',
|
|
962
|
+
}));
|
|
963
|
+
}
|
|
964
|
+
// B9 ships the endpoint CONTRACT + the session_source schema column.
|
|
965
|
+
// The query implementation goes through plan-chat-service.mjs (which
|
|
966
|
+
// owns the pg pool); this host-cp handler currently emits a 503 with
|
|
967
|
+
// a structured "not_implemented" marker so callers can verify the
|
|
968
|
+
// endpoint shape + auth + query-param parsing without the data path.
|
|
969
|
+
//
|
|
970
|
+
// Phase G of this epic adds the plan-chat-service handler that this
|
|
971
|
+
// endpoint will proxy to. Until then operators can run the SQL
|
|
972
|
+
// directly:
|
|
973
|
+
// SELECT COALESCE(session_source, 'unknown'), COUNT(*)
|
|
974
|
+
// FROM planning_sessions
|
|
975
|
+
// WHERE created_at >= $since
|
|
976
|
+
// GROUP BY 1;
|
|
977
|
+
//
|
|
978
|
+
// Notify-C: ship contract + schema; defer data path to Phase G.
|
|
979
|
+
res.writeHead(503, { 'Content-Type': 'application/json' });
|
|
980
|
+
return res.end(JSON.stringify({
|
|
981
|
+
error: 'not_implemented',
|
|
982
|
+
message: 'B9 ships the endpoint contract + session_source schema column. ' +
|
|
983
|
+
'Aggregation handler scaffolded in plan-chat-service.mjs lands in Phase G.',
|
|
984
|
+
since,
|
|
985
|
+
contractShape: {
|
|
986
|
+
plan_chat_spa: 0,
|
|
987
|
+
control_plane_app: 0,
|
|
988
|
+
unknown: 0,
|
|
989
|
+
ratio: null,
|
|
990
|
+
since: '<YYYY-MM-DD>',
|
|
991
|
+
asOf: '<ISO 8601>',
|
|
992
|
+
},
|
|
993
|
+
}));
|
|
994
|
+
}
|
|
995
|
+
|
|
806
996
|
// /api/version/status: returns the current version snapshot (baked SHA
|
|
807
997
|
// vs operator's local HEAD). No auth required beyond the existing gate
|
|
808
998
|
// (already applied above). Phase 1 only — detection, no auto-upgrade.
|
|
@@ -2070,6 +2260,76 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
|
|
|
2070
2260
|
// B5's CLI uses).
|
|
2071
2261
|
// When unset, returns 503 with a clear setup hint instead of failing
|
|
2072
2262
|
// silently — operators wire when they're ready for cloud-mode dogfood.
|
|
2263
|
+
// POST /v1/dispatch-from-email — see docs/architecture/email-as-trigger.md.
|
|
2264
|
+
//
|
|
2265
|
+
// The CF Email Worker (packages/email-worker-cloudflare) HMAC-signs the
|
|
2266
|
+
// canonical payload (Decision 022) and POSTs it here. The host re-validates
|
|
2267
|
+
// the signature, re-checks the sender allowlist (defense in depth), persists
|
|
2268
|
+
// attachments under OLAM_EMAIL_ATTACHMENTS_ROOT/<worldId>/<timestampMs>/,
|
|
2269
|
+
// and either routes the dispatch to a known world or persists a
|
|
2270
|
+
// spawn-pending request for the MCP/CLI layer to drain.
|
|
2271
|
+
//
|
|
2272
|
+
// The body cap here is 30 MiB — 25 MiB attachment ceiling + 5 MiB margin
|
|
2273
|
+
// for the JSON envelope. Larger payloads are rejected at 413.
|
|
2274
|
+
if (url.pathname === '/v1/dispatch-from-email' && req.method === 'POST') {
|
|
2275
|
+
const chunks = [];
|
|
2276
|
+
let size = 0;
|
|
2277
|
+
const MAX_BODY = 30 * 1024 * 1024;
|
|
2278
|
+
let aborted = false;
|
|
2279
|
+
req.on('data', (chunk) => {
|
|
2280
|
+
size += chunk.length;
|
|
2281
|
+
if (size > MAX_BODY) {
|
|
2282
|
+
aborted = true;
|
|
2283
|
+
jsonReply(res, 413, { error: 'body_too_large', maxBytes: MAX_BODY });
|
|
2284
|
+
req.destroy();
|
|
2285
|
+
return;
|
|
2286
|
+
}
|
|
2287
|
+
chunks.push(chunk);
|
|
2288
|
+
});
|
|
2289
|
+
req.on('end', async () => {
|
|
2290
|
+
if (aborted) return;
|
|
2291
|
+
let dispatch;
|
|
2292
|
+
try {
|
|
2293
|
+
dispatch = JSON.parse(Buffer.concat(chunks).toString('utf8') || '{}');
|
|
2294
|
+
} catch (err) {
|
|
2295
|
+
return jsonReply(res, 400, { error: 'invalid_json', message: err.message });
|
|
2296
|
+
}
|
|
2297
|
+
try {
|
|
2298
|
+
// Auto-tier-scheduler v1 (ADR 042): emit an informational
|
|
2299
|
+
// `dispatch.tier-suggestion` event BEFORE handing off to the
|
|
2300
|
+
// dispatch handler. Pure-informational — never changes which
|
|
2301
|
+
// provider actually runs. The dispatch payload's optional
|
|
2302
|
+
// `tierSpec` ({ kind?, expectedDurationMs?, explicitTier? })
|
|
2303
|
+
// carries the shape; absent it, the heuristic falls through to
|
|
2304
|
+
// its default (`cloudflare-sandbox`).
|
|
2305
|
+
if (dispatch && typeof dispatch.worldId === 'string') {
|
|
2306
|
+
try {
|
|
2307
|
+
emitTierSuggestion({
|
|
2308
|
+
worldId: dispatch.worldId,
|
|
2309
|
+
dispatchSpec: dispatch.tierSpec ?? {},
|
|
2310
|
+
currentTier: null,
|
|
2311
|
+
hostStream,
|
|
2312
|
+
});
|
|
2313
|
+
} catch { /* never let a hint surface break dispatch */ }
|
|
2314
|
+
}
|
|
2315
|
+
const result = await handleDispatchFromEmail({
|
|
2316
|
+
dispatch,
|
|
2317
|
+
worlds: WORLDS,
|
|
2318
|
+
secret: OLAM_EMAIL_SIGNING_SECRET,
|
|
2319
|
+
attachmentsRoot: OLAM_EMAIL_ATTACHMENTS_ROOT,
|
|
2320
|
+
allowlist: OLAM_EMAIL_ALLOWED_SENDERS,
|
|
2321
|
+
});
|
|
2322
|
+
return jsonReply(res, result.status, result.body);
|
|
2323
|
+
} catch (err) {
|
|
2324
|
+
return jsonReply(res, 500, {
|
|
2325
|
+
error: 'dispatch_failed',
|
|
2326
|
+
message: err instanceof Error ? err.message : String(err),
|
|
2327
|
+
});
|
|
2328
|
+
}
|
|
2329
|
+
});
|
|
2330
|
+
return;
|
|
2331
|
+
}
|
|
2332
|
+
|
|
2073
2333
|
if (url.pathname === '/api/cloud-dispatch' && req.method === 'POST') {
|
|
2074
2334
|
const cloudUrl = process.env.OLAM_CLOUD_URL;
|
|
2075
2335
|
const showcasePw = process.env.OLAM_SHOWCASE_PASSWORD;
|
|
@@ -3078,6 +3338,7 @@ for (const sig of ['SIGTERM', 'SIGINT']) {
|
|
|
3078
3338
|
stopListeningSnapshotLoop();
|
|
3079
3339
|
if (serversSnapshotTimer) { clearTimeout(serversSnapshotTimer); serversSnapshotTimer = null; }
|
|
3080
3340
|
hostStream.close();
|
|
3341
|
+
if (ndjsonSpanSink) ndjsonSpanSink.close().catch(() => {});
|
|
3081
3342
|
clearInterval(versionPollTimer);
|
|
3082
3343
|
cache.clear();
|
|
3083
3344
|
server.close(() => process.exit(0));
|