@pleri/olam-cli 0.1.169 → 0.1.173

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/README.md +38 -0
  2. package/dist/agent-stream/driver-runner.js +13 -0
  3. package/dist/commands/auth-status.d.ts +1 -0
  4. package/dist/commands/auth-status.d.ts.map +1 -1
  5. package/dist/commands/auth-status.js +45 -4
  6. package/dist/commands/auth-status.js.map +1 -1
  7. package/dist/commands/create.d.ts.map +1 -1
  8. package/dist/commands/create.js +26 -0
  9. package/dist/commands/create.js.map +1 -1
  10. package/dist/commands/enter.d.ts.map +1 -1
  11. package/dist/commands/enter.js +5 -0
  12. package/dist/commands/enter.js.map +1 -1
  13. package/dist/commands/resume.d.ts +63 -0
  14. package/dist/commands/resume.d.ts.map +1 -0
  15. package/dist/commands/resume.js +174 -0
  16. package/dist/commands/resume.js.map +1 -0
  17. package/dist/commands/setup.d.ts +19 -0
  18. package/dist/commands/setup.d.ts.map +1 -1
  19. package/dist/commands/setup.js +157 -19
  20. package/dist/commands/setup.js.map +1 -1
  21. package/dist/image-digests.json +8 -8
  22. package/dist/index.js +1025 -577
  23. package/dist/index.js.map +1 -1
  24. package/dist/lib/health-probes.d.ts +28 -0
  25. package/dist/lib/health-probes.d.ts.map +1 -1
  26. package/dist/lib/health-probes.js +75 -0
  27. package/dist/lib/health-probes.js.map +1 -1
  28. package/dist/lib/k8s-context-discovery.d.ts +80 -0
  29. package/dist/lib/k8s-context-discovery.d.ts.map +1 -0
  30. package/dist/lib/k8s-context-discovery.js +102 -0
  31. package/dist/lib/k8s-context-discovery.js.map +1 -0
  32. package/dist/mcp-server.js +2417 -1060
  33. package/dist/spawn/home-override.d.ts +82 -0
  34. package/dist/spawn/home-override.d.ts.map +1 -0
  35. package/dist/spawn/home-override.js +107 -0
  36. package/dist/spawn/home-override.js.map +1 -0
  37. package/hermes-bundle/version.json +1 -1
  38. package/host-cp/k8s/manifests/30-configmap.yaml +5 -0
  39. package/host-cp/k8s/manifests/50-deployment.yaml +9 -2
  40. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  41. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  42. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  43. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  44. package/host-cp/lifecycle/classify.mjs +110 -0
  45. package/host-cp/lifecycle/emit.mjs +119 -0
  46. package/host-cp/lifecycle/evidence.mjs +45 -0
  47. package/host-cp/lifecycle/failure-kinds.mjs +56 -0
  48. package/host-cp/lifecycle/index.mjs +22 -0
  49. package/host-cp/lifecycle/phases.mjs +52 -0
  50. package/host-cp/observability/grafana-port-forward.sh +1 -1
  51. package/host-cp/observability/kyverno-cardinality-mutate.sh +2 -2
  52. package/host-cp/observability/loki-ingest.sh +1 -1
  53. package/host-cp/observability/ndjson-span-sink.mjs +183 -0
  54. package/host-cp/observability/prom-no-double-grafana.sh +4 -4
  55. package/host-cp/observability/redactor.mjs +72 -0
  56. package/host-cp/recovery/engine.mjs +148 -0
  57. package/host-cp/recovery/index.mjs +16 -0
  58. package/host-cp/recovery/ledger.mjs +105 -0
  59. package/host-cp/recovery/recipes.mjs +46 -0
  60. package/host-cp/recovery/scenarios.mjs +124 -0
  61. package/host-cp/recovery/step-runners.mjs +263 -0
  62. package/host-cp/src/docker-events.mjs +30 -6
  63. package/host-cp/src/linear-sync.mjs +43 -0
  64. package/host-cp/src/plan-chat-service.mjs +129 -1
  65. package/host-cp/src/pr-nanny.mjs +55 -3
  66. package/host-cp/src/server.mjs +261 -0
  67. package/package.json +1 -1
@@ -68,7 +68,7 @@ const SCOPE_ID_RE = /^[A-Za-z0-9_.-]+$/;
68
68
  // /v1/shape. Only these tables have server-side where-rewrite support; any
69
69
  // other table=... param gets a 400. Guards against a client enumerating
70
70
  // tables the service doesn't own.
71
- const ALLOWED_SHAPE_TABLES = new Set(['chunks', 'message_usage']);
71
+ const ALLOWED_SHAPE_TABLES = new Set(['chunks', 'message_usage', 'planning_artifacts']);
72
72
 
73
73
  // B6 (plan-chat-context-window-display Phase B): context-window caps per
74
74
  // model. Mirrors CONTEXT_CAPS from @olam/intelligence/src/llm-router/providers/claude.ts.
@@ -399,6 +399,44 @@ export function createHandler({
399
399
  }
400
400
  }
401
401
  }
402
+ // H2 (plan-chat-spa-canonical-surface Phase G) — extract commit_plan /
403
+ // propose_plan tool_use chunks into the planning_artifacts mutable
404
+ // table. The chunk's content carries JSON-stringified {name, input}
405
+ // per the substrate contract; we parse it once + write a single row.
406
+ // Failure to extract is logged + swallowed — the chunk itself is
407
+ // already persisted; artifact row absence is recoverable via
408
+ // re-extraction from chunks in a follow-up batch job.
409
+ if (body.chunk_type === 'tool_use') {
410
+ try {
411
+ const parsed = JSON.parse(body.chunk);
412
+ const toolName = typeof parsed?.name === 'string' ? parsed.name : '';
413
+ const artifactType = (toolName === 'commit_plan' || toolName === 'propose_plan')
414
+ ? 'commit_plan'
415
+ : toolName === 'component_scaffold'
416
+ ? 'component_scaffold'
417
+ : toolName === 'design_jam'
418
+ ? 'design_jam'
419
+ : null;
420
+ if (artifactType && parsed.input && typeof parsed.input === 'object') {
421
+ const input = parsed.input;
422
+ const title = typeof input.title === 'string' && input.title.length > 0
423
+ ? input.title.slice(0, 200)
424
+ : `Untitled ${artifactType}`;
425
+ const artifactId = `${body.message_id}:${body.seq}`;
426
+ await pool.query(
427
+ `INSERT INTO planning_artifacts
428
+ (id, world_id, session_id, type, title, body)
429
+ VALUES ($1, $2, $3, $4, $5, $6)
430
+ ON CONFLICT (id) DO NOTHING`,
431
+ [artifactId, body.world_id, body.session_id, artifactType, title, input],
432
+ );
433
+ }
434
+ } catch (artifactErr) {
435
+ console.warn(
436
+ `[plan-chat-service] planning_artifacts extraction failed for message=${body.message_id} seq=${body.seq}: ${artifactErr?.message ?? artifactErr}`,
437
+ );
438
+ }
439
+ }
402
440
  } catch (err) {
403
441
  if (err && typeof err === 'object' && 'code' in err && err.code === '23505') {
404
442
  return send(res, 409, { error: 'duplicate', message: '(message_id, seq) already exists' });
@@ -634,6 +672,26 @@ export function createHandler({
634
672
  createWorld,
635
673
  destroyWorld,
636
674
  });
675
+ // H7 (Phase G) — back-fill crystallized_world_id to all planning_artifacts
676
+ // rows for this session. The status pill state machine (Phase E E4) reads
677
+ // this field; the editor view + diagrams viewer use it for the
678
+ // "View world →" CTA. Failure here is logged + swallowed — the
679
+ // crystallize itself already succeeded; back-fill is a best-effort
680
+ // sync that an operator can re-run via a CLI helper.
681
+ if (result.worldId) {
682
+ try {
683
+ await pool.query(
684
+ `UPDATE planning_artifacts
685
+ SET crystallized_world_id = $1, status = 'crystallized'
686
+ WHERE session_id = $2 AND status = 'open'`,
687
+ [result.worldId, body.session_id],
688
+ );
689
+ } catch (backfillErr) {
690
+ console.warn(
691
+ `[plan-chat-service] crystallize back-fill failed for session=${body.session_id}: ${backfillErr?.message ?? backfillErr}`,
692
+ );
693
+ }
694
+ }
637
695
  return send(res, 200, {
638
696
  ok: true,
639
697
  created_world_id: result.worldId,
@@ -649,6 +707,68 @@ export function createHandler({
649
707
  }
650
708
  }
651
709
 
710
+ // H4 (Phase G) — GET + PATCH /v1/artifacts/:id endpoint pair backing
711
+ // the SPA's editor-view round-trip. GET returns the artifact row;
712
+ // PATCH updates body (the JSON payload) + bumps updated_at via trigger.
713
+ // SCOPE_ID_RE on :id; bearer auth identical to the chunks endpoints.
714
+ async function handleGetArtifact(req, res, id) {
715
+ if (!checkAuth(req)) return unauthorized(res);
716
+ if (!SCOPE_ID_RE.test(id)) return badRequest(res, 'invalid artifact id');
717
+ try {
718
+ const result = await pool.query(
719
+ `SELECT id, world_id, session_id, type, title, body, status,
720
+ linear_issue_url, crystallized_world_id,
721
+ created_at, updated_at
722
+ FROM planning_artifacts WHERE id = $1`,
723
+ [id],
724
+ );
725
+ const row = result.rows[0];
726
+ if (!row) return send(res, 404, { error: 'not-found' });
727
+ return send(res, 200, row);
728
+ } catch (err) {
729
+ return send(res, 500, { error: 'query-failed', message: String(err?.message ?? err) });
730
+ }
731
+ }
732
+
733
+ async function handlePatchArtifact(req, res, id) {
734
+ if (!checkAuth(req)) return unauthorized(res);
735
+ if (!SCOPE_ID_RE.test(id)) return badRequest(res, 'invalid artifact id');
736
+ let body;
737
+ try {
738
+ body = await readJson(req);
739
+ } catch {
740
+ return badRequest(res, 'malformed JSON body');
741
+ }
742
+ const sets = [];
743
+ const values = [id];
744
+ if (body.body !== undefined) {
745
+ values.push(body.body);
746
+ sets.push(`body = $${values.length}::jsonb`);
747
+ }
748
+ if (typeof body.title === 'string') {
749
+ values.push(body.title.slice(0, 200));
750
+ sets.push(`title = $${values.length}`);
751
+ }
752
+ if (typeof body.status === 'string' && ['open', 'crystallized', 'failed', 'archived'].includes(body.status)) {
753
+ values.push(body.status);
754
+ sets.push(`status = $${values.length}`);
755
+ }
756
+ if (sets.length === 0) {
757
+ return badRequest(res, 'no patchable fields supplied (body | title | status)');
758
+ }
759
+ try {
760
+ const result = await pool.query(
761
+ `UPDATE planning_artifacts SET ${sets.join(', ')} WHERE id = $1 RETURNING *`,
762
+ values,
763
+ );
764
+ const row = result.rows[0];
765
+ if (!row) return send(res, 404, { error: 'not-found' });
766
+ return send(res, 200, row);
767
+ } catch (err) {
768
+ return send(res, 500, { error: 'update-failed', message: String(err?.message ?? err) });
769
+ }
770
+ }
771
+
652
772
  return async function handler(req, res) {
653
773
  const url = new URL(req.url ?? '/', `http://${req.headers.host}`);
654
774
  if (req.method === 'GET' && url.pathname === '/livez') return send(res, 200, { ok: true });
@@ -656,6 +776,14 @@ export function createHandler({
656
776
  if (req.method === 'GET' && url.pathname === '/v1/shape') return handleGetShape(req, res, url);
657
777
  if (req.method === 'GET' && url.pathname === '/v1/planning-sessions') return handleGetPlanningSessions(req, res, url);
658
778
  if (req.method === 'POST' && url.pathname === '/v1/crystallize') return handlePostCrystallize(req, res);
779
+ // H4 — /v1/artifacts/:id pair
780
+ const artifactMatch = /^\/v1\/artifacts\/([^/]+)$/.exec(url.pathname);
781
+ if (artifactMatch) {
782
+ const id = decodeURIComponent(artifactMatch[1]);
783
+ if (req.method === 'GET') return handleGetArtifact(req, res, id);
784
+ if (req.method === 'PATCH') return handlePatchArtifact(req, res, id);
785
+ return send(res, 405, { error: 'method-not-allowed' });
786
+ }
659
787
  return send(res, 404, { error: 'not-found' });
660
788
  };
661
789
  }
@@ -13,10 +13,17 @@
13
13
  * 2. wall-clock since first dispatch >= MAX_WALL_CLOCK_MIN (default 60)
14
14
  * 3. same-root-cause loop detected (last 2 dispatch summaries identical)
15
15
  * 4. operator manual pause
16
+ *
17
+ * Tier escalation (PR #N tier-escalation):
18
+ * On each retry, the nanny advances to the next tier in `escalationTiers`
19
+ * (stored per-world in nanny_current_tier) instead of repeating the same
20
+ * model. When the chain is exhausted, emits `dispatch.tier-exhausted` on
21
+ * the host-stream and falls back to existing operator escalation.
16
22
  */
17
23
 
18
24
  import { execFile } from 'node:child_process';
19
25
  import { promisify } from 'node:util';
26
+ import { pickNextTier } from './dispatch/tier-escalator.mjs';
20
27
 
21
28
  const execFileAsync = promisify(execFile);
22
29
 
@@ -68,8 +75,9 @@ function parsePrUrl(prUrl) {
68
75
  * @param {{
69
76
  * prStateStore: ReturnType<import('./world-pr-state.mjs').createWorldPrStateStore>,
70
77
  * getGhToken: () => Promise<string|null>,
71
- * dispatchToWorld: (worldId: string, prompt: string) => Promise<void>,
78
+ * dispatchToWorld: (worldId: string, prompt: string, opts?: { tier?: string }) => Promise<void>,
72
79
  * consultCodex: (ctx: string) => Promise<string>,
80
+ * broadcastTierEvent?: (eventType: string, payload: unknown) => void,
73
81
  * pollIntervalMs?: number,
74
82
  * maxDispatches?: number,
75
83
  * maxWallClockMin?: number,
@@ -80,6 +88,7 @@ export function createPrNanny({
80
88
  getGhToken,
81
89
  dispatchToWorld,
82
90
  consultCodex,
91
+ broadcastTierEvent = () => {},
83
92
  pollIntervalMs = 60_000,
84
93
  maxDispatches = parseInt(process.env.OLAM_PR_NANNY_MAX_DISPATCHES ?? '5', 10),
85
94
  maxWallClockMin = parseInt(process.env.OLAM_PR_NANNY_MAX_WALL_CLOCK_MIN ?? '60', 10),
@@ -198,17 +207,60 @@ export function createPrNanny({
198
207
  return;
199
208
  }
200
209
 
210
+ // ── Tier escalation (PR #938) ───────────────────────────────────────────
211
+ //
212
+ // `nanny_escalation_tiers` is set by the olam_dispatch caller via the
213
+ // escalationTiers schema field and persisted here by server.mjs when the
214
+ // world is registered for nanny tracking. Defaults to ['sonnet'] when
215
+ // absent (no escalation, no cost surprise).
216
+ //
217
+ // `nanny_current_tier` tracks the model tier used by the LAST dispatch for
218
+ // this PR. On first dispatch (dispatchCount === 0) it is undefined, and we
219
+ // use escalationTiers[0] as the starting tier. On retries we advance the
220
+ // chain via pickNextTier. This is the pr-state store (option c from the
221
+ // design doc) — it persists across polls and matches the nanny_* field
222
+ // pattern already established by nanny_dispatch_count et al.
223
+ const escalationTiers = entry.nanny_escalation_tiers ?? ['sonnet'];
224
+ const currentTier = entry.nanny_current_tier ?? escalationTiers[0] ?? 'sonnet';
225
+ let tierForThisDispatch = currentTier;
226
+
227
+ if (dispatchCount > 0) {
228
+ // This is a retry — try to escalate the tier.
229
+ const nextTier = pickNextTier(currentTier, escalationTiers);
230
+ if (nextTier !== null) {
231
+ tierForThisDispatch = nextTier;
232
+ broadcastTierEvent('dispatch.escalated', {
233
+ worldId,
234
+ fromTier: currentTier,
235
+ toTier: nextTier,
236
+ reason: 'retry-after-failure',
237
+ });
238
+ console.log(`[pr-nanny] tier escalated for ${worldId}: ${currentTier} → ${nextTier}`);
239
+ } else {
240
+ // Chain exhausted — emit tier-exhausted and fall back to operator escalation.
241
+ broadcastTierEvent('dispatch.tier-exhausted', {
242
+ worldId,
243
+ exhaustedTier: currentTier,
244
+ escalationTiers,
245
+ });
246
+ console.log(`[pr-nanny] tier chain exhausted for ${worldId} (last tier: ${currentTier}) — escalating to operator`);
247
+ prStateStore.set(worldId, { nanny_escalated: true, nanny_escalate_reason: 'tier_exhausted' });
248
+ return;
249
+ }
250
+ }
251
+
201
252
  // Dispatch fix
202
253
  try {
203
- await dispatchToWorld(worldId, prompt);
254
+ await dispatchToWorld(worldId, prompt, { tier: tierForThisDispatch });
204
255
  const now = new Date().toISOString();
205
256
  prStateStore.set(worldId, {
206
257
  nanny_dispatch_count: dispatchCount + 1,
207
258
  nanny_first_dispatch_at: entry.nanny_first_dispatch_at ?? now,
208
259
  nanny_last_dispatch_at: now,
209
260
  nanny_last_dispatch_prompt: prompt,
261
+ nanny_current_tier: tierForThisDispatch,
210
262
  });
211
- console.log(`[pr-nanny] dispatched fix to ${worldId} (dispatch ${dispatchCount + 1}/${maxDispatches})`);
263
+ console.log(`[pr-nanny] dispatched fix to ${worldId} (dispatch ${dispatchCount + 1}/${maxDispatches}, tier: ${tierForThisDispatch})`);
212
264
  } catch (err) {
213
265
  console.error(`[pr-nanny] dispatch failed for ${worldId}: ${err.message}`);
214
266
  }
@@ -34,7 +34,19 @@ import { computeProgress } from './world-progress.mjs';
34
34
  import { createPrCache } from './pr-cache.mjs';
35
35
  import { fetchContainerSecret } from './container-secret-fetcher.mjs';
36
36
  import { subscribeDockerEvents } from './docker-events.mjs';
37
+ import {
38
+ recordWorldLifecycle,
39
+ emptyEvidence,
40
+ WorldLifecyclePhase,
41
+ WorldStartupFailureKind,
42
+ } from '../lifecycle/index.mjs';
37
43
  import { createHostStream, newStreamId } from './host-stream.mjs';
44
+ import {
45
+ createNdjsonSpanSink,
46
+ attachBetaResponseEvents,
47
+ } from '../observability/ndjson-span-sink.mjs';
48
+ import { betaResponseEmitter } from '@olam/auth-client';
49
+ import { attemptRecovery, findScenarioForKind } from '../recovery/index.mjs';
38
50
  import { detectHaltChunk } from './halt-detect.mjs';
39
51
  import { spawnUpgraderContainer } from './upgrade-spawner.mjs';
40
52
  import { parseProxyPath, perWorldBase, proxyToWorld } from './proxy.mjs';
@@ -74,6 +86,8 @@ import {
74
86
  handleServerBridges,
75
87
  } from './routes/process-port.mjs';
76
88
  import { instrumentHandler, renderMetrics } from './metrics.mjs';
89
+ import { handleDispatchFromEmail } from './lib/email-dispatch.mjs';
90
+ import { emitTierSuggestion } from '../dispatch/auto-tier-scheduler.mjs';
77
91
 
78
92
  // ── Deployment-mode detection ─────────────────────────────────────
79
93
  //
@@ -142,6 +156,20 @@ const OLAM_REPO_HOST_PATH = process.env.OLAM_REPO_HOST_PATH ?? '';
142
156
  const OLAM_GH_CONFIG_HOST_PATH = process.env.OLAM_GH_CONFIG_HOST_PATH ?? '';
143
157
  const OLAM_UPGRADER_IMAGE = process.env.OLAM_UPGRADER_IMAGE ?? 'ghcr.io/pleri/olam-host-cp:latest';
144
158
  const WORKSPACES_DIR = process.env.OLAM_WORKSPACES_DIR ?? '/data/workspaces';
159
+ // Email-trigger surface (PR feat/email-as-world-trigger). The signing
160
+ // secret is the operator-shared key with the CF Email Worker — see
161
+ // docs/architecture/email-as-trigger.md. The allowlist is enforced
162
+ // defense-in-depth: the worker rejects at SMTP-time so bounces reach
163
+ // senders; we re-check at HTTP-time so a misrouted direct POST cannot
164
+ // bypass it. Both empty → endpoint stays mis-configured and returns
165
+ // 500/403 (fail-closed).
166
+ const OLAM_EMAIL_SIGNING_SECRET = process.env.OLAM_EMAIL_SIGNING_SECRET ?? '';
167
+ const OLAM_EMAIL_ALLOWED_SENDERS = process.env.OLAM_EMAIL_ALLOWED_SENDERS ?? '';
168
+ const OLAM_EMAIL_ATTACHMENTS_ROOT =
169
+ process.env.OLAM_EMAIL_ATTACHMENTS_ROOT ??
170
+ (HOST_CP_MODE === 'container'
171
+ ? '/data/email-attachments'
172
+ : path.join(os.homedir(), '.olam', 'email-attachments'));
145
173
  const WORLD_NAMES_PATH =
146
174
  process.env.OLAM_WORLD_NAMES_PATH ??
147
175
  (HOST_CP_MODE === 'container'
@@ -458,6 +486,29 @@ const sseGate = new SseGate({ maxConcurrent: SSE_CAP });
458
486
  // poll-every-2s `useListeningServers` loop.
459
487
  const hostStream = createHostStream({ log: (m) => console.log(`[host-stream] ${m}`) });
460
488
 
489
+ // Zero-config NDJSON span sink. Subscribes to host-stream `event: span`
490
+ // broadcasts and appends to ~/.olam/logs/host.trace.ndjson (override via
491
+ // OLAM_TRACE_LOG_PATH). Fail-open: a sink-bootstrap error logs a warning
492
+ // and proceeds without tracing rather than blocking host-cp boot.
493
+ const ndjsonSpanSink = await createNdjsonSpanSink({ hostStream }).catch((err) => {
494
+ console.warn(`[trace] NDJSON span sink unavailable: ${err?.message ?? err}`);
495
+ return null;
496
+ });
497
+
498
+ // Wire @olam/auth-client `beta-response` events (Anthropic SDK 0.96+ beta
499
+ // flags — thinking-token-count, cache-diagnostics, future passthrough) into
500
+ // the NDJSON trace as `withCredential.beta-response` spans. Opt-in via the
501
+ // caller's `withCredential('claude', fn, { betas: [...] })` options; when
502
+ // no caller opts in, the emitter never fires and this subscription is a
503
+ // no-op. See docs/decisions/047-anthropic-sdk-beta-flags.md.
504
+ if (ndjsonSpanSink) {
505
+ try {
506
+ attachBetaResponseEvents({ sink: ndjsonSpanSink, emitter: betaResponseEmitter });
507
+ } catch (err) {
508
+ console.warn(`[trace] beta-response wire unavailable: ${err?.message ?? err}`);
509
+ }
510
+ }
511
+
461
512
  // A4: coalesce docker-event bursts into a single servers.snapshot. World
462
513
  // boot fires `create` + `start` + healthcheck transitions in <100ms; we
463
514
  // don't want a broadcast storm. Window matches plan-source.md P3 target.
@@ -485,6 +536,93 @@ const stopEvents = subscribeDockerEvents({
485
536
  // this callback is by construction an olam world.
486
537
  scheduleServersSnapshot();
487
538
  },
539
+ // Killshot #2 — emit typed world.lifecycle events alongside the cache
540
+ // invalidate. Docker actions map onto phases as follows:
541
+ // start | restart → Spawning (container boot kicked off)
542
+ // stop → Finished (clean operator-initiated stop)
543
+ // die | kill → Failed (involuntary exit; carries exit code +
544
+ // classifier-derived failureKind)
545
+ // The lifecycle module's classifier runs against a synthetic evidence
546
+ // bundle so the trace records *why* the bucket was chosen. TrustRequired,
547
+ // ReadyForPrompt, and Running emissions are not observable from
548
+ // host-cp's docker-events surface — those transitions happen inside
549
+ // container-cp and are wired in a follow-up (see ADR 033 § Open
550
+ // questions for the planned container-cp → host-cp emission seam).
551
+ onWorldLifecycleEvent: ({ worldId, action, exitCode }) => {
552
+ const now = Date.now();
553
+ if (action === 'start' || action === 'restart') {
554
+ recordWorldLifecycle(hostStream, {
555
+ worldId,
556
+ phase: WorldLifecyclePhase.Spawning,
557
+ at: now,
558
+ });
559
+ return;
560
+ }
561
+ if (action === 'stop') {
562
+ recordWorldLifecycle(hostStream, {
563
+ worldId,
564
+ phase: WorldLifecyclePhase.Finished,
565
+ at: now,
566
+ });
567
+ return;
568
+ }
569
+ if (action === 'die' || action === 'kill') {
570
+ const ev = emptyEvidence(worldId, now);
571
+ ev.lastPhase = WorldLifecyclePhase.Running;
572
+ ev.lastPhaseAt = now;
573
+ if (exitCode !== undefined) ev.processExitCode = exitCode;
574
+ // For involuntary exit with a code we know the bucket up front;
575
+ // skip the classifier inference and pass it through explicitly so
576
+ // the trace records the exact docker-derived signal.
577
+ const failureKind =
578
+ exitCode !== undefined ? WorldStartupFailureKind.ProviderProcessGone : undefined;
579
+ const lifecycleEvent = recordWorldLifecycle(hostStream, {
580
+ worldId,
581
+ phase: WorldLifecyclePhase.Failed,
582
+ at: now,
583
+ evidence: ev,
584
+ failureKind,
585
+ });
586
+
587
+
588
+ // Killshot #3 — bounded auto-recovery. Attempt once per
589
+ // (worldId, failureKind) pair; the engine enforces idempotency.
590
+ // Emit recovery.* events on the host-stream so the NDJSON trace
591
+ // sink captures the full attempt trail.
592
+ const resolvedKind = lifecycleEvent.failureKind ?? null;
593
+ const scenario = findScenarioForKind(resolvedKind);
594
+ if (scenario !== undefined) {
595
+ hostStream.broadcast('recovery.attempt-started', {
596
+ worldId,
597
+ scenario: scenario?.name ?? 'unmatched',
598
+ recipe: scenario?.recipe ?? null,
599
+ });
600
+ attemptRecovery(worldId, ev, resolvedKind)
601
+ .then((entry) => {
602
+ if (entry.outcome === 'escalated') {
603
+ hostStream.broadcast('recovery.escalated', {
604
+ worldId,
605
+ ledgerEntry: entry,
606
+ });
607
+ } else if (entry.outcome === 'success') {
608
+ hostStream.broadcast('recovery.attempt-succeeded', {
609
+ worldId,
610
+ ledgerEntry: entry,
611
+ });
612
+ } else {
613
+ hostStream.broadcast('recovery.attempt-failed', {
614
+ worldId,
615
+ ledgerEntry: entry,
616
+ });
617
+ }
618
+ })
619
+ .catch((err) => {
620
+ // Recovery engine always resolves — this path is a safety net.
621
+ console.error(`[recovery] unexpected engine rejection for ${worldId}: ${err?.message}`);
622
+ });
623
+ }
624
+ }
625
+ },
488
626
  });
489
627
 
490
628
  // Initial servers.snapshot so subscribers connecting before any docker
@@ -803,6 +941,58 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
803
941
  if (handled) return;
804
942
  }
805
943
 
944
+ // /api/telemetry/planning-sessions — B9: aggregate planning_sessions by
945
+ // session_source for the canonical-surface bet's adoption signal. Per
946
+ // plan-chat-spa-canonical-surface plan § Operator workflow seam falsification
947
+ // trigger: if plan-chat-spa weekly-active sessions < 60% of control-plane/app
948
+ // by 2026-Q3, freeze plan-chat-spa feature work. This endpoint is the
949
+ // data source for that measurement.
950
+ //
951
+ // Query param: ?since=YYYY-MM-DD (required; rejects with 400 otherwise).
952
+ // Response: { plan_chat_spa: N, control_plane_app: M, unknown: K, ratio: pct }
953
+ // where ratio = plan_chat_spa / (plan_chat_spa + control_plane_app) * 100,
954
+ // null if denominator is 0.
955
+ if (url.pathname === '/api/telemetry/planning-sessions' && req.method === 'GET') {
956
+ const since = url.searchParams.get('since');
957
+ if (!since || !/^\d{4}-\d{2}-\d{2}$/.test(since)) {
958
+ res.writeHead(400, { 'Content-Type': 'application/json' });
959
+ return res.end(JSON.stringify({
960
+ error: 'bad_request',
961
+ message: 'Missing or malformed `since` query param. Expected YYYY-MM-DD.',
962
+ }));
963
+ }
964
+ // B9 ships the endpoint CONTRACT + the session_source schema column.
965
+ // The query implementation goes through plan-chat-service.mjs (which
966
+ // owns the pg pool); this host-cp handler currently emits a 503 with
967
+ // a structured "not_implemented" marker so callers can verify the
968
+ // endpoint shape + auth + query-param parsing without the data path.
969
+ //
970
+ // Phase G of this epic adds the plan-chat-service handler that this
971
+ // endpoint will proxy to. Until then operators can run the SQL
972
+ // directly:
973
+ // SELECT COALESCE(session_source, 'unknown'), COUNT(*)
974
+ // FROM planning_sessions
975
+ // WHERE created_at >= $since
976
+ // GROUP BY 1;
977
+ //
978
+ // Notify-C: ship contract + schema; defer data path to Phase G.
979
+ res.writeHead(503, { 'Content-Type': 'application/json' });
980
+ return res.end(JSON.stringify({
981
+ error: 'not_implemented',
982
+ message: 'B9 ships the endpoint contract + session_source schema column. ' +
983
+ 'Aggregation handler scaffolded in plan-chat-service.mjs lands in Phase G.',
984
+ since,
985
+ contractShape: {
986
+ plan_chat_spa: 0,
987
+ control_plane_app: 0,
988
+ unknown: 0,
989
+ ratio: null,
990
+ since: '<YYYY-MM-DD>',
991
+ asOf: '<ISO 8601>',
992
+ },
993
+ }));
994
+ }
995
+
806
996
  // /api/version/status: returns the current version snapshot (baked SHA
807
997
  // vs operator's local HEAD). No auth required beyond the existing gate
808
998
  // (already applied above). Phase 1 only — detection, no auto-upgrade.
@@ -2070,6 +2260,76 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
2070
2260
  // B5's CLI uses).
2071
2261
  // When unset, returns 503 with a clear setup hint instead of failing
2072
2262
  // silently — operators wire when they're ready for cloud-mode dogfood.
2263
+ // POST /v1/dispatch-from-email — see docs/architecture/email-as-trigger.md.
2264
+ //
2265
+ // The CF Email Worker (packages/email-worker-cloudflare) HMAC-signs the
2266
+ // canonical payload (Decision 022) and POSTs it here. The host re-validates
2267
+ // the signature, re-checks the sender allowlist (defense in depth), persists
2268
+ // attachments under OLAM_EMAIL_ATTACHMENTS_ROOT/<worldId>/<timestampMs>/,
2269
+ // and either routes the dispatch to a known world or persists a
2270
+ // spawn-pending request for the MCP/CLI layer to drain.
2271
+ //
2272
+ // The body cap here is 30 MiB — 25 MiB attachment ceiling + 5 MiB margin
2273
+ // for the JSON envelope. Larger payloads are rejected at 413.
2274
+ if (url.pathname === '/v1/dispatch-from-email' && req.method === 'POST') {
2275
+ const chunks = [];
2276
+ let size = 0;
2277
+ const MAX_BODY = 30 * 1024 * 1024;
2278
+ let aborted = false;
2279
+ req.on('data', (chunk) => {
2280
+ size += chunk.length;
2281
+ if (size > MAX_BODY) {
2282
+ aborted = true;
2283
+ jsonReply(res, 413, { error: 'body_too_large', maxBytes: MAX_BODY });
2284
+ req.destroy();
2285
+ return;
2286
+ }
2287
+ chunks.push(chunk);
2288
+ });
2289
+ req.on('end', async () => {
2290
+ if (aborted) return;
2291
+ let dispatch;
2292
+ try {
2293
+ dispatch = JSON.parse(Buffer.concat(chunks).toString('utf8') || '{}');
2294
+ } catch (err) {
2295
+ return jsonReply(res, 400, { error: 'invalid_json', message: err.message });
2296
+ }
2297
+ try {
2298
+ // Auto-tier-scheduler v1 (ADR 042): emit an informational
2299
+ // `dispatch.tier-suggestion` event BEFORE handing off to the
2300
+ // dispatch handler. Pure-informational — never changes which
2301
+ // provider actually runs. The dispatch payload's optional
2302
+ // `tierSpec` ({ kind?, expectedDurationMs?, explicitTier? })
2303
+ // carries the shape; absent it, the heuristic falls through to
2304
+ // its default (`cloudflare-sandbox`).
2305
+ if (dispatch && typeof dispatch.worldId === 'string') {
2306
+ try {
2307
+ emitTierSuggestion({
2308
+ worldId: dispatch.worldId,
2309
+ dispatchSpec: dispatch.tierSpec ?? {},
2310
+ currentTier: null,
2311
+ hostStream,
2312
+ });
2313
+ } catch { /* never let a hint surface break dispatch */ }
2314
+ }
2315
+ const result = await handleDispatchFromEmail({
2316
+ dispatch,
2317
+ worlds: WORLDS,
2318
+ secret: OLAM_EMAIL_SIGNING_SECRET,
2319
+ attachmentsRoot: OLAM_EMAIL_ATTACHMENTS_ROOT,
2320
+ allowlist: OLAM_EMAIL_ALLOWED_SENDERS,
2321
+ });
2322
+ return jsonReply(res, result.status, result.body);
2323
+ } catch (err) {
2324
+ return jsonReply(res, 500, {
2325
+ error: 'dispatch_failed',
2326
+ message: err instanceof Error ? err.message : String(err),
2327
+ });
2328
+ }
2329
+ });
2330
+ return;
2331
+ }
2332
+
2073
2333
  if (url.pathname === '/api/cloud-dispatch' && req.method === 'POST') {
2074
2334
  const cloudUrl = process.env.OLAM_CLOUD_URL;
2075
2335
  const showcasePw = process.env.OLAM_SHOWCASE_PASSWORD;
@@ -3078,6 +3338,7 @@ for (const sig of ['SIGTERM', 'SIGINT']) {
3078
3338
  stopListeningSnapshotLoop();
3079
3339
  if (serversSnapshotTimer) { clearTimeout(serversSnapshotTimer); serversSnapshotTimer = null; }
3080
3340
  hostStream.close();
3341
+ if (ndjsonSpanSink) ndjsonSpanSink.close().catch(() => {});
3081
3342
  clearInterval(versionPollTimer);
3082
3343
  cache.clear();
3083
3344
  server.close(() => process.exit(0));
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pleri/olam-cli",
3
- "version": "0.1.169",
3
+ "version": "0.1.173",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "olam": "./bin/olam.cjs"