@boardwalk-labs/engine 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/LICENSE +202 -0
  2. package/README.md +69 -0
  3. package/bin/boardwalk-server.js +16 -0
  4. package/dist/agent/conversation.d.ts +42 -0
  5. package/dist/agent/conversation.js +4 -0
  6. package/dist/agent/leaf.d.ts +81 -0
  7. package/dist/agent/leaf.js +190 -0
  8. package/dist/agent/providers.d.ts +23 -0
  9. package/dist/agent/providers.js +347 -0
  10. package/dist/agent/rates.d.ts +13 -0
  11. package/dist/agent/rates.js +35 -0
  12. package/dist/agent/redact.d.ts +9 -0
  13. package/dist/agent/redact.js +27 -0
  14. package/dist/agent/resolve.d.ts +58 -0
  15. package/dist/agent/resolve.js +153 -0
  16. package/dist/agent/sse.d.ts +2 -0
  17. package/dist/agent/sse.js +30 -0
  18. package/dist/agent/tools.d.ts +57 -0
  19. package/dist/agent/tools.js +324 -0
  20. package/dist/clock.d.ts +8 -0
  21. package/dist/clock.js +32 -0
  22. package/dist/cron/cron.d.ts +34 -0
  23. package/dist/cron/cron.js +331 -0
  24. package/dist/engine.d.ts +106 -0
  25. package/dist/engine.js +183 -0
  26. package/dist/errors.d.ts +15 -0
  27. package/dist/errors.js +40 -0
  28. package/dist/ids.d.ts +7 -0
  29. package/dist/ids.js +42 -0
  30. package/dist/index.d.ts +6 -0
  31. package/dist/index.js +8 -0
  32. package/dist/json_value.d.ts +7 -0
  33. package/dist/json_value.js +29 -0
  34. package/dist/mcp/client.d.ts +39 -0
  35. package/dist/mcp/client.js +112 -0
  36. package/dist/mcp/jsonrpc.d.ts +57 -0
  37. package/dist/mcp/jsonrpc.js +117 -0
  38. package/dist/mcp/oauth.d.ts +72 -0
  39. package/dist/mcp/oauth.js +337 -0
  40. package/dist/mcp/token_store.d.ts +30 -0
  41. package/dist/mcp/token_store.js +101 -0
  42. package/dist/mcp/transport_http.d.ts +38 -0
  43. package/dist/mcp/transport_http.js +143 -0
  44. package/dist/mcp/transport_stdio.d.ts +27 -0
  45. package/dist/mcp/transport_stdio.js +94 -0
  46. package/dist/run/child.d.ts +1 -0
  47. package/dist/run/child.js +139 -0
  48. package/dist/run/child_host.d.ts +26 -0
  49. package/dist/run/child_host.js +124 -0
  50. package/dist/run/idempotency.d.ts +5 -0
  51. package/dist/run/idempotency.js +31 -0
  52. package/dist/run/ipc.d.ts +159 -0
  53. package/dist/run/ipc.js +150 -0
  54. package/dist/run/run_dir.d.ts +31 -0
  55. package/dist/run/run_dir.js +106 -0
  56. package/dist/run/supervisor.d.ts +107 -0
  57. package/dist/run/supervisor.js +676 -0
  58. package/dist/scheduler/scheduler.d.ts +54 -0
  59. package/dist/scheduler/scheduler.js +215 -0
  60. package/dist/server/http.d.ts +42 -0
  61. package/dist/server/http.js +183 -0
  62. package/dist/server/routes/api.d.ts +17 -0
  63. package/dist/server/routes/api.js +107 -0
  64. package/dist/server/routes/hooks.d.ts +2 -0
  65. package/dist/server/routes/hooks.js +88 -0
  66. package/dist/server/routes/router.d.ts +15 -0
  67. package/dist/server/routes/router.js +75 -0
  68. package/dist/server/routes/stream.d.ts +2 -0
  69. package/dist/server/routes/stream.js +79 -0
  70. package/dist/server/routes/ui.d.ts +2 -0
  71. package/dist/server/routes/ui.js +120 -0
  72. package/dist/server/server.d.ts +25 -0
  73. package/dist/server/server.js +67 -0
  74. package/dist/server_main.d.ts +46 -0
  75. package/dist/server_main.js +203 -0
  76. package/dist/store/migrations.d.ts +21 -0
  77. package/dist/store/migrations.js +159 -0
  78. package/dist/store/store.d.ts +194 -0
  79. package/dist/store/store.js +567 -0
  80. package/package.json +57 -0
@@ -0,0 +1,676 @@
1
+ // The run supervisor — owns run-lifecycle state transitions and process supervision
2
+ // (SPEC §2.2). Layering (CODE_QUALITY §7.2): knows nothing about HTTP or the CLI; persistence
3
+ // goes through the Store; what workflows *do* lives in the child process.
4
+ //
5
+ // Semantics implemented here, identical in every engine (MASTER_SPEC §2.4):
6
+ // - one run = one spawned process, isolated working directory
7
+ // - hold-and-pay: sleep holds the child; nothing here checkpoints
8
+ // - restart-on-crash: child death without a done/failed report restarts the run from the
9
+ // top, bounded by maxRestarts, then `failed` with code CRASHED
10
+ // - cancellation: cooperative SIGTERM, then SIGKILL after a grace window
11
+ // - budgets terminate: max_duration_seconds is a supervisor deadline spanning restarts
12
+ // - crash-safe: every transition is persisted before/with its event; a recovery sweep on
13
+ // boot re-dispatches whatever a dead engine left behind
14
+ //
15
+ // Envelope authority: the child sends event BODIES; this is the only place envelopes are
16
+ // stamped and cursors allocated. On restart the cursor resumes past maxCursor so a filtered
17
+ // SSE consumer's resume position stays valid across crashes.
18
+ import { spawn } from "node:child_process";
19
+ import { pathToFileURL } from "node:url";
20
+ import { join } from "node:path";
21
+ import { existsSync, writeFileSync } from "node:fs";
22
+ import { makeCursor, runEventSchema, TURN_CURSOR_STRIDE, } from "@boardwalk-labs/workflow";
23
+ import { usageUsdMicros } from "../agent/rates.js";
24
+ import { resolveModel } from "../agent/resolve.js";
25
+ import { MEMORY_PATH_RE } from "../agent/tools.js";
26
+ import { systemClock } from "../clock.js";
27
+ import { EngineError, toErrorShape } from "../errors.js";
28
+ import { asJsonValue } from "../json_value.js";
29
+ import { refreshAccessToken } from "../mcp/oauth.js";
30
+ import { McpTokenStore, MCP_TOKENS_FILENAME } from "../mcp/token_store.js";
31
+ import { defaultIdempotencyKey } from "./idempotency.js";
32
+ import { callWorkflowArgsSchema, childToParentSchema, getSecretArgsSchema, mcpTokenArgsSchema, resolveModelArgsSchema, writeArtifactArgsSchema, } from "./ipc.js";
33
+ import { hydrateWorkspace, persistRoot, persistWorkspace, prepareRunDir, } from "./run_dir.js";
34
+ const TERMINAL_STATUSES = ["completed", "failed", "cancelled"];
35
+ /** Treat an MCP access token expiring this soon as already expired — a token that dies
36
+ * between the broker reply and the server call would burn the child's whole 401 retry. */
37
+ const MCP_TOKEN_EXPIRY_SKEW_MS = 30_000;
38
+ /** True when a run can no longer change state. */
39
+ export function isTerminal(status) {
40
+ return TERMINAL_STATUSES.includes(status);
41
+ }
42
+ export class RunSupervisor {
43
+ store;
44
+ dataDir;
45
+ childEntryPath;
46
+ env;
47
+ envLabel;
48
+ clock;
49
+ maxRestarts;
50
+ cancelGraceMs;
51
+ inference;
52
+ mcpTokens;
53
+ active = new Map();
54
+ listeners = new Set();
55
+ constructor(opts) {
56
+ this.store = opts.store;
57
+ this.dataDir = opts.dataDir;
58
+ this.childEntryPath = opts.childEntryPath;
59
+ this.env = opts.env;
60
+ this.envLabel = opts.envLabel;
61
+ this.clock = opts.clock ?? systemClock;
62
+ this.maxRestarts = opts.maxRestarts ?? 2;
63
+ this.cancelGraceMs = opts.cancelGraceMs ?? 10_000;
64
+ this.inference = opts.inference ?? {};
65
+ // Same path Engine.authorizeMcpServer writes — the interactive grant lands where runs read.
66
+ this.mcpTokens = new McpTokenStore(join(opts.dataDir, MCP_TOKENS_FILENAME));
67
+ }
68
+ /** Subscribe to every stamped run event (the local feed for SSE/log UIs). */
69
+ onEvent(listener) {
70
+ this.listeners.add(listener);
71
+ return () => this.listeners.delete(listener);
72
+ }
73
+ /**
74
+ * Drive a run to terminal status; idempotent per run id (a second call while active returns
75
+ * the same promise; a call on a terminal run resolves immediately). Never rejects for run
76
+ * failures — failure is a status, not an exception. Rejects only on caller bugs (unknown id).
77
+ */
78
+ supervise(runId) {
79
+ const existing = this.active.get(runId);
80
+ if (existing !== undefined)
81
+ return existing.promise;
82
+ const run = this.store.getRun(runId);
83
+ if (run === null) {
84
+ return Promise.reject(new EngineError("NOT_FOUND", `Unknown run: ${runId}`));
85
+ }
86
+ if (isTerminal(run.status))
87
+ return Promise.resolve(run);
88
+ if (run.status === "cancelling") {
89
+ // An interrupted cancellation (the killer died mid-kill). The process is gone — the
90
+ // orphan exits on IPC disconnect — so finalize instead of re-executing.
91
+ this.store.updateRunStatus(runId, "cancelled", { endedAt: this.clock.now() });
92
+ this.stampAndStore(runId, this.resumeEnvelope(runId), {
93
+ kind: "run_status",
94
+ status: "cancelled",
95
+ });
96
+ return Promise.resolve(this.mustGetRun(runId));
97
+ }
98
+ const entry = {
99
+ promise: Promise.resolve(run), // replaced below
100
+ child: null,
101
+ cancelRequested: false,
102
+ budgetReason: null,
103
+ memoryDirs: new Set(),
104
+ envelope: this.resumeEnvelope(runId),
105
+ };
106
+ entry.promise = this.execute(run, entry)
107
+ .catch((err) => {
108
+ // Engine-internal failure (store error, spawn impossible). Record it; never throw
109
+ // out of supervision — a run must always land on a terminal row. The recovery write
110
+ // itself can throw if the store closed under a shutdown race, so it is guarded: a
111
+ // void-dispatched supervise() must never surface an unhandled rejection.
112
+ try {
113
+ this.finishRun(runId, entry, "failed", { error: toErrorShape(err) });
114
+ return this.mustGetRun(runId);
115
+ }
116
+ catch (recoveryErr) {
117
+ console.error(`run ${runId}: failed to record terminal state`, recoveryErr);
118
+ return run;
119
+ }
120
+ })
121
+ .finally(() => this.active.delete(runId));
122
+ this.active.set(runId, entry);
123
+ return entry.promise;
124
+ }
125
+ /** Emit the `queued` lifecycle event for a freshly created run (the creator calls this once). */
126
+ emitQueued(runId) {
127
+ const envelope = this.resumeEnvelope(runId);
128
+ this.stampAndStore(runId, envelope, { kind: "run_status", status: "queued" });
129
+ }
130
+ /**
131
+ * Cancel a run: cooperative SIGTERM, SIGKILL after the grace window. A queued/unsupervised
132
+ * run is cancelled directly; a terminal run is a no-op.
133
+ */
134
+ async cancel(runId) {
135
+ const run = this.store.getRun(runId);
136
+ if (run === null)
137
+ throw new EngineError("NOT_FOUND", `Unknown run: ${runId}`);
138
+ if (isTerminal(run.status))
139
+ return;
140
+ const entry = this.active.get(runId);
141
+ if (entry === undefined) {
142
+ // Nothing is executing it right now (queued, or left over from a dead engine).
143
+ const envelope = this.resumeEnvelope(runId);
144
+ this.store.updateRunStatus(runId, "cancelled", { endedAt: this.clock.now() });
145
+ this.stampAndStore(runId, envelope, { kind: "run_status", status: "cancelled" });
146
+ return;
147
+ }
148
+ if (entry.cancelRequested)
149
+ return;
150
+ entry.cancelRequested = true;
151
+ this.store.updateRunStatus(runId, "cancelling");
152
+ this.stampAndStore(runId, entry.envelope, { kind: "run_status", status: "cancelling" });
153
+ const child = entry.child;
154
+ if (child !== null && child.exitCode === null && !child.killed) {
155
+ child.kill("SIGTERM");
156
+ await this.clock.sleep(this.cancelGraceMs);
157
+ if (entry.child !== null && entry.child.exitCode === null) {
158
+ entry.child.kill("SIGKILL");
159
+ }
160
+ }
161
+ await entry.promise;
162
+ }
163
+ /**
164
+ * Boot recovery sweep (SPEC §2.2): runs a dead engine left active are re-dispatched
165
+ * (restart-from-the-top — the child died with the engine); interrupted cancellations are
166
+ * finalized (the orphan child exits on IPC disconnect, so the kill already happened).
167
+ * Engine restarts do not consume the run's crash-restart budget.
168
+ */
169
+ recoverOnBoot() {
170
+ const resumed = [];
171
+ const cancelled = [];
172
+ for (const run of this.store.listRuns({ statuses: ["cancelling"] })) {
173
+ this.store.updateRunStatus(run.id, "cancelled", { endedAt: this.clock.now() });
174
+ this.stampAndStore(run.id, this.resumeEnvelope(run.id), {
175
+ kind: "run_status",
176
+ status: "cancelled",
177
+ });
178
+ cancelled.push(run.id);
179
+ }
180
+ for (const run of this.store.listRuns({ statuses: ["queued", "pending", "running"] })) {
181
+ resumed.push(run.id);
182
+ void this.supervise(run.id);
183
+ }
184
+ return { resumed, cancelled };
185
+ }
186
+ /** SIGTERM all children and stop. In-flight runs are recovered by the next boot's sweep. */
187
+ shutdown() {
188
+ for (const entry of this.active.values()) {
189
+ entry.child?.kill("SIGTERM");
190
+ }
191
+ }
192
+ // --------------------------------------------------------------------------
193
+ // Execution
194
+ // --------------------------------------------------------------------------
195
+ async execute(run, entry) {
196
+ const workflow = this.store.getWorkflowById(run.workflowId);
197
+ if (workflow === null) {
198
+ throw new EngineError("INTERNAL", `Run ${run.id} references a missing workflow.`);
199
+ }
200
+ const manifest = workflow.manifest;
201
+ this.setStatus(run.id, entry, "pending");
202
+ let firstStartedAt = run.startedAt;
203
+ // Hydrate persistent dirs only into a NEVER-started workspace: a crash-restart (and an
204
+ // engine-restart resume) must keep the workspace as the crashed pass left it.
205
+ let hydrated = run.startedAt !== null;
206
+ for (;;) {
207
+ if (entry.cancelRequested) {
208
+ return this.finishRun(run.id, entry, "cancelled", {});
209
+ }
210
+ const dirs = prepareRunDir(this.dataDir, run.id, workflow.program);
211
+ if (!hydrated) {
212
+ hydrated = true;
213
+ hydrateWorkspace(persistRoot(this.dataDir, workflow.id), dirs.workspaceDir);
214
+ }
215
+ const startedAt = firstStartedAt ?? this.clock.now();
216
+ this.store.updateRunStatus(run.id, "running", { startedAt });
217
+ this.stampAndStore(run.id, entry.envelope, { kind: "run_status", status: "running" });
218
+ firstStartedAt = startedAt;
219
+ const maxSeconds = manifest.budget?.max_duration_seconds;
220
+ const deadline = maxSeconds === undefined ? null : startedAt + maxSeconds * 1000;
221
+ const result = await this.spawnOnce(run, entry, workflow, dirs, deadline);
222
+ switch (result.kind) {
223
+ case "done": {
224
+ if (result.outputDeclared) {
225
+ this.stampAndStore(run.id, entry.envelope, { kind: "output", value: result.output });
226
+ }
227
+ // A completion racing a cancel request coerces to cancelled — `cancelling` must
228
+ // never land on `completed` (the output event above is still preserved).
229
+ if (entry.cancelRequested)
230
+ return this.finishRun(run.id, entry, "cancelled", {});
231
+ // Persist-back happens at SUCCESSFUL run end only (failed/cancelled runs must not
232
+ // overwrite the durable state with a half-finished workspace). Per-agent memory
233
+ // dirs used this run are persisted alongside the manifest's selection.
234
+ persistWorkspace(persistRoot(this.dataDir, workflow.id), manifest.workspace?.persist, entry.memoryDirs, dirs.workspaceDir);
235
+ return this.finishRun(run.id, entry, "completed", {
236
+ output: result.outputDeclared ? result.output : null,
237
+ });
238
+ }
239
+ case "failed": {
240
+ // A verdict output() before the throw is emitted (before the failed status) and kept
241
+ // on the row — same as the completed path, so failed runs aren't silently output-less.
242
+ if (result.outputDeclared) {
243
+ this.stampAndStore(run.id, entry.envelope, { kind: "output", value: result.output });
244
+ }
245
+ return this.finishRun(run.id, entry, "failed", {
246
+ error: result.error,
247
+ ...(result.outputDeclared ? { output: result.output } : {}),
248
+ });
249
+ }
250
+ case "cancelled":
251
+ return this.finishRun(run.id, entry, "cancelled", {});
252
+ case "budget":
253
+ return this.finishRun(run.id, entry, "failed", {
254
+ error: {
255
+ code: "BUDGET_EXCEEDED",
256
+ message: entry.budgetReason ??
257
+ `Run exceeded budget.max_duration_seconds (${String(maxSeconds)}s) and was terminated.`,
258
+ },
259
+ });
260
+ case "crashed": {
261
+ const restarts = this.store.incrementRestarts(run.id);
262
+ if (restarts > this.maxRestarts) {
263
+ return this.finishRun(run.id, entry, "failed", {
264
+ error: {
265
+ code: "CRASHED",
266
+ message: `Run process died ${String(restarts)} times; restart budget exhausted.`,
267
+ },
268
+ });
269
+ }
270
+ // Restart from the top — the documented crash semantics. Durable sub-work behind
271
+ // workflows.call re-attaches via idempotency on the next pass.
272
+ this.setStatus(run.id, entry, "pending");
273
+ }
274
+ }
275
+ }
276
+ }
277
+ spawnOnce(run, entry, workflow, dirs, deadline) {
278
+ return new Promise((resolve) => {
279
+ let settled = false;
280
+ let budgetTimer = null;
281
+ const settle = (result) => {
282
+ if (settled)
283
+ return;
284
+ settled = true;
285
+ if (budgetTimer !== null)
286
+ clearTimeout(budgetTimer);
287
+ resolve(result);
288
+ };
289
+ if (deadline !== null && deadline - this.clock.now() <= 0) {
290
+ entry.budgetReason ??= durationBudgetMessage(workflow.manifest);
291
+ settle({ kind: "budget" });
292
+ return;
293
+ }
294
+ let child;
295
+ try {
296
+ child = spawn(process.execPath, [this.childEntryPath], {
297
+ cwd: dirs.workspaceDir,
298
+ env: this.childEnv(workflow.manifest),
299
+ stdio: ["ignore", "pipe", "pipe", "ipc"],
300
+ serialization: "json",
301
+ });
302
+ }
303
+ catch {
304
+ settle({ kind: "crashed" });
305
+ return;
306
+ }
307
+ entry.child = child;
308
+ if (deadline !== null) {
309
+ budgetTimer = setTimeout(() => {
310
+ entry.budgetReason ??= durationBudgetMessage(workflow.manifest);
311
+ // Budget breach terminates immediately — enforced, not advisory (CODE_QUALITY §4.3).
312
+ child.kill("SIGKILL");
313
+ }, deadline - this.clock.now());
314
+ }
315
+ child.stdout?.on("data", (chunk) => {
316
+ this.emitBody(run.id, entry, {
317
+ kind: "program_output",
318
+ stream: "stdout",
319
+ text: chunk.toString(),
320
+ });
321
+ });
322
+ child.stderr?.on("data", (chunk) => {
323
+ this.emitBody(run.id, entry, {
324
+ kind: "program_output",
325
+ stream: "stderr",
326
+ text: chunk.toString(),
327
+ });
328
+ });
329
+ child.on("message", (raw) => {
330
+ const parsed = childToParentSchema.safeParse(raw);
331
+ if (!parsed.success)
332
+ return; // Only protocol messages are expected on this channel.
333
+ const msg = parsed.data;
334
+ switch (msg.type) {
335
+ case "host_call":
336
+ void this.handleHostCall(run, workflow, dirs, msg.method, msg.args)
337
+ .then((value) => {
338
+ if (child.connected) {
339
+ child.send({
340
+ type: "host_result",
341
+ callId: msg.callId,
342
+ result: { ok: true, value },
343
+ });
344
+ }
345
+ })
346
+ .catch((err) => {
347
+ const shape = toErrorShape(err);
348
+ const hint = err instanceof EngineError ? err.hint : undefined;
349
+ if (child.connected) {
350
+ child.send({
351
+ type: "host_result",
352
+ callId: msg.callId,
353
+ result: {
354
+ ok: false,
355
+ error: { ...shape, ...(hint !== undefined ? { hint } : {}) },
356
+ },
357
+ });
358
+ }
359
+ });
360
+ break;
361
+ case "emit":
362
+ this.emitBody(run.id, entry, msg.body, msg.turnId);
363
+ break;
364
+ case "turn_started":
365
+ // A new agent turn: bump the cursor stride block, then emit its opening frame —
366
+ // naming the leaf (agentId + optional agentName) so consumers can attribute it.
367
+ entry.envelope.turn += 1;
368
+ entry.envelope.seq = 0;
369
+ this.emitBody(run.id, entry, {
370
+ kind: "turn_started",
371
+ agentId: msg.agentId,
372
+ ...(msg.agentName !== undefined ? { agentName: msg.agentName } : {}),
373
+ }, msg.turnId);
374
+ break;
375
+ case "report_usage":
376
+ this.recordUsage(run.id, entry, workflow, msg.modelRef, msg.usage);
377
+ break;
378
+ case "memory_used":
379
+ // The child validated the path, but the parent persists it — re-check the shape
380
+ // before it can ever reach a filesystem copy (CODE_QUALITY §2.1).
381
+ if (MEMORY_PATH_RE.test(msg.dir) && !msg.dir.includes("\\")) {
382
+ entry.memoryDirs.add(msg.dir);
383
+ }
384
+ else {
385
+ console.error(`run ${run.id}: ignored malformed memory dir from child`);
386
+ }
387
+ break;
388
+ case "done":
389
+ settle({ kind: "done", output: msg.output, outputDeclared: msg.outputDeclared });
390
+ break;
391
+ case "failed":
392
+ settle({
393
+ kind: "failed",
394
+ error: msg.error,
395
+ output: msg.output,
396
+ outputDeclared: msg.outputDeclared,
397
+ });
398
+ break;
399
+ }
400
+ });
401
+ child.on("error", () => settle({ kind: "crashed" }));
402
+ child.on("exit", () => {
403
+ entry.child = null;
404
+ if (entry.budgetReason !== null)
405
+ settle({ kind: "budget" });
406
+ else if (entry.cancelRequested)
407
+ settle({ kind: "cancelled" });
408
+ else
409
+ settle({ kind: "crashed" });
410
+ });
411
+ const init = {
412
+ type: "init",
413
+ runId: run.id,
414
+ programPath: dirs.programPath,
415
+ workspaceDir: dirs.workspaceDir,
416
+ skillsDir: this.skillsDirFor(workflow.id),
417
+ input: run.input,
418
+ config: workflow.config,
419
+ manifest: workflow.manifest,
420
+ };
421
+ if (child.connected)
422
+ child.send(init);
423
+ });
424
+ }
425
+ // --------------------------------------------------------------------------
426
+ // Host calls (the engine side of the SDK bridge)
427
+ // --------------------------------------------------------------------------
428
+ /**
429
+ * Accumulate a leaf's usage into the run row and enforce token/USD budgets — the supervisor
430
+ * is the single budget authority, so a multi-leaf run can't out-run its caps by parallelism.
431
+ */
432
+ recordUsage(runId, entry, workflow, modelRef, usage) {
433
+ this.store.addRunUsage(runId, {
434
+ tokensIn: usage.inputTokens ?? 0,
435
+ tokensOut: usage.outputTokens ?? 0,
436
+ usdMicros: usageUsdMicros(modelRef, usage),
437
+ });
438
+ const budget = workflow.manifest.budget;
439
+ if (budget === undefined)
440
+ return;
441
+ const totals = this.store.getRunUsage(runId);
442
+ let reason = null;
443
+ if (budget.max_tokens !== undefined && totals.tokensIn + totals.tokensOut > budget.max_tokens) {
444
+ reason = `Run exceeded budget.max_tokens (${String(budget.max_tokens)}) and was terminated.`;
445
+ }
446
+ else if (budget.max_usd !== undefined && totals.usdMicros > budget.max_usd * 1_000_000) {
447
+ reason = `Run exceeded budget.max_usd ($${String(budget.max_usd)}, approximate rates) and was terminated.`;
448
+ }
449
+ if (reason !== null && entry.budgetReason === null) {
450
+ entry.budgetReason = reason;
451
+ entry.child?.kill("SIGKILL");
452
+ }
453
+ }
454
+ async handleHostCall(run, workflow, dirs, method, args) {
455
+ switch (method) {
456
+ case "get_secret":
457
+ return this.resolveSecret(workflow.manifest, getSecretArgsSchema.parse(args).name);
458
+ case "resolve_model": {
459
+ const a = resolveModelArgsSchema.parse(args);
460
+ return resolveModel({
461
+ model: a.model,
462
+ provider: a.provider,
463
+ config: this.inference,
464
+ getEnv: (name) => this.env.get(name) ?? process.env[name],
465
+ });
466
+ }
467
+ case "call_workflow": {
468
+ const a = callWorkflowArgsSchema.parse(args);
469
+ const child = this.startChildRun(run.id, a.slug, a.input, a.idempotencyKey);
470
+ const terminal = await this.supervise(child.id);
471
+ if (terminal.status === "completed")
472
+ return terminal.output;
473
+ if (terminal.status === "cancelled") {
474
+ throw new EngineError("CANCELLED", `Child workflow "${a.slug}" was cancelled.`);
475
+ }
476
+ throw new EngineError("PROGRAM_ERROR", `Child workflow "${a.slug}" failed: ${terminal.error?.message ?? "unknown error"}`);
477
+ }
478
+ case "run_workflow": {
479
+ const a = callWorkflowArgsSchema.parse(args);
480
+ const child = this.startChildRun(run.id, a.slug, a.input, a.idempotencyKey);
481
+ void this.supervise(child.id);
482
+ return child.id;
483
+ }
484
+ case "mcp_token": {
485
+ const a = mcpTokenArgsSchema.parse(args);
486
+ return await this.resolveMcpToken(a.serverUrl, a.invalidateToken);
487
+ }
488
+ case "write_artifact": {
489
+ const a = writeArtifactArgsSchema.parse(args);
490
+ if (a.name.includes("/") || a.name.includes("\\") || a.name.includes("..")) {
491
+ throw new EngineError("VALIDATION", `Artifact name "${a.name}" must be a plain file name.`);
492
+ }
493
+ const bytes = Buffer.from(a.bodyBase64, "base64");
494
+ const path = join(dirs.artifactsDir, a.name);
495
+ writeFileSync(path, bytes);
496
+ const row = this.store.createArtifact({
497
+ runId: run.id,
498
+ name: a.name,
499
+ contentType: a.contentType,
500
+ path,
501
+ size: bytes.length,
502
+ ...(a.metadata !== undefined ? { metadata: a.metadata } : {}),
503
+ });
504
+ return { id: row.id, name: row.name, url: pathToFileURL(path).href };
505
+ }
506
+ }
507
+ }
508
+ /** Find-or-create the durable child run for workflows.call/run (idempotent re-attach). */
509
+ startChildRun(parentRunId, slug, input, idempotencyKey) {
510
+ const target = this.store.getWorkflow(slug);
511
+ if (target === null) {
512
+ throw new EngineError("NOT_FOUND", `workflows.call target "${slug}" is not deployed on this engine.`, `Deploy it first — the engine only runs workflows it knows by name.`);
513
+ }
514
+ // Crossed the JSON IPC channel, but narrow instead of assuming (CODE_QUALITY §2.1) — and
515
+ // the canonical default key requires a JSON tree anyway.
516
+ const jsonInput = input === undefined ? null : asJsonValue(input, "workflows.call input");
517
+ const key = idempotencyKey ?? defaultIdempotencyKey(parentRunId, slug, jsonInput);
518
+ const { run, created } = this.store.createRun({
519
+ workflowId: target.id,
520
+ triggerKind: "manual",
521
+ input: jsonInput,
522
+ parentRunId,
523
+ idempotencyKey: key,
524
+ });
525
+ if (created)
526
+ this.emitQueued(run.id);
527
+ return run;
528
+ }
529
+ /**
530
+ * The engine side of MCP OAuth: hand the child a usable access token, refreshing SILENTLY
531
+ * when the stored one is expired (clock + skew) or the child reports the server rejected it
532
+ * (`invalidateToken` — the child retries at most once, so a second rejection lands back here
533
+ * as a failure). When only a human could fix it, answer null + a hint naming
534
+ * engine.authorizeMcpServer — a headless run must fail loudly, never prompt.
535
+ */
536
+ async resolveMcpToken(serverUrl, invalidateToken) {
537
+ const hint = `No usable OAuth token for this MCP server — authorize it once with ` +
538
+ `engine.authorizeMcpServer("${serverUrl}") (boardwalk dev / the server UI expose this), ` +
539
+ `then re-run.`;
540
+ const entry = this.mcpTokens.get(serverUrl);
541
+ if (entry === null)
542
+ return { accessToken: null, hint };
543
+ const invalidated = invalidateToken !== undefined && invalidateToken === entry.accessToken;
544
+ const expired = entry.expiresAt !== undefined && entry.expiresAt - MCP_TOKEN_EXPIRY_SKEW_MS <= Date.now();
545
+ if (!invalidated && !expired)
546
+ return { accessToken: entry.accessToken };
547
+ if (entry.refreshToken === undefined || entry.tokenEndpoint === undefined) {
548
+ return { accessToken: null, hint };
549
+ }
550
+ try {
551
+ const grant = await refreshAccessToken({
552
+ tokenEndpoint: entry.tokenEndpoint,
553
+ clientId: entry.clientId,
554
+ refreshToken: entry.refreshToken,
555
+ resource: entry.resource,
556
+ });
557
+ this.mcpTokens.set(serverUrl, {
558
+ ...entry,
559
+ accessToken: grant.accessToken,
560
+ // An AS may rotate the refresh token on use (OAuth 2.1 encourages it) — keep the new one.
561
+ ...(grant.refreshToken !== undefined ? { refreshToken: grant.refreshToken } : {}),
562
+ ...(grant.expiresAt !== undefined ? { expiresAt: grant.expiresAt } : {}),
563
+ });
564
+ return { accessToken: grant.accessToken };
565
+ }
566
+ catch {
567
+ // The entry stays: a transient AS outage must not destroy a working grant. The run
568
+ // fails with the authorize hint; a later run retries the refresh.
569
+ return { accessToken: null, hint };
570
+ }
571
+ }
572
+ resolveSecret(manifest, name) {
573
+ const declared = (manifest.secrets ?? []).some((s) => s.name === name);
574
+ if (!declared) {
575
+ throw new EngineError("SECRET_UNDECLARED", `Secret "${name}" is not declared in meta.secrets.`, `Add { name: "${name}" } to meta.secrets — secret access is fail-closed everywhere.`);
576
+ }
577
+ const value = this.env.get(name) ?? process.env[name];
578
+ if (value === undefined || value.length === 0) {
579
+ throw new EngineError("SECRET_MISSING", `Secret "${name}" has no value on this engine.`, `Set ${name}=… in ${this.envLabel}.`);
580
+ }
581
+ return value;
582
+ }
583
+ /**
584
+ * The child's environment: the parent env plus manifest.env with whole-value
585
+ * `${{ secrets.NAME }}` interpolation resolved (fail-closed against meta.secrets).
586
+ */
587
+ childEnv(manifest) {
588
+ const out = { ...process.env };
589
+ for (const [key, value] of Object.entries(manifest.env ?? {})) {
590
+ const secretName = /^\$\{\{\s*secrets\.([A-Za-z0-9_-]+)\s*\}\}$/.exec(value)?.[1];
591
+ out[key] = secretName === undefined ? value : this.resolveSecret(manifest, secretName);
592
+ }
593
+ return out;
594
+ }
595
+ // --------------------------------------------------------------------------
596
+ // Events + transitions
597
+ // --------------------------------------------------------------------------
598
+ setStatus(runId, entry, status) {
599
+ this.store.updateRunStatus(runId, status);
600
+ this.stampAndStore(runId, entry.envelope, { kind: "run_status", status });
601
+ }
602
+ /** Terminal transition: persist status + output/error, emit the lifecycle event, return the row. */
603
+ finishRun(runId, entry, status, opts) {
604
+ this.store.updateRunStatus(runId, status, {
605
+ endedAt: this.clock.now(),
606
+ ...(opts.output !== undefined
607
+ ? { output: asJsonValue(opts.output, "The run's declared output") }
608
+ : {}),
609
+ ...(opts.error !== undefined
610
+ ? { error: { code: opts.error.code, message: opts.error.message } }
611
+ : {}),
612
+ });
613
+ this.stampAndStore(runId, entry.envelope, {
614
+ kind: "run_status",
615
+ status,
616
+ ...(opts.error !== undefined
617
+ ? { error: { code: opts.error.code, message: opts.error.message } }
618
+ : {}),
619
+ });
620
+ return this.mustGetRun(runId);
621
+ }
622
+ /** Stamp a child-emitted body. A malformed body is dropped with a diagnostic, never fatal. */
623
+ emitBody(runId, entry, body, turnId) {
624
+ try {
625
+ this.stampAndStore(runId, entry.envelope, body, turnId);
626
+ }
627
+ catch {
628
+ // A malformed body from the child is a protocol bug, not a reason to kill the run.
629
+ // runEventSchema.parse inside stampAndStore is what threw.
630
+ console.error(`run ${runId}: dropped malformed child event (kind=${body.kind})`);
631
+ }
632
+ }
633
+ /**
634
+ * The single envelope-stamping path: allocate cursor, validate, persist, fan out. The body
635
+ * is typed loosely because child-emitted bodies are untrusted — runEventSchema.parse below
636
+ * is the validation, not the type. Run-level frames carry the run id as turnId; agent-leaf
637
+ * frames carry their turn's id.
638
+ */
639
+ stampAndStore(runId, envelope, body, turnId) {
640
+ envelope.seq += 1;
641
+ const cursor = makeCursor(envelope.turn, envelope.seq);
642
+ const event = runEventSchema.parse({
643
+ ...body,
644
+ runId,
645
+ turnId: turnId ?? runId,
646
+ seq: envelope.seq,
647
+ t: this.clock.now(),
648
+ });
649
+ this.store.appendEvents(runId, [{ cursor, event }]);
650
+ const row = { runId, cursor, event };
651
+ for (const listener of this.listeners)
652
+ listener(row);
653
+ }
654
+ /** Resume the envelope past everything already persisted (crash/boot safe). */
655
+ resumeEnvelope(runId) {
656
+ const max = this.store.maxCursor(runId);
657
+ return max === 0
658
+ ? { turn: 0, seq: 0 }
659
+ : { turn: Math.floor(max / TURN_CURSOR_STRIDE) + 1, seq: 0 };
660
+ }
661
+ mustGetRun(runId) {
662
+ const run = this.store.getRun(runId);
663
+ if (run === null)
664
+ throw new EngineError("INTERNAL", `Run ${runId} vanished from the store.`);
665
+ return run;
666
+ }
667
+ /** Deployed skills live at <dataDir>/skills/<workflowId>/<name>.md (written at deploy). */
668
+ skillsDirFor(workflowId) {
669
+ const dir = join(this.dataDir, "skills", workflowId);
670
+ return existsSync(dir) ? dir : null;
671
+ }
672
+ }
673
+ function durationBudgetMessage(manifest) {
674
+ const seconds = manifest.budget?.max_duration_seconds;
675
+ return `Run exceeded budget.max_duration_seconds (${String(seconds)}s) and was terminated.`;
676
+ }