@holoscript/holoscript-agent 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/runner.js CHANGED
@@ -37,7 +37,18 @@ function brainClassOf(brain) {
37
37
  return "unknown";
38
38
  }
39
39
  function buildCaelRecord(input) {
40
- const { identity, brain, task, messages, finalText, usage, costUsd, spentUsd, prevChain, runtimeVersion } = input;
40
+ const {
41
+ identity,
42
+ brain,
43
+ task,
44
+ messages,
45
+ finalText,
46
+ usage,
47
+ costUsd,
48
+ spentUsd,
49
+ prevChain,
50
+ runtimeVersion
51
+ } = input;
41
52
  const l0 = sha(brain.systemPrompt);
42
53
  const l1 = sha(`${task.id}|${task.title}|${task.description ?? ""}`);
43
54
  const l2 = sha(JSON.stringify(messages));
@@ -53,15 +64,16 @@ function buildCaelRecord(input) {
53
64
  prev_hash: prevChain,
54
65
  fnv1a_chain,
55
66
  version_vector_fingerprint: `agent@${runtimeVersion}|brain@${brainClassOf(brain)}|provider@${identity.llmProvider}|model@${identity.llmModel}`,
56
- brain_class: brainClassOf(brain)
67
+ brain_class: brainClassOf(brain),
68
+ trust_epoch: "post-w107"
57
69
  };
58
70
  }
59
71
 
60
72
  // src/tools.ts
61
73
  import { readFile, writeFile, readdir, mkdir, stat } from "fs/promises";
62
- import { resolve, dirname } from "path";
74
+ import { resolve, dirname, delimiter, isAbsolute, sep } from "path";
63
75
  import { spawn } from "child_process";
64
- var ALLOWED_READ_ROOTS = [
76
+ var FLEET_READ_ROOTS = [
65
77
  "/root/msc-paper-22",
66
78
  // Paper 22 mechanization inputs (scp'd by deploy)
67
79
  "/root/holoscript-mesh",
@@ -69,15 +81,24 @@ var ALLOWED_READ_ROOTS = [
69
81
  "/root/agent-output"
70
82
  // Read back what we wrote
71
83
  ];
72
- var ALLOWED_WRITE_ROOTS = [
84
+ var FLEET_WRITE_ROOTS = [
73
85
  "/root/agent-output"
74
86
  // Single write sink — keeps deliverables in one place
75
87
  ];
76
- var BASH_WHITELIST = [
77
- "lake build",
78
- "lake env",
79
- "lake clean",
80
- "lean ",
88
+ function parseRootsEnv(raw, fallback) {
89
+ if (!raw) return fallback;
90
+ const roots = raw.split(delimiter).map((r) => r.trim()).filter((r) => r.length > 0 && isAbsolute(r));
91
+ return roots.length > 0 ? roots : fallback;
92
+ }
93
+ var ALLOWED_READ_ROOTS = parseRootsEnv(
94
+ process.env.HOLOSCRIPT_AGENT_READ_ROOTS,
95
+ FLEET_READ_ROOTS
96
+ );
97
+ var ALLOWED_WRITE_ROOTS = parseRootsEnv(
98
+ process.env.HOLOSCRIPT_AGENT_WRITE_ROOTS,
99
+ FLEET_WRITE_ROOTS
100
+ );
101
+ var BASH_READ_ONLY_PREFIXES = [
81
102
  "ls ",
82
103
  "ls\n",
83
104
  "ls$",
@@ -92,16 +113,36 @@ var BASH_WHITELIST = [
92
113
  "git log",
93
114
  "git diff",
94
115
  "git show",
116
+ "pwd",
117
+ "echo ",
118
+ "lake env"
119
+ ];
120
+ var BASH_PRODUCTIVE_PREFIXES = [
121
+ "lake build",
122
+ "lake clean",
123
+ "lean ",
95
124
  "pnpm --filter",
96
125
  "pnpm vitest",
97
126
  "vitest run",
98
- "pwd",
99
- "echo "
127
+ // Robotics / edge-node (Jetson) productive commands — without these, every
128
+ // ros2/colcon/tegrastats task fails the W.107 artifact gate and is abandoned
129
+ // as no-artifact. (jetson-orin-01 lane.)
130
+ "ros2 launch",
131
+ "ros2 topic pub",
132
+ "ros2 service call",
133
+ "colcon build",
134
+ "tegrastats"
100
135
  ];
136
+ var BASH_WHITELIST = [...BASH_READ_ONLY_PREFIXES, ...BASH_PRODUCTIVE_PREFIXES];
137
+ function isProductiveBashCommand(cmd) {
138
+ const trimmed = String(cmd ?? "").trim();
139
+ if (!trimmed) return false;
140
+ return BASH_PRODUCTIVE_PREFIXES.some((prefix) => trimmed.startsWith(prefix.trim()));
141
+ }
101
142
  var MESH_TOOLS = [
102
143
  {
103
144
  name: "read_file",
104
- description: "Read a file from the agent sandbox. Allowed roots: /root/msc-paper-22, /root/holoscript-mesh, /root/agent-output. Returns the file content as text. Use this to inspect inputs scp'd to the instance (e.g. MSC/Invariants.lean).",
145
+ description: `Read a file from the agent sandbox. Allowed roots: ${ALLOWED_READ_ROOTS.join(", ")}. Returns the file content as text. Use this to inspect task inputs and the read-only repo view.`,
105
146
  input_schema: {
106
147
  type: "object",
107
148
  properties: {
@@ -123,11 +164,11 @@ var MESH_TOOLS = [
123
164
  },
124
165
  {
125
166
  name: "write_file",
126
- description: "Write a file to /root/agent-output/. This is the deliverable sink \u2014 anything you want to emit as task output (a Lean proof, a markdown report, a JSON dataset) goes here. Creates parent directories. Will refuse paths outside the write root.",
167
+ description: `Write a file to the deliverable sink (write roots: ${ALLOWED_WRITE_ROOTS.join(", ")}). Anything you want to emit as task output (a Lean proof, a markdown report, a JSON dataset, a .holo scene) goes here. Creates parent directories. Will refuse paths outside the write root(s).`,
127
168
  input_schema: {
128
169
  type: "object",
129
170
  properties: {
130
- path: { type: "string", description: "Absolute path under /root/agent-output/" },
171
+ path: { type: "string", description: `Absolute path under a write root: ${ALLOWED_WRITE_ROOTS.join(", ")}` },
131
172
  content: { type: "string", description: "File content to write (UTF-8)" }
132
173
  },
133
174
  required: ["path", "content"]
@@ -135,7 +176,7 @@ var MESH_TOOLS = [
135
176
  },
136
177
  {
137
178
  name: "bash",
138
- description: "Run a shell command. Whitelisted prefixes only: lake build, lean, ls, cat, grep, find, wc, head, tail, git status/log/diff/show, pnpm --filter, vitest run, pwd, echo. Hard 60s wall timeout, 1MB stdout cap. Use for lake build / lean kernel-checks, git inspection, repo greps. Refuses rm, curl, ssh, sudo, eval.",
179
+ description: "Run a shell command. Whitelisted prefixes only: lake build, lean, ls, cat, grep, find, wc, head, tail, git status/log/diff/show, pnpm --filter, vitest run, pwd, echo, ros2 launch/topic/service, colcon build, tegrastats. Hard 60s wall timeout, 1MB stdout cap. Use for builds, tests, hardware probes. Refuses rm, curl, ssh, sudo, eval.",
139
180
  input_schema: {
140
181
  type: "object",
141
182
  properties: {
@@ -144,22 +185,52 @@ var MESH_TOOLS = [
144
185
  },
145
186
  required: ["cmd"]
146
187
  }
188
+ },
189
+ {
190
+ name: "emit_hardware_receipt",
191
+ description: "Emit a portable hardware receipt (PortableHardwareReceiptMetadata v1) capturing device identity, runtime, and measured performance. Writes a JSON receipt to the agent output dir. Use after running tegrastats or colcon build to record hardware evidence for the CAEL audit chain. Accepts either pre-parsed measurements or raw tegrastats output (the tool parses it automatically).",
192
+ input_schema: {
193
+ type: "object",
194
+ properties: {
195
+ device_kind: {
196
+ type: "string",
197
+ description: 'Device identifier, e.g. "jetson-orin-nano-super", "raspberry-pi-5"'
198
+ },
199
+ accelerator: {
200
+ description: 'Accelerator string, e.g. "NVIDIA CUDA 8.7", or null for CPU-only'
201
+ },
202
+ runtime_name: { type: "string", description: 'Inference runtime, e.g. "Ollama", "llama.cpp"' },
203
+ runtime_version: { type: "string", description: 'Runtime version, e.g. "0.30.8"' },
204
+ host_os: { type: "string", description: 'OS + firmware, e.g. "JetPack 6.2.1 / Ubuntu 22.04"' },
205
+ composition_id: { type: "string", description: 'Brain composition reference, e.g. "jetson-orin-brain"' },
206
+ measurements: {
207
+ type: "array",
208
+ description: "Pre-parsed measurements. Each item: {metric: string, value: number, unit: string}",
209
+ items: { type: "object" }
210
+ },
211
+ tegrastats_output: {
212
+ type: "string",
213
+ description: "Raw tegrastats output line(s) \u2014 tool auto-parses GPU%, RAM, temp, power"
214
+ }
215
+ },
216
+ required: ["device_kind", "runtime_name", "runtime_version", "host_os"]
217
+ }
147
218
  }
148
219
  ];
149
220
  function isUnderRoot(absPath, root) {
150
221
  const resolved = resolve(absPath);
151
222
  const rootResolved = resolve(root);
152
- return resolved === rootResolved || resolved.startsWith(rootResolved + "/");
223
+ return resolved === rootResolved || resolved.startsWith(rootResolved + sep);
153
224
  }
154
225
  function checkReadAllowed(path) {
155
- if (!path.startsWith("/")) return `path must be absolute, got "${path}"`;
226
+ if (!isAbsolute(path)) return `path must be absolute, got "${path}"`;
156
227
  for (const root of ALLOWED_READ_ROOTS) {
157
228
  if (isUnderRoot(path, root)) return null;
158
229
  }
159
230
  return `read denied \u2014 path "${path}" not under allowed roots: ${ALLOWED_READ_ROOTS.join(", ")}`;
160
231
  }
161
232
  function checkWriteAllowed(path) {
162
- if (!path.startsWith("/")) return `path must be absolute, got "${path}"`;
233
+ if (!isAbsolute(path)) return `path must be absolute, got "${path}"`;
163
234
  for (const root of ALLOWED_WRITE_ROOTS) {
164
235
  if (isUnderRoot(path, root)) return null;
165
236
  }
@@ -214,12 +285,113 @@ async function runTool(use) {
214
285
  return result.code === 0 ? okResult(use.id, result.stdout) : errResult(use.id, `exit=${result.code}
215
286
  ${result.stderr || result.stdout}`);
216
287
  }
288
+ if (use.name === "emit_hardware_receipt") {
289
+ const deviceKind = String(use.input.device_kind ?? "unknown-device");
290
+ const accelerator = use.input.accelerator === null || use.input.accelerator === "null" ? null : String(use.input.accelerator ?? "").trim() || null;
291
+ const runtimeName = String(use.input.runtime_name ?? "Ollama");
292
+ const runtimeVersion = String(use.input.runtime_version ?? "unknown");
293
+ const hostOs = String(use.input.host_os ?? "unknown");
294
+ const compositionId = String(use.input.composition_id ?? "unknown");
295
+ let measurements = [];
296
+ if (Array.isArray(use.input.measurements)) {
297
+ for (const m of use.input.measurements) {
298
+ const metric = String(m.metric ?? "");
299
+ const value = Number(m.value ?? 0);
300
+ const unit = String(m.unit ?? "");
301
+ if (metric && Number.isFinite(value)) {
302
+ measurements.push({ metric, value, unit, method: "measured" });
303
+ }
304
+ }
305
+ }
306
+ if (typeof use.input.tegrastats_output === "string" && use.input.tegrastats_output.length > 0) {
307
+ measurements = [...measurements, ...parseTegrastats(use.input.tegrastats_output)];
308
+ }
309
+ if (measurements.length === 0) {
310
+ measurements.push({ metric: "agent-tick", value: 1, unit: "count", method: "presence" });
311
+ }
312
+ const capturedAt = (/* @__PURE__ */ new Date()).toISOString();
313
+ const receipt = {
314
+ schemaVersion: "holoscript.hardware-receipt-metadata.v1",
315
+ target: {
316
+ id: `${deviceKind}-${Date.now()}`,
317
+ kind: deviceKind,
318
+ architecture: /jetson|orin|nano|agx|xavier/i.test(deviceKind) ? "arm64" : "unknown",
319
+ artifactKind: "measurement-trace"
320
+ },
321
+ device: {
322
+ vendor: /jetson|orin|nvidia/i.test(deviceKind) ? "nvidia" : "unknown",
323
+ model: deviceKind,
324
+ accelerator
325
+ },
326
+ runtime: { name: runtimeName, version: runtimeVersion, hostOS: hostOs },
327
+ compilerVersion: "holoscript-agent-1.0.0",
328
+ constraints: [],
329
+ measuredResults: measurements,
330
+ replayInputs: [
331
+ { kind: "composition-ref", uri: `compositions/${compositionId}`, sha256: "unknown" }
332
+ ],
333
+ provenance: {
334
+ capturedAt,
335
+ sourceCompositionHash: compositionId
336
+ },
337
+ owner: {
338
+ agent: process.env.HOLOSCRIPT_AGENT_HANDLE ?? "unknown",
339
+ ...process.env.HOLOMESH_TEAM_ID ? { team: process.env.HOLOMESH_TEAM_ID } : {}
340
+ }
341
+ };
342
+ const ts = capturedAt.replace(/[:.]/g, "-");
343
+ const outPath = resolve(ALLOWED_WRITE_ROOTS[0], `hardware-receipt-${ts}.json`);
344
+ const denied = checkWriteAllowed(outPath);
345
+ if (denied) return errResult(use.id, `Cannot write receipt: ${denied}`);
346
+ await mkdir(dirname(outPath), { recursive: true });
347
+ await writeFile(outPath, JSON.stringify(receipt, null, 2), "utf8");
348
+ return okResult(
349
+ use.id,
350
+ `Hardware receipt written to ${outPath} \u2014 ${measurements.length} measurements, accelerator=${accelerator ?? "none"}`
351
+ );
352
+ }
217
353
  return errResult(use.id, `unknown tool: ${use.name}`);
218
354
  } catch (err) {
219
355
  return errResult(use.id, err instanceof Error ? err.message : String(err));
220
356
  }
221
357
  }
358
+ function parseTegrastats(raw) {
359
+ const results = [];
360
+ const m = (pattern, metric, unit, transform) => {
361
+ const match = raw.match(pattern);
362
+ if (match?.[1]) {
363
+ const value = transform ? transform(match[1]) : Number(match[1]);
364
+ if (Number.isFinite(value)) results.push({ metric, value, unit, method: "tegrastats" });
365
+ }
366
+ };
367
+ const ram = raw.match(/RAM\s+(\d+)\/(\d+)MB/);
368
+ if (ram) {
369
+ const used = Number(ram[1]);
370
+ const total = Number(ram[2]);
371
+ results.push({ metric: "ram-used", value: used, unit: "MB", method: "tegrastats" });
372
+ results.push({ metric: "ram-total", value: total, unit: "MB", method: "tegrastats" });
373
+ if (total > 0)
374
+ results.push({ metric: "ram-pct", value: Math.round(used / total * 100), unit: "%", method: "tegrastats" });
375
+ }
376
+ m(/GR3D_FREQ\s+(\d+)%/, "gpu-util", "%");
377
+ m(/EMC_FREQ\s+(\d+)%/, "emc-freq-pct", "%");
378
+ m(/tj@([\d.]+)C/, "temp-tj", "C", parseFloat);
379
+ m(/cpu@([\d.]+)C/, "temp-cpu", "C", parseFloat);
380
+ m(/gpu@([\d.]+)C/, "temp-gpu", "C", parseFloat);
381
+ m(/VDD_SOC\s+(\d+)mW/, "power-soc", "mW");
382
+ m(/VDD_CPU_CV\s+(\d+)mW/, "power-cpu-cv", "mW");
383
+ m(/VDD_IN\s+(\d+)mW/, "power-total", "mW");
384
+ m(/CPU\s+\[(\d+)%/, "cpu-util-core0", "%");
385
+ return results;
386
+ }
222
387
  function runBash(cmd, cwd) {
388
+ if (process.env.VITEST === "true" || process.env.NODE_ENV === "test") {
389
+ return Promise.resolve({
390
+ code: 0,
391
+ stdout: `[mock-bash under vitest] cmd="${cmd}" cwd="${cwd}"`,
392
+ stderr: ""
393
+ });
394
+ }
223
395
  return new Promise((resolveProm) => {
224
396
  const child = spawn("bash", ["-c", cmd], { cwd, env: process.env });
225
397
  let stdout = "";
@@ -288,6 +460,35 @@ var AgentRunner = class {
288
460
  const { identity, brain, mesh, costGuard, provider, logger } = this.opts;
289
461
  const log = logger ?? (() => void 0);
290
462
  await this.heartbeatWithAutoRejoin();
463
+ if (this.opts.messageHandler) {
464
+ try {
465
+ const receipts = await this.opts.messageHandler.processMessages();
466
+ if (receipts.length > 0) {
467
+ log({
468
+ ev: "messages-processed",
469
+ count: receipts.length,
470
+ statuses: receipts.map((r) => r.status)
471
+ });
472
+ if (brain.capabilityTags.length === 0 || brain.capabilityTags.every((t) => t.startsWith("delegated"))) {
473
+ return {
474
+ action: "messages-processed",
475
+ spentUsd: costGuard.getState().spentUsd,
476
+ remainingUsd: costGuard.getRemainingUsd(),
477
+ receipts: receipts.map((r) => ({
478
+ status: r.status,
479
+ action: r.action,
480
+ reason: r.reason
481
+ }))
482
+ };
483
+ }
484
+ }
485
+ } catch (err) {
486
+ log({
487
+ ev: "message-handler-error",
488
+ message: err instanceof Error ? err.message : String(err)
489
+ });
490
+ }
491
+ }
291
492
  if (costGuard.isOverBudget()) {
292
493
  const state = costGuard.getState();
293
494
  log({ ev: "over-budget", spentUsd: state.spentUsd, budget: identity.budgetUsdPerDay });
@@ -321,6 +522,8 @@ var AgentRunner = class {
321
522
  const MAX_TOOL_ITERS = 30;
322
523
  let lastResponse;
323
524
  const toolsCalled = /* @__PURE__ */ new Set();
525
+ let productiveCallCount = 0;
526
+ let lastCommitHash;
324
527
  while (true) {
325
528
  iters++;
326
529
  if (iters > MAX_TOOL_ITERS) {
@@ -328,12 +531,16 @@ var AgentRunner = class {
328
531
  finalText = finalText || `[tool-loop hit ${MAX_TOOL_ITERS}-iter cap before final text]`;
329
532
  break;
330
533
  }
534
+ const activeTools = brain.requires.includes("local-llm") ? MESH_TOOLS.filter((t) => t.name === "write_file") : MESH_TOOLS;
331
535
  const resp = await provider.complete(
332
536
  {
333
537
  messages,
334
- maxTokens: 4096,
538
+ // 8192 for local thinking models (qwen3:4b uses ~3800 tokens on thinking
539
+ // before the tool-call JSON; 4096 cuts off mid-generation). Frontier
540
+ // models ignore this ceiling and stop naturally earlier.
541
+ maxTokens: 8192,
335
542
  temperature: 0.4,
336
- tools: MESH_TOOLS
543
+ tools: activeTools
337
544
  },
338
545
  identity.llmModel
339
546
  );
@@ -344,13 +551,39 @@ var AgentRunner = class {
344
551
  totalTokens: aggUsage.totalTokens + resp.usage.totalTokens
345
552
  };
346
553
  if (resp.finishReason === "tool_use" && resp.toolUses && resp.toolUses.length > 0) {
347
- log({ ev: "tool-call", taskId: target.id, iter: iters, tools: resp.toolUses.map((t) => t.name) });
348
- for (const u of resp.toolUses) toolsCalled.add(u.name);
554
+ log({
555
+ ev: "tool-call",
556
+ taskId: target.id,
557
+ iter: iters,
558
+ tools: resp.toolUses.map((t) => t.name)
559
+ });
560
+ for (const u of resp.toolUses) {
561
+ toolsCalled.add(u.name);
562
+ if (u.name === "write_file") {
563
+ const content = String(u.input?.content ?? "");
564
+ if (content.length > 0) productiveCallCount++;
565
+ } else if (u.name === "bash") {
566
+ const cmd = String(u.input?.cmd ?? "");
567
+ if (isProductiveBashCommand(cmd)) productiveCallCount++;
568
+ } else if (u.name === "emit_hardware_receipt") {
569
+ productiveCallCount++;
570
+ }
571
+ }
349
572
  messages.push({
350
573
  role: "assistant",
351
574
  content: resp.assistantBlocks ?? []
352
575
  });
353
576
  const toolResults = await Promise.all(resp.toolUses.map((u) => runTool(u)));
577
+ for (let ti = 0; ti < resp.toolUses.length; ti++) {
578
+ const tu = resp.toolUses[ti];
579
+ if (tu.name === "bash") {
580
+ const tr = toolResults[ti];
581
+ if (tr && !tr.is_error) {
582
+ const shaMatch = tr.content.match(/\b([0-9a-f]{7,40})\b/);
583
+ if (shaMatch) lastCommitHash = shaMatch[1];
584
+ }
585
+ }
586
+ }
354
587
  messages.push({
355
588
  role: "user",
356
589
  content: toolResults
@@ -361,24 +594,75 @@ var AgentRunner = class {
361
594
  break;
362
595
  }
363
596
  const durationMs = Date.now() - start;
364
- const SIDE_EFFECTING_TOOLS = /* @__PURE__ */ new Set(["write_file", "bash"]);
365
- const sideEffectingCalled = [...toolsCalled].some((t) => SIDE_EFFECTING_TOOLS.has(t));
366
- if (!sideEffectingCalled) {
597
+ if (productiveCallCount === 0) {
367
598
  log({
368
599
  ev: "no-artifact",
369
600
  taskId: target.id,
370
601
  tool_iters: iters,
371
602
  toolsCalled: [...toolsCalled],
372
- message: "task execution called no side-effecting tool (write_file/bash) \u2014 refusing to mark executed. Likely a pure-text or read-only-inspection response. Task remains open for a grounded attempt."
603
+ productiveCallCount,
604
+ message: "task execution did not produce a real artifact \u2014 refusing to mark executed. Required: write_file with non-empty content OR bash with a productive prefix (lake build / pnpm --filter / vitest run / lean / pnpm vitest). Pure-text, read-only inspection, and trivial-bash-bypass (`echo`, `cat`, etc.) do not satisfy the gate."
373
605
  });
374
606
  return {
375
607
  action: "no-artifact",
376
608
  taskId: target.id,
377
609
  spentUsd: costGuard.getState().spentUsd,
378
610
  remainingUsd: costGuard.getRemainingUsd(),
379
- message: `no side-effecting tool called (toolsCalled=[${[...toolsCalled].join(",")}], iters=${iters})`
611
+ message: `no productive tool call observed (toolsCalled=[${[...toolsCalled].join(",")}], productiveCallCount=${productiveCallCount}, iters=${iters})`
380
612
  };
381
613
  }
614
+ let reflectVerdict;
615
+ if (brain.reflect) {
616
+ try {
617
+ const reflectResp = await provider.complete(
618
+ {
619
+ messages: [
620
+ {
621
+ role: "system",
622
+ content: "You are a strict reviewer. Evaluate the work against the criteria; do not rewrite it."
623
+ },
624
+ {
625
+ role: "user",
626
+ content: `Reflect on the artifact produced for this task. Evaluate it for: ${brain.reflect.criteria}.
627
+
628
+ --- artifact / final response ---
629
+ ${finalText.slice(0, 4e3)}
630
+ --- end ---
631
+
632
+ Give a one-line reason, then end with exactly "VERDICT: PASS" or "VERDICT: FAIL".`
633
+ }
634
+ ],
635
+ maxTokens: 512,
636
+ temperature: 0.1
637
+ },
638
+ identity.llmModel
639
+ );
640
+ aggUsage = {
641
+ promptTokens: aggUsage.promptTokens + reflectResp.usage.promptTokens,
642
+ completionTokens: aggUsage.completionTokens + reflectResp.usage.completionTokens,
643
+ totalTokens: aggUsage.totalTokens + reflectResp.usage.totalTokens
644
+ };
645
+ const verdictMatch = /VERDICT:\s*(PASS|FAIL)/i.exec(reflectResp.content);
646
+ const pass = verdictMatch ? verdictMatch[1].toUpperCase() === "PASS" : true;
647
+ reflectVerdict = {
648
+ pass,
649
+ reason: reflectResp.content.replace(/VERDICT:\s*(PASS|FAIL)/i, "").trim().slice(0, 300)
650
+ };
651
+ log({
652
+ ev: "reflect",
653
+ taskId: target.id,
654
+ pass,
655
+ escalateOnFail: brain.reflect.escalateOnFail,
656
+ reason: reflectVerdict.reason.slice(0, 120)
657
+ });
658
+ } catch (err) {
659
+ log({
660
+ ev: "reflect-error",
661
+ taskId: target.id,
662
+ message: err instanceof Error ? err.message : String(err)
663
+ });
664
+ }
665
+ }
382
666
  const cost = costGuard.recordUsage(identity.llmModel, aggUsage);
383
667
  log({
384
668
  ev: "executed",
@@ -388,7 +672,11 @@ var AgentRunner = class {
388
672
  tokens: aggUsage.totalTokens,
389
673
  tool_iters: iters
390
674
  });
391
- const response = { ...lastResponse ?? { content: finalText, usage: aggUsage }, content: finalText, usage: aggUsage };
675
+ const response = {
676
+ ...lastResponse ?? { content: finalText, usage: aggUsage },
677
+ content: finalText,
678
+ usage: aggUsage
679
+ };
392
680
  const execResult = {
393
681
  taskId: target.id,
394
682
  responseText: response.content,
@@ -422,10 +710,32 @@ var AgentRunner = class {
422
710
  });
423
711
  const posted = await mesh.postAuditRecords(identity.handle, [caelRecord]);
424
712
  this.prevCaelChain = caelRecord.fnv1a_chain;
425
- log({ ev: "cael-posted", taskId: target.id, appended: posted.appended, rejected: posted.rejected });
713
+ log({
714
+ ev: "cael-posted",
715
+ taskId: target.id,
716
+ appended: posted.appended,
717
+ rejected: posted.rejected
718
+ });
426
719
  } catch (err) {
427
720
  log({ ev: "cael-post-error", message: err instanceof Error ? err.message : String(err) });
428
721
  }
722
+ if (reflectVerdict && !reflectVerdict.pass && brain.reflect?.escalateOnFail) {
723
+ try {
724
+ await mesh.sendMessageOnTask(
725
+ target.id,
726
+ `[${identity.handle}] reflect gate FAILED \u2014 escalating to the fleet instead of marking done. Reason: ${reflectVerdict.reason}`
727
+ );
728
+ } catch {
729
+ }
730
+ log({ ev: "reflect-escalate", taskId: target.id, reason: reflectVerdict.reason.slice(0, 120) });
731
+ return {
732
+ action: "reflect-escalate",
733
+ taskId: target.id,
734
+ spentUsd: costGuard.getState().spentUsd,
735
+ remainingUsd: costGuard.getRemainingUsd(),
736
+ message: `reflect self-evaluation failed; escalated to fleet (reason: ${reflectVerdict.reason.slice(0, 120)})`
737
+ };
738
+ }
429
739
  if (this.opts.onTaskExecuted) {
430
740
  await this.opts.onTaskExecuted(execResult, target);
431
741
  } else {
@@ -436,6 +746,16 @@ var AgentRunner = class {
436
746
  ${response.content}`
437
747
  );
438
748
  }
749
+ try {
750
+ await mesh.markDone(target.id, finalText.slice(0, 500), lastCommitHash);
751
+ log({ ev: "mark-done", taskId: target.id, commitHash: lastCommitHash });
752
+ } catch (err) {
753
+ log({
754
+ ev: "mark-done-error",
755
+ taskId: target.id,
756
+ message: err instanceof Error ? err.message : String(err)
757
+ });
758
+ }
439
759
  return {
440
760
  action: "executed",
441
761
  taskId: target.id,
@@ -528,7 +848,7 @@ function buildTaskPrompt(task) {
528
848
  "Description:",
529
849
  task.description ?? "(no description)",
530
850
  "",
531
- "Produce the deliverable described in the task. Apply your brain composition rules \u2014 anti-patterns, decision loop, and scope tier all bind. Return the response as plain text suitable for posting to /room as a message on this task."
851
+ "Produce the deliverable: call write_file (or bash with a build command) to create all required output files FIRST. Apply your brain composition rules \u2014 anti-patterns, decision loop, and scope tier all bind. After calling the tool(s), return a short plain-text summary of what you did for posting to /room."
532
852
  ].join("\n");
533
853
  }
534
854
  function sleep(ms) {