@robzilla1738/agentswarm 0.3.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +51 -11
  2. package/dist/agent.js +18 -2
  3. package/dist/cli.js +39 -8
  4. package/dist/config.js +62 -6
  5. package/dist/crawltools.js +247 -0
  6. package/dist/deepseek.js +125 -10
  7. package/dist/executor.js +993 -144
  8. package/dist/hub.js +85 -6
  9. package/dist/journal.js +61 -11
  10. package/dist/memory.js +84 -0
  11. package/dist/pdftext.js +211 -0
  12. package/dist/prompts.js +124 -23
  13. package/dist/report.js +289 -0
  14. package/dist/run.js +15 -2
  15. package/dist/sandbox.js +11 -0
  16. package/dist/searchcore.js +244 -0
  17. package/dist/state.js +85 -3
  18. package/dist/tools.js +392 -25
  19. package/dist/util.js +85 -0
  20. package/dist/webtools.js +327 -66
  21. package/package.json +3 -2
  22. package/ui/out/404/index.html +1 -1
  23. package/ui/out/404.html +1 -1
  24. package/ui/out/_next/static/chunks/532-35122e93f37719b9.js +1 -0
  25. package/ui/out/_next/static/chunks/677-721ce1c8b7a6a317.js +1 -0
  26. package/ui/out/_next/static/chunks/app/page-dc9f6744d203e76c.js +1 -0
  27. package/ui/out/_next/static/chunks/app/run/page-3674e103981703a2.js +1 -0
  28. package/ui/out/_next/static/chunks/app/settings/page-41a5d8ba43ecfd4a.js +1 -0
  29. package/ui/out/_next/static/css/d95c2ba395730031.css +3 -0
  30. package/ui/out/fonts/PlanetKosmos.ttf +0 -0
  31. package/ui/out/index.html +1 -1
  32. package/ui/out/index.txt +3 -3
  33. package/ui/out/run/index.html +1 -1
  34. package/ui/out/run/index.txt +3 -3
  35. package/ui/out/settings/index.html +1 -1
  36. package/ui/out/settings/index.txt +3 -3
  37. package/ui/out/_next/static/chunks/383-289a866b246b41cc.js +0 -1
  38. package/ui/out/_next/static/chunks/619-ba102abea3e3d0e4.js +0 -1
  39. package/ui/out/_next/static/chunks/677-7ab85a6f38c3a235.js +0 -1
  40. package/ui/out/_next/static/chunks/app/page-0fda5b8e77d90b84.js +0 -1
  41. package/ui/out/_next/static/chunks/app/run/page-07aab6b1224c3c8c.js +0 -1
  42. package/ui/out/_next/static/chunks/app/settings/page-528482d468d84cfa.js +0 -1
  43. package/ui/out/_next/static/css/e2c82b53bf4519e8.css +0 -3
  44. /package/ui/out/_next/static/{Rm5Fhkds2-wIOnVlME55J → 7_pihFubDGD40BCy2ynlr}/_buildManifest.js +0 -0
  45. /package/ui/out/_next/static/{Rm5Fhkds2-wIOnVlME55J → 7_pihFubDGD40BCy2ynlr}/_ssgManifest.js +0 -0
package/dist/prompts.js CHANGED
@@ -39,10 +39,13 @@ exports.conductorInitialUpdate = conductorInitialUpdate;
39
39
  exports.conductorUpdate = conductorUpdate;
40
40
  exports.taskTable = taskTable;
41
41
  exports.reportBlock = reportBlock;
42
+ exports.depReportBlock = depReportBlock;
42
43
  exports.workerSystem = workerSystem;
43
44
  exports.forcedFinal = forcedFinal;
44
45
  exports.verifierSystem = verifierSystem;
45
46
  exports.synthSystem = synthSystem;
47
+ exports.completenessPrompt = completenessPrompt;
48
+ exports.synthCheckPrompt = synthCheckPrompt;
46
49
  exports.compactorPrompt = compactorPrompt;
47
50
  exports.budgetLine = budgetLine;
48
51
  const os = __importStar(require("os"));
@@ -72,15 +75,20 @@ DOCTRINE
72
75
  2. Make every task self-contained: crisp objective, explicit success criteria ("Done when …"), and every fact/path/URL the worker needs inlined in context. Workers know nothing you don't tell them.
73
76
  3. Invent the right specialist role per task (researcher, coder, analyst, data-wrangler, reviewer, writer, …). One concern per task, roughly 5–25 tool steps of work. Bigger → split it. Trivial → batch it.
74
77
  4. Software missions: scaffold first (one task), then parallel tasks on DISJOINT files/modules — never two writers on the same file — then an integration + test task that deps on all of them with verify:true.
75
- 5. Research missions: parallel scouts with distinct angles and sources, then a consolidation/analysis task that deps on the scouts.
78
+ 5. Research missions: go WIDE. Spawn many parallel scouts (10+ for a broad topic), each owning a distinct sub-question, angle, source type, time period, or entity — so collectively they pull hundreds of sources, not dozens. Tell each scout to use deep web_search (high count) and to record findings with exact URLs/quotes on the blackboard and in artifact files. Then spawn analysis/consolidation tasks that dep on the scouts, and a final synthesis. When one scout's area is itself broad, spawn it with team:true so it fans out further.
76
79
  6. Set verify:true on tasks whose failure would poison the mission (builds, integrations, data pipelines, final deliverables). A verification agent will adversarially check them and can fail them back for retry.
77
80
  7. React to evidence. Failed/blocked task → diagnose from its report and spawn a corrected or alternative approach (never re-run a failed approach verbatim). Surprising findings → adapt the plan.
78
81
  8. Watch the budget shown in every update. As it tightens, cut scope to what the mission truly needs — always deliver value before the cap, never run out mid-flight.
79
82
  9. Operator messages override everything. Adjust the plan immediately when one appears.
80
83
  10. finish only when the mission's success criteria are demonstrably met, or budget/feasibility forces it. Your finish notes steer the synthesizer that writes the final report.
84
+ 11. Model tiers: set model:"cheap" on scouts and bulk extraction, model:"strong" on leads, integration, and verified deliverables. Default tier for everything in between.
85
+ 12. Big subsystems: spawn with team:true to run the task as a sub-swarm — its own lead decomposes it into parallel sub-tasks and reports one consolidated result. Use for coherent multi-task chunks ("build the backend", "research all 12 competitors"), not for single jobs.
86
+ 13. Beyond ~20 tasks, maintain a living plan with update_plan (mission-plan.md): approach, what's done, what's next, open risks. Rewrite it at phase boundaries — it is pinned into your updates and survives restarts.
87
+ 14. Long missions: structure the work into phases with set_phase (e.g. discovery → build → integrate → polish). The current phase and its exit criteria are pinned into every update, so the plan survives even when old history is trimmed.
88
+ 15. DELIVERABLES SHIP IN THE FORMAT THE MISSION ACTUALLY NEEDS — a markdown report is the fallback, not the default. Software → running code with build/run instructions; data work → .csv/.json/.sqlite plus a summary; comparisons and datasets → tables in CSV as well as prose; polished documents → styled self-contained .html (the operator reads HTML, not raw markdown); scripts/configs → the runnable files themselves. Spell the expected format and exact filename(s) out in the deliverable task's objective and have it save them with save_artifact.
81
89
 
82
90
  RULES
83
- - Respond ONLY by calling your tools (spawn_tasks / wait / finish). Plain-text replies are ignored.
91
+ - Respond ONLY by calling your tools (spawn_tasks / set_phase / wait / finish). Plain-text replies are ignored. set_phase alone is not a decision — pair it with spawn_tasks, wait, or finish.
84
92
  - Never spawn a task whose deps are not yet all created.
85
93
  - Keep the total task count within budget (max ${o.maxTasks} per run); make every task earn its place.`;
86
94
  }
@@ -98,6 +106,10 @@ function conductorUpdate(p) {
98
106
  sections.push(`NEW REPORTS\n${p.reports.join("\n\n")}`);
99
107
  if (p.blackboard)
100
108
  sections.push(`BLACKBOARD (shared notes digest)\n${p.blackboard}`);
109
+ if (p.phase)
110
+ sections.push(p.phase);
111
+ if (p.plan)
112
+ sections.push(p.plan);
101
113
  sections.push(`SWARM STATE\n${p.taskTable}`);
102
114
  sections.push(p.budgetLine);
103
115
  if (p.extra)
@@ -108,27 +120,71 @@ function conductorUpdate(p) {
108
120
  function taskTable(tasks) {
109
121
  if (!tasks.length)
110
122
  return "(no tasks yet)";
111
- return tasks
112
- .map((t) => {
123
+ const line = (t) => {
113
124
  const deps = t.deps.length ? ` deps:[${t.deps.join(",")}]` : "";
114
- const extra = t.status === "failed" && t.error ? ` — ${(0, util_1.clip)(t.error, 80)}` : "";
125
+ const extra = (t.status === "failed" || t.status === "blocked") && t.error ? ` — ${(0, util_1.clip)(t.error, 120)}` : "";
115
126
  return `${t.id} [${t.status}${t.attempt > 1 ? ` a${t.attempt}` : ""}] (${t.role})${deps} ${(0, util_1.clip)(t.title, 70)}${extra}`;
116
- })
117
- .join("\n");
127
+ };
128
+ const settled = tasks.filter((t) => ["done", "failed", "blocked"].includes(t.status));
129
+ if (settled.length <= 30)
130
+ return tasks.map(line).join("\n");
131
+ // Hundreds of tasks must not flood the conductor's prompt: collapse DONE
132
+ // tasks in older waves to one line per wave. Failures/blocks stay full-line
133
+ // forever (they're what the conductor plans around), as do active tasks and
134
+ // the two most recent waves.
135
+ const maxWave = Math.max(...tasks.map((t) => t.wave));
136
+ const out = [];
137
+ const waves = [...new Set(tasks.map((t) => t.wave))].sort((a, b) => a - b);
138
+ for (const w of waves) {
139
+ const ws = tasks.filter((t) => t.wave === w);
140
+ const collapsible = w < maxWave - 1 ? ws.filter((t) => t.status === "done") : [];
141
+ const fullLines = ws.filter((t) => !collapsible.includes(t));
142
+ if (collapsible.length) {
143
+ out.push(`wave ${w}: ${collapsible.length} done (${collapsible.map((t) => t.id).join(",")})`);
144
+ }
145
+ out.push(...fullLines.map(line));
146
+ }
147
+ return out.join("\n");
148
+ }
149
+ function sourcesLine(t, max = 6) {
150
+ if (!t.sources?.length)
151
+ return "";
152
+ const shown = t.sources.slice(0, max).map((s) => s.url);
153
+ const more = t.sources.length > max ? ` (+${t.sources.length - max} more)` : "";
154
+ return `\nsources: ${shown.join(" · ")}${more}`;
118
155
  }
119
156
  function reportBlock(t) {
120
157
  const head = `── ${t.id} (${t.role}) "${(0, util_1.clip)(t.title, 60)}" → ${t.status.toUpperCase()}${t.attempt > 1 ? ` (attempt ${t.attempt})` : ""}`;
121
158
  const body = t.report ? (0, util_1.clip)(t.report, 1600) : t.error ? `error: ${(0, util_1.clip)(t.error, 400)}` : "(no report)";
159
+ const facts = t.keyFacts?.length ? `\nkey facts:\n${t.keyFacts.map((f) => ` • ${(0, util_1.clip)(f, 200)}`).join("\n")}` : "";
160
+ const open = t.openQuestions?.length ? `\nopen questions: ${t.openQuestions.map((q) => (0, util_1.clip)(q, 150)).join(" | ")}` : "";
161
+ const files = t.filesTouched?.length ? `\nfiles touched: ${t.filesTouched.join(", ")}` : "";
122
162
  const arts = t.artifacts.length ? `\nartifacts: ${t.artifacts.join(", ")}` : "";
123
163
  const fb = t.feedback ? `\nverifier: ${(0, util_1.clip)(t.feedback, 300)}` : "";
124
- return `${head}\n${body}${arts}${fb}`;
164
+ return `${head}\n${body}${facts}${open}${files}${arts}${sourcesLine(t)}${fb}`;
165
+ }
166
+ /**
167
+ * Compact dependency context for a downstream worker: structured handoff
168
+ * fields in full, prose report as an excerpt — read_report(taskId) has the
169
+ * rest. Keeps fan-in tasks from inheriting megabytes of ancestor prose.
170
+ */
171
+ function depReportBlock(t) {
172
+ const head = `── dep ${t.id} (${t.role}) "${(0, util_1.clip)(t.title, 60)}" → ${t.status.toUpperCase()}`;
173
+ const facts = t.keyFacts?.length ? `\nkey facts:\n${t.keyFacts.map((f) => ` • ${(0, util_1.clip)(f, 200)}`).join("\n")}` : "";
174
+ const files = t.filesTouched?.length ? `\nfiles touched: ${t.filesTouched.join(", ")}` : "";
175
+ const arts = t.artifacts.length ? `\nartifacts: ${t.artifacts.join(", ")}` : "";
176
+ const full = (t.report ?? "").length > 1200 ? `\n(excerpt — full text: read_report("${t.id}"))` : "";
177
+ const body = t.report ? (0, util_1.clip)(t.report, 1200) : t.error ? `error: ${(0, util_1.clip)(t.error, 400)}` : "(no report)";
178
+ return `${head}\n${body}${facts}${files}${arts}${sourcesLine(t)}${full}`;
125
179
  }
126
180
  // ============================================================ workers
127
181
  const ROLE_HINTS = {
128
- researcher: "Research craft: triangulate across independent sources; prefer primary docs over blog spam; capture exact figures, dates, URLs. Search several distinct phrasings before concluding something is unfindable.",
182
+ researcher: "Research craft: be exhaustive. Run deep web_search (deep=true, high count) across several distinct phrasings — pull DOZENS of sources for your sub-question, not three. Triangulate across independent sources; prefer primary docs and official sources over blog spam; capture exact figures, dates, and URLs, and keep the quotable passages the search returns. Record key findings as blackboard notes (with url=<source>) and save a structured markdown file of your sources+findings as an artifact so the synthesizer can build on it. " +
183
+ "A finding without a source is an opinion: list EVERY source your findings rest on in report(...)'s `sources` field (url + what it supports) — only sources reported there can be cited in the final deliverable. When independent sources disagree on a material fact, post note(kind:'conflict') naming both sources and the discrepancy — never silently pick one. For scientific or technical questions, also run academic_search (arXiv + Crossref) — peer-reviewed beats blog posts. " +
184
+ "If a crawl_site tool is available, use it to ingest whole documentation sites or multi-page sources into local markdown files, then read the saved files — far cheaper and broader than fetching pages one by one.",
129
185
  coder: "Engineering craft: read existing code before changing it; match its conventions; build/run/test after every meaningful change and include the command + result in your report. Leave the tree compiling.",
130
186
  analyst: "Analysis craft: quantify wherever possible; state assumptions explicitly; separate observation from interpretation; sanity-check numbers twice.",
131
- writer: "Writing craft: structure before prose; concrete over abstract; cut filler. Match the audience and purpose given in the objective.",
187
+ writer: "Writing craft: structure before prose; concrete over abstract; cut filler. Match the audience and purpose given in the objective. Deliver in the format the objective calls for — for polished documents prefer a styled, self-contained .html file (inline CSS, readable typography, real tables) over raw markdown; ship data tables as .csv alongside the prose.",
132
188
  reviewer: "Review craft: be adversarial; try to break it; check edge cases and the unhappy path; verify claims against the actual files, not the description.",
133
189
  "data-wrangler": "Data craft: validate schema and row counts at every step; spot-check samples; never silently drop rows — report anomalies.",
134
190
  };
@@ -140,13 +196,16 @@ function workerSystem(opts) {
140
196
  : task.attempt > 1 && task.error
141
197
  ? `\nPREVIOUS ATTEMPT FAILED: ${task.error}\nTake a different approach.\n`
142
198
  : "";
199
+ const checkpoint = task.lastCheckpoint
200
+ ? `\nPROGRESS CHECKPOINT FROM A PREVIOUS ATTEMPT (the run was interrupted or retried — do not redo completed work blindly):\n${task.lastCheckpoint}\nRe-verify the state it describes (files, commands) before re-creating anything, then continue from where it left off.\n`
201
+ : "";
143
202
  return `You are ${opts.agentId}, a ${opts.role} agent in a swarm pursuing this mission:
144
203
  ${meta.mission}
145
204
 
146
205
  YOUR TASK — ${task.id} (attempt ${task.attempt})
147
206
  ${task.title}
148
207
  Objective: ${task.objective}
149
- ${task.context ? `Context from the conductor:\n${task.context}\n` : ""}${retry}
208
+ ${task.context ? `Context from the conductor:\n${task.context}\n` : ""}${retry}${checkpoint}
150
209
  CONTEXT FROM THE SWARM
151
210
  ${opts.depReports || "(no dependency reports)"}
152
211
  ${opts.blackboard ? `Blackboard digest:\n${opts.blackboard}` : ""}
@@ -158,12 +217,15 @@ OPERATING PROTOCOL
158
217
  - You are fully autonomous. Never ask questions; decide and act.
159
218
  - Plan briefly, then act in small verified steps: after changing anything, prove it worked (run it, read it back, test it).
160
219
  - Evidence over assumption: read before you edit; check outputs; cite concrete paths, commands and numbers.
161
- - Be token-lean: targeted reads (line ranges, grep via shell) over wholesale dumps; don't re-read unchanged files.
220
+ - Be token-lean: targeted reads (line ranges, grep_files) over wholesale dumps; don't re-read unchanged files. Several edits to one file → one replace_in_file call with edits[].
162
221
  - Post durable discoveries other agents will need to the blackboard with note(...) — facts only, used sparingly.
163
- - Save deliverable files with save_artifact so the operator sees them.
222
+ - Editing files other tasks might also touch? First search_notes for claims, then post note(kind:"claim", key:"<path>") before editing. Claims are advisory — coordinate, don't fight.
223
+ - Save deliverable files with save_artifact so the operator sees them. Pick the format that genuinely fits the deliverable — structured data as .csv/.json, polished documents as self-contained .html, code as runnable files — not everything is a markdown report.
224
+ - On long tasks, call checkpoint(...) after each major chunk so an interrupted run resumes warm instead of from scratch.
164
225
  - Genuinely impossible / missing prerequisite → report(status:"blocked", …) early instead of thrashing.
165
226
  - You have at most ${opts.maxSteps} tool steps. Budget them.
166
- - ALWAYS end by calling report(...). The conductor sees ONLY that report it is the entire value of your work. Specific beats vague: what you did, what you verified, key findings, exact paths.
227
+ - Dependency reports above are excerpts; use read_report(task_id) for full text, and search_notes(query) to find facts posted earlier in the run.
228
+ - ALWAYS end by calling report(...). The conductor sees ONLY that report — it is the entire value of your work. Specific beats vague: what you did, what you verified, key findings, exact paths. Fill key_facts (standalone facts downstream tasks need), open_questions, and files_touched — they are handed verbatim to dependent tasks. If your work drew on the web, fill sources (url + what it supports): only sources reported there can be cited in the final deliverable.
167
229
  ${roleHint ? "\n" + roleHint : ""}`;
168
230
  }
169
231
  exports.WORKER_KICKOFF = "Begin now. Work the task to completion, then call report(...).";
@@ -173,7 +235,7 @@ function forcedFinal(reason) {
173
235
  return `${reason} Stop working and call your terminal tool RIGHT NOW with your best honest account: what you completed, what you verified, what remains.`;
174
236
  }
175
237
  // ============================================================ verifier
176
- function verifierSystem(meta, task) {
238
+ function verifierSystem(meta, task, depReports = "") {
177
239
  return `You are an adversarial verification agent. A worker claims it completed this task — your job is to try to falsify that claim with evidence.
178
240
 
179
241
  MISSION (for context): ${(0, util_1.clip)(meta.mission, 400)}
@@ -184,14 +246,18 @@ ${task.context ? `Context: ${(0, util_1.clip)(task.context, 600)}` : ""}
184
246
  Worker's report:
185
247
  ${(0, util_1.clip)(task.report ?? "", 2400)}
186
248
  ${task.artifacts.length ? `Claimed artifacts: ${task.artifacts.join(", ")}` : ""}
187
-
249
+ ${depReports ? `\nUPSTREAM INPUTS (settled dependency reports — what this task had to build on; judge completeness against them):\n${depReports}\n` : ""}
188
250
  Working directory: ${meta.cwd}
189
251
 
190
252
  PROTOCOL
191
- - Do NOT trust the report. Verify concretely with tools: read the files it claims to have written, run the build/tests/commands, fetch the URLs, check the numbers.
192
- - Check: objective met? success criteria satisfied? deliverables exist and are non-trivial (not stubs/placeholders)?
253
+ - Do NOT trust the report. Verify concretely with tools: read the files it claims to have written, run the build/tests/commands, fetch the URLs, check the numbers. You see only the worker's CLAIMS — gather your own evidence; do not assume shared context.
254
+ - RUBRIC fail unless all hold:
255
+ 1. Completeness: every part of the objective and its "Done when" criteria is addressed${depReports ? " (including everything the upstream inputs handed over)" : ""}.
256
+ 2. Evidence: each substantive claim in the report is backed by something you verified yourself.
257
+ 3. Deliverables: claimed files/artifacts exist, are non-trivial (not stubs/placeholders), and match what the report says about them.
258
+ 4. Correctness: commands/builds/tests the task implies actually succeed when you run them.
193
259
  - Spot-check depth over exhaustive breadth; ~5-12 tool steps.
194
- - Then call verdict(pass, feedback). On fail, feedback must be actionable: exactly what is wrong and where. On pass, one line citing the evidence you checked.`;
260
+ - Then call verdict(pass, feedback, issues). On fail, ALSO fill issues one entry per concrete problem with the evidence you gathered and the exact change needed; the worker's retry sees them verbatim. On pass, feedback is one line citing the evidence you checked.`;
195
261
  }
196
262
  exports.VERIFIER_KICKOFF = "Verify now, then call verdict(...).";
197
263
  // ============================================================ synthesizer
@@ -207,17 +273,47 @@ Conductor's closing notes: ${opts.finishNotes || "(none)"}
207
273
  ALL TASK REPORTS
208
274
  ${opts.reports}
209
275
 
210
- ${opts.blackboard ? `BLACKBOARD\n${opts.blackboard}\n` : ""}${opts.artifactList ? `ARTIFACTS ON DISK\n${opts.artifactList}\n` : ""}
276
+ ${opts.sources ? `SOURCES (numbered, deduplicated from the task reports — the only sources that exist)\n${opts.sources}\n\n` : ""}${opts.blackboard ? `BLACKBOARD\n${opts.blackboard}\n` : ""}${opts.artifactList ? `ARTIFACTS ON DISK\n${opts.artifactList}\n` : ""}
211
277
  Working directory: ${opts.meta.cwd}
212
278
 
213
279
  PROTOCOL
214
280
  - You may read files (read_file / list_dir) to confirm specifics before writing — verify key claims you repeat.
215
- - Then call submit_final with:
216
- report_markdown the deliverable document. Structure: # title; **Outcome** first (did the mission succeed, headline results); then What was built/found with evidence and exact paths; How to use/run it (if applicable); Open issues & recommended next steps. Write for the operator: complete, concrete, zero filler.
281
+ - The mission's PRIMARY deliverable should exist in the format that serves it best, not only as prose. If the task reports produced data, comparisons, or rankings that the artifacts don't already capture in a structured form, save them now with save_artifact (e.g. data/results.csv, data/findings.json) before submitting. Don't duplicate artifacts that already exist — point to them.
282
+ ${opts.sources ? `- CITE YOUR SOURCES: where a claim rests on a numbered source, cite it inline as [n]. End report_markdown with a \`## Sources\` section listing each number you actually cited as a markdown link ([n] [title](url)). Never invent a source or cite a number not in the list. Where sources conflict, present both positions with their citations — do not silently pick one.\n` : ""}- Then call submit_final with:
283
+ • report_markdown — the deliverable document. Structure: # title; **Outcome** first (did the mission succeed, headline results); then What was built/found with evidence and exact paths; How to use/run it (if applicable); Open issues & recommended next steps. Write for the operator: complete, concrete, zero filler. Use real markdown tables for tabular findings. (A styled HTML rendering is generated automatically — do not hand-write one.)
217
284
  • summary — ≤8 sentences for the console.
218
285
  - The report stands alone: a reader who saw nothing else must understand what happened and where everything is.`;
219
286
  }
220
287
  exports.SYNTH_KICKOFF = "Compose and submit the final deliverable now via submit_final(...).";
288
+ // ============================================================ completeness / synthesis checks
289
+ function completenessPrompt(mission, taskTableStr, reports) {
290
+ return `You are a completeness critic for an agent-swarm run that is about to finish. Given the mission and what was actually delivered, list any REAL gaps: parts of the mission not addressed, claims with no supporting task, or deliverables that were promised but never produced.
291
+
292
+ MISSION
293
+ ${mission}
294
+
295
+ TASKS
296
+ ${taskTableStr}
297
+
298
+ TASK REPORTS
299
+ ${reports}
300
+
301
+ Reply with EXACTLY "COMPLETE" if the mission's requirements are genuinely covered. Otherwise reply with a short numbered list of concrete gaps (max 5), each one actionable enough to become a task. Do not invent nice-to-haves — only true gaps against the stated mission.`;
302
+ }
303
+ function synthCheckPrompt(mission, reports, finalReport, sources) {
304
+ return `You are checking a final mission report for faithfulness before delivery. Compare it against the underlying task reports.
305
+
306
+ MISSION
307
+ ${mission}
308
+
309
+ TASK REPORTS (ground truth)
310
+ ${reports}
311
+
312
+ ${sources ? `SOURCE LIST (the only citable sources)\n${sources}\n\n` : ""}FINAL REPORT (to check)
313
+ ${finalReport}
314
+
315
+ Reply with EXACTLY "OK" if the final report's claims are supported by the task reports and nothing material is misrepresented or fabricated${sources ? ", its inline [n] citations all reference numbers that exist in the source list, and no key web-derived factual claim is left uncited" : ""}. Otherwise list the specific discrepancies (max 5), each citing what the final report says vs what the task reports support.`;
316
+ }
221
317
  // ============================================================ compaction
222
318
  function compactorPrompt(serialized) {
223
319
  return `Compress this agent conversation segment into a dense progress summary the agent can rely on to continue working. Preserve: decisions made, files created/modified (exact paths), commands run and their outcomes, key findings/numbers/URLs, errors hit and how they were resolved, current state of the work, and anything still pending. Omit pleasantries and dead ends unless they prevent repeating a mistake. Output the summary only.
@@ -228,5 +324,10 @@ ${serialized}`;
228
324
  // ============================================================ misc
229
325
  function budgetLine(spent, cap) {
230
326
  const pct = cap > 0 ? Math.round((spent.total / cap) * 100) : 0;
231
- return `BUDGET: ${(0, util_1.fmtTokens)(spent.total)} of ${(0, util_1.fmtTokens)(cap)} tokens used (${pct}%) · est. cost so far $${spent.cost.toFixed(2)}`;
327
+ const urgency = pct >= 90
328
+ ? " ⚠ WIND DOWN NOW: stop spawning new work, consolidate what exists, and finish before the cap."
329
+ : pct >= 75
330
+ ? " Note: budget is tightening — prefer consolidation over new exploration."
331
+ : "";
332
+ return `BUDGET: ${(0, util_1.fmtTokens)(spent.total)} of ${(0, util_1.fmtTokens)(cap)} tokens used (${pct}%) · est. cost so far $${spent.cost.toFixed(2)}${urgency}`;
232
333
  }
package/dist/report.js ADDED
@@ -0,0 +1,289 @@
1
+ "use strict";
2
+ /**
3
+ * Dependency-free markdown → styled HTML rendering for final reports.
4
+ *
5
+ * Every run writes artifacts/final-report.html next to final-report.md so the
6
+ * operator always gets a readable, shareable document — even for fallback and
7
+ * failure reports. This is NOT a full CommonMark implementation; it covers the
8
+ * subset models actually emit in reports: headings, paragraphs, lists (nested),
9
+ * fenced code, inline code, bold/italic, links, images, tables, blockquotes,
10
+ * and horizontal rules. Unknown constructs degrade to escaped text — never to
11
+ * broken markup.
12
+ */
13
+ Object.defineProperty(exports, "__esModule", { value: true });
14
+ exports.aggregateSources = aggregateSources;
15
+ exports.sourcesBlock = sourcesBlock;
16
+ exports.mdToHtml = mdToHtml;
17
+ exports.renderFinalHtml = renderFinalHtml;
18
+ const searchcore_1 = require("./searchcore");
19
+ /**
20
+ * Dedupe every task's reported sources (by canonical URL) into one numbered
21
+ * bibliography for the synthesizer. First occurrence wins the number; later
22
+ * tasks fill in missing titles/dates.
23
+ */
24
+ function aggregateSources(tasks) {
25
+ const byKey = new Map();
26
+ for (const t of tasks) {
27
+ for (const s of t.sources ?? []) {
28
+ const key = (0, searchcore_1.canonicalizeUrl)(s.url);
29
+ const cur = byKey.get(key);
30
+ if (cur) {
31
+ if (!cur.taskIds.includes(t.id))
32
+ cur.taskIds.push(t.id);
33
+ if (!cur.title && s.title)
34
+ cur.title = s.title;
35
+ if (!cur.date && s.date)
36
+ cur.date = s.date;
37
+ if (!cur.note && s.note)
38
+ cur.note = s.note;
39
+ }
40
+ else {
41
+ byKey.set(key, { ...s, n: byKey.size + 1, taskIds: [t.id] });
42
+ }
43
+ }
44
+ }
45
+ return [...byKey.values()];
46
+ }
47
+ /** Render the numbered source list for prompts (one line per source). */
48
+ function sourcesBlock(sources) {
49
+ return sources
50
+ .map((s) => `[${s.n}] ${s.title ? `${s.title} — ` : ""}${s.url}${s.date ? ` (${s.date})` : ""}${s.note ? ` — ${s.note}` : ""} [cited by ${s.taskIds.join(",")}]`)
51
+ .join("\n");
52
+ }
53
+ function esc(s) {
54
+ return s
55
+ .replace(/&/g, "&amp;")
56
+ .replace(/</g, "&lt;")
57
+ .replace(/>/g, "&gt;")
58
+ .replace(/"/g, "&quot;");
59
+ }
60
+ /** Inline markdown on an already-escaped string. Code spans are opaque. */
61
+ function inline(s) {
62
+ const out = [];
63
+ // Split on code spans first so no other rule fires inside them.
64
+ const parts = s.split(/(`+[^`]*`+)/g);
65
+ for (const part of parts) {
66
+ const code = /^(`+)([^`]*)\1$/.exec(part);
67
+ if (code) {
68
+ out.push(`<code>${code[2].trim() || "`"}</code>`);
69
+ continue;
70
+ }
71
+ let t = part;
72
+ // Images before links (same bracket syntax).
73
+ t = t.replace(/!\[([^\]]*)\]\((https?:[^()\s]+)\)/g, '<img src="$2" alt="$1" loading="lazy">');
74
+ t = t.replace(/\[([^\]]+)\]\(((?:https?:|#|\.{0,2}\/)[^()\s]*)\)/g, '<a href="$2" target="_blank" rel="noopener">$1</a>');
75
+ t = t.replace(/\*\*\*([^*]+)\*\*\*/g, "<strong><em>$1</em></strong>");
76
+ t = t.replace(/\*\*([^*]+)\*\*/g, "<strong>$1</strong>");
77
+ t = t.replace(/(^|[\s(])\*([^*\s][^*]*)\*/g, "$1<em>$2</em>");
78
+ t = t.replace(/(^|[\s(])_([^_\s][^_]*)_(?=[\s.,;:!?)]|$)/g, "$1<em>$2</em>");
79
+ // Bare URLs become links (escaped text, so no quotes can appear inside).
80
+ t = t.replace(/(^|[\s(])(https?:\/\/[^\s<)]+[^\s<).,;:!?])/g, '$1<a href="$2" target="_blank" rel="noopener">$2</a>');
81
+ out.push(t);
82
+ }
83
+ return out.join("");
84
+ }
85
+ function mdToHtml(md) {
86
+ const lines = md.replace(/\r\n/g, "\n").split("\n");
87
+ const html = [];
88
+ const lists = [];
89
+ let para = [];
90
+ let quote = [];
91
+ const closeLists = (toIndent = -1) => {
92
+ while (lists.length && lists[lists.length - 1].indent > toIndent) {
93
+ html.push(`</li></${lists.pop().tag}>`);
94
+ }
95
+ };
96
+ const flushPara = () => {
97
+ if (para.length) {
98
+ html.push(`<p>${inline(esc(para.join(" ")))}</p>`);
99
+ para = [];
100
+ }
101
+ };
102
+ const flushQuote = () => {
103
+ if (quote.length) {
104
+ html.push(`<blockquote>${mdToHtml(quote.join("\n"))}</blockquote>`);
105
+ quote = [];
106
+ }
107
+ };
108
+ const flushAll = () => {
109
+ flushPara();
110
+ flushQuote();
111
+ closeLists();
112
+ };
113
+ for (let i = 0; i < lines.length; i++) {
114
+ const line = lines[i];
115
+ // Fenced code block.
116
+ const fence = /^\s*(```|~~~)\s*(\S*)/.exec(line);
117
+ if (fence) {
118
+ flushAll();
119
+ const buf = [];
120
+ for (i++; i < lines.length && !lines[i].trim().startsWith(fence[1]); i++)
121
+ buf.push(lines[i]);
122
+ const lang = fence[2] ? ` class="lang-${esc(fence[2])}"` : "";
123
+ html.push(`<pre><code${lang}>${esc(buf.join("\n"))}</code></pre>`);
124
+ continue;
125
+ }
126
+ // Blockquote (grouped, recursively rendered).
127
+ const q = /^\s*>\s?(.*)$/.exec(line);
128
+ if (q) {
129
+ flushPara();
130
+ closeLists();
131
+ quote.push(q[1]);
132
+ continue;
133
+ }
134
+ flushQuote();
135
+ // Blank line ends the current paragraph / list run (unless the next
136
+ // non-blank line continues the list).
137
+ if (!line.trim()) {
138
+ flushPara();
139
+ if (lists.length) {
140
+ let j = i + 1;
141
+ while (j < lines.length && !lines[j].trim())
142
+ j++;
143
+ if (j >= lines.length || !/^(\s*)([-*+]|\d+[.)])\s+/.test(lines[j]))
144
+ closeLists();
145
+ }
146
+ continue;
147
+ }
148
+ // Heading.
149
+ const h = /^(#{1,6})\s+(.*)$/.exec(line.trim());
150
+ if (h) {
151
+ flushAll();
152
+ const level = h[1].length;
153
+ const text = h[2].replace(/\s+#+\s*$/, "");
154
+ html.push(`<h${level}>${inline(esc(text))}</h${level}>`);
155
+ continue;
156
+ }
157
+ // Horizontal rule.
158
+ if (/^\s*([-*_])\s*(\1\s*){2,}$/.test(line)) {
159
+ flushAll();
160
+ html.push("<hr>");
161
+ continue;
162
+ }
163
+ // Table: header row + |---| separator.
164
+ if (line.includes("|") && i + 1 < lines.length && /^\s*\|?[\s:|-]+\|[\s:|-]*$/.test(lines[i + 1]) && lines[i + 1].includes("-")) {
165
+ flushAll();
166
+ const cells = (row) => row.trim().replace(/^\|/, "").replace(/\|$/, "").split("|").map((c) => inline(esc(c.trim())));
167
+ const head = cells(line);
168
+ const rows = [];
169
+ for (i += 2; i < lines.length && lines[i].includes("|") && lines[i].trim(); i++)
170
+ rows.push(cells(lines[i]));
171
+ i--;
172
+ html.push("<table><thead><tr>" +
173
+ head.map((c) => `<th>${c}</th>`).join("") +
174
+ "</tr></thead><tbody>" +
175
+ rows.map((r) => `<tr>${r.map((c) => `<td>${c}</td>`).join("")}</tr>`).join("") +
176
+ "</tbody></table>");
177
+ continue;
178
+ }
179
+ // List item (unordered or ordered, nested by indentation).
180
+ const li = /^(\s*)([-*+]|\d+[.)])\s+(.*)$/.exec(line);
181
+ if (li) {
182
+ flushPara();
183
+ const indent = li[1].length;
184
+ const tag = /\d/.test(li[2]) ? "ol" : "ul";
185
+ const top = lists[lists.length - 1];
186
+ if (!top || indent > top.indent) {
187
+ lists.push({ indent, tag });
188
+ html.push(`<${tag}><li>${inline(esc(li[3]))}`);
189
+ }
190
+ else {
191
+ closeLists(indent);
192
+ const cur = lists[lists.length - 1];
193
+ if (cur && cur.indent === indent && cur.tag !== tag) {
194
+ html.push(`</li></${lists.pop().tag}>`);
195
+ }
196
+ if (lists.length && lists[lists.length - 1].indent === indent) {
197
+ html.push(`</li><li>${inline(esc(li[3]))}`);
198
+ }
199
+ else {
200
+ lists.push({ indent, tag });
201
+ html.push(`<${tag}><li>${inline(esc(li[3]))}`);
202
+ }
203
+ }
204
+ continue;
205
+ }
206
+ // Continuation line inside a list item.
207
+ if (lists.length && /^\s{2,}\S/.test(line)) {
208
+ html.push(` ${inline(esc(line.trim()))}`);
209
+ continue;
210
+ }
211
+ closeLists();
212
+ para.push(line.trim());
213
+ }
214
+ flushAll();
215
+ return html.join("\n");
216
+ }
217
+ const CSS = `
218
+ :root { color-scheme: light dark; }
219
+ * { box-sizing: border-box; }
220
+ body {
221
+ margin: 0; padding: 48px 24px 96px;
222
+ font: 16px/1.65 -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", sans-serif;
223
+ background: #fcfcfa; color: #1c1c1a;
224
+ }
225
+ @media (prefers-color-scheme: dark) { body { background: #131312; color: #e8e6e1; } }
226
+ main { max-width: 860px; margin: 0 auto; }
227
+ header.run-meta {
228
+ max-width: 860px; margin: 0 auto 36px; padding-bottom: 20px;
229
+ border-bottom: 1px solid rgba(128,128,128,.25);
230
+ font-size: 13px; color: #6e6e68; display: flex; flex-wrap: wrap; gap: 8px 18px; align-items: center;
231
+ }
232
+ .badge { padding: 2px 10px; border-radius: 999px; font-weight: 600; font-size: 12px; letter-spacing: .02em; }
233
+ .badge.done { background: rgba(34,160,84,.14); color: #1d8a4c; }
234
+ .badge.failed { background: rgba(214,60,60,.14); color: #c23b3b; }
235
+ .badge.cancelled { background: rgba(150,150,150,.18); color: #77756f; }
236
+ h1, h2, h3, h4 { line-height: 1.25; letter-spacing: -0.012em; }
237
+ h1 { font-size: 30px; margin: 0 0 18px; }
238
+ h2 { font-size: 22px; margin: 36px 0 12px; }
239
+ h3 { font-size: 18px; margin: 28px 0 10px; }
240
+ a { color: #2563c4; text-decoration: none; }
241
+ a:hover { text-decoration: underline; }
242
+ @media (prefers-color-scheme: dark) { a { color: #7aa7e8; } }
243
+ code {
244
+ font: 13.5px/1.5 ui-monospace, "SF Mono", Menlo, Consolas, monospace;
245
+ background: rgba(128,128,128,.13); padding: 1.5px 5px; border-radius: 4px;
246
+ }
247
+ pre {
248
+ background: rgba(128,128,128,.09); border: 1px solid rgba(128,128,128,.18);
249
+ border-radius: 8px; padding: 14px 16px; overflow-x: auto;
250
+ }
251
+ pre code { background: none; padding: 0; }
252
+ blockquote {
253
+ margin: 16px 0; padding: 2px 18px; border-left: 3px solid rgba(128,128,128,.35);
254
+ color: #6e6e68;
255
+ }
256
+ table { border-collapse: collapse; margin: 18px 0; width: 100%; font-size: 14.5px; }
257
+ th, td { border: 1px solid rgba(128,128,128,.25); padding: 7px 12px; text-align: left; vertical-align: top; }
258
+ th { background: rgba(128,128,128,.08); }
259
+ img { max-width: 100%; border-radius: 6px; }
260
+ hr { border: none; border-top: 1px solid rgba(128,128,128,.25); margin: 32px 0; }
261
+ ul, ol { padding-left: 26px; }
262
+ li { margin: 3px 0; }
263
+ `;
264
+ /** Self-contained HTML document (inline CSS, no scripts, no external fetches). */
265
+ function renderFinalHtml(o) {
266
+ const title = /^#\s+(.+)$/m.exec(o.markdown)?.[1] ?? o.mission;
267
+ const date = new Date(o.finishedAt).toISOString().replace("T", " ").slice(0, 16) + " UTC";
268
+ return `<!doctype html>
269
+ <html lang="en">
270
+ <head>
271
+ <meta charset="utf-8">
272
+ <meta name="viewport" content="width=device-width, initial-scale=1">
273
+ <title>${esc(title.slice(0, 120))}</title>
274
+ <style>${CSS}</style>
275
+ </head>
276
+ <body>
277
+ <header class="run-meta">
278
+ <span class="badge ${o.status}">${o.status}</span>
279
+ <span>run ${esc(o.runId)}</span>
280
+ <span>${esc(date)}</span>
281
+ <span title="${esc(o.mission.slice(0, 600))}">mission: ${esc(o.mission.length > 90 ? o.mission.slice(0, 90) + "…" : o.mission)}</span>
282
+ </header>
283
+ <main>
284
+ ${mdToHtml(o.markdown)}
285
+ </main>
286
+ </body>
287
+ </html>
288
+ `;
289
+ }
package/dist/run.js CHANGED
@@ -126,8 +126,13 @@ const summaryCache = new Map();
126
126
  */
127
127
  const liveCache = new Map();
128
128
  const TERMINAL_STATUSES = ["done", "failed", "cancelled"];
129
- /** Grace before a silent, pid-less run is presumed dead (engine startup, fs lag). */
130
- const STALE_AFTER_MS = 20_000;
129
+ /**
130
+ * Grace before a silent, pid-less run is presumed dead. The pid file is the
131
+ * primary live signal; this window only covers engine startup (before
132
+ * writePid) and filesystem lag — generous enough that slow disks and slow
133
+ * provider preflights never flag a healthy run as interrupted.
134
+ */
135
+ const STALE_AFTER_MS = 45_000;
131
136
  /**
132
137
  * A run whose engine process vanished without writing a terminal status
133
138
  * (kill -9, reboot) would otherwise show "running" forever. Presentation-level
@@ -193,6 +198,14 @@ function listRuns(pricing) {
193
198
  s.pid = readPid(id);
194
199
  out.push(applyLiveness(s));
195
200
  }
201
+ // Deleted runs must not pin their reduced state in a long-lived hub forever.
202
+ const live = new Set(ids);
203
+ for (const key of summaryCache.keys())
204
+ if (!live.has(key))
205
+ summaryCache.delete(key);
206
+ for (const key of liveCache.keys())
207
+ if (!live.has(key))
208
+ liveCache.delete(key);
196
209
  out.sort((a, b) => b.createdAt - a.createdAt);
197
210
  return out;
198
211
  }
package/dist/sandbox.js CHANGED
@@ -302,7 +302,17 @@ class RemoteRuntime {
302
302
  throw new Error(`${what} failed (exit ${r.code}): ${r.out.slice(0, 300)}`);
303
303
  return r.out;
304
304
  }
305
+ /** base64-over-shell transfers buffer the whole file — refuse the huge ones. */
306
+ async checkSize(abs, capBytes, what) {
307
+ const out = await this.execOk(`wc -c < ${shq(abs)}`, `stat ${abs}`);
308
+ const size = Number(out.trim());
309
+ if (Number.isFinite(size) && size > capBytes) {
310
+ throw new Error(`${what}: file is ${Math.round(size / 1e6)}MB (cap ${Math.round(capBytes / 1e6)}MB) — ` +
311
+ `compress it or extract the relevant part in the sandbox first`);
312
+ }
313
+ }
305
314
  async readFile(abs) {
315
+ await this.checkSize(abs, 4_000_000, `read ${abs}`);
306
316
  const out = await this.execOk(`base64 < ${shq(abs)}`, `read ${abs}`);
307
317
  return Buffer.from(out.replace(/\s+/g, ""), "base64").toString("utf8");
308
318
  }
@@ -319,6 +329,7 @@ class RemoteRuntime {
319
329
  }
320
330
  }
321
331
  async pull(remoteAbs, localAbs) {
332
+ await this.checkSize(remoteAbs, 32_000_000, `pull ${remoteAbs}`);
322
333
  const out = await this.execOk(`base64 < ${shq(remoteAbs)}`, `pull ${remoteAbs}`);
323
334
  (0, util_1.ensureDir)(path.dirname(localAbs));
324
335
  fs.writeFileSync(localAbs, Buffer.from(out.replace(/\s+/g, ""), "base64"));