npm - @robzilla1738/agentswarm - Versions diffs - 0.3.0 → 0.6.0 - Mend

@robzilla1738/agentswarm 0.3.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/README.md +51 -11
package/dist/agent.js +18 -2
package/dist/cli.js +39 -8
package/dist/config.js +62 -6
package/dist/crawltools.js +247 -0
package/dist/deepseek.js +125 -10
package/dist/executor.js +993 -144
package/dist/hub.js +85 -6
package/dist/journal.js +61 -11
package/dist/memory.js +84 -0
package/dist/pdftext.js +211 -0
package/dist/prompts.js +124 -23
package/dist/report.js +289 -0
package/dist/run.js +15 -2
package/dist/sandbox.js +11 -0
package/dist/searchcore.js +244 -0
package/dist/state.js +85 -3
package/dist/tools.js +392 -25
package/dist/util.js +85 -0
package/dist/webtools.js +327 -66
package/package.json +3 -2
package/ui/out/404/index.html +1 -1
package/ui/out/404.html +1 -1
package/ui/out/_next/static/chunks/532-35122e93f37719b9.js +1 -0
package/ui/out/_next/static/chunks/677-721ce1c8b7a6a317.js +1 -0
package/ui/out/_next/static/chunks/app/page-dc9f6744d203e76c.js +1 -0
package/ui/out/_next/static/chunks/app/run/page-3674e103981703a2.js +1 -0
package/ui/out/_next/static/chunks/app/settings/page-41a5d8ba43ecfd4a.js +1 -0
package/ui/out/_next/static/css/d95c2ba395730031.css +3 -0
package/ui/out/fonts/PlanetKosmos.ttf +0 -0
package/ui/out/index.html +1 -1
package/ui/out/index.txt +3 -3
package/ui/out/run/index.html +1 -1
package/ui/out/run/index.txt +3 -3
package/ui/out/settings/index.html +1 -1
package/ui/out/settings/index.txt +3 -3
package/ui/out/_next/static/chunks/383-289a866b246b41cc.js +0 -1
package/ui/out/_next/static/chunks/619-ba102abea3e3d0e4.js +0 -1
package/ui/out/_next/static/chunks/677-7ab85a6f38c3a235.js +0 -1
package/ui/out/_next/static/chunks/app/page-0fda5b8e77d90b84.js +0 -1
package/ui/out/_next/static/chunks/app/run/page-07aab6b1224c3c8c.js +0 -1
package/ui/out/_next/static/chunks/app/settings/page-528482d468d84cfa.js +0 -1
package/ui/out/_next/static/css/e2c82b53bf4519e8.css +0 -3
/package/ui/out/_next/static/{Rm5Fhkds2-wIOnVlME55J → 7_pihFubDGD40BCy2ynlr}/_buildManifest.js +0 -0
/package/ui/out/_next/static/{Rm5Fhkds2-wIOnVlME55J → 7_pihFubDGD40BCy2ynlr}/_ssgManifest.js +0 -0

package/dist/prompts.js CHANGED Viewed

@@ -39,10 +39,13 @@ exports.conductorInitialUpdate = conductorInitialUpdate;
 exports.conductorUpdate = conductorUpdate;
 exports.taskTable = taskTable;
 exports.reportBlock = reportBlock;
+exports.depReportBlock = depReportBlock;
 exports.workerSystem = workerSystem;
 exports.forcedFinal = forcedFinal;
 exports.verifierSystem = verifierSystem;
 exports.synthSystem = synthSystem;
+exports.completenessPrompt = completenessPrompt;
+exports.synthCheckPrompt = synthCheckPrompt;
 exports.compactorPrompt = compactorPrompt;
 exports.budgetLine = budgetLine;
 const os = __importStar(require("os"));
@@ -72,15 +75,20 @@ DOCTRINE
 2. Make every task self-contained: crisp objective, explicit success criteria ("Done when …"), and every fact/path/URL the worker needs inlined in context. Workers know nothing you don't tell them.
 3. Invent the right specialist role per task (researcher, coder, analyst, data-wrangler, reviewer, writer, …). One concern per task, roughly 5–25 tool steps of work. Bigger → split it. Trivial → batch it.
 4. Software missions: scaffold first (one task), then parallel tasks on DISJOINT files/modules — never two writers on the same file — then an integration + test task that deps on all of them with verify:true.
-5. Research missions: parallel scouts with distinct angles and sources, then a consolidation/analysis task that deps on the scouts.
+5. Research missions: go WIDE. Spawn many parallel scouts (10+ for a broad topic), each owning a distinct sub-question, angle, source type, time period, or entity — so collectively they pull hundreds of sources, not dozens. Tell each scout to use deep web_search (high count) and to record findings with exact URLs/quotes on the blackboard and in artifact files. Then spawn analysis/consolidation tasks that dep on the scouts, and a final synthesis. When one scout's area is itself broad, spawn it with team:true so it fans out further.
 6. Set verify:true on tasks whose failure would poison the mission (builds, integrations, data pipelines, final deliverables). A verification agent will adversarially check them and can fail them back for retry.
 7. React to evidence. Failed/blocked task → diagnose from its report and spawn a corrected or alternative approach (never re-run a failed approach verbatim). Surprising findings → adapt the plan.
 8. Watch the budget shown in every update. As it tightens, cut scope to what the mission truly needs — always deliver value before the cap, never run out mid-flight.
 9. Operator messages override everything. Adjust the plan immediately when one appears.
 10. finish only when the mission's success criteria are demonstrably met, or budget/feasibility forces it. Your finish notes steer the synthesizer that writes the final report.
+11. Model tiers: set model:"cheap" on scouts and bulk extraction, model:"strong" on leads, integration, and verified deliverables. Default tier for everything in between.
+12. Big subsystems: spawn with team:true to run the task as a sub-swarm — its own lead decomposes it into parallel sub-tasks and reports one consolidated result. Use for coherent multi-task chunks ("build the backend", "research all 12 competitors"), not for single jobs.
+13. Beyond ~20 tasks, maintain a living plan with update_plan (mission-plan.md): approach, what's done, what's next, open risks. Rewrite it at phase boundaries — it is pinned into your updates and survives restarts.
+14. Long missions: structure the work into phases with set_phase (e.g. discovery → build → integrate → polish). The current phase and its exit criteria are pinned into every update, so the plan survives even when old history is trimmed.
+15. DELIVERABLES SHIP IN THE FORMAT THE MISSION ACTUALLY NEEDS — a markdown report is the fallback, not the default. Software → running code with build/run instructions; data work → .csv/.json/.sqlite plus a summary; comparisons and datasets → tables in CSV as well as prose; polished documents → styled self-contained .html (the operator reads HTML, not raw markdown); scripts/configs → the runnable files themselves. Spell the expected format and exact filename(s) out in the deliverable task's objective and have it save them with save_artifact.
 RULES
-- Respond ONLY by calling your tools (spawn_tasks / wait / finish). Plain-text replies are ignored.
+- Respond ONLY by calling your tools (spawn_tasks / set_phase / wait / finish). Plain-text replies are ignored. set_phase alone is not a decision — pair it with spawn_tasks, wait, or finish.
 - Never spawn a task whose deps are not yet all created.
 - Keep the total task count within budget (max ${o.maxTasks} per run); make every task earn its place.`;
 }
@@ -98,6 +106,10 @@ function conductorUpdate(p) {
         sections.push(`NEW REPORTS\n${p.reports.join("\n\n")}`);
     if (p.blackboard)
         sections.push(`BLACKBOARD (shared notes digest)\n${p.blackboard}`);
+    if (p.phase)
+        sections.push(p.phase);
+    if (p.plan)
+        sections.push(p.plan);
     sections.push(`SWARM STATE\n${p.taskTable}`);
     sections.push(p.budgetLine);
     if (p.extra)
@@ -108,27 +120,71 @@ function conductorUpdate(p) {
 function taskTable(tasks) {
     if (!tasks.length)
         return "(no tasks yet)";
-    return tasks
-        .map((t) => {
+    const line = (t) => {
         const deps = t.deps.length ? ` deps:[${t.deps.join(",")}]` : "";
-        const extra = t.status === "failed" && t.error ? ` — ${(0, util_1.clip)(t.error, 80)}` : "";
+        const extra = (t.status === "failed" || t.status === "blocked") && t.error ? ` — ${(0, util_1.clip)(t.error, 120)}` : "";
         return `${t.id} [${t.status}${t.attempt > 1 ? ` a${t.attempt}` : ""}] (${t.role})${deps} ${(0, util_1.clip)(t.title, 70)}${extra}`;
-    })
-        .join("\n");
+    };
+    const settled = tasks.filter((t) => ["done", "failed", "blocked"].includes(t.status));
+    if (settled.length <= 30)
+        return tasks.map(line).join("\n");
+    // Hundreds of tasks must not flood the conductor's prompt: collapse DONE
+    // tasks in older waves to one line per wave. Failures/blocks stay full-line
+    // forever (they're what the conductor plans around), as do active tasks and
+    // the two most recent waves.
+    const maxWave = Math.max(...tasks.map((t) => t.wave));
+    const out = [];
+    const waves = [...new Set(tasks.map((t) => t.wave))].sort((a, b) => a - b);
+    for (const w of waves) {
+        const ws = tasks.filter((t) => t.wave === w);
+        const collapsible = w < maxWave - 1 ? ws.filter((t) => t.status === "done") : [];
+        const fullLines = ws.filter((t) => !collapsible.includes(t));
+        if (collapsible.length) {
+            out.push(`wave ${w}: ${collapsible.length} done (${collapsible.map((t) => t.id).join(",")})`);
+        }
+        out.push(...fullLines.map(line));
+    }
+    return out.join("\n");
+}
+function sourcesLine(t, max = 6) {
+    if (!t.sources?.length)
+        return "";
+    const shown = t.sources.slice(0, max).map((s) => s.url);
+    const more = t.sources.length > max ? ` (+${t.sources.length - max} more)` : "";
+    return `\nsources: ${shown.join(" · ")}${more}`;
 }
 function reportBlock(t) {
     const head = `── ${t.id} (${t.role}) "${(0, util_1.clip)(t.title, 60)}" → ${t.status.toUpperCase()}${t.attempt > 1 ? ` (attempt ${t.attempt})` : ""}`;
     const body = t.report ? (0, util_1.clip)(t.report, 1600) : t.error ? `error: ${(0, util_1.clip)(t.error, 400)}` : "(no report)";
+    const facts = t.keyFacts?.length ? `\nkey facts:\n${t.keyFacts.map((f) => `  • ${(0, util_1.clip)(f, 200)}`).join("\n")}` : "";
+    const open = t.openQuestions?.length ? `\nopen questions: ${t.openQuestions.map((q) => (0, util_1.clip)(q, 150)).join(" | ")}` : "";
+    const files = t.filesTouched?.length ? `\nfiles touched: ${t.filesTouched.join(", ")}` : "";
     const arts = t.artifacts.length ? `\nartifacts: ${t.artifacts.join(", ")}` : "";
     const fb = t.feedback ? `\nverifier: ${(0, util_1.clip)(t.feedback, 300)}` : "";
-    return `${head}\n${body}${arts}${fb}`;
+    return `${head}\n${body}${facts}${open}${files}${arts}${sourcesLine(t)}${fb}`;
+}
+/**
+ * Compact dependency context for a downstream worker: structured handoff
+ * fields in full, prose report as an excerpt — read_report(taskId) has the
+ * rest. Keeps fan-in tasks from inheriting megabytes of ancestor prose.
+ */
+function depReportBlock(t) {
+    const head = `── dep ${t.id} (${t.role}) "${(0, util_1.clip)(t.title, 60)}" → ${t.status.toUpperCase()}`;
+    const facts = t.keyFacts?.length ? `\nkey facts:\n${t.keyFacts.map((f) => `  • ${(0, util_1.clip)(f, 200)}`).join("\n")}` : "";
+    const files = t.filesTouched?.length ? `\nfiles touched: ${t.filesTouched.join(", ")}` : "";
+    const arts = t.artifacts.length ? `\nartifacts: ${t.artifacts.join(", ")}` : "";
+    const full = (t.report ?? "").length > 1200 ? `\n(excerpt — full text: read_report("${t.id}"))` : "";
+    const body = t.report ? (0, util_1.clip)(t.report, 1200) : t.error ? `error: ${(0, util_1.clip)(t.error, 400)}` : "(no report)";
+    return `${head}\n${body}${facts}${files}${arts}${sourcesLine(t)}${full}`;
 }
 // ============================================================ workers
 const ROLE_HINTS = {
-    researcher: "Research craft: triangulate across independent sources; prefer primary docs over blog spam; capture exact figures, dates, URLs. Search several distinct phrasings before concluding something is unfindable.",
+    researcher: "Research craft: be exhaustive. Run deep web_search (deep=true, high count) across several distinct phrasings — pull DOZENS of sources for your sub-question, not three. Triangulate across independent sources; prefer primary docs and official sources over blog spam; capture exact figures, dates, and URLs, and keep the quotable passages the search returns. Record key findings as blackboard notes (with url=<source>) and save a structured markdown file of your sources+findings as an artifact so the synthesizer can build on it. " +
+        "A finding without a source is an opinion: list EVERY source your findings rest on in report(...)'s `sources` field (url + what it supports) — only sources reported there can be cited in the final deliverable. When independent sources disagree on a material fact, post note(kind:'conflict') naming both sources and the discrepancy — never silently pick one. For scientific or technical questions, also run academic_search (arXiv + Crossref) — peer-reviewed beats blog posts. " +
+        "If a crawl_site tool is available, use it to ingest whole documentation sites or multi-page sources into local markdown files, then read the saved files — far cheaper and broader than fetching pages one by one.",
     coder: "Engineering craft: read existing code before changing it; match its conventions; build/run/test after every meaningful change and include the command + result in your report. Leave the tree compiling.",
     analyst: "Analysis craft: quantify wherever possible; state assumptions explicitly; separate observation from interpretation; sanity-check numbers twice.",
-    writer: "Writing craft: structure before prose; concrete over abstract; cut filler. Match the audience and purpose given in the objective.",
+    writer: "Writing craft: structure before prose; concrete over abstract; cut filler. Match the audience and purpose given in the objective. Deliver in the format the objective calls for — for polished documents prefer a styled, self-contained .html file (inline CSS, readable typography, real tables) over raw markdown; ship data tables as .csv alongside the prose.",
     reviewer: "Review craft: be adversarial; try to break it; check edge cases and the unhappy path; verify claims against the actual files, not the description.",
     "data-wrangler": "Data craft: validate schema and row counts at every step; spot-check samples; never silently drop rows — report anomalies.",
 };
@@ -140,13 +196,16 @@ function workerSystem(opts) {
         : task.attempt > 1 && task.error
             ? `\nPREVIOUS ATTEMPT FAILED: ${task.error}\nTake a different approach.\n`
             : "";
+    const checkpoint = task.lastCheckpoint
+        ? `\nPROGRESS CHECKPOINT FROM A PREVIOUS ATTEMPT (the run was interrupted or retried — do not redo completed work blindly):\n${task.lastCheckpoint}\nRe-verify the state it describes (files, commands) before re-creating anything, then continue from where it left off.\n`
+        : "";
     return `You are ${opts.agentId}, a ${opts.role} agent in a swarm pursuing this mission:
 ${meta.mission}
 YOUR TASK — ${task.id} (attempt ${task.attempt})
 ${task.title}
 Objective: ${task.objective}
-${task.context ? `Context from the conductor:\n${task.context}\n` : ""}${retry}
+${task.context ? `Context from the conductor:\n${task.context}\n` : ""}${retry}${checkpoint}
 CONTEXT FROM THE SWARM
 ${opts.depReports || "(no dependency reports)"}
 ${opts.blackboard ? `Blackboard digest:\n${opts.blackboard}` : ""}
@@ -158,12 +217,15 @@ OPERATING PROTOCOL
 - You are fully autonomous. Never ask questions; decide and act.
 - Plan briefly, then act in small verified steps: after changing anything, prove it worked (run it, read it back, test it).
 - Evidence over assumption: read before you edit; check outputs; cite concrete paths, commands and numbers.
-- Be token-lean: targeted reads (line ranges, grep via shell) over wholesale dumps; don't re-read unchanged files.
+- Be token-lean: targeted reads (line ranges, grep_files) over wholesale dumps; don't re-read unchanged files. Several edits to one file → one replace_in_file call with edits[].
 - Post durable discoveries other agents will need to the blackboard with note(...) — facts only, used sparingly.
-- Save deliverable files with save_artifact so the operator sees them.
+- Editing files other tasks might also touch? First search_notes for claims, then post note(kind:"claim", key:"<path>") before editing. Claims are advisory — coordinate, don't fight.
+- Save deliverable files with save_artifact so the operator sees them. Pick the format that genuinely fits the deliverable — structured data as .csv/.json, polished documents as self-contained .html, code as runnable files — not everything is a markdown report.
+- On long tasks, call checkpoint(...) after each major chunk so an interrupted run resumes warm instead of from scratch.
 - Genuinely impossible / missing prerequisite → report(status:"blocked", …) early instead of thrashing.
 - You have at most ${opts.maxSteps} tool steps. Budget them.
-- ALWAYS end by calling report(...). The conductor sees ONLY that report — it is the entire value of your work. Specific beats vague: what you did, what you verified, key findings, exact paths.
+- Dependency reports above are excerpts; use read_report(task_id) for full text, and search_notes(query) to find facts posted earlier in the run.
+- ALWAYS end by calling report(...). The conductor sees ONLY that report — it is the entire value of your work. Specific beats vague: what you did, what you verified, key findings, exact paths. Fill key_facts (standalone facts downstream tasks need), open_questions, and files_touched — they are handed verbatim to dependent tasks. If your work drew on the web, fill sources (url + what it supports): only sources reported there can be cited in the final deliverable.
 ${roleHint ? "\n" + roleHint : ""}`;
 }
 exports.WORKER_KICKOFF = "Begin now. Work the task to completion, then call report(...).";
@@ -173,7 +235,7 @@ function forcedFinal(reason) {
     return `${reason} Stop working and call your terminal tool RIGHT NOW with your best honest account: what you completed, what you verified, what remains.`;
 }
 // ============================================================ verifier
-function verifierSystem(meta, task) {
+function verifierSystem(meta, task, depReports = "") {
     return `You are an adversarial verification agent. A worker claims it completed this task — your job is to try to falsify that claim with evidence.
 MISSION (for context): ${(0, util_1.clip)(meta.mission, 400)}
@@ -184,14 +246,18 @@ ${task.context ? `Context: ${(0, util_1.clip)(task.context, 600)}` : ""}
 Worker's report:
 ${(0, util_1.clip)(task.report ?? "", 2400)}
 ${task.artifacts.length ? `Claimed artifacts: ${task.artifacts.join(", ")}` : ""}
+${depReports ? `\nUPSTREAM INPUTS (settled dependency reports — what this task had to build on; judge completeness against them):\n${depReports}\n` : ""}
 Working directory: ${meta.cwd}
 PROTOCOL
-- Do NOT trust the report. Verify concretely with tools: read the files it claims to have written, run the build/tests/commands, fetch the URLs, check the numbers.
-- Check: objective met? success criteria satisfied? deliverables exist and are non-trivial (not stubs/placeholders)?
+- Do NOT trust the report. Verify concretely with tools: read the files it claims to have written, run the build/tests/commands, fetch the URLs, check the numbers. You see only the worker's CLAIMS — gather your own evidence; do not assume shared context.
+- RUBRIC — fail unless all hold:
+  1. Completeness: every part of the objective and its "Done when" criteria is addressed${depReports ? " (including everything the upstream inputs handed over)" : ""}.
+  2. Evidence: each substantive claim in the report is backed by something you verified yourself.
+  3. Deliverables: claimed files/artifacts exist, are non-trivial (not stubs/placeholders), and match what the report says about them.
+  4. Correctness: commands/builds/tests the task implies actually succeed when you run them.
 - Spot-check depth over exhaustive breadth; ~5-12 tool steps.
-- Then call verdict(pass, feedback). On fail, feedback must be actionable: exactly what is wrong and where. On pass, one line citing the evidence you checked.`;
+- Then call verdict(pass, feedback, issues). On fail, ALSO fill issues — one entry per concrete problem with the evidence you gathered and the exact change needed; the worker's retry sees them verbatim. On pass, feedback is one line citing the evidence you checked.`;
 }
 exports.VERIFIER_KICKOFF = "Verify now, then call verdict(...).";
 // ============================================================ synthesizer
@@ -207,17 +273,47 @@ Conductor's closing notes: ${opts.finishNotes || "(none)"}
 ALL TASK REPORTS
 ${opts.reports}
-${opts.blackboard ? `BLACKBOARD\n${opts.blackboard}\n` : ""}${opts.artifactList ? `ARTIFACTS ON DISK\n${opts.artifactList}\n` : ""}
+${opts.sources ? `SOURCES (numbered, deduplicated from the task reports — the only sources that exist)\n${opts.sources}\n\n` : ""}${opts.blackboard ? `BLACKBOARD\n${opts.blackboard}\n` : ""}${opts.artifactList ? `ARTIFACTS ON DISK\n${opts.artifactList}\n` : ""}
 Working directory: ${opts.meta.cwd}
 PROTOCOL
 - You may read files (read_file / list_dir) to confirm specifics before writing — verify key claims you repeat.
-- Then call submit_final with:
-  • report_markdown — the deliverable document. Structure: # title; **Outcome** first (did the mission succeed, headline results); then What was built/found with evidence and exact paths; How to use/run it (if applicable); Open issues & recommended next steps. Write for the operator: complete, concrete, zero filler.
+- The mission's PRIMARY deliverable should exist in the format that serves it best, not only as prose. If the task reports produced data, comparisons, or rankings that the artifacts don't already capture in a structured form, save them now with save_artifact (e.g. data/results.csv, data/findings.json) before submitting. Don't duplicate artifacts that already exist — point to them.
+${opts.sources ? `- CITE YOUR SOURCES: where a claim rests on a numbered source, cite it inline as [n]. End report_markdown with a \`## Sources\` section listing each number you actually cited as a markdown link ([n] [title](url)). Never invent a source or cite a number not in the list. Where sources conflict, present both positions with their citations — do not silently pick one.\n` : ""}- Then call submit_final with:
+  • report_markdown — the deliverable document. Structure: # title; **Outcome** first (did the mission succeed, headline results); then What was built/found with evidence and exact paths; How to use/run it (if applicable); Open issues & recommended next steps. Write for the operator: complete, concrete, zero filler. Use real markdown tables for tabular findings. (A styled HTML rendering is generated automatically — do not hand-write one.)
   • summary — ≤8 sentences for the console.
 - The report stands alone: a reader who saw nothing else must understand what happened and where everything is.`;
 }
 exports.SYNTH_KICKOFF = "Compose and submit the final deliverable now via submit_final(...).";
+// ============================================================ completeness / synthesis checks
+function completenessPrompt(mission, taskTableStr, reports) {
+    return `You are a completeness critic for an agent-swarm run that is about to finish. Given the mission and what was actually delivered, list any REAL gaps: parts of the mission not addressed, claims with no supporting task, or deliverables that were promised but never produced.
+MISSION
+${mission}
+TASKS
+${taskTableStr}
+TASK REPORTS
+${reports}
+Reply with EXACTLY "COMPLETE" if the mission's requirements are genuinely covered. Otherwise reply with a short numbered list of concrete gaps (max 5), each one actionable enough to become a task. Do not invent nice-to-haves — only true gaps against the stated mission.`;
+}
+function synthCheckPrompt(mission, reports, finalReport, sources) {
+    return `You are checking a final mission report for faithfulness before delivery. Compare it against the underlying task reports.
+MISSION
+${mission}
+TASK REPORTS (ground truth)
+${reports}
+${sources ? `SOURCE LIST (the only citable sources)\n${sources}\n\n` : ""}FINAL REPORT (to check)
+${finalReport}
+Reply with EXACTLY "OK" if the final report's claims are supported by the task reports and nothing material is misrepresented or fabricated${sources ? ", its inline [n] citations all reference numbers that exist in the source list, and no key web-derived factual claim is left uncited" : ""}. Otherwise list the specific discrepancies (max 5), each citing what the final report says vs what the task reports support.`;
+}
 // ============================================================ compaction
 function compactorPrompt(serialized) {
     return `Compress this agent conversation segment into a dense progress summary the agent can rely on to continue working. Preserve: decisions made, files created/modified (exact paths), commands run and their outcomes, key findings/numbers/URLs, errors hit and how they were resolved, current state of the work, and anything still pending. Omit pleasantries and dead ends unless they prevent repeating a mistake. Output the summary only.
@@ -228,5 +324,10 @@ ${serialized}`;
 // ============================================================ misc
 function budgetLine(spent, cap) {
     const pct = cap > 0 ? Math.round((spent.total / cap) * 100) : 0;
-    return `BUDGET: ${(0, util_1.fmtTokens)(spent.total)} of ${(0, util_1.fmtTokens)(cap)} tokens used (${pct}%) · est. cost so far $${spent.cost.toFixed(2)}`;
+    const urgency = pct >= 90
+        ? " ⚠ WIND DOWN NOW: stop spawning new work, consolidate what exists, and finish before the cap."
+        : pct >= 75
+            ? " Note: budget is tightening — prefer consolidation over new exploration."
+            : "";
+    return `BUDGET: ${(0, util_1.fmtTokens)(spent.total)} of ${(0, util_1.fmtTokens)(cap)} tokens used (${pct}%) · est. cost so far $${spent.cost.toFixed(2)}${urgency}`;
 }

package/dist/report.js ADDED Viewed

@@ -0,0 +1,289 @@
+"use strict";
+/**
+ * Dependency-free markdown → styled HTML rendering for final reports.
+ *
+ * Every run writes artifacts/final-report.html next to final-report.md so the
+ * operator always gets a readable, shareable document — even for fallback and
+ * failure reports. This is NOT a full CommonMark implementation; it covers the
+ * subset models actually emit in reports: headings, paragraphs, lists (nested),
+ * fenced code, inline code, bold/italic, links, images, tables, blockquotes,
+ * and horizontal rules. Unknown constructs degrade to escaped text — never to
+ * broken markup.
+ */
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.aggregateSources = aggregateSources;
+exports.sourcesBlock = sourcesBlock;
+exports.mdToHtml = mdToHtml;
+exports.renderFinalHtml = renderFinalHtml;
+const searchcore_1 = require("./searchcore");
+/**
+ * Dedupe every task's reported sources (by canonical URL) into one numbered
+ * bibliography for the synthesizer. First occurrence wins the number; later
+ * tasks fill in missing titles/dates.
+ */
+function aggregateSources(tasks) {
+    const byKey = new Map();
+    for (const t of tasks) {
+        for (const s of t.sources ?? []) {
+            const key = (0, searchcore_1.canonicalizeUrl)(s.url);
+            const cur = byKey.get(key);
+            if (cur) {
+                if (!cur.taskIds.includes(t.id))
+                    cur.taskIds.push(t.id);
+                if (!cur.title && s.title)
+                    cur.title = s.title;
+                if (!cur.date && s.date)
+                    cur.date = s.date;
+                if (!cur.note && s.note)
+                    cur.note = s.note;
+            }
+            else {
+                byKey.set(key, { ...s, n: byKey.size + 1, taskIds: [t.id] });
+            }
+        }
+    }
+    return [...byKey.values()];
+}
+/** Render the numbered source list for prompts (one line per source). */
+function sourcesBlock(sources) {
+    return sources
+        .map((s) => `[${s.n}] ${s.title ? `${s.title} — ` : ""}${s.url}${s.date ? ` (${s.date})` : ""}${s.note ? ` — ${s.note}` : ""} [cited by ${s.taskIds.join(",")}]`)
+        .join("\n");
+}
+function esc(s) {
+    return s
+        .replace(/&/g, "&amp;")
+        .replace(/</g, "&lt;")
+        .replace(/>/g, "&gt;")
+        .replace(/"/g, "&quot;");
+}
+/** Inline markdown on an already-escaped string. Code spans are opaque. */
+function inline(s) {
+    const out = [];
+    // Split on code spans first so no other rule fires inside them.
+    const parts = s.split(/(`+[^`]*`+)/g);
+    for (const part of parts) {
+        const code = /^(`+)([^`]*)\1$/.exec(part);
+        if (code) {
+            out.push(`<code>${code[2].trim() || "`"}</code>`);
+            continue;
+        }
+        let t = part;
+        // Images before links (same bracket syntax).
+        t = t.replace(/!\[([^\]]*)\]\((https?:[^()\s]+)\)/g, '<img src="$2" alt="$1" loading="lazy">');
+        t = t.replace(/\[([^\]]+)\]\(((?:https?:|#|\.{0,2}\/)[^()\s]*)\)/g, '<a href="$2" target="_blank" rel="noopener">$1</a>');
+        t = t.replace(/\*\*\*([^*]+)\*\*\*/g, "<strong><em>$1</em></strong>");
+        t = t.replace(/\*\*([^*]+)\*\*/g, "<strong>$1</strong>");
+        t = t.replace(/(^|[\s(])\*([^*\s][^*]*)\*/g, "$1<em>$2</em>");
+        t = t.replace(/(^|[\s(])_([^_\s][^_]*)_(?=[\s.,;:!?)]|$)/g, "$1<em>$2</em>");
+        // Bare URLs become links (escaped text, so no quotes can appear inside).
+        t = t.replace(/(^|[\s(])(https?:\/\/[^\s<)]+[^\s<).,;:!?])/g, '$1<a href="$2" target="_blank" rel="noopener">$2</a>');
+        out.push(t);
+    }
+    return out.join("");
+}
+function mdToHtml(md) {
+    const lines = md.replace(/\r\n/g, "\n").split("\n");
+    const html = [];
+    const lists = [];
+    let para = [];
+    let quote = [];
+    const closeLists = (toIndent = -1) => {
+        while (lists.length && lists[lists.length - 1].indent > toIndent) {
+            html.push(`</li></${lists.pop().tag}>`);
+        }
+    };
+    const flushPara = () => {
+        if (para.length) {
+            html.push(`<p>${inline(esc(para.join(" ")))}</p>`);
+            para = [];
+        }
+    };
+    const flushQuote = () => {
+        if (quote.length) {
+            html.push(`<blockquote>${mdToHtml(quote.join("\n"))}</blockquote>`);
+            quote = [];
+        }
+    };
+    const flushAll = () => {
+        flushPara();
+        flushQuote();
+        closeLists();
+    };
+    for (let i = 0; i < lines.length; i++) {
+        const line = lines[i];
+        // Fenced code block.
+        const fence = /^\s*(```|~~~)\s*(\S*)/.exec(line);
+        if (fence) {
+            flushAll();
+            const buf = [];
+            for (i++; i < lines.length && !lines[i].trim().startsWith(fence[1]); i++)
+                buf.push(lines[i]);
+            const lang = fence[2] ? ` class="lang-${esc(fence[2])}"` : "";
+            html.push(`<pre><code${lang}>${esc(buf.join("\n"))}</code></pre>`);
+            continue;
+        }
+        // Blockquote (grouped, recursively rendered).
+        const q = /^\s*>\s?(.*)$/.exec(line);
+        if (q) {
+            flushPara();
+            closeLists();
+            quote.push(q[1]);
+            continue;
+        }
+        flushQuote();
+        // Blank line ends the current paragraph / list run (unless the next
+        // non-blank line continues the list).
+        if (!line.trim()) {
+            flushPara();
+            if (lists.length) {
+                let j = i + 1;
+                while (j < lines.length && !lines[j].trim())
+                    j++;
+                if (j >= lines.length || !/^(\s*)([-*+]|\d+[.)])\s+/.test(lines[j]))
+                    closeLists();
+            }
+            continue;
+        }
+        // Heading.
+        const h = /^(#{1,6})\s+(.*)$/.exec(line.trim());
+        if (h) {
+            flushAll();
+            const level = h[1].length;
+            const text = h[2].replace(/\s+#+\s*$/, "");
+            html.push(`<h${level}>${inline(esc(text))}</h${level}>`);
+            continue;
+        }
+        // Horizontal rule.
+        if (/^\s*([-*_])\s*(\1\s*){2,}$/.test(line)) {
+            flushAll();
+            html.push("<hr>");
+            continue;
+        }
+        // Table: header row + |---| separator.
+        if (line.includes("|") && i + 1 < lines.length && /^\s*\|?[\s:|-]+\|[\s:|-]*$/.test(lines[i + 1]) && lines[i + 1].includes("-")) {
+            flushAll();
+            const cells = (row) => row.trim().replace(/^\|/, "").replace(/\|$/, "").split("|").map((c) => inline(esc(c.trim())));
+            const head = cells(line);
+            const rows = [];
+            for (i += 2; i < lines.length && lines[i].includes("|") && lines[i].trim(); i++)
+                rows.push(cells(lines[i]));
+            i--;
+            html.push("<table><thead><tr>" +
+                head.map((c) => `<th>${c}</th>`).join("") +
+                "</tr></thead><tbody>" +
+                rows.map((r) => `<tr>${r.map((c) => `<td>${c}</td>`).join("")}</tr>`).join("") +
+                "</tbody></table>");
+            continue;
+        }
+        // List item (unordered or ordered, nested by indentation).
+        const li = /^(\s*)([-*+]|\d+[.)])\s+(.*)$/.exec(line);
+        if (li) {
+            flushPara();
+            const indent = li[1].length;
+            const tag = /\d/.test(li[2]) ? "ol" : "ul";
+            const top = lists[lists.length - 1];
+            if (!top || indent > top.indent) {
+                lists.push({ indent, tag });
+                html.push(`<${tag}><li>${inline(esc(li[3]))}`);
+            }
+            else {
+                closeLists(indent);
+                const cur = lists[lists.length - 1];
+                if (cur && cur.indent === indent && cur.tag !== tag) {
+                    html.push(`</li></${lists.pop().tag}>`);
+                }
+                if (lists.length && lists[lists.length - 1].indent === indent) {
+                    html.push(`</li><li>${inline(esc(li[3]))}`);
+                }
+                else {
+                    lists.push({ indent, tag });
+                    html.push(`<${tag}><li>${inline(esc(li[3]))}`);
+                }
+            }
+            continue;
+        }
+        // Continuation line inside a list item.
+        if (lists.length && /^\s{2,}\S/.test(line)) {
+            html.push(` ${inline(esc(line.trim()))}`);
+            continue;
+        }
+        closeLists();
+        para.push(line.trim());
+    }
+    flushAll();
+    return html.join("\n");
+}
+const CSS = `
+:root { color-scheme: light dark; }
+* { box-sizing: border-box; }
+body {
+  margin: 0; padding: 48px 24px 96px;
+  font: 16px/1.65 -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", sans-serif;
+  background: #fcfcfa; color: #1c1c1a;
+}
+@media (prefers-color-scheme: dark) { body { background: #131312; color: #e8e6e1; } }
+main { max-width: 860px; margin: 0 auto; }
+header.run-meta {
+  max-width: 860px; margin: 0 auto 36px; padding-bottom: 20px;
+  border-bottom: 1px solid rgba(128,128,128,.25);
+  font-size: 13px; color: #6e6e68; display: flex; flex-wrap: wrap; gap: 8px 18px; align-items: center;
+}
+.badge { padding: 2px 10px; border-radius: 999px; font-weight: 600; font-size: 12px; letter-spacing: .02em; }
+.badge.done { background: rgba(34,160,84,.14); color: #1d8a4c; }
+.badge.failed { background: rgba(214,60,60,.14); color: #c23b3b; }
+.badge.cancelled { background: rgba(150,150,150,.18); color: #77756f; }
+h1, h2, h3, h4 { line-height: 1.25; letter-spacing: -0.012em; }
+h1 { font-size: 30px; margin: 0 0 18px; }
+h2 { font-size: 22px; margin: 36px 0 12px; }
+h3 { font-size: 18px; margin: 28px 0 10px; }
+a { color: #2563c4; text-decoration: none; }
+a:hover { text-decoration: underline; }
+@media (prefers-color-scheme: dark) { a { color: #7aa7e8; } }
+code {
+  font: 13.5px/1.5 ui-monospace, "SF Mono", Menlo, Consolas, monospace;
+  background: rgba(128,128,128,.13); padding: 1.5px 5px; border-radius: 4px;
+}
+pre {
+  background: rgba(128,128,128,.09); border: 1px solid rgba(128,128,128,.18);
+  border-radius: 8px; padding: 14px 16px; overflow-x: auto;
+}
+pre code { background: none; padding: 0; }
+blockquote {
+  margin: 16px 0; padding: 2px 18px; border-left: 3px solid rgba(128,128,128,.35);
+  color: #6e6e68;
+}
+table { border-collapse: collapse; margin: 18px 0; width: 100%; font-size: 14.5px; }
+th, td { border: 1px solid rgba(128,128,128,.25); padding: 7px 12px; text-align: left; vertical-align: top; }
+th { background: rgba(128,128,128,.08); }
+img { max-width: 100%; border-radius: 6px; }
+hr { border: none; border-top: 1px solid rgba(128,128,128,.25); margin: 32px 0; }
+ul, ol { padding-left: 26px; }
+li { margin: 3px 0; }
+`;
+/** Self-contained HTML document (inline CSS, no scripts, no external fetches). */
+function renderFinalHtml(o) {
+    const title = /^#\s+(.+)$/m.exec(o.markdown)?.[1] ?? o.mission;
+    const date = new Date(o.finishedAt).toISOString().replace("T", " ").slice(0, 16) + " UTC";
+    return `<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>${esc(title.slice(0, 120))}</title>
+<style>${CSS}</style>
+</head>
+<body>
+<header class="run-meta">
+  <span class="badge ${o.status}">${o.status}</span>
+  <span>run ${esc(o.runId)}</span>
+  <span>${esc(date)}</span>
+  <span title="${esc(o.mission.slice(0, 600))}">mission: ${esc(o.mission.length > 90 ? o.mission.slice(0, 90) + "…" : o.mission)}</span>
+</header>
+<main>
+${mdToHtml(o.markdown)}
+</main>
+</body>
+</html>
+`;
+}

package/dist/run.js CHANGED Viewed

@@ -126,8 +126,13 @@ const summaryCache = new Map();
  */
 const liveCache = new Map();
 const TERMINAL_STATUSES = ["done", "failed", "cancelled"];
-/** Grace before a silent, pid-less run is presumed dead (engine startup, fs lag). */
-const STALE_AFTER_MS = 20_000;
+/**
+ * Grace before a silent, pid-less run is presumed dead. The pid file is the
+ * primary live signal; this window only covers engine startup (before
+ * writePid) and filesystem lag — generous enough that slow disks and slow
+ * provider preflights never flag a healthy run as interrupted.
+ */
+const STALE_AFTER_MS = 45_000;
 /**
  * A run whose engine process vanished without writing a terminal status
  * (kill -9, reboot) would otherwise show "running" forever. Presentation-level
@@ -193,6 +198,14 @@ function listRuns(pricing) {
         s.pid = readPid(id);
         out.push(applyLiveness(s));
     }
+    // Deleted runs must not pin their reduced state in a long-lived hub forever.
+    const live = new Set(ids);
+    for (const key of summaryCache.keys())
+        if (!live.has(key))
+            summaryCache.delete(key);
+    for (const key of liveCache.keys())
+        if (!live.has(key))
+            liveCache.delete(key);
     out.sort((a, b) => b.createdAt - a.createdAt);
     return out;
 }

package/dist/sandbox.js CHANGED Viewed

@@ -302,7 +302,17 @@ class RemoteRuntime {
             throw new Error(`${what} failed (exit ${r.code}): ${r.out.slice(0, 300)}`);
         return r.out;
     }
+    /** base64-over-shell transfers buffer the whole file — refuse the huge ones. */
+    async checkSize(abs, capBytes, what) {
+        const out = await this.execOk(`wc -c < ${shq(abs)}`, `stat ${abs}`);
+        const size = Number(out.trim());
+        if (Number.isFinite(size) && size > capBytes) {
+            throw new Error(`${what}: file is ${Math.round(size / 1e6)}MB (cap ${Math.round(capBytes / 1e6)}MB) — ` +
+                `compress it or extract the relevant part in the sandbox first`);
+        }
+    }
     async readFile(abs) {
+        await this.checkSize(abs, 4_000_000, `read ${abs}`);
         const out = await this.execOk(`base64 < ${shq(abs)}`, `read ${abs}`);
         return Buffer.from(out.replace(/\s+/g, ""), "base64").toString("utf8");
     }
@@ -319,6 +329,7 @@ class RemoteRuntime {
         }
     }
     async pull(remoteAbs, localAbs) {
+        await this.checkSize(remoteAbs, 32_000_000, `pull ${remoteAbs}`);
         const out = await this.execOk(`base64 < ${shq(remoteAbs)}`, `pull ${remoteAbs}`);
         (0, util_1.ensureDir)(path.dirname(localAbs));
         fs.writeFileSync(localAbs, Buffer.from(out.replace(/\s+/g, ""), "base64"));