@robzilla1738/agentswarm 0.2.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -5
- package/dist/agent.js +64 -32
- package/dist/cli.js +18 -4
- package/dist/config.js +35 -5
- package/dist/crawltools.js +247 -0
- package/dist/deepseek.js +125 -10
- package/dist/executor.js +771 -122
- package/dist/hub.js +40 -3
- package/dist/journal.js +61 -11
- package/dist/memory.js +83 -0
- package/dist/prompts.js +109 -16
- package/dist/report.js +252 -0
- package/dist/run.js +7 -2
- package/dist/searchcore.js +191 -0
- package/dist/state.js +57 -3
- package/dist/tools.js +202 -12
- package/dist/webtools.js +191 -60
- package/package.json +3 -2
- package/ui/out/404/index.html +1 -1
- package/ui/out/404.html +1 -1
- package/ui/out/_next/static/chunks/532-35122e93f37719b9.js +1 -0
- package/ui/out/_next/static/chunks/677-859e8d42add1806b.js +1 -0
- package/ui/out/_next/static/chunks/app/page-dc9f6744d203e76c.js +1 -0
- package/ui/out/_next/static/chunks/app/run/page-2420c9e4c963d9b3.js +1 -0
- package/ui/out/_next/static/chunks/app/settings/page-092a6bf42dfde57d.js +1 -0
- package/ui/out/_next/static/css/9f7bd82b8e4c762c.css +3 -0
- package/ui/out/fonts/PlanetKosmos.ttf +0 -0
- package/ui/out/index.html +1 -1
- package/ui/out/index.txt +3 -3
- package/ui/out/run/index.html +1 -1
- package/ui/out/run/index.txt +3 -3
- package/ui/out/settings/index.html +1 -1
- package/ui/out/settings/index.txt +3 -3
- package/ui/out/_next/static/chunks/383-289a866b246b41cc.js +0 -1
- package/ui/out/_next/static/chunks/619-ba102abea3e3d0e4.js +0 -1
- package/ui/out/_next/static/chunks/677-b37981ba0eca75b2.js +0 -1
- package/ui/out/_next/static/chunks/app/page-0c9f35bd4aa8e370.js +0 -1
- package/ui/out/_next/static/chunks/app/run/page-13dc41a57e34da71.js +0 -1
- package/ui/out/_next/static/chunks/app/settings/page-a1763be7f6de888c.js +0 -1
- package/ui/out/_next/static/css/82edaa7a5942f894.css +0 -3
- /package/ui/out/_next/static/{eiQeDU9uBHNsBj0CFkp8M → errjtBR_bKoee8ogLp8xk}/_buildManifest.js +0 -0
- /package/ui/out/_next/static/{eiQeDU9uBHNsBj0CFkp8M → errjtBR_bKoee8ogLp8xk}/_ssgManifest.js +0 -0
package/README.md
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<picture>
|
|
3
|
+
<source media="(prefers-color-scheme: dark)" srcset=".github/assets/swarm-mark-light.png">
|
|
4
|
+
<img src=".github/assets/swarm-mark-dark.png" alt="agentswarm" width="120">
|
|
5
|
+
</picture>
|
|
6
|
+
</p>
|
|
7
|
+
|
|
1
8
|
# agentswarm
|
|
2
9
|
|
|
10
|
+
[](https://www.npmjs.com/package/@robzilla1738/agentswarm)
|
|
11
|
+
[](LICENSE)
|
|
12
|
+
[](package.json)
|
|
13
|
+
[](https://buymeacoffee.com/robcourson)
|
|
14
|
+
|
|
3
15
|
A local agent-swarm orchestrator with a terminal dashboard and a localhost web UI. Works with DeepSeek, OpenAI, Anthropic, xAI, MiniMax, OpenRouter, Ollama, LM Studio, or any OpenAI-compatible endpoint.
|
|
4
16
|
|
|
5
17
|
You give it a mission. A conductor model breaks the mission into tasks and hands them to worker agents that run in parallel, share findings on a blackboard, and get checked by an adversarial verifier. The run ends with a synthesized report plus whatever files the agents produced. Everything runs on your machine with your own API key, or fully offline against a local model.
|
|
@@ -17,7 +29,7 @@ You give it a mission. A conductor model breaks the mission into tasks and hands
|
|
|
17
29
|
│ T4 dep │◀─────│ verify │ adversarial verification
|
|
18
30
|
└────┬─────┘ └─────────┘
|
|
19
31
|
┌────▼─────┐
|
|
20
|
-
│Synthesize│ → final
|
|
32
|
+
│Synthesize│ → final report (.md + .html) + artifacts
|
|
21
33
|
└──────────┘
|
|
22
34
|
```
|
|
23
35
|
|
|
@@ -89,17 +101,32 @@ Run options (also on the UI launch form under Options): `--workers N` (paralleli
|
|
|
89
101
|
|
|
90
102
|
## How it works
|
|
91
103
|
|
|
92
|
-
The conductor is a model with
|
|
104
|
+
The conductor is a model with six tools: `spawn_tasks`, `set_phase`, `update_plan`, `read_report`, `wait`, and `finish`. It reads the mission, spawns self-contained tasks (each with an objective, success criteria, a role, optional dependencies, and an optional `verify` flag), then reacts as reports come back. On long missions it declares phases (`set_phase`) whose goals and exit criteria are pinned into every update — so the plan survives even when old history is trimmed and replaced by a mission ledger (settled tasks, decisions, current phase).
|
|
105
|
+
|
|
106
|
+
Each task becomes an autonomous agent with a tool budget. It works in small steps, posts durable findings to the blackboard (decisions are never trimmed from digests; `search_notes` searches the full history), journals progress checkpoints on long tasks, saves artifacts, and ends by reporting back with structured handoff fields (`key_facts`, `open_questions`, `files_touched`). Dependent tasks receive report excerpts plus those fields, and can pull full text with `read_report`.
|
|
107
|
+
|
|
108
|
+
**Scale.** A global AIMD limiter (`maxConcurrentCalls`) bounds concurrent model calls per endpoint — a 429 halves the ceiling, successes recover it, and conductor calls always jump the queue, so a 100-agent swarm degrades gracefully instead of melting down. Settles are debounced before waking the conductor; on big runs the task table collapses settled waves (failures stay itemized) and excess reports become one-liners the conductor can expand with `read_report`. Spawn specs take a `model` tier (`cheap` for scouts, `strong` for leads/verifiers via `cheapModel`/`strongModel` config) and `team:true` to run a task as a full sub-swarm — its own conductor decomposes it in parallel and reports one consolidated result, with all activity journaled under its `teamId`.
|
|
109
|
+
|
|
110
|
+
**Long horizon.** The conductor maintains a living `mission-plan.md` (`update_plan`) pinned into every update and restored on resume; every 25 settled tasks a progress snapshot lands in `artifacts/` so multi-day runs always have a partial deliverable; and real-directory runs leave a memory (`~/.agentswarm/memory/`) of missions, outcomes, and decisions that seeds the next swarm in the same workspace.
|
|
93
111
|
|
|
94
|
-
|
|
112
|
+
Verified tasks pass two gates: a free mechanical check (claimed artifacts must exist and be non-empty), then a blind LLM verifier that judges the deliverables against the objective with its own tools — it never sees the worker's blackboard. In `--verify strict` mode, a completeness critic reviews the whole run for gaps before synthesis (the conductor gets one round to fill them), and the final report is checked for faithfulness against the task reports.
|
|
95
113
|
|
|
96
114
|
The scheduler starts a task as soon as its dependencies are done, up to the parallelism cap. Tasks whose dependencies failed are blocked and surfaced to the conductor for re-planning.
|
|
97
115
|
|
|
98
|
-
When the conductor finishes (or the budget forces it), a synthesizer composes `final-report.md`
|
|
116
|
+
When the conductor finishes (or the budget forces it), a synthesizer composes the final deliverable from every task report. Deliverables ship in the format the mission calls for — code, `.csv`/`.json` data, styled documents — alongside `final-report.md` and a self-contained `final-report.html` rendering (open it with `swarm report <id> --open`).
|
|
99
117
|
|
|
100
118
|
The journal is the source of truth. Every run is an append-only `events.jsonl`; the terminal dashboard, the web UI, and `swarm ls` all reduce the same file. That's why runs survive crashes and can be resumed or replayed. Runs live under `~/.agentswarm/runs/<id>/`.
|
|
101
119
|
|
|
102
|
-
If the engine process dies without writing a terminal status (kill -9, reboot), the hub notices the missing process and shows the run as interrupted instead of leaving it "running" forever.
|
|
120
|
+
If the engine process dies without writing a terminal status (kill -9, reboot), the hub notices the missing process and shows the run as interrupted instead of leaving it "running" forever. `swarm resume <id>` continues it: settled tasks keep their results, and tasks that were mid-flight restart *warm* from their last journaled checkpoint instead of from scratch. SIGTERM flushes the journal synchronously and leaves the run resumable.
|
|
121
|
+
|
|
122
|
+
## Troubleshooting
|
|
123
|
+
|
|
124
|
+
- **"interrupted — the engine process is no longer running"** — the engine died without a terminal status (kill -9, reboot, crash). Check `~/.agentswarm/runs/<id>/exec.log` for the crash output, then `swarm resume <id>`.
|
|
125
|
+
- **Run ended with "conductor unavailable"** — five consecutive conductor API calls failed (after backoff). Usually a provider outage or a bad model name; check the run's activity log for the underlying error, fix, and resume.
|
|
126
|
+
- **"journal writes are failing"** — the engine could not append to `events.jsonl` (disk full, permissions). The run aborts deliberately rather than doing unrecorded work.
|
|
127
|
+
- **A verified task keeps failing with "Claimed artifact(s) do not exist"** — the worker reported files it never wrote. That's the mechanical pre-verifier doing its job; the retry prompt tells the worker to actually create them.
|
|
128
|
+
- **Docker sandbox fails to start** — confirm `docker info` works as your user, and that the configured `sandboxImage` can be pulled. `swarm sandbox test` checks the configured runtime end-to-end.
|
|
129
|
+
- **Hung or wedged run** — `swarm cancel <id>` aborts in-flight agents within ~1s; sandbox teardown is bounded by a 15s timeout so it can't hang shutdown.
|
|
103
130
|
|
|
104
131
|
## Architecture
|
|
105
132
|
|
|
@@ -137,6 +164,10 @@ Boots a mock model server and drives real missions through the engine, offline,
|
|
|
137
164
|
- Costs are estimates based on list prices and the token counts the API reports. Models without pricing data show $0. Set a `--budget` either way.
|
|
138
165
|
- Keys are stored in `~/.agentswarm/config.json` (chmod 600) and are only sent to the APIs you configured.
|
|
139
166
|
|
|
167
|
+
## Author
|
|
168
|
+
|
|
169
|
+
Built by [Robert Courson](https://robertcourson.com). If agentswarm saves you time, you can [buy me a coffee](https://buymeacoffee.com/robcourson).
|
|
170
|
+
|
|
140
171
|
## License
|
|
141
172
|
|
|
142
173
|
MIT
|
package/dist/agent.js
CHANGED
|
@@ -26,31 +26,52 @@ async function runAgent(p) {
|
|
|
26
26
|
let lastText = "";
|
|
27
27
|
let steps = 0;
|
|
28
28
|
hooks.onTranscript?.(messages);
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
29
|
+
// Wrap-up calls (forced or terminal-only) run with thinking disabled:
|
|
30
|
+
// DeepSeek's thinking mode hard-400s on a forced function tool_choice, and
|
|
31
|
+
// the final "summarize into the terminal tool" turn needs no deep reasoning.
|
|
32
|
+
const callModel = (opts) => {
|
|
33
|
+
const wrapUp = Boolean(opts?.only || opts?.terminalOnly);
|
|
34
|
+
return (0, deepseek_1.chat)(cfg, {
|
|
35
|
+
model: p.model,
|
|
36
|
+
messages,
|
|
37
|
+
tools: opts?.only
|
|
38
|
+
? allSchemas.filter((s) => s.name === opts.only)
|
|
39
|
+
: opts?.terminalOnly
|
|
40
|
+
? p.terminal
|
|
41
|
+
: allSchemas,
|
|
42
|
+
toolChoice: opts?.only,
|
|
43
|
+
thinking: wrapUp ? false : p.thinking,
|
|
44
|
+
reasoningEffort: !wrapUp && p.thinking ? p.reasoningEffort : undefined,
|
|
45
|
+
maxTokens: p.maxTokensOut,
|
|
46
|
+
signal: p.signal,
|
|
47
|
+
onDelta: (d) => {
|
|
48
|
+
if (d.think)
|
|
49
|
+
hooks.onDelta?.("think", d.think);
|
|
50
|
+
if (d.text)
|
|
51
|
+
hooks.onDelta?.("text", d.text);
|
|
52
|
+
},
|
|
53
|
+
});
|
|
54
|
+
};
|
|
47
55
|
let stopReason = null;
|
|
48
56
|
while (steps < p.maxSteps) {
|
|
49
57
|
stopReason = p.stop?.() ?? null;
|
|
50
58
|
if (stopReason)
|
|
51
59
|
break;
|
|
52
60
|
steps++;
|
|
53
|
-
|
|
61
|
+
let res;
|
|
62
|
+
try {
|
|
63
|
+
res = await callModel();
|
|
64
|
+
}
|
|
65
|
+
catch (e) {
|
|
66
|
+
// The chat client already retries 429/5xx; this catches the rest of the
|
|
67
|
+
// transient class (connection resets, DNS blips) once per step so a
|
|
68
|
+
// single network hiccup doesn't burn a whole task attempt.
|
|
69
|
+
if (p.signal.aborted)
|
|
70
|
+
throw e;
|
|
71
|
+
hooks.onLog?.("warn", `${p.agentId}: model call failed (${(0, util_1.errMsg)(e)}); retrying once`);
|
|
72
|
+
await new Promise((r) => setTimeout(r, 1500));
|
|
73
|
+
res = await callModel();
|
|
74
|
+
}
|
|
54
75
|
hooks.onUsage?.(p.model, res.usage);
|
|
55
76
|
usage = (0, types_1.addUsage)(usage, res.usage);
|
|
56
77
|
if (res.toolCalls.length === 0) {
|
|
@@ -128,21 +149,30 @@ async function runAgent(p) {
|
|
|
128
149
|
}
|
|
129
150
|
}
|
|
130
151
|
// Step budget exhausted (or stopped early) — force one final terminal call.
|
|
152
|
+
// Two attempts: a forced tool_choice first, then terminal-only tools with
|
|
153
|
+
// free choice, since some providers reject or ignore forced choices. The
|
|
154
|
+
// agent's work must never be discarded because the wrap-up call failed.
|
|
131
155
|
messages.push({ role: "user", content: stopReason ? (0, prompts_1.forcedFinal)(stopReason) : prompts_1.STEP_LIMIT_FINAL });
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
156
|
+
for (const opts of [{ only: p.terminal[0].name }, { terminalOnly: true }]) {
|
|
157
|
+
try {
|
|
158
|
+
const res = await callModel(opts);
|
|
159
|
+
hooks.onUsage?.(p.model, res.usage);
|
|
160
|
+
usage = (0, types_1.addUsage)(usage, res.usage);
|
|
161
|
+
const call = res.toolCalls.find((c) => terminalNames.has(c.function.name));
|
|
162
|
+
if (call) {
|
|
163
|
+
const args = (0, util_1.safeJson)(call.function.arguments) ?? {};
|
|
164
|
+
return { terminal: { name: call.function.name, args }, finalText: lastText, steps, usage };
|
|
165
|
+
}
|
|
166
|
+
if (res.content) {
|
|
167
|
+
lastText = res.content;
|
|
168
|
+
// The model answered in prose; keep it and demand the tool call.
|
|
169
|
+
messages.push({ role: "assistant", content: res.content });
|
|
170
|
+
messages.push({ role: "user", content: `Call the ${p.terminal[0].name} tool now. Do not reply with text.` });
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
catch (e) {
|
|
174
|
+
hooks.onLog?.("warn", `${p.agentId}: final terminal call failed: ${(0, util_1.errMsg)(e)}`);
|
|
140
175
|
}
|
|
141
|
-
if (res.content)
|
|
142
|
-
lastText = res.content;
|
|
143
|
-
}
|
|
144
|
-
catch (e) {
|
|
145
|
-
hooks.onLog?.("warn", `${p.agentId}: forced final call failed: ${(0, util_1.errMsg)(e)}`);
|
|
146
176
|
}
|
|
147
177
|
return { terminal: null, finalText: lastText, steps, usage };
|
|
148
178
|
}
|
|
@@ -194,6 +224,8 @@ async function compact(p, messages) {
|
|
|
194
224
|
});
|
|
195
225
|
p.hooks.onUsage?.(p.model, res.usage);
|
|
196
226
|
summary = res.content || "(compaction produced no summary)";
|
|
227
|
+
if (res.content)
|
|
228
|
+
p.hooks.onCheckpoint?.(res.content);
|
|
197
229
|
}
|
|
198
230
|
catch (e) {
|
|
199
231
|
// Compaction is best-effort; fall back to hard truncation.
|
package/dist/cli.js
CHANGED
|
@@ -342,11 +342,24 @@ async function execForeground(cfg, meta, render, resume = false) {
|
|
|
342
342
|
};
|
|
343
343
|
process.on("uncaughtException", onFatal);
|
|
344
344
|
process.on("unhandledRejection", onFatal);
|
|
345
|
+
// SIGTERM (kill, system shutdown): flush buffered journal lines synchronously
|
|
346
|
+
// and exit WITHOUT a terminal status — the run stays resumable, and viewers
|
|
347
|
+
// show it as interrupted once the pid disappears.
|
|
348
|
+
const onTerm = () => {
|
|
349
|
+
journal.append("log", { level: "warn", msg: "engine received SIGTERM — exiting; resume with: swarm resume " + meta.id });
|
|
350
|
+
journal.flushSync();
|
|
351
|
+
(0, run_1.clearPid)(meta.id);
|
|
352
|
+
if (renderer)
|
|
353
|
+
renderer.stop();
|
|
354
|
+
process.exit(143);
|
|
355
|
+
};
|
|
356
|
+
process.on("SIGTERM", onTerm);
|
|
345
357
|
try {
|
|
346
358
|
await executor.run();
|
|
347
359
|
}
|
|
348
360
|
finally {
|
|
349
361
|
process.off("SIGINT", onSig);
|
|
362
|
+
process.off("SIGTERM", onTerm);
|
|
350
363
|
process.off("uncaughtException", onFatal);
|
|
351
364
|
process.off("unhandledRejection", onFatal);
|
|
352
365
|
(0, run_1.clearPid)(meta.id);
|
|
@@ -476,8 +489,10 @@ function cmdReport(id, flags) {
|
|
|
476
489
|
process.exit(1);
|
|
477
490
|
}
|
|
478
491
|
if (flags.open) {
|
|
479
|
-
|
|
480
|
-
|
|
492
|
+
const html = path.join((0, config_1.runDir)(id), "artifacts", "final-report.html");
|
|
493
|
+
const target = fs.existsSync(html) ? html : file;
|
|
494
|
+
openBrowser("file://" + target);
|
|
495
|
+
console.log(target);
|
|
481
496
|
return;
|
|
482
497
|
}
|
|
483
498
|
process.stdout.write(fs.readFileSync(file, "utf8") + "\n");
|
|
@@ -614,7 +629,7 @@ function printFinalLine(id) {
|
|
|
614
629
|
console.log("");
|
|
615
630
|
if (fs.existsSync(reportFile)) {
|
|
616
631
|
console.log(util_1.ansi.green("✓ final report: ") + reportFile);
|
|
617
|
-
console.log(util_1.ansi.gray(" view: ") + `swarm report ${id}`);
|
|
632
|
+
console.log(util_1.ansi.gray(" view: ") + `swarm report ${id}` + util_1.ansi.gray(" · open in browser: ") + `swarm report ${id} --open`);
|
|
618
633
|
}
|
|
619
634
|
else {
|
|
620
635
|
console.log(util_1.ansi.gray(`run ${id} ended without a final report (see: swarm watch ${id})`));
|
|
@@ -661,7 +676,6 @@ ${b("RUN OPTIONS")}
|
|
|
661
676
|
${b("FIRST RUN")}
|
|
662
677
|
swarm config set apiKey <key> # key for the active provider (default: DeepSeek)
|
|
663
678
|
swarm config set provider <id> # deepseek | openai | anthropic | xai | minimax | openrouter | ollama | lmstudio | custom
|
|
664
|
-
pip install searchkit # optional: local, citable web search for agents
|
|
665
679
|
swarm serve --open # open the web UI
|
|
666
680
|
`);
|
|
667
681
|
}
|
package/dist/config.js
CHANGED
|
@@ -70,17 +70,24 @@ exports.DEFAULTS = {
|
|
|
70
70
|
baseUrl: providers_1.PROVIDERS.deepseek.baseUrl,
|
|
71
71
|
model: "deepseek-v4-flash",
|
|
72
72
|
conductorModel: "deepseek-v4-flash",
|
|
73
|
+
cheapModel: "",
|
|
74
|
+
strongModel: "",
|
|
73
75
|
maxWorkers: 6,
|
|
74
76
|
maxStepsPerTask: 30,
|
|
75
|
-
maxTasks:
|
|
77
|
+
maxTasks: 200,
|
|
76
78
|
maxTokensPerRun: 12_000_000,
|
|
77
79
|
verification: "normal",
|
|
80
|
+
verifyMaxAttempts: 2,
|
|
78
81
|
thinking: true,
|
|
79
82
|
reasoningEffort: "high",
|
|
80
83
|
safeMode: true,
|
|
81
84
|
tinyfishApiKey: "",
|
|
82
85
|
searchBackend: "auto",
|
|
83
|
-
|
|
86
|
+
firecrawlApiKey: "",
|
|
87
|
+
contextdevApiKey: "",
|
|
88
|
+
deepcrawlApiKey: "",
|
|
89
|
+
deepcrawlBaseUrl: "",
|
|
90
|
+
crawlBackend: "auto",
|
|
84
91
|
sandboxRuntime: "host",
|
|
85
92
|
sandboxImage: "node:22-bookworm",
|
|
86
93
|
e2bApiKey: "",
|
|
@@ -90,10 +97,11 @@ exports.DEFAULTS = {
|
|
|
90
97
|
vercelToken: "",
|
|
91
98
|
vercelTeamId: "",
|
|
92
99
|
vercelProjectId: "",
|
|
100
|
+
maxConcurrentCalls: 16,
|
|
93
101
|
requestTimeoutMs: 900_000,
|
|
94
102
|
idleTimeoutMs: 180_000,
|
|
95
103
|
contextTokenLimit: 120_000,
|
|
96
|
-
maxToolResultChars:
|
|
104
|
+
maxToolResultChars: 20_000,
|
|
97
105
|
hubPort: 7777,
|
|
98
106
|
uiPort: 7780,
|
|
99
107
|
pricing: exports.DEFAULT_PRICING,
|
|
@@ -109,6 +117,9 @@ exports.SECRET_ENV_KEYS = [
|
|
|
109
117
|
.map((p) => p.keyEnv)
|
|
110
118
|
.filter((k) => Boolean(k)),
|
|
111
119
|
"TINYFISH_API_KEY",
|
|
120
|
+
"FIRECRAWL_API_KEY",
|
|
121
|
+
"CONTEXT_DEV_API_KEY",
|
|
122
|
+
"DEEPCRAWL_API_KEY",
|
|
112
123
|
"E2B_API_KEY",
|
|
113
124
|
"MODAL_TOKEN_ID",
|
|
114
125
|
"MODAL_TOKEN_SECRET",
|
|
@@ -160,6 +171,14 @@ function loadConfig() {
|
|
|
160
171
|
cfg.apiKey = process.env[info.keyEnv];
|
|
161
172
|
if (process.env.TINYFISH_API_KEY)
|
|
162
173
|
cfg.tinyfishApiKey = process.env.TINYFISH_API_KEY;
|
|
174
|
+
if (process.env.FIRECRAWL_API_KEY)
|
|
175
|
+
cfg.firecrawlApiKey = process.env.FIRECRAWL_API_KEY;
|
|
176
|
+
if (process.env.CONTEXT_DEV_API_KEY)
|
|
177
|
+
cfg.contextdevApiKey = process.env.CONTEXT_DEV_API_KEY;
|
|
178
|
+
if (process.env.DEEPCRAWL_API_KEY)
|
|
179
|
+
cfg.deepcrawlApiKey = process.env.DEEPCRAWL_API_KEY;
|
|
180
|
+
if (process.env.DEEPCRAWL_BASE_URL)
|
|
181
|
+
cfg.deepcrawlBaseUrl = process.env.DEEPCRAWL_BASE_URL;
|
|
163
182
|
if (process.env.E2B_API_KEY)
|
|
164
183
|
cfg.e2bApiKey = process.env.E2B_API_KEY;
|
|
165
184
|
if (process.env.MODAL_TOKEN_ID)
|
|
@@ -218,17 +237,24 @@ exports.SETTABLE_KEYS = [
|
|
|
218
237
|
"baseUrl",
|
|
219
238
|
"model",
|
|
220
239
|
"conductorModel",
|
|
240
|
+
"cheapModel",
|
|
241
|
+
"strongModel",
|
|
221
242
|
"maxWorkers",
|
|
222
243
|
"maxStepsPerTask",
|
|
223
244
|
"maxTasks",
|
|
224
245
|
"maxTokensPerRun",
|
|
225
246
|
"verification",
|
|
247
|
+
"verifyMaxAttempts",
|
|
226
248
|
"thinking",
|
|
227
249
|
"reasoningEffort",
|
|
228
250
|
"safeMode",
|
|
229
251
|
"tinyfishApiKey",
|
|
230
252
|
"searchBackend",
|
|
231
|
-
"
|
|
253
|
+
"firecrawlApiKey",
|
|
254
|
+
"contextdevApiKey",
|
|
255
|
+
"deepcrawlApiKey",
|
|
256
|
+
"deepcrawlBaseUrl",
|
|
257
|
+
"crawlBackend",
|
|
232
258
|
"sandboxRuntime",
|
|
233
259
|
"sandboxImage",
|
|
234
260
|
"e2bApiKey",
|
|
@@ -238,15 +264,18 @@ exports.SETTABLE_KEYS = [
|
|
|
238
264
|
"vercelToken",
|
|
239
265
|
"vercelTeamId",
|
|
240
266
|
"vercelProjectId",
|
|
267
|
+
"maxConcurrentCalls",
|
|
241
268
|
"contextTokenLimit",
|
|
242
269
|
"hubPort",
|
|
243
270
|
"uiPort",
|
|
244
271
|
];
|
|
245
272
|
/** Allowed ranges for numeric settings (values are clamped, not rejected). */
|
|
246
273
|
const NUM_RANGES = {
|
|
247
|
-
maxWorkers: [1,
|
|
274
|
+
maxWorkers: [1, 128],
|
|
275
|
+
maxConcurrentCalls: [1, 256],
|
|
248
276
|
maxStepsPerTask: [3, 200],
|
|
249
277
|
maxTasks: [1, 1000],
|
|
278
|
+
verifyMaxAttempts: [1, 5],
|
|
250
279
|
maxTokensPerRun: [50_000, 2_000_000_000],
|
|
251
280
|
contextTokenLimit: [8_000, 900_000],
|
|
252
281
|
hubPort: [0, 65535],
|
|
@@ -256,6 +285,7 @@ const ENUMS = {
|
|
|
256
285
|
verification: ["off", "normal", "strict"],
|
|
257
286
|
reasoningEffort: ["low", "medium", "high", "max"],
|
|
258
287
|
searchBackend: ["auto", "tinyfish", "ddg"],
|
|
288
|
+
crawlBackend: ["auto", "firecrawl", "contextdev", "deepcrawl", "off"],
|
|
259
289
|
sandboxRuntime: ["auto", "host", "docker", "e2b", "modal", "vercel"],
|
|
260
290
|
provider: Object.keys(providers_1.PROVIDERS),
|
|
261
291
|
};
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.resolveCrawlBackend = resolveCrawlBackend;
|
|
4
|
+
exports.hasScrapeBackend = hasScrapeBackend;
|
|
5
|
+
exports.crawlSite = crawlSite;
|
|
6
|
+
exports.scrapeUrl = scrapeUrl;
|
|
7
|
+
exports.slugForUrl = slugForUrl;
|
|
8
|
+
const util_1 = require("./util");
|
|
9
|
+
const PER_PAGE_CHAR_CAP = 200_000;
|
|
10
|
+
const TOTAL_CHAR_BUDGET = 8_000_000;
|
|
11
|
+
const CRAWL_DEADLINE_MS = 120_000;
|
|
12
|
+
/** auto = first configured: Firecrawl → context.dev → deepcrawl. "off" or nothing configured → null. */
|
|
13
|
+
function resolveCrawlBackend(cfg) {
|
|
14
|
+
if (cfg.crawlBackend === "off")
|
|
15
|
+
return null;
|
|
16
|
+
const configured = {
|
|
17
|
+
firecrawl: Boolean(cfg.firecrawlApiKey),
|
|
18
|
+
contextdev: Boolean(cfg.contextdevApiKey),
|
|
19
|
+
deepcrawl: Boolean(cfg.deepcrawlApiKey && cfg.deepcrawlBaseUrl),
|
|
20
|
+
};
|
|
21
|
+
if (cfg.crawlBackend !== "auto")
|
|
22
|
+
return configured[cfg.crawlBackend] ? cfg.crawlBackend : null;
|
|
23
|
+
for (const id of ["firecrawl", "contextdev", "deepcrawl"]) {
|
|
24
|
+
if (configured[id])
|
|
25
|
+
return id;
|
|
26
|
+
}
|
|
27
|
+
return null;
|
|
28
|
+
}
|
|
29
|
+
/** Backends usable for single-page scrape in fetch_url (the custom deepcrawl contract has no scrape endpoint). */
|
|
30
|
+
function hasScrapeBackend(cfg) {
|
|
31
|
+
const b = resolveCrawlBackend(cfg);
|
|
32
|
+
return b === "firecrawl" || b === "contextdev";
|
|
33
|
+
}
|
|
34
|
+
async function crawlSite(cfg, opts) {
|
|
35
|
+
const backend = resolveCrawlBackend(cfg);
|
|
36
|
+
if (!backend)
|
|
37
|
+
throw new Error("no crawl backend configured — add a Firecrawl/context.dev/deepcrawl key in Settings");
|
|
38
|
+
const warnings = [];
|
|
39
|
+
let pages;
|
|
40
|
+
if (backend === "firecrawl")
|
|
41
|
+
pages = await firecrawlCrawl(cfg, opts, warnings);
|
|
42
|
+
else if (backend === "contextdev")
|
|
43
|
+
pages = await contextdevCrawl(cfg, opts);
|
|
44
|
+
else
|
|
45
|
+
pages = await deepcrawlCrawl(cfg, opts);
|
|
46
|
+
// Normalize: drop empty/binary pages, cap per-page and total size.
|
|
47
|
+
const clean = [];
|
|
48
|
+
let skipped = 0;
|
|
49
|
+
let total = 0;
|
|
50
|
+
for (const p of pages) {
|
|
51
|
+
if (clean.length >= opts.maxPages)
|
|
52
|
+
break;
|
|
53
|
+
const md = (p.markdown || "").trim();
|
|
54
|
+
if (!md || md.includes("\u0000")) {
|
|
55
|
+
skipped++;
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
const body = (0, util_1.truncateMiddle)(md, PER_PAGE_CHAR_CAP, "chars");
|
|
59
|
+
if (total + body.length > TOTAL_CHAR_BUDGET) {
|
|
60
|
+
warnings.push(`stopped at ${clean.length} pages: total content budget reached`);
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
total += body.length;
|
|
64
|
+
clean.push({ url: p.url, title: p.title, markdown: body });
|
|
65
|
+
}
|
|
66
|
+
if (skipped)
|
|
67
|
+
warnings.push(`${skipped} empty page${skipped > 1 ? "s" : ""} skipped`);
|
|
68
|
+
return { backend, pages: clean, warnings };
|
|
69
|
+
}
|
|
70
|
+
/** Single-page scrape via the configured backend. Throws on failure — callers fall through to their own fetch path. */
|
|
71
|
+
async function scrapeUrl(cfg, url, signal) {
|
|
72
|
+
const backend = resolveCrawlBackend(cfg);
|
|
73
|
+
if (backend === "firecrawl") {
|
|
74
|
+
const data = await callJson("firecrawl", "https://api.firecrawl.dev/v1/scrape", cfg.firecrawlApiKey, { url, formats: ["markdown"] }, 30_000, signal);
|
|
75
|
+
const md = String(data?.data?.markdown ?? "");
|
|
76
|
+
if (!md.trim())
|
|
77
|
+
throw new Error("firecrawl: empty scrape result");
|
|
78
|
+
const title = data?.data?.metadata?.title;
|
|
79
|
+
return title ? `# ${title}\n\n${md}` : md;
|
|
80
|
+
}
|
|
81
|
+
if (backend === "contextdev") {
|
|
82
|
+
const data = await callJson("context.dev", "https://api.context.dev/v1/web/scrape", cfg.contextdevApiKey, { url }, 30_000, signal);
|
|
83
|
+
const md = String(data?.markdown ?? data?.results?.[0]?.markdown ?? "");
|
|
84
|
+
if (!md.trim())
|
|
85
|
+
throw new Error("context.dev: empty scrape result");
|
|
86
|
+
const title = data?.metadata?.title ?? data?.results?.[0]?.metadata?.title;
|
|
87
|
+
return title ? `# ${title}\n\n${md}` : md;
|
|
88
|
+
}
|
|
89
|
+
throw new Error("no scrape-capable crawl backend configured");
|
|
90
|
+
}
|
|
91
|
+
/** "https://docs.foo.com/a/b?x=1" → filesystem-safe { host, slug } with no separators or traversal. */
|
|
92
|
+
function slugForUrl(url) {
|
|
93
|
+
let u;
|
|
94
|
+
try {
|
|
95
|
+
u = new URL(url);
|
|
96
|
+
}
|
|
97
|
+
catch {
|
|
98
|
+
return { host: "site", slug: sanitize(url) || "page" };
|
|
99
|
+
}
|
|
100
|
+
const host = sanitize(u.hostname) || "site";
|
|
101
|
+
const slug = sanitize(u.pathname.replace(/\/+$/, "")) || "index";
|
|
102
|
+
return { host, slug };
|
|
103
|
+
}
|
|
104
|
+
function sanitize(s) {
|
|
105
|
+
return s
|
|
106
|
+
.toLowerCase()
|
|
107
|
+
.replace(/[^a-z0-9._-]+/g, "-")
|
|
108
|
+
.replace(/\.{2,}/g, ".")
|
|
109
|
+
.replace(/-{2,}/g, "-")
|
|
110
|
+
.replace(/^[-.]+|[-.]+$/g, "")
|
|
111
|
+
.slice(0, 120);
|
|
112
|
+
}
|
|
113
|
+
// ---------------------------------------------------------------- backends
|
|
114
|
+
async function firecrawlCrawl(cfg, opts, warnings) {
|
|
115
|
+
const start = await callJson("firecrawl", "https://api.firecrawl.dev/v1/crawl", cfg.firecrawlApiKey, {
|
|
116
|
+
url: opts.url,
|
|
117
|
+
limit: opts.maxPages,
|
|
118
|
+
...(opts.includePaths?.length ? { includePaths: opts.includePaths } : {}),
|
|
119
|
+
scrapeOptions: { formats: ["markdown"] },
|
|
120
|
+
}, 30_000, opts.signal);
|
|
121
|
+
const jobId = start?.id;
|
|
122
|
+
if (!jobId)
|
|
123
|
+
throw new Error(`firecrawl: crawl did not start (${start?.error || "no job id"})`);
|
|
124
|
+
const pollMs = opts.pollMs ?? 3000;
|
|
125
|
+
const deadline = Date.now() + CRAWL_DEADLINE_MS;
|
|
126
|
+
let last = null;
|
|
127
|
+
for (;;) {
|
|
128
|
+
opts.signal?.throwIfAborted();
|
|
129
|
+
last = await getJson("firecrawl", `https://api.firecrawl.dev/v1/crawl/${jobId}`, cfg.firecrawlApiKey, opts.signal);
|
|
130
|
+
if (last?.status === "completed")
|
|
131
|
+
break;
|
|
132
|
+
if (last?.status === "failed")
|
|
133
|
+
throw new Error(`firecrawl: crawl failed (${last?.error || "unknown error"})`);
|
|
134
|
+
if (Date.now() > deadline) {
|
|
135
|
+
const partial = mapFirecrawlPages(last);
|
|
136
|
+
if (!partial.length)
|
|
137
|
+
throw new Error("firecrawl: crawl still running after 120s with no pages yet — try fewer pages");
|
|
138
|
+
warnings.push(`crawl still running after 120s; returning ${partial.length} partial pages`);
|
|
139
|
+
return partial;
|
|
140
|
+
}
|
|
141
|
+
await sleep(pollMs, opts.signal);
|
|
142
|
+
}
|
|
143
|
+
// Completed: collect pages, following `next` pagination until maxPages.
|
|
144
|
+
const pages = mapFirecrawlPages(last);
|
|
145
|
+
let next = last?.next;
|
|
146
|
+
while (next && pages.length < opts.maxPages) {
|
|
147
|
+
const more = await getJson("firecrawl", String(next), cfg.firecrawlApiKey, opts.signal);
|
|
148
|
+
pages.push(...mapFirecrawlPages(more));
|
|
149
|
+
next = more?.next;
|
|
150
|
+
}
|
|
151
|
+
return pages;
|
|
152
|
+
}
|
|
153
|
+
function mapFirecrawlPages(res) {
|
|
154
|
+
const data = Array.isArray(res?.data) ? res.data : [];
|
|
155
|
+
return data.map((d) => ({
|
|
156
|
+
url: String(d?.metadata?.sourceURL ?? d?.metadata?.url ?? ""),
|
|
157
|
+
title: String(d?.metadata?.title ?? ""),
|
|
158
|
+
markdown: String(d?.markdown ?? ""),
|
|
159
|
+
}));
|
|
160
|
+
}
|
|
161
|
+
async function contextdevCrawl(cfg, opts) {
|
|
162
|
+
const data = await callJson("context.dev", "https://api.context.dev/v1/web/crawl", cfg.contextdevApiKey, {
|
|
163
|
+
url: opts.url,
|
|
164
|
+
max_pages: opts.maxPages,
|
|
165
|
+
...(opts.includePaths?.length ? { include_paths: opts.includePaths } : {}),
|
|
166
|
+
}, CRAWL_DEADLINE_MS, opts.signal);
|
|
167
|
+
const results = Array.isArray(data?.results) ? data.results : [];
|
|
168
|
+
return results.map((r) => ({
|
|
169
|
+
url: String(r?.metadata?.url ?? r?.url ?? ""),
|
|
170
|
+
title: String(r?.metadata?.title ?? r?.title ?? ""),
|
|
171
|
+
markdown: String(r?.markdown ?? ""),
|
|
172
|
+
}));
|
|
173
|
+
}
|
|
174
|
+
async function deepcrawlCrawl(cfg, opts) {
|
|
175
|
+
const base = cfg.deepcrawlBaseUrl.replace(/\/+$/, "");
|
|
176
|
+
const data = await callJson("deepcrawl", `${base}/crawl`, cfg.deepcrawlApiKey, {
|
|
177
|
+
url: opts.url,
|
|
178
|
+
max_pages: opts.maxPages,
|
|
179
|
+
...(opts.includePaths?.length ? { include_paths: opts.includePaths } : {}),
|
|
180
|
+
}, CRAWL_DEADLINE_MS, opts.signal);
|
|
181
|
+
// Accept either the context.dev-compatible shape or a flat pages[] list.
|
|
182
|
+
if (Array.isArray(data?.results)) {
|
|
183
|
+
return data.results.map((r) => ({
|
|
184
|
+
url: String(r?.metadata?.url ?? r?.url ?? ""),
|
|
185
|
+
title: String(r?.metadata?.title ?? r?.title ?? ""),
|
|
186
|
+
markdown: String(r?.markdown ?? ""),
|
|
187
|
+
}));
|
|
188
|
+
}
|
|
189
|
+
if (Array.isArray(data?.pages)) {
|
|
190
|
+
return data.pages.map((p) => ({
|
|
191
|
+
url: String(p?.url ?? ""),
|
|
192
|
+
title: String(p?.title ?? ""),
|
|
193
|
+
markdown: String(p?.markdown ?? p?.content ?? ""),
|
|
194
|
+
}));
|
|
195
|
+
}
|
|
196
|
+
throw new Error("deepcrawl: unrecognized response shape (expected results[] or pages[])");
|
|
197
|
+
}
|
|
198
|
+
// ---------------------------------------------------------------- plumbing
|
|
199
|
+
function friendlyHttpError(service, status, body) {
|
|
200
|
+
if (status === 401 || status === 403) {
|
|
201
|
+
return new Error(`${service} API key invalid or unauthorized (HTTP ${status}) — check Settings → Crawl integrations`);
|
|
202
|
+
}
|
|
203
|
+
if (status === 402)
|
|
204
|
+
return new Error(`${service}: quota or credits exhausted (HTTP 402)`);
|
|
205
|
+
if (status === 429)
|
|
206
|
+
return new Error(`${service}: rate limited (HTTP 429) — retry later`);
|
|
207
|
+
return new Error(`${service}: HTTP ${status} ${(0, util_1.truncateMiddle)(body, 300, "chars")}`);
|
|
208
|
+
}
|
|
209
|
+
function mergeSignal(timeoutMs, signal) {
|
|
210
|
+
const t = AbortSignal.timeout(timeoutMs);
|
|
211
|
+
if (!signal)
|
|
212
|
+
return t;
|
|
213
|
+
return typeof AbortSignal.any === "function" ? AbortSignal.any([t, signal]) : signal;
|
|
214
|
+
}
|
|
215
|
+
async function callJson(service, url, key, body, timeoutMs, signal) {
|
|
216
|
+
const res = await fetch(url, {
|
|
217
|
+
method: "POST",
|
|
218
|
+
headers: { authorization: `Bearer ${key}`, "content-type": "application/json" },
|
|
219
|
+
body: JSON.stringify(body),
|
|
220
|
+
signal: mergeSignal(timeoutMs, signal),
|
|
221
|
+
});
|
|
222
|
+
if (!res.ok)
|
|
223
|
+
throw friendlyHttpError(service, res.status, await res.text().catch(() => ""));
|
|
224
|
+
return res.json();
|
|
225
|
+
}
|
|
226
|
+
async function getJson(service, url, key, signal) {
|
|
227
|
+
const res = await fetch(url, {
|
|
228
|
+
headers: { authorization: `Bearer ${key}` },
|
|
229
|
+
signal: mergeSignal(30_000, signal),
|
|
230
|
+
});
|
|
231
|
+
if (!res.ok)
|
|
232
|
+
throw friendlyHttpError(service, res.status, await res.text().catch(() => ""));
|
|
233
|
+
return res.json();
|
|
234
|
+
}
|
|
235
|
+
function sleep(ms, signal) {
|
|
236
|
+
return new Promise((resolve, reject) => {
|
|
237
|
+
const t = setTimeout(() => {
|
|
238
|
+
signal?.removeEventListener("abort", onAbort);
|
|
239
|
+
resolve();
|
|
240
|
+
}, ms);
|
|
241
|
+
const onAbort = () => {
|
|
242
|
+
clearTimeout(t);
|
|
243
|
+
reject(new Error("aborted"));
|
|
244
|
+
};
|
|
245
|
+
signal?.addEventListener("abort", onAbort, { once: true });
|
|
246
|
+
});
|
|
247
|
+
}
|