@robzilla1738/agentswarm 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/README.md +28 -5
  2. package/dist/agent.js +16 -1
  3. package/dist/cli.js +18 -4
  4. package/dist/config.js +35 -5
  5. package/dist/crawltools.js +247 -0
  6. package/dist/deepseek.js +125 -10
  7. package/dist/executor.js +771 -122
  8. package/dist/hub.js +16 -3
  9. package/dist/journal.js +61 -11
  10. package/dist/memory.js +83 -0
  11. package/dist/prompts.js +109 -16
  12. package/dist/report.js +252 -0
  13. package/dist/run.js +7 -2
  14. package/dist/searchcore.js +191 -0
  15. package/dist/state.js +57 -3
  16. package/dist/tools.js +202 -12
  17. package/dist/webtools.js +191 -60
  18. package/package.json +3 -2
  19. package/ui/out/404/index.html +1 -1
  20. package/ui/out/404.html +1 -1
  21. package/ui/out/_next/static/chunks/532-35122e93f37719b9.js +1 -0
  22. package/ui/out/_next/static/chunks/677-859e8d42add1806b.js +1 -0
  23. package/ui/out/_next/static/chunks/app/page-dc9f6744d203e76c.js +1 -0
  24. package/ui/out/_next/static/chunks/app/run/page-2420c9e4c963d9b3.js +1 -0
  25. package/ui/out/_next/static/chunks/app/settings/page-092a6bf42dfde57d.js +1 -0
  26. package/ui/out/_next/static/css/9f7bd82b8e4c762c.css +3 -0
  27. package/ui/out/fonts/PlanetKosmos.ttf +0 -0
  28. package/ui/out/index.html +1 -1
  29. package/ui/out/index.txt +3 -3
  30. package/ui/out/run/index.html +1 -1
  31. package/ui/out/run/index.txt +3 -3
  32. package/ui/out/settings/index.html +1 -1
  33. package/ui/out/settings/index.txt +3 -3
  34. package/ui/out/_next/static/chunks/383-289a866b246b41cc.js +0 -1
  35. package/ui/out/_next/static/chunks/619-ba102abea3e3d0e4.js +0 -1
  36. package/ui/out/_next/static/chunks/677-7ab85a6f38c3a235.js +0 -1
  37. package/ui/out/_next/static/chunks/app/page-0fda5b8e77d90b84.js +0 -1
  38. package/ui/out/_next/static/chunks/app/run/page-07aab6b1224c3c8c.js +0 -1
  39. package/ui/out/_next/static/chunks/app/settings/page-528482d468d84cfa.js +0 -1
  40. package/ui/out/_next/static/css/e2c82b53bf4519e8.css +0 -3
  41. /package/ui/out/_next/static/{Rm5Fhkds2-wIOnVlME55J → errjtBR_bKoee8ogLp8xk}/_buildManifest.js +0 -0
  42. /package/ui/out/_next/static/{Rm5Fhkds2-wIOnVlME55J → errjtBR_bKoee8ogLp8xk}/_ssgManifest.js +0 -0
package/README.md CHANGED
@@ -1,8 +1,16 @@
1
+ <p align="center">
2
+ <picture>
3
+ <source media="(prefers-color-scheme: dark)" srcset=".github/assets/swarm-mark-light.png">
4
+ <img src=".github/assets/swarm-mark-dark.png" alt="agentswarm" width="120">
5
+ </picture>
6
+ </p>
7
+
1
8
  # agentswarm
2
9
 
3
10
  [![npm](https://img.shields.io/npm/v/@robzilla1738/agentswarm)](https://www.npmjs.com/package/@robzilla1738/agentswarm)
4
11
  [![license](https://img.shields.io/badge/license-MIT-blue)](LICENSE)
5
12
  [![node](https://img.shields.io/badge/node-%E2%89%A520.10-brightgreen)](package.json)
13
+ [![support](https://img.shields.io/badge/support-buy%20me%20a%20coffee-yellow)](https://buymeacoffee.com/robcourson)
6
14
 
7
15
  A local agent-swarm orchestrator with a terminal dashboard and a localhost web UI. Works with DeepSeek, OpenAI, Anthropic, xAI, MiniMax, OpenRouter, Ollama, LM Studio, or any OpenAI-compatible endpoint.
8
16
 
@@ -21,7 +29,7 @@ You give it a mission. A conductor model breaks the mission into tasks and hands
21
29
  │ T4 dep │◀─────│ verify │ adversarial verification
22
30
  └────┬─────┘ └─────────┘
23
31
  ┌────▼─────┐
24
- │Synthesize│ → final-report.md + artifacts
32
+ │Synthesize│ → final report (.md + .html) + artifacts
25
33
  └──────────┘
26
34
  ```
27
35
 
@@ -93,17 +101,32 @@ Run options (also on the UI launch form under Options): `--workers N` (paralleli
93
101
 
94
102
  ## How it works
95
103
 
96
- The conductor is a model with three tools: `spawn_tasks`, `wait`, and `finish`. It reads the mission, spawns self-contained tasks (each with an objective, success criteria, a role, optional dependencies, and an optional `verify` flag), then reacts as reports come back.
104
+ The conductor is a model with six tools: `spawn_tasks`, `set_phase`, `update_plan`, `read_report`, `wait`, and `finish`. It reads the mission, spawns self-contained tasks (each with an objective, success criteria, a role, optional dependencies, and an optional `verify` flag), then reacts as reports come back. On long missions it declares phases (`set_phase`) whose goals and exit criteria are pinned into every update — so the plan survives even when old history is trimmed and replaced by a mission ledger (settled tasks, decisions, current phase).
105
+
106
+ Each task becomes an autonomous agent with a tool budget. It works in small steps, posts durable findings to the blackboard (decisions are never trimmed from digests; `search_notes` searches the full history), journals progress checkpoints on long tasks, saves artifacts, and ends by reporting back with structured handoff fields (`key_facts`, `open_questions`, `files_touched`). Dependent tasks receive report excerpts plus those fields, and can pull full text with `read_report`.
107
+
108
+ **Scale.** A global AIMD limiter (`maxConcurrentCalls`) bounds concurrent model calls per endpoint — a 429 halves the ceiling, successes recover it, and conductor calls always jump the queue, so a 100-agent swarm degrades gracefully instead of melting down. Settles are debounced before waking the conductor; on big runs the task table collapses settled waves (failures stay itemized) and excess reports become one-liners the conductor can expand with `read_report`. Spawn specs take a `model` tier (`cheap` for scouts, `strong` for leads/verifiers via `cheapModel`/`strongModel` config) and `team:true` to run a task as a full sub-swarm — its own conductor decomposes it in parallel and reports one consolidated result, with all activity journaled under its `teamId`.
97
109
 
98
- Each task becomes an autonomous agent with a tool budget. It works in small steps, posts durable findings to the blackboard, saves artifacts, and ends by reporting back. The report is the only thing the conductor sees, which keeps reports specific.
110
+ **Long horizon.** The conductor maintains a living `mission-plan.md` (`update_plan`) pinned into every update and restored on resume; every 25 settled tasks a progress snapshot lands in `artifacts/` so multi-day runs always have a partial deliverable; and real-directory runs leave a memory (`~/.agentswarm/memory/`) of missions, outcomes, and decisions that seeds the next swarm in the same workspace.
111
+
112
+ Verified tasks pass two gates: a free mechanical check (claimed artifacts must exist and be non-empty), then a blind LLM verifier that judges the deliverables against the objective with its own tools — it never sees the worker's blackboard. In `--verify strict` mode, a completeness critic reviews the whole run for gaps before synthesis (the conductor gets one round to fill them), and the final report is checked for faithfulness against the task reports.
99
113
 
100
114
  The scheduler starts a task as soon as its dependencies are done, up to the parallelism cap. Tasks whose dependencies failed are blocked and surfaced to the conductor for re-planning.
101
115
 
102
- When the conductor finishes (or the budget forces it), a synthesizer composes `final-report.md` from every task report.
116
+ When the conductor finishes (or the budget forces it), a synthesizer composes the final deliverable from every task report. Deliverables ship in the format the mission calls for — code, `.csv`/`.json` data, styled documents — alongside `final-report.md` and a self-contained `final-report.html` rendering (open it with `swarm report <id> --open`).
103
117
 
104
118
  The journal is the source of truth. Every run is an append-only `events.jsonl`; the terminal dashboard, the web UI, and `swarm ls` all reduce the same file. That's why runs survive crashes and can be resumed or replayed. Runs live under `~/.agentswarm/runs/<id>/`.
105
119
 
106
- If the engine process dies without writing a terminal status (kill -9, reboot), the hub notices the missing process and shows the run as interrupted instead of leaving it "running" forever.
120
+ If the engine process dies without writing a terminal status (kill -9, reboot), the hub notices the missing process and shows the run as interrupted instead of leaving it "running" forever. `swarm resume <id>` continues it: settled tasks keep their results, and tasks that were mid-flight restart *warm* from their last journaled checkpoint instead of from scratch. SIGTERM flushes the journal synchronously and leaves the run resumable.
121
+
122
+ ## Troubleshooting
123
+
124
+ - **"interrupted — the engine process is no longer running"** — the engine died without a terminal status (kill -9, reboot, crash). Check `~/.agentswarm/runs/<id>/exec.log` for the crash output, then `swarm resume <id>`.
125
+ - **Run ended with "conductor unavailable"** — five consecutive conductor API calls failed (after backoff). Usually a provider outage or a bad model name; check the run's activity log for the underlying error, fix, and resume.
126
+ - **"journal writes are failing"** — the engine could not append to `events.jsonl` (disk full, permissions). The run aborts deliberately rather than doing unrecorded work.
127
+ - **A verified task keeps failing with "Claimed artifact(s) do not exist"** — the worker reported files it never wrote. That's the mechanical pre-verifier doing its job; the retry prompt tells the worker to actually create them.
128
+ - **Docker sandbox fails to start** — confirm `docker info` works as your user, and that the configured `sandboxImage` can be pulled. `swarm sandbox test` checks the configured runtime end-to-end.
129
+ - **Hung or wedged run** — `swarm cancel <id>` aborts in-flight agents within ~1s; sandbox teardown is bounded by a 15s timeout so it can't hang shutdown.
107
130
 
108
131
  ## Architecture
109
132
 
package/dist/agent.js CHANGED
@@ -58,7 +58,20 @@ async function runAgent(p) {
58
58
  if (stopReason)
59
59
  break;
60
60
  steps++;
61
- const res = await callModel();
61
+ let res;
62
+ try {
63
+ res = await callModel();
64
+ }
65
+ catch (e) {
66
+ // The chat client already retries 429/5xx; this catches the rest of the
67
+ // transient class (connection resets, DNS blips) once per step so a
68
+ // single network hiccup doesn't burn a whole task attempt.
69
+ if (p.signal.aborted)
70
+ throw e;
71
+ hooks.onLog?.("warn", `${p.agentId}: model call failed (${(0, util_1.errMsg)(e)}); retrying once`);
72
+ await new Promise((r) => setTimeout(r, 1500));
73
+ res = await callModel();
74
+ }
62
75
  hooks.onUsage?.(p.model, res.usage);
63
76
  usage = (0, types_1.addUsage)(usage, res.usage);
64
77
  if (res.toolCalls.length === 0) {
@@ -211,6 +224,8 @@ async function compact(p, messages) {
211
224
  });
212
225
  p.hooks.onUsage?.(p.model, res.usage);
213
226
  summary = res.content || "(compaction produced no summary)";
227
+ if (res.content)
228
+ p.hooks.onCheckpoint?.(res.content);
214
229
  }
215
230
  catch (e) {
216
231
  // Compaction is best-effort; fall back to hard truncation.
package/dist/cli.js CHANGED
@@ -342,11 +342,24 @@ async function execForeground(cfg, meta, render, resume = false) {
342
342
  };
343
343
  process.on("uncaughtException", onFatal);
344
344
  process.on("unhandledRejection", onFatal);
345
+ // SIGTERM (kill, system shutdown): flush buffered journal lines synchronously
346
+ // and exit WITHOUT a terminal status — the run stays resumable, and viewers
347
+ // show it as interrupted once the pid disappears.
348
+ const onTerm = () => {
349
+ journal.append("log", { level: "warn", msg: "engine received SIGTERM — exiting; resume with: swarm resume " + meta.id });
350
+ journal.flushSync();
351
+ (0, run_1.clearPid)(meta.id);
352
+ if (renderer)
353
+ renderer.stop();
354
+ process.exit(143);
355
+ };
356
+ process.on("SIGTERM", onTerm);
345
357
  try {
346
358
  await executor.run();
347
359
  }
348
360
  finally {
349
361
  process.off("SIGINT", onSig);
362
+ process.off("SIGTERM", onTerm);
350
363
  process.off("uncaughtException", onFatal);
351
364
  process.off("unhandledRejection", onFatal);
352
365
  (0, run_1.clearPid)(meta.id);
@@ -476,8 +489,10 @@ function cmdReport(id, flags) {
476
489
  process.exit(1);
477
490
  }
478
491
  if (flags.open) {
479
- openBrowser("file://" + file);
480
- console.log(file);
492
+ const html = path.join((0, config_1.runDir)(id), "artifacts", "final-report.html");
493
+ const target = fs.existsSync(html) ? html : file;
494
+ openBrowser("file://" + target);
495
+ console.log(target);
481
496
  return;
482
497
  }
483
498
  process.stdout.write(fs.readFileSync(file, "utf8") + "\n");
@@ -614,7 +629,7 @@ function printFinalLine(id) {
614
629
  console.log("");
615
630
  if (fs.existsSync(reportFile)) {
616
631
  console.log(util_1.ansi.green("✓ final report: ") + reportFile);
617
- console.log(util_1.ansi.gray(" view: ") + `swarm report ${id}`);
632
+ console.log(util_1.ansi.gray(" view: ") + `swarm report ${id}` + util_1.ansi.gray(" · open in browser: ") + `swarm report ${id} --open`);
618
633
  }
619
634
  else {
620
635
  console.log(util_1.ansi.gray(`run ${id} ended without a final report (see: swarm watch ${id})`));
@@ -661,7 +676,6 @@ ${b("RUN OPTIONS")}
661
676
  ${b("FIRST RUN")}
662
677
  swarm config set apiKey <key> # key for the active provider (default: DeepSeek)
663
678
  swarm config set provider <id> # deepseek | openai | anthropic | xai | minimax | openrouter | ollama | lmstudio | custom
664
- pip install searchkit # optional: local, citable web search for agents
665
679
  swarm serve --open # open the web UI
666
680
  `);
667
681
  }
package/dist/config.js CHANGED
@@ -70,17 +70,24 @@ exports.DEFAULTS = {
70
70
  baseUrl: providers_1.PROVIDERS.deepseek.baseUrl,
71
71
  model: "deepseek-v4-flash",
72
72
  conductorModel: "deepseek-v4-flash",
73
+ cheapModel: "",
74
+ strongModel: "",
73
75
  maxWorkers: 6,
74
76
  maxStepsPerTask: 30,
75
- maxTasks: 48,
77
+ maxTasks: 200,
76
78
  maxTokensPerRun: 12_000_000,
77
79
  verification: "normal",
80
+ verifyMaxAttempts: 2,
78
81
  thinking: true,
79
82
  reasoningEffort: "high",
80
83
  safeMode: true,
81
84
  tinyfishApiKey: "",
82
85
  searchBackend: "auto",
83
- searchkitCmd: "searchkit",
86
+ firecrawlApiKey: "",
87
+ contextdevApiKey: "",
88
+ deepcrawlApiKey: "",
89
+ deepcrawlBaseUrl: "",
90
+ crawlBackend: "auto",
84
91
  sandboxRuntime: "host",
85
92
  sandboxImage: "node:22-bookworm",
86
93
  e2bApiKey: "",
@@ -90,10 +97,11 @@ exports.DEFAULTS = {
90
97
  vercelToken: "",
91
98
  vercelTeamId: "",
92
99
  vercelProjectId: "",
100
+ maxConcurrentCalls: 16,
93
101
  requestTimeoutMs: 900_000,
94
102
  idleTimeoutMs: 180_000,
95
103
  contextTokenLimit: 120_000,
96
- maxToolResultChars: 12_000,
104
+ maxToolResultChars: 20_000,
97
105
  hubPort: 7777,
98
106
  uiPort: 7780,
99
107
  pricing: exports.DEFAULT_PRICING,
@@ -109,6 +117,9 @@ exports.SECRET_ENV_KEYS = [
109
117
  .map((p) => p.keyEnv)
110
118
  .filter((k) => Boolean(k)),
111
119
  "TINYFISH_API_KEY",
120
+ "FIRECRAWL_API_KEY",
121
+ "CONTEXT_DEV_API_KEY",
122
+ "DEEPCRAWL_API_KEY",
112
123
  "E2B_API_KEY",
113
124
  "MODAL_TOKEN_ID",
114
125
  "MODAL_TOKEN_SECRET",
@@ -160,6 +171,14 @@ function loadConfig() {
160
171
  cfg.apiKey = process.env[info.keyEnv];
161
172
  if (process.env.TINYFISH_API_KEY)
162
173
  cfg.tinyfishApiKey = process.env.TINYFISH_API_KEY;
174
+ if (process.env.FIRECRAWL_API_KEY)
175
+ cfg.firecrawlApiKey = process.env.FIRECRAWL_API_KEY;
176
+ if (process.env.CONTEXT_DEV_API_KEY)
177
+ cfg.contextdevApiKey = process.env.CONTEXT_DEV_API_KEY;
178
+ if (process.env.DEEPCRAWL_API_KEY)
179
+ cfg.deepcrawlApiKey = process.env.DEEPCRAWL_API_KEY;
180
+ if (process.env.DEEPCRAWL_BASE_URL)
181
+ cfg.deepcrawlBaseUrl = process.env.DEEPCRAWL_BASE_URL;
163
182
  if (process.env.E2B_API_KEY)
164
183
  cfg.e2bApiKey = process.env.E2B_API_KEY;
165
184
  if (process.env.MODAL_TOKEN_ID)
@@ -218,17 +237,24 @@ exports.SETTABLE_KEYS = [
218
237
  "baseUrl",
219
238
  "model",
220
239
  "conductorModel",
240
+ "cheapModel",
241
+ "strongModel",
221
242
  "maxWorkers",
222
243
  "maxStepsPerTask",
223
244
  "maxTasks",
224
245
  "maxTokensPerRun",
225
246
  "verification",
247
+ "verifyMaxAttempts",
226
248
  "thinking",
227
249
  "reasoningEffort",
228
250
  "safeMode",
229
251
  "tinyfishApiKey",
230
252
  "searchBackend",
231
- "searchkitCmd",
253
+ "firecrawlApiKey",
254
+ "contextdevApiKey",
255
+ "deepcrawlApiKey",
256
+ "deepcrawlBaseUrl",
257
+ "crawlBackend",
232
258
  "sandboxRuntime",
233
259
  "sandboxImage",
234
260
  "e2bApiKey",
@@ -238,15 +264,18 @@ exports.SETTABLE_KEYS = [
238
264
  "vercelToken",
239
265
  "vercelTeamId",
240
266
  "vercelProjectId",
267
+ "maxConcurrentCalls",
241
268
  "contextTokenLimit",
242
269
  "hubPort",
243
270
  "uiPort",
244
271
  ];
245
272
  /** Allowed ranges for numeric settings (values are clamped, not rejected). */
246
273
  const NUM_RANGES = {
247
- maxWorkers: [1, 32],
274
+ maxWorkers: [1, 128],
275
+ maxConcurrentCalls: [1, 256],
248
276
  maxStepsPerTask: [3, 200],
249
277
  maxTasks: [1, 1000],
278
+ verifyMaxAttempts: [1, 5],
250
279
  maxTokensPerRun: [50_000, 2_000_000_000],
251
280
  contextTokenLimit: [8_000, 900_000],
252
281
  hubPort: [0, 65535],
@@ -256,6 +285,7 @@ const ENUMS = {
256
285
  verification: ["off", "normal", "strict"],
257
286
  reasoningEffort: ["low", "medium", "high", "max"],
258
287
  searchBackend: ["auto", "tinyfish", "ddg"],
288
+ crawlBackend: ["auto", "firecrawl", "contextdev", "deepcrawl", "off"],
259
289
  sandboxRuntime: ["auto", "host", "docker", "e2b", "modal", "vercel"],
260
290
  provider: Object.keys(providers_1.PROVIDERS),
261
291
  };
@@ -0,0 +1,247 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.resolveCrawlBackend = resolveCrawlBackend;
4
+ exports.hasScrapeBackend = hasScrapeBackend;
5
+ exports.crawlSite = crawlSite;
6
+ exports.scrapeUrl = scrapeUrl;
7
+ exports.slugForUrl = slugForUrl;
8
+ const util_1 = require("./util");
9
+ const PER_PAGE_CHAR_CAP = 200_000;
10
+ const TOTAL_CHAR_BUDGET = 8_000_000;
11
+ const CRAWL_DEADLINE_MS = 120_000;
12
+ /** auto = first configured: Firecrawl → context.dev → deepcrawl. "off" or nothing configured → null. */
13
+ function resolveCrawlBackend(cfg) {
14
+ if (cfg.crawlBackend === "off")
15
+ return null;
16
+ const configured = {
17
+ firecrawl: Boolean(cfg.firecrawlApiKey),
18
+ contextdev: Boolean(cfg.contextdevApiKey),
19
+ deepcrawl: Boolean(cfg.deepcrawlApiKey && cfg.deepcrawlBaseUrl),
20
+ };
21
+ if (cfg.crawlBackend !== "auto")
22
+ return configured[cfg.crawlBackend] ? cfg.crawlBackend : null;
23
+ for (const id of ["firecrawl", "contextdev", "deepcrawl"]) {
24
+ if (configured[id])
25
+ return id;
26
+ }
27
+ return null;
28
+ }
29
+ /** Backends usable for single-page scrape in fetch_url (the custom deepcrawl contract has no scrape endpoint). */
30
+ function hasScrapeBackend(cfg) {
31
+ const b = resolveCrawlBackend(cfg);
32
+ return b === "firecrawl" || b === "contextdev";
33
+ }
34
+ async function crawlSite(cfg, opts) {
35
+ const backend = resolveCrawlBackend(cfg);
36
+ if (!backend)
37
+ throw new Error("no crawl backend configured — add a Firecrawl/context.dev/deepcrawl key in Settings");
38
+ const warnings = [];
39
+ let pages;
40
+ if (backend === "firecrawl")
41
+ pages = await firecrawlCrawl(cfg, opts, warnings);
42
+ else if (backend === "contextdev")
43
+ pages = await contextdevCrawl(cfg, opts);
44
+ else
45
+ pages = await deepcrawlCrawl(cfg, opts);
46
+ // Normalize: drop empty/binary pages, cap per-page and total size.
47
+ const clean = [];
48
+ let skipped = 0;
49
+ let total = 0;
50
+ for (const p of pages) {
51
+ if (clean.length >= opts.maxPages)
52
+ break;
53
+ const md = (p.markdown || "").trim();
54
+ if (!md || md.includes("\u0000")) {
55
+ skipped++;
56
+ continue;
57
+ }
58
+ const body = (0, util_1.truncateMiddle)(md, PER_PAGE_CHAR_CAP, "chars");
59
+ if (total + body.length > TOTAL_CHAR_BUDGET) {
60
+ warnings.push(`stopped at ${clean.length} pages: total content budget reached`);
61
+ break;
62
+ }
63
+ total += body.length;
64
+ clean.push({ url: p.url, title: p.title, markdown: body });
65
+ }
66
+ if (skipped)
67
+ warnings.push(`${skipped} empty page${skipped > 1 ? "s" : ""} skipped`);
68
+ return { backend, pages: clean, warnings };
69
+ }
70
+ /** Single-page scrape via the configured backend. Throws on failure — callers fall through to their own fetch path. */
71
+ async function scrapeUrl(cfg, url, signal) {
72
+ const backend = resolveCrawlBackend(cfg);
73
+ if (backend === "firecrawl") {
74
+ const data = await callJson("firecrawl", "https://api.firecrawl.dev/v1/scrape", cfg.firecrawlApiKey, { url, formats: ["markdown"] }, 30_000, signal);
75
+ const md = String(data?.data?.markdown ?? "");
76
+ if (!md.trim())
77
+ throw new Error("firecrawl: empty scrape result");
78
+ const title = data?.data?.metadata?.title;
79
+ return title ? `# ${title}\n\n${md}` : md;
80
+ }
81
+ if (backend === "contextdev") {
82
+ const data = await callJson("context.dev", "https://api.context.dev/v1/web/scrape", cfg.contextdevApiKey, { url }, 30_000, signal);
83
+ const md = String(data?.markdown ?? data?.results?.[0]?.markdown ?? "");
84
+ if (!md.trim())
85
+ throw new Error("context.dev: empty scrape result");
86
+ const title = data?.metadata?.title ?? data?.results?.[0]?.metadata?.title;
87
+ return title ? `# ${title}\n\n${md}` : md;
88
+ }
89
+ throw new Error("no scrape-capable crawl backend configured");
90
+ }
91
+ /** "https://docs.foo.com/a/b?x=1" → filesystem-safe { host, slug } with no separators or traversal. */
92
+ function slugForUrl(url) {
93
+ let u;
94
+ try {
95
+ u = new URL(url);
96
+ }
97
+ catch {
98
+ return { host: "site", slug: sanitize(url) || "page" };
99
+ }
100
+ const host = sanitize(u.hostname) || "site";
101
+ const slug = sanitize(u.pathname.replace(/\/+$/, "")) || "index";
102
+ return { host, slug };
103
+ }
104
+ function sanitize(s) {
105
+ return s
106
+ .toLowerCase()
107
+ .replace(/[^a-z0-9._-]+/g, "-")
108
+ .replace(/\.{2,}/g, ".")
109
+ .replace(/-{2,}/g, "-")
110
+ .replace(/^[-.]+|[-.]+$/g, "")
111
+ .slice(0, 120);
112
+ }
113
+ // ---------------------------------------------------------------- backends
114
+ async function firecrawlCrawl(cfg, opts, warnings) {
115
+ const start = await callJson("firecrawl", "https://api.firecrawl.dev/v1/crawl", cfg.firecrawlApiKey, {
116
+ url: opts.url,
117
+ limit: opts.maxPages,
118
+ ...(opts.includePaths?.length ? { includePaths: opts.includePaths } : {}),
119
+ scrapeOptions: { formats: ["markdown"] },
120
+ }, 30_000, opts.signal);
121
+ const jobId = start?.id;
122
+ if (!jobId)
123
+ throw new Error(`firecrawl: crawl did not start (${start?.error || "no job id"})`);
124
+ const pollMs = opts.pollMs ?? 3000;
125
+ const deadline = Date.now() + CRAWL_DEADLINE_MS;
126
+ let last = null;
127
+ for (;;) {
128
+ opts.signal?.throwIfAborted();
129
+ last = await getJson("firecrawl", `https://api.firecrawl.dev/v1/crawl/${jobId}`, cfg.firecrawlApiKey, opts.signal);
130
+ if (last?.status === "completed")
131
+ break;
132
+ if (last?.status === "failed")
133
+ throw new Error(`firecrawl: crawl failed (${last?.error || "unknown error"})`);
134
+ if (Date.now() > deadline) {
135
+ const partial = mapFirecrawlPages(last);
136
+ if (!partial.length)
137
+ throw new Error("firecrawl: crawl still running after 120s with no pages yet — try fewer pages");
138
+ warnings.push(`crawl still running after 120s; returning ${partial.length} partial pages`);
139
+ return partial;
140
+ }
141
+ await sleep(pollMs, opts.signal);
142
+ }
143
+ // Completed: collect pages, following `next` pagination until maxPages.
144
+ const pages = mapFirecrawlPages(last);
145
+ let next = last?.next;
146
+ while (next && pages.length < opts.maxPages) {
147
+ const more = await getJson("firecrawl", String(next), cfg.firecrawlApiKey, opts.signal);
148
+ pages.push(...mapFirecrawlPages(more));
149
+ next = more?.next;
150
+ }
151
+ return pages;
152
+ }
153
+ function mapFirecrawlPages(res) {
154
+ const data = Array.isArray(res?.data) ? res.data : [];
155
+ return data.map((d) => ({
156
+ url: String(d?.metadata?.sourceURL ?? d?.metadata?.url ?? ""),
157
+ title: String(d?.metadata?.title ?? ""),
158
+ markdown: String(d?.markdown ?? ""),
159
+ }));
160
+ }
161
+ async function contextdevCrawl(cfg, opts) {
162
+ const data = await callJson("context.dev", "https://api.context.dev/v1/web/crawl", cfg.contextdevApiKey, {
163
+ url: opts.url,
164
+ max_pages: opts.maxPages,
165
+ ...(opts.includePaths?.length ? { include_paths: opts.includePaths } : {}),
166
+ }, CRAWL_DEADLINE_MS, opts.signal);
167
+ const results = Array.isArray(data?.results) ? data.results : [];
168
+ return results.map((r) => ({
169
+ url: String(r?.metadata?.url ?? r?.url ?? ""),
170
+ title: String(r?.metadata?.title ?? r?.title ?? ""),
171
+ markdown: String(r?.markdown ?? ""),
172
+ }));
173
+ }
174
+ async function deepcrawlCrawl(cfg, opts) {
175
+ const base = cfg.deepcrawlBaseUrl.replace(/\/+$/, "");
176
+ const data = await callJson("deepcrawl", `${base}/crawl`, cfg.deepcrawlApiKey, {
177
+ url: opts.url,
178
+ max_pages: opts.maxPages,
179
+ ...(opts.includePaths?.length ? { include_paths: opts.includePaths } : {}),
180
+ }, CRAWL_DEADLINE_MS, opts.signal);
181
+ // Accept either the context.dev-compatible shape or a flat pages[] list.
182
+ if (Array.isArray(data?.results)) {
183
+ return data.results.map((r) => ({
184
+ url: String(r?.metadata?.url ?? r?.url ?? ""),
185
+ title: String(r?.metadata?.title ?? r?.title ?? ""),
186
+ markdown: String(r?.markdown ?? ""),
187
+ }));
188
+ }
189
+ if (Array.isArray(data?.pages)) {
190
+ return data.pages.map((p) => ({
191
+ url: String(p?.url ?? ""),
192
+ title: String(p?.title ?? ""),
193
+ markdown: String(p?.markdown ?? p?.content ?? ""),
194
+ }));
195
+ }
196
+ throw new Error("deepcrawl: unrecognized response shape (expected results[] or pages[])");
197
+ }
198
+ // ---------------------------------------------------------------- plumbing
199
+ function friendlyHttpError(service, status, body) {
200
+ if (status === 401 || status === 403) {
201
+ return new Error(`${service} API key invalid or unauthorized (HTTP ${status}) — check Settings → Crawl integrations`);
202
+ }
203
+ if (status === 402)
204
+ return new Error(`${service}: quota or credits exhausted (HTTP 402)`);
205
+ if (status === 429)
206
+ return new Error(`${service}: rate limited (HTTP 429) — retry later`);
207
+ return new Error(`${service}: HTTP ${status} ${(0, util_1.truncateMiddle)(body, 300, "chars")}`);
208
+ }
209
+ function mergeSignal(timeoutMs, signal) {
210
+ const t = AbortSignal.timeout(timeoutMs);
211
+ if (!signal)
212
+ return t;
213
+ return typeof AbortSignal.any === "function" ? AbortSignal.any([t, signal]) : signal;
214
+ }
215
+ async function callJson(service, url, key, body, timeoutMs, signal) {
216
+ const res = await fetch(url, {
217
+ method: "POST",
218
+ headers: { authorization: `Bearer ${key}`, "content-type": "application/json" },
219
+ body: JSON.stringify(body),
220
+ signal: mergeSignal(timeoutMs, signal),
221
+ });
222
+ if (!res.ok)
223
+ throw friendlyHttpError(service, res.status, await res.text().catch(() => ""));
224
+ return res.json();
225
+ }
226
+ async function getJson(service, url, key, signal) {
227
+ const res = await fetch(url, {
228
+ headers: { authorization: `Bearer ${key}` },
229
+ signal: mergeSignal(30_000, signal),
230
+ });
231
+ if (!res.ok)
232
+ throw friendlyHttpError(service, res.status, await res.text().catch(() => ""));
233
+ return res.json();
234
+ }
235
+ function sleep(ms, signal) {
236
+ return new Promise((resolve, reject) => {
237
+ const t = setTimeout(() => {
238
+ signal?.removeEventListener("abort", onAbort);
239
+ resolve();
240
+ }, ms);
241
+ const onAbort = () => {
242
+ clearTimeout(t);
243
+ reject(new Error("aborted"));
244
+ };
245
+ signal?.addEventListener("abort", onAbort, { once: true });
246
+ });
247
+ }