@robzilla1738/agentswarm 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -12
- package/dist/agent.js +2 -1
- package/dist/cli.js +21 -4
- package/dist/config.js +27 -1
- package/dist/executor.js +243 -43
- package/dist/hub.js +69 -3
- package/dist/memory.js +5 -4
- package/dist/pdftext.js +211 -0
- package/dist/prompts.js +23 -15
- package/dist/report.js +37 -0
- package/dist/run.js +8 -0
- package/dist/sandbox.js +11 -0
- package/dist/searchcore.js +55 -2
- package/dist/state.js +34 -6
- package/dist/tools.js +196 -19
- package/dist/util.js +85 -0
- package/dist/webtools.js +145 -15
- package/package.json +1 -1
- package/ui/out/404/index.html +1 -1
- package/ui/out/404.html +1 -1
- package/ui/out/_next/static/chunks/677-721ce1c8b7a6a317.js +1 -0
- package/ui/out/_next/static/chunks/app/run/page-3674e103981703a2.js +1 -0
- package/ui/out/_next/static/chunks/app/settings/page-41a5d8ba43ecfd4a.js +1 -0
- package/ui/out/_next/static/css/{9f7bd82b8e4c762c.css → d95c2ba395730031.css} +1 -1
- package/ui/out/index.html +1 -1
- package/ui/out/index.txt +3 -3
- package/ui/out/run/index.html +1 -1
- package/ui/out/run/index.txt +3 -3
- package/ui/out/settings/index.html +1 -1
- package/ui/out/settings/index.txt +3 -3
- package/ui/out/_next/static/chunks/677-859e8d42add1806b.js +0 -1
- package/ui/out/_next/static/chunks/app/run/page-2420c9e4c963d9b3.js +0 -1
- package/ui/out/_next/static/chunks/app/settings/page-092a6bf42dfde57d.js +0 -1
- /package/ui/out/_next/static/{errjtBR_bKoee8ogLp8xk → 7_pihFubDGD40BCy2ynlr}/_buildManifest.js +0 -0
- /package/ui/out/_next/static/{errjtBR_bKoee8ogLp8xk → 7_pihFubDGD40BCy2ynlr}/_ssgManifest.js +0 -0
package/dist/tools.js
CHANGED
|
@@ -40,6 +40,7 @@ exports.synthToolset = synthToolset;
|
|
|
40
40
|
const fs = __importStar(require("fs"));
|
|
41
41
|
const path = __importStar(require("path"));
|
|
42
42
|
const crawltools_1 = require("./crawltools");
|
|
43
|
+
const searchcore_1 = require("./searchcore");
|
|
43
44
|
const util_1 = require("./util");
|
|
44
45
|
const webtools_1 = require("./webtools");
|
|
45
46
|
// ---------- safety ----------
|
|
@@ -62,9 +63,48 @@ function checkCommand(cmd, cfg) {
|
|
|
62
63
|
function resolveRead(p, ctx) {
|
|
63
64
|
return path.resolve(ctx.workdir, p);
|
|
64
65
|
}
|
|
66
|
+
/** Single-quote a string for sh. */
|
|
67
|
+
function shq(s) {
|
|
68
|
+
return `'${s.replace(/'/g, `'\\''`)}'`;
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Where a write actually lands: realpath of the deepest existing ancestor plus
|
|
72
|
+
* the not-yet-created remainder. Confinement checks must use this, or a
|
|
73
|
+
* symlink inside the workdir smuggles writes anywhere on the host.
|
|
74
|
+
*/
|
|
75
|
+
function realDestination(abs) {
|
|
76
|
+
let dir = abs;
|
|
77
|
+
const tail = [];
|
|
78
|
+
while (!fs.existsSync(dir)) {
|
|
79
|
+
tail.unshift(path.basename(dir));
|
|
80
|
+
const parent = path.dirname(dir);
|
|
81
|
+
if (parent === dir)
|
|
82
|
+
break;
|
|
83
|
+
dir = parent;
|
|
84
|
+
}
|
|
85
|
+
try {
|
|
86
|
+
dir = fs.realpathSync(dir);
|
|
87
|
+
}
|
|
88
|
+
catch {
|
|
89
|
+
/* races/permissions: keep the lexical path */
|
|
90
|
+
}
|
|
91
|
+
return path.join(dir, ...tail);
|
|
92
|
+
}
|
|
93
|
+
function realBase(base) {
|
|
94
|
+
try {
|
|
95
|
+
return fs.realpathSync(base);
|
|
96
|
+
}
|
|
97
|
+
catch {
|
|
98
|
+
return base;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
65
101
|
function resolveWrite(p, ctx) {
|
|
66
102
|
const abs = path.resolve(ctx.workdir, p);
|
|
67
|
-
|
|
103
|
+
// Remote sandboxes own their filesystem — host-side realpath is meaningless there.
|
|
104
|
+
const real = ctx.sandbox.localFs ? realDestination(abs) : abs;
|
|
105
|
+
const ok = (0, util_1.pathInside)(realBase(ctx.workdir), real) ||
|
|
106
|
+
(0, util_1.pathInside)(realBase(ctx.runDirPath), real) ||
|
|
107
|
+
!ctx.cfg.safeMode;
|
|
68
108
|
if (!ok) {
|
|
69
109
|
throw new Error(`safeMode: writes are restricted to the working directory (${ctx.workdir}). ` +
|
|
70
110
|
`Use a relative path, or save deliverables with save_artifact.`);
|
|
@@ -171,7 +211,7 @@ function workerToolset(cfg) {
|
|
|
171
211
|
tools.replace_in_file = {
|
|
172
212
|
schema: {
|
|
173
213
|
name: "replace_in_file",
|
|
174
|
-
description: "Exact string replacement in a file. `find` must match exactly (including whitespace). Fails if not found, or if ambiguous when all=false.",
|
|
214
|
+
description: "Exact string replacement in a file. `find` must match exactly (including whitespace). Fails if not found, or if ambiguous when all=false. For several edits to the same file, pass `edits` — they apply in order, all-or-nothing, in one call.",
|
|
175
215
|
parameters: {
|
|
176
216
|
type: "object",
|
|
177
217
|
properties: {
|
|
@@ -179,25 +219,96 @@ function workerToolset(cfg) {
|
|
|
179
219
|
find: { type: "string" },
|
|
180
220
|
replace: { type: "string" },
|
|
181
221
|
all: { type: "boolean", description: "Replace every occurrence (default false)" },
|
|
222
|
+
edits: {
|
|
223
|
+
type: "array",
|
|
224
|
+
description: "Batch mode: multiple find/replace pairs applied in order, atomically (replaces top-level find/replace)",
|
|
225
|
+
items: {
|
|
226
|
+
type: "object",
|
|
227
|
+
properties: {
|
|
228
|
+
find: { type: "string" },
|
|
229
|
+
replace: { type: "string" },
|
|
230
|
+
all: { type: "boolean" },
|
|
231
|
+
},
|
|
232
|
+
required: ["find", "replace"],
|
|
233
|
+
},
|
|
234
|
+
},
|
|
182
235
|
},
|
|
183
|
-
required: ["path"
|
|
236
|
+
required: ["path"],
|
|
184
237
|
},
|
|
185
238
|
},
|
|
186
239
|
run: async (args, ctx) => {
|
|
187
240
|
const abs = resolveWrite(String(args.path), ctx);
|
|
188
241
|
const raw = await readFileVia(ctx, abs);
|
|
189
|
-
const
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
242
|
+
const edits = Array.isArray(args.edits) && args.edits.length
|
|
243
|
+
? args.edits.map((e) => ({
|
|
244
|
+
find: String(e.find ?? ""),
|
|
245
|
+
replace: String(e.replace ?? ""),
|
|
246
|
+
all: Boolean(e.all),
|
|
247
|
+
}))
|
|
248
|
+
: args.find !== undefined && args.replace !== undefined
|
|
249
|
+
? [{ find: String(args.find), replace: String(args.replace), all: Boolean(args.all) }]
|
|
250
|
+
: null;
|
|
251
|
+
if (!edits)
|
|
252
|
+
throw new Error("provide find+replace, or an edits array");
|
|
253
|
+
// Validate-then-apply against the progressively edited content:
|
|
254
|
+
// any failing edit aborts the whole batch with nothing written.
|
|
255
|
+
let next = raw;
|
|
256
|
+
let total = 0;
|
|
257
|
+
const at = (i) => (edits.length > 1 ? `edit ${i + 1}: ` : "");
|
|
258
|
+
for (let i = 0; i < edits.length; i++) {
|
|
259
|
+
const { find, replace, all } = edits[i];
|
|
260
|
+
if (!find)
|
|
261
|
+
throw new Error(`${at(i)}find must not be empty`);
|
|
262
|
+
const count = next.split(find).length - 1;
|
|
263
|
+
if (count === 0) {
|
|
264
|
+
throw new Error(`${at(i)}find string not found in file${edits.length > 1 ? " — no edits were applied" : ""}`);
|
|
265
|
+
}
|
|
266
|
+
if (count > 1 && !all) {
|
|
267
|
+
throw new Error(`${at(i)}find string matches ${count} times; provide more context or set all=true${edits.length > 1 ? " — no edits were applied" : ""}`);
|
|
268
|
+
}
|
|
269
|
+
next = all ? next.split(find).join(replace) : next.replace(find, replace);
|
|
270
|
+
total += all ? count : 1;
|
|
196
271
|
}
|
|
197
|
-
const next = args.all ? raw.split(find).join(replace) : raw.replace(find, replace);
|
|
198
272
|
await writeFileVia(ctx, abs, next);
|
|
199
273
|
const warn = ctx.checkClaim?.(String(args.path));
|
|
200
|
-
return `replaced ${
|
|
274
|
+
return `replaced ${total} occurrence(s) via ${edits.length} edit(s) in ${abs}${warn ? `\n${warn}` : ""}`;
|
|
275
|
+
},
|
|
276
|
+
};
|
|
277
|
+
tools.grep_files = {
|
|
278
|
+
schema: {
|
|
279
|
+
name: "grep_files",
|
|
280
|
+
description: "Search file contents with a regex (grep -E syntax). Returns matching lines as path:line:text. Use this to locate code or text instead of shell grep pipelines — one round-trip, works identically in remote sandboxes, skips node_modules/.git/build output.",
|
|
281
|
+
parameters: {
|
|
282
|
+
type: "object",
|
|
283
|
+
properties: {
|
|
284
|
+
pattern: { type: "string", description: "Extended regex (grep -E)" },
|
|
285
|
+
path: { type: "string", description: "Directory or file to search (default: working directory)" },
|
|
286
|
+
glob: { type: "string", description: "Filename filter, e.g. *.ts" },
|
|
287
|
+
ignore_case: { type: "boolean" },
|
|
288
|
+
max_results: { type: "number", description: "Default 50, max 200" },
|
|
289
|
+
},
|
|
290
|
+
required: ["pattern"],
|
|
291
|
+
},
|
|
292
|
+
},
|
|
293
|
+
run: async (args, ctx) => {
|
|
294
|
+
const pattern = String(args.pattern ?? "");
|
|
295
|
+
if (!pattern.trim())
|
|
296
|
+
throw new Error("pattern is required");
|
|
297
|
+
const root = args.path ? resolveRead(String(args.path), ctx) : ctx.workdir;
|
|
298
|
+
const max = Math.min(Math.max(Number(args.max_results) || 50, 1), 200);
|
|
299
|
+
const flags = `-rnE${args.ignore_case ? "i" : ""}`;
|
|
300
|
+
const include = args.glob ? ` --include=${shq(String(args.glob))}` : "";
|
|
301
|
+
const excludes = ["node_modules", ".git", "dist", ".next", "out", "build", "target", "__pycache__", ".venv"]
|
|
302
|
+
.map((d) => ` --exclude-dir=${d}`)
|
|
303
|
+
.join("");
|
|
304
|
+
const cmd = `grep ${flags}${include}${excludes} -e ${shq(pattern)} ${shq(root)} | head -n ${max + 1}`;
|
|
305
|
+
const r = await ctx.sandbox.exec(cmd, { cwd: ctx.workdir, timeoutSec: 60, signal: ctx.signal });
|
|
306
|
+
const lines = r.out.split("\n").filter(Boolean);
|
|
307
|
+
if (!lines.length)
|
|
308
|
+
return "no matches";
|
|
309
|
+
const shown = lines.slice(0, max);
|
|
310
|
+
const more = lines.length > max ? `\n…more matches truncated (raise max_results or narrow the pattern)` : "";
|
|
311
|
+
return shown.join("\n") + more;
|
|
201
312
|
},
|
|
202
313
|
};
|
|
203
314
|
tools.list_dir = {
|
|
@@ -295,6 +406,39 @@ function workerToolset(cfg) {
|
|
|
295
406
|
.join("\n");
|
|
296
407
|
},
|
|
297
408
|
};
|
|
409
|
+
tools.academic_search = {
|
|
410
|
+
schema: {
|
|
411
|
+
name: "academic_search",
|
|
412
|
+
description: "Search scholarly sources: arXiv preprints and Crossref journal/conference metadata (keyless APIs). Returns papers with title, link (arXiv/DOI), abstract snippet, and date. Use for scientific or technical questions where peer-reviewed and preprint sources beat the open web.",
|
|
413
|
+
parameters: {
|
|
414
|
+
type: "object",
|
|
415
|
+
properties: {
|
|
416
|
+
query: { type: "string" },
|
|
417
|
+
count: { type: "number", description: "Max results, default 8, max 20" },
|
|
418
|
+
},
|
|
419
|
+
required: ["query"],
|
|
420
|
+
},
|
|
421
|
+
},
|
|
422
|
+
run: async (args, ctx) => {
|
|
423
|
+
const count = Math.min(Math.max(Number(args.count) || 8, 1), 20);
|
|
424
|
+
const q = String(args.query);
|
|
425
|
+
const settled = await Promise.allSettled([
|
|
426
|
+
(0, webtools_1.arxivSearch)(q, count, ctx.signal),
|
|
427
|
+
(0, webtools_1.crossrefSearch)(q, count, ctx.signal),
|
|
428
|
+
]);
|
|
429
|
+
const candidates = settled.flatMap((s) => (s.status === "fulfilled" ? s.value : []));
|
|
430
|
+
if (!candidates.length) {
|
|
431
|
+
const err = settled.find((s) => s.status === "rejected");
|
|
432
|
+
if (err)
|
|
433
|
+
throw err.reason;
|
|
434
|
+
return "no results";
|
|
435
|
+
}
|
|
436
|
+
const merged = (0, searchcore_1.mergeCandidates)(candidates, count);
|
|
437
|
+
return merged
|
|
438
|
+
.map((h, i) => `${i + 1}. ${h.title}${h.date ? ` (${h.date})` : ""} [${h.engine}]\n ${h.url}\n ${h.snippet}`)
|
|
439
|
+
.join("\n");
|
|
440
|
+
},
|
|
441
|
+
};
|
|
298
442
|
tools.fetch_url = {
|
|
299
443
|
schema: {
|
|
300
444
|
name: "fetch_url",
|
|
@@ -318,7 +462,7 @@ function workerToolset(cfg) {
|
|
|
318
462
|
tools.note = {
|
|
319
463
|
schema: {
|
|
320
464
|
name: "note",
|
|
321
|
-
description: "Post a durable fact/discovery to the swarm's shared blackboard so the conductor and other agents can see it. Use sparingly — facts other tasks need, not progress chatter. Mark kind='decision' for choices the rest of the mission must respect (
|
|
465
|
+
description: "Post a durable fact/discovery to the swarm's shared blackboard so the conductor and other agents can see it. Use sparingly — facts other tasks need, not progress chatter. Mark kind='decision' for choices the rest of the mission must respect, and kind='conflict' when independent sources disagree on a material fact (both are never trimmed from digests).",
|
|
322
466
|
parameters: {
|
|
323
467
|
type: "object",
|
|
324
468
|
properties: {
|
|
@@ -326,18 +470,20 @@ function workerToolset(cfg) {
|
|
|
326
470
|
key: { type: "string", description: "Optional short label" },
|
|
327
471
|
kind: {
|
|
328
472
|
type: "string",
|
|
329
|
-
enum: ["finding", "decision", "open-question", "handoff", "claim"],
|
|
330
|
-
description: "Category (default finding). kind='claim' with key=<file path> advertises you are editing that file",
|
|
473
|
+
enum: ["finding", "decision", "conflict", "open-question", "handoff", "claim"],
|
|
474
|
+
description: "Category (default finding). kind='conflict' flags sources that disagree — name both. kind='claim' with key=<file path> advertises you are editing that file",
|
|
331
475
|
},
|
|
476
|
+
url: { type: "string", description: "Source URL backing this note, when it came from the web" },
|
|
332
477
|
},
|
|
333
478
|
required: ["text"],
|
|
334
479
|
},
|
|
335
480
|
},
|
|
336
481
|
run: async (args, ctx) => {
|
|
337
|
-
const kind = ["finding", "decision", "open-question", "handoff", "claim"].includes(String(args.kind))
|
|
482
|
+
const kind = ["finding", "decision", "conflict", "open-question", "handoff", "claim"].includes(String(args.kind))
|
|
338
483
|
? String(args.kind)
|
|
339
484
|
: undefined;
|
|
340
|
-
|
|
485
|
+
const url = /^https?:\/\//.test(String(args.url ?? "")) ? String(args.url) : undefined;
|
|
486
|
+
ctx.addNote(String(args.text), args.key ? String(args.key) : undefined, kind, url);
|
|
341
487
|
return "noted on the blackboard";
|
|
342
488
|
},
|
|
343
489
|
};
|
|
@@ -416,8 +562,12 @@ function workerToolset(cfg) {
|
|
|
416
562
|
},
|
|
417
563
|
run: async (args, ctx) => {
|
|
418
564
|
const name = String(args.name).replace(/^\/+/, "");
|
|
419
|
-
const
|
|
420
|
-
|
|
565
|
+
const artifactsRoot = path.join(ctx.runDirPath, "artifacts");
|
|
566
|
+
(0, util_1.ensureDir)(artifactsRoot);
|
|
567
|
+
const dest = path.join(artifactsRoot, name);
|
|
568
|
+
// Realpath-based: neither ../ traversal nor a planted symlink may move
|
|
569
|
+
// the artifact outside the run's artifacts folder.
|
|
570
|
+
if (!(0, util_1.pathInside)(realBase(artifactsRoot), realDestination(dest))) {
|
|
421
571
|
throw new Error("artifact name must stay inside the artifacts folder");
|
|
422
572
|
}
|
|
423
573
|
(0, util_1.ensureDir)(path.dirname(dest));
|
|
@@ -546,6 +696,20 @@ exports.REPORT_TOOL = {
|
|
|
546
696
|
items: { type: "string" },
|
|
547
697
|
description: "Every file you created or modified (exact paths)",
|
|
548
698
|
},
|
|
699
|
+
sources: {
|
|
700
|
+
type: "array",
|
|
701
|
+
description: "Web sources your findings rely on — REQUIRED whenever your work drew on the web. They flow into the final report's bibliography; a web-sourced claim without an entry here cannot be cited.",
|
|
702
|
+
items: {
|
|
703
|
+
type: "object",
|
|
704
|
+
properties: {
|
|
705
|
+
url: { type: "string" },
|
|
706
|
+
title: { type: "string" },
|
|
707
|
+
date: { type: "string", description: "Publication date if known (ISO or year)" },
|
|
708
|
+
note: { type: "string", description: "What this source supports" },
|
|
709
|
+
},
|
|
710
|
+
required: ["url"],
|
|
711
|
+
},
|
|
712
|
+
},
|
|
549
713
|
},
|
|
550
714
|
required: ["status", "report"],
|
|
551
715
|
},
|
|
@@ -561,6 +725,19 @@ exports.VERDICT_TOOL = {
|
|
|
561
725
|
type: "string",
|
|
562
726
|
description: "If fail: exactly what is wrong and where. If pass: one-line confirmation of the evidence.",
|
|
563
727
|
},
|
|
728
|
+
issues: {
|
|
729
|
+
type: "array",
|
|
730
|
+
description: "On fail: one entry per concrete problem. The worker's retry sees these verbatim — make each actionable.",
|
|
731
|
+
items: {
|
|
732
|
+
type: "object",
|
|
733
|
+
properties: {
|
|
734
|
+
problem: { type: "string", description: "What is wrong" },
|
|
735
|
+
evidence: { type: "string", description: "What you observed that proves it (command output, file content, URL)" },
|
|
736
|
+
fix: { type: "string", description: "The exact change that would resolve it" },
|
|
737
|
+
},
|
|
738
|
+
required: ["problem"],
|
|
739
|
+
},
|
|
740
|
+
},
|
|
564
741
|
},
|
|
565
742
|
required: ["pass", "feedback"],
|
|
566
743
|
},
|
package/dist/util.js
CHANGED
|
@@ -48,6 +48,7 @@ exports.ensureDir = ensureDir;
|
|
|
48
48
|
exports.readJson = readJson;
|
|
49
49
|
exports.writeJson = writeJson;
|
|
50
50
|
exports.pathInside = pathInside;
|
|
51
|
+
exports.validateArtifactFormat = validateArtifactFormat;
|
|
51
52
|
exports.decodeEntities = decodeEntities;
|
|
52
53
|
exports.htmlToText = htmlToText;
|
|
53
54
|
const fs = __importStar(require("fs"));
|
|
@@ -147,6 +148,90 @@ function pathInside(parent, child) {
|
|
|
147
148
|
const rel = path.relative(path.resolve(parent), path.resolve(child));
|
|
148
149
|
return rel === "" || (!rel.startsWith("..") && !path.isAbsolute(rel));
|
|
149
150
|
}
|
|
151
|
+
// ---------- artifact validation ----------
|
|
152
|
+
/**
|
|
153
|
+
* Cheap structural checks for common deliverable formats — catches a worker
|
|
154
|
+
* shipping malformed JSON/CSV/stub HTML before an LLM verifier spends tokens
|
|
155
|
+
* on it. Returns a problem description, or null when the file looks sound
|
|
156
|
+
* (or is a format we don't check).
|
|
157
|
+
*/
|
|
158
|
+
function validateArtifactFormat(absPath) {
|
|
159
|
+
const ext = path.extname(absPath).toLowerCase();
|
|
160
|
+
if (![".json", ".csv", ".html", ".htm"].includes(ext))
|
|
161
|
+
return null;
|
|
162
|
+
let raw;
|
|
163
|
+
try {
|
|
164
|
+
raw = fs.readFileSync(absPath, "utf8");
|
|
165
|
+
}
|
|
166
|
+
catch {
|
|
167
|
+
return null; // existence/size is the caller's check
|
|
168
|
+
}
|
|
169
|
+
if (ext === ".json") {
|
|
170
|
+
try {
|
|
171
|
+
JSON.parse(raw);
|
|
172
|
+
return null;
|
|
173
|
+
}
|
|
174
|
+
catch (e) {
|
|
175
|
+
return `not valid JSON (${errMsg(e)})`;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
if (ext === ".csv") {
|
|
179
|
+
const counts = csvFieldCounts(raw, 50);
|
|
180
|
+
if (!counts.length)
|
|
181
|
+
return "CSV has no records";
|
|
182
|
+
const expect = counts[0];
|
|
183
|
+
const bad = counts.findIndex((c) => c !== expect);
|
|
184
|
+
if (bad > 0)
|
|
185
|
+
return `inconsistent CSV: record 1 has ${expect} field(s), record ${bad + 1} has ${counts[bad]}`;
|
|
186
|
+
return null;
|
|
187
|
+
}
|
|
188
|
+
// .html / .htm — catch empty shells and plain text passed off as HTML.
|
|
189
|
+
if (raw.length < 200 || !/<[a-z!][^>]*>/i.test(raw) || !/<\/[a-z][a-z0-9]*>/i.test(raw)) {
|
|
190
|
+
return "HTML looks like a stub (too short or no real markup)";
|
|
191
|
+
}
|
|
192
|
+
return null;
|
|
193
|
+
}
|
|
194
|
+
/** Field count per CSV record (quote-aware, handles newlines inside quotes). */
|
|
195
|
+
function csvFieldCounts(raw, maxRecords) {
|
|
196
|
+
const counts = [];
|
|
197
|
+
let fields = 1;
|
|
198
|
+
let chars = 0; // non-separator chars seen in the current record
|
|
199
|
+
let inQ = false;
|
|
200
|
+
for (let i = 0; i < raw.length && counts.length < maxRecords; i++) {
|
|
201
|
+
const ch = raw[i];
|
|
202
|
+
if (inQ) {
|
|
203
|
+
if (ch === '"') {
|
|
204
|
+
if (raw[i + 1] === '"')
|
|
205
|
+
i++;
|
|
206
|
+
else
|
|
207
|
+
inQ = false;
|
|
208
|
+
}
|
|
209
|
+
chars++;
|
|
210
|
+
}
|
|
211
|
+
else if (ch === '"') {
|
|
212
|
+
inQ = true;
|
|
213
|
+
chars++;
|
|
214
|
+
}
|
|
215
|
+
else if (ch === ",") {
|
|
216
|
+
fields++;
|
|
217
|
+
chars++;
|
|
218
|
+
}
|
|
219
|
+
else if (ch === "\n" || ch === "\r") {
|
|
220
|
+
if (ch === "\r" && raw[i + 1] === "\n")
|
|
221
|
+
i++;
|
|
222
|
+
if (chars > 0)
|
|
223
|
+
counts.push(fields); // skip blank lines
|
|
224
|
+
fields = 1;
|
|
225
|
+
chars = 0;
|
|
226
|
+
}
|
|
227
|
+
else {
|
|
228
|
+
chars++;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
if (chars > 0 && counts.length < maxRecords)
|
|
232
|
+
counts.push(fields);
|
|
233
|
+
return counts;
|
|
234
|
+
}
|
|
150
235
|
// ---------- html ----------
|
|
151
236
|
const ENTITIES = {
|
|
152
237
|
amp: "&", lt: "<", gt: ">", quot: '"', apos: "'", nbsp: " ",
|
package/dist/webtools.js
CHANGED
|
@@ -1,9 +1,16 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.webSearch = webSearch;
|
|
4
|
+
exports._resetEngineCooldowns = _resetEngineCooldowns;
|
|
5
|
+
exports.tinyfishSearch = tinyfishSearch;
|
|
6
|
+
exports.ddgSearch = ddgSearch;
|
|
7
|
+
exports.bingSearch = bingSearch;
|
|
8
|
+
exports.arxivSearch = arxivSearch;
|
|
9
|
+
exports.crossrefSearch = crossrefSearch;
|
|
4
10
|
exports.parseBingHtml = parseBingHtml;
|
|
5
11
|
exports.fetchUrl = fetchUrl;
|
|
6
12
|
const crawltools_1 = require("./crawltools");
|
|
13
|
+
const pdftext_1 = require("./pdftext");
|
|
7
14
|
const searchcore_1 = require("./searchcore");
|
|
8
15
|
const util_1 = require("./util");
|
|
9
16
|
const UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36 agentswarm/0.1";
|
|
@@ -20,7 +27,7 @@ const DEEP_PASSAGES = 3;
|
|
|
20
27
|
* and re-ranks by content quality. Ranking/passage algorithms live in
|
|
21
28
|
* searchcore.ts.
|
|
22
29
|
*/
|
|
23
|
-
async function webSearch(cfg, query, count, signal, deep = false, warn) {
|
|
30
|
+
async function webSearch(cfg, query, count, signal, deep = false, warn, _retried = false) {
|
|
24
31
|
// Deep searches widen recall by issuing complementary phrasings; the fast
|
|
25
32
|
// path stays a single query so an agent's tool loop isn't slowed.
|
|
26
33
|
const queries = deep ? (0, searchcore_1.expandQueries)(query) : [query];
|
|
@@ -37,18 +44,30 @@ async function webSearch(cfg, query, count, signal, deep = false, warn) {
|
|
|
37
44
|
}
|
|
38
45
|
}
|
|
39
46
|
}
|
|
47
|
+
// Scholarly questions also sweep the keyless academic APIs (deep mode only).
|
|
48
|
+
if (deep && (0, searchcore_1.looksAcademic)(query)) {
|
|
49
|
+
engineCalls.push(arxivSearch(query, perEngine, signal), crossrefSearch(query, perEngine, signal));
|
|
50
|
+
}
|
|
40
51
|
const settled = await Promise.allSettled(engineCalls);
|
|
41
52
|
const candidates = settled.flatMap((s) => (s.status === "fulfilled" ? s.value : []));
|
|
42
53
|
if (!candidates.length) {
|
|
43
54
|
const firstErr = settled.find((s) => s.status === "rejected");
|
|
55
|
+
if (firstErr && settled.every((s) => s.status === "rejected"))
|
|
56
|
+
throw firstErr.reason;
|
|
57
|
+
// Engines answered but nothing parsed/matched: one retry with a
|
|
58
|
+
// simplified phrasing before giving up.
|
|
59
|
+
if (!_retried) {
|
|
60
|
+
const alt = (0, searchcore_1.reformulate)(query);
|
|
61
|
+
if (alt) {
|
|
62
|
+
warn?.(`no results for "${query}" — retrying as "${alt}"`);
|
|
63
|
+
return webSearch(cfg, alt, count, signal, deep, warn, true);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
44
66
|
if (firstErr)
|
|
45
67
|
throw firstErr.reason;
|
|
46
68
|
return [];
|
|
47
69
|
}
|
|
48
70
|
const failures = settled.filter((s) => s.status === "rejected").length;
|
|
49
|
-
if (failures && failures === settled.length) {
|
|
50
|
-
throw (settled.find((s) => s.status === "rejected")).reason;
|
|
51
|
-
}
|
|
52
71
|
if (failures) {
|
|
53
72
|
warn?.(`${failures}/${settled.length} search engine calls failed; results come from the rest`);
|
|
54
73
|
}
|
|
@@ -111,16 +130,23 @@ async function fetchReadable(url, signal) {
|
|
|
111
130
|
}
|
|
112
131
|
}
|
|
113
132
|
const res = await fetch(url, {
|
|
114
|
-
headers: { "user-agent": UA, accept: "text/html,text/*;q=0.9,*/*;q=0.5" },
|
|
133
|
+
headers: { "user-agent": UA, accept: "text/html,application/pdf,text/*;q=0.9,*/*;q=0.5" },
|
|
115
134
|
signal: mergeSignal(20_000, signal),
|
|
116
135
|
redirect: "follow",
|
|
117
136
|
});
|
|
118
137
|
if (!res.ok)
|
|
119
138
|
throw new Error(`HTTP ${res.status}`);
|
|
120
139
|
const ctype = res.headers.get("content-type") || "";
|
|
140
|
+
if (/application\/pdf/i.test(ctype)) {
|
|
141
|
+
const buf = Buffer.from(await res.arrayBuffer());
|
|
142
|
+
const pdf = buf.length <= 20_000_000 ? (0, pdftext_1.extractPdfText)(buf) : null;
|
|
143
|
+
if (!pdf)
|
|
144
|
+
throw new Error("pdf with no extractable text");
|
|
145
|
+
return clip(pdf.text);
|
|
146
|
+
}
|
|
121
147
|
if (!/text\/|html|xml|json/i.test(ctype))
|
|
122
148
|
throw new Error(`not textual: ${ctype}`);
|
|
123
|
-
const body = await res.
|
|
149
|
+
const body = decodeBody(Buffer.from(await res.arrayBuffer()), ctype);
|
|
124
150
|
const text = /html/i.test(ctype) ? (0, util_1.htmlToText)(body) : body;
|
|
125
151
|
return clip(text);
|
|
126
152
|
}
|
|
@@ -135,6 +161,36 @@ function mergeSignal(timeoutMs, signal) {
|
|
|
135
161
|
return typeof AbortSignal.any === "function" ? AbortSignal.any([t, signal]) : signal;
|
|
136
162
|
}
|
|
137
163
|
// ---------------------------------------------------------------- engines
|
|
164
|
+
/**
|
|
165
|
+
* Per-engine rate-limit cooldowns: an engine that answers 429/403/503 sits
|
|
166
|
+
* out (60s, or the server's retry-after up to 120s) instead of getting
|
|
167
|
+
* hammered into a long block mid-research. A tiny retry-after (≤5s) is
|
|
168
|
+
* honored once in-call.
|
|
169
|
+
*/
|
|
170
|
+
const engineCooldown = new Map();
|
|
171
|
+
/** Test hook. */
|
|
172
|
+
function _resetEngineCooldowns() {
|
|
173
|
+
engineCooldown.clear();
|
|
174
|
+
}
|
|
175
|
+
async function engineFetch(engine, url, init, signal) {
|
|
176
|
+
const until = engineCooldown.get(engine) ?? 0;
|
|
177
|
+
if (until > Date.now()) {
|
|
178
|
+
throw new Error(`${engine} is cooling down after a rate limit (${Math.ceil((until - Date.now()) / 1000)}s left)`);
|
|
179
|
+
}
|
|
180
|
+
for (let attempt = 0;; attempt++) {
|
|
181
|
+
const res = await fetch(url, { ...init, signal: mergeSignal(20_000, signal) });
|
|
182
|
+
if (![429, 403, 503].includes(res.status))
|
|
183
|
+
return res;
|
|
184
|
+
const retryAfter = Number(res.headers.get("retry-after"));
|
|
185
|
+
if (attempt === 0 && Number.isFinite(retryAfter) && retryAfter > 0 && retryAfter <= 5) {
|
|
186
|
+
await new Promise((r) => setTimeout(r, retryAfter * 1000));
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
const ms = Number.isFinite(retryAfter) && retryAfter > 0 ? Math.min(retryAfter, 120) * 1000 : 60_000;
|
|
190
|
+
engineCooldown.set(engine, Date.now() + ms);
|
|
191
|
+
throw new Error(`${engine} rate-limited (HTTP ${res.status}); cooling down ${Math.round(ms / 1000)}s`);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
138
194
|
async function tinyfishSearch(cfg, query, count, signal) {
|
|
139
195
|
const url = `https://api.search.tinyfish.ai?query=${encodeURIComponent(query)}`;
|
|
140
196
|
const res = await fetch(url, {
|
|
@@ -173,10 +229,9 @@ async function ddgSearch(query, count, signal) {
|
|
|
173
229
|
let reachedAny = false;
|
|
174
230
|
for (const ep of DDG_ENDPOINTS) {
|
|
175
231
|
try {
|
|
176
|
-
const res = await
|
|
232
|
+
const res = await engineFetch("duckduckgo", ep.url + encodeURIComponent(query), {
|
|
177
233
|
headers: { "user-agent": UA },
|
|
178
|
-
|
|
179
|
-
});
|
|
234
|
+
}, signal);
|
|
180
235
|
if (!res.ok)
|
|
181
236
|
throw new Error(`search failed: HTTP ${res.status}`);
|
|
182
237
|
reachedAny = true;
|
|
@@ -221,14 +276,56 @@ function parseDdgHtml(html, count, linkRe) {
|
|
|
221
276
|
}
|
|
222
277
|
/** Bing's HTML results page: each hit is an <li class="b_algo"> with an <h2><a> link. */
|
|
223
278
|
async function bingSearch(query, count, signal) {
|
|
224
|
-
const res = await
|
|
279
|
+
const res = await engineFetch("bing", `https://www.bing.com/search?q=${encodeURIComponent(query)}`, {
|
|
225
280
|
headers: { "user-agent": UA, "accept-language": "en-US,en;q=0.9" },
|
|
226
|
-
|
|
227
|
-
});
|
|
281
|
+
}, signal);
|
|
228
282
|
if (!res.ok)
|
|
229
283
|
throw new Error(`bing search ${res.status}`);
|
|
230
284
|
return parseBingHtml(await res.text(), count);
|
|
231
285
|
}
|
|
286
|
+
// ---------------------------------------------------------------- academic engines (keyless)
|
|
287
|
+
/** arXiv's Atom API — preprints with abstracts, no key needed. */
|
|
288
|
+
async function arxivSearch(query, count, signal) {
|
|
289
|
+
const url = `https://export.arxiv.org/api/query?search_query=all:${encodeURIComponent(query)}&max_results=${Math.min(count, 15)}`;
|
|
290
|
+
const res = await engineFetch("arxiv", url, { headers: { "user-agent": UA } }, signal);
|
|
291
|
+
if (!res.ok)
|
|
292
|
+
throw new Error(`arxiv search ${res.status}`);
|
|
293
|
+
const xml = await res.text();
|
|
294
|
+
const out = [];
|
|
295
|
+
for (const entry of xml.split(/<entry>/).slice(1)) {
|
|
296
|
+
if (out.length >= count)
|
|
297
|
+
break;
|
|
298
|
+
const title = strip((/<title>([\s\S]*?)<\/title>/.exec(entry) || [])[1] || "");
|
|
299
|
+
const id = ((/<id>([\s\S]*?)<\/id>/.exec(entry) || [])[1] || "").trim();
|
|
300
|
+
const summary = strip((/<summary>([\s\S]*?)<\/summary>/.exec(entry) || [])[1] || "");
|
|
301
|
+
const published = (/<published>(\d{4}-\d{2}-\d{2})/.exec(entry) || [])[1];
|
|
302
|
+
if (!id || !title || !/^https?:\/\//.test(id))
|
|
303
|
+
continue;
|
|
304
|
+
out.push({ title, url: id, snippet: summary.slice(0, 300), rank: out.length + 1, engine: "arxiv", date: published });
|
|
305
|
+
}
|
|
306
|
+
return out;
|
|
307
|
+
}
|
|
308
|
+
/** Crossref's works API — journal/conference metadata with DOIs, no key needed. */
|
|
309
|
+
async function crossrefSearch(query, count, signal) {
|
|
310
|
+
const url = `https://api.crossref.org/works?query=${encodeURIComponent(query)}&rows=${Math.min(count, 15)}&select=title,DOI,abstract,issued,container-title`;
|
|
311
|
+
const res = await engineFetch("crossref", url, { headers: { "user-agent": UA } }, signal);
|
|
312
|
+
if (!res.ok)
|
|
313
|
+
throw new Error(`crossref search ${res.status}`);
|
|
314
|
+
const data = await res.json();
|
|
315
|
+
const out = [];
|
|
316
|
+
for (const it of data?.message?.items ?? []) {
|
|
317
|
+
if (out.length >= count)
|
|
318
|
+
break;
|
|
319
|
+
const title = strip(String(Array.isArray(it.title) ? it.title[0] ?? "" : it.title ?? ""));
|
|
320
|
+
if (!title || !it.DOI)
|
|
321
|
+
continue;
|
|
322
|
+
const date = Array.isArray(it.issued?.["date-parts"]?.[0]) ? it.issued["date-parts"][0].join("-") : undefined;
|
|
323
|
+
const venue = Array.isArray(it["container-title"]) ? it["container-title"][0] : "";
|
|
324
|
+
const snippet = (strip(String(it.abstract ?? "")) || venue || "").slice(0, 300);
|
|
325
|
+
out.push({ title, url: `https://doi.org/${it.DOI}`, snippet, rank: out.length + 1, engine: "crossref", date });
|
|
326
|
+
}
|
|
327
|
+
return out;
|
|
328
|
+
}
|
|
232
329
|
function parseBingHtml(html, count) {
|
|
233
330
|
const hits = [];
|
|
234
331
|
const blocks = html.split(/<li class="b_algo[^"]*"/i).slice(1);
|
|
@@ -302,18 +399,51 @@ async function fetchUrl(cfg, url, raw, maxChars, signal) {
|
|
|
302
399
|
}
|
|
303
400
|
}
|
|
304
401
|
const res = await fetch(url, {
|
|
305
|
-
headers: { "user-agent": UA, accept: "text/html,application/json,text/*;q=0.9,*/*;q=0.5" },
|
|
402
|
+
headers: { "user-agent": UA, accept: "text/html,application/json,application/pdf,text/*;q=0.9,*/*;q=0.5" },
|
|
306
403
|
signal: signal ?? AbortSignal.timeout(25000),
|
|
307
404
|
redirect: "follow",
|
|
308
405
|
});
|
|
309
406
|
const ctype = res.headers.get("content-type") || "";
|
|
310
|
-
const body = await res.text();
|
|
311
407
|
if (!res.ok) {
|
|
312
|
-
|
|
408
|
+
// An error page is not content: returning it as a successful result lets
|
|
409
|
+
// "HTTP 403 ... subscribe to continue" become a "fact" in someone's report.
|
|
410
|
+
const body = await res.text().catch(() => "");
|
|
411
|
+
throw new Error(`HTTP ${res.status} ${res.statusText} — page is not usable as a source (paywall/login/blocked?). ` +
|
|
412
|
+
`Try web_search for an alternative source.${body ? ` Server said: ${(0, util_1.oneLine)((0, util_1.htmlToText)(body), 200)}` : ""}`);
|
|
413
|
+
}
|
|
414
|
+
const buf = Buffer.from(await res.arrayBuffer());
|
|
415
|
+
if (/application\/pdf/i.test(ctype) || buf.subarray(0, 5).toString("latin1") === "%PDF-") {
|
|
416
|
+
if (buf.length > 20_000_000)
|
|
417
|
+
throw new Error(`PDF is ${Math.round(buf.length / 1e6)}MB — too large to extract`);
|
|
418
|
+
const pdf = (0, pdftext_1.extractPdfText)(buf);
|
|
419
|
+
if (!pdf) {
|
|
420
|
+
throw new Error("PDF contains no extractable text (likely scanned or encrypted) — find an HTML version of this source.");
|
|
421
|
+
}
|
|
422
|
+
return (0, util_1.truncateMiddle)(`[PDF, ${pdf.pages} page${pdf.pages > 1 ? "s" : ""}]\n${pdf.text}`, maxChars, "chars");
|
|
313
423
|
}
|
|
424
|
+
const body = decodeBody(buf, ctype);
|
|
314
425
|
const text = !raw && /html/i.test(ctype) ? (0, util_1.htmlToText)(body) : body;
|
|
426
|
+
if (!raw && /html/i.test(ctype)) {
|
|
427
|
+
const trimmed = text.trim();
|
|
428
|
+
if (trimmed.length < 400 && /subscrib|sign.?in|log.?in|enable javascript|access denied|are you a (human|robot)|captcha/i.test(trimmed)) {
|
|
429
|
+
return `WARNING: this page returned only a paywall/anti-bot shell — the text below is probably not the real content. Try web_search for an alternative source.\n\n${trimmed}`;
|
|
430
|
+
}
|
|
431
|
+
}
|
|
315
432
|
return (0, util_1.truncateMiddle)(text, maxChars, "chars");
|
|
316
433
|
}
|
|
434
|
+
/** Decode a response body honoring its content-type charset (UTF-8 fallback). */
|
|
435
|
+
function decodeBody(buf, ctype) {
|
|
436
|
+
const charset = /charset=([\w-]+)/i.exec(ctype)?.[1]?.toLowerCase();
|
|
437
|
+
if (charset && charset !== "utf-8" && charset !== "utf8") {
|
|
438
|
+
try {
|
|
439
|
+
return new TextDecoder(charset).decode(buf);
|
|
440
|
+
}
|
|
441
|
+
catch {
|
|
442
|
+
/* unknown label — fall through to utf-8 */
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
return buf.toString("utf8");
|
|
446
|
+
}
|
|
317
447
|
async function tinyfishFetch(cfg, url, signal) {
|
|
318
448
|
const res = await fetch("https://api.fetch.tinyfish.ai", {
|
|
319
449
|
method: "POST",
|