pi-web-toolkit 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +33 -1
- package/README.md +151 -119
- package/docs/adr/0001-firecrawl-keyless-cloud-fallback.md +1 -1
- package/docs/adr/0002-toolkit-config-for-installer-selections.md +3 -0
- package/docs/adr/0003-conservative-installer-prerequisites.md +3 -0
- package/docs/adr/0004-searxng-endpoint-discovery.md +3 -0
- package/docs/guide.md +20 -4
- package/docs/tools.md +25 -7
- package/extensions/firecrawl_interact.ts +13 -14
- package/extensions/firecrawl_scrape.ts +13 -14
- package/extensions/firecrawl_search.ts +6 -6
- package/extensions/index.ts +25 -0
- package/extensions/utils/agent-browser.ts +4 -3
- package/extensions/utils/config.ts +170 -0
- package/extensions/utils/firecrawl.ts +27 -3
- package/extensions/utils/scrapling.ts +2 -1
- package/extensions/utils/web-search-core.ts +146 -0
- package/extensions/web_batch_fetch.ts +3 -7
- package/extensions/web_browse.ts +5 -9
- package/extensions/web_fetch.ts +5 -9
- package/extensions/web_search.ts +42 -118
- package/install.sh +801 -0
- package/package.json +6 -3
|
@@ -41,17 +41,17 @@ const firecrawlScrapeTool = defineTool({
|
|
|
41
41
|
name: "firecrawl_scrape",
|
|
42
42
|
label: "Firecrawl Scrape",
|
|
43
43
|
description: [
|
|
44
|
-
"
|
|
45
|
-
"
|
|
46
|
-
"
|
|
44
|
+
"Fallback-only cloud fetch via Firecrawl (keyless — no API key, no signup).",
|
|
45
|
+
"Do not use firecrawl_scrape as the first attempt for ordinary URL reading; use web_fetch first.",
|
|
46
|
+
"Use firecrawl_scrape only when web_fetch already failed on a hard target (anti-bot,",
|
|
47
|
+
"JavaScript-heavy pages, PDFs), or when the user explicitly asks for Firecrawl/cloud rendering.",
|
|
47
48
|
"Privacy: the URL and page content are sent to Firecrawl's cloud.",
|
|
48
49
|
`Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
|
|
49
50
|
].join(" "),
|
|
50
|
-
promptSnippet: "
|
|
51
|
+
promptSnippet: "Fallback-only Firecrawl scrape",
|
|
51
52
|
promptGuidelines: [
|
|
52
|
-
"
|
|
53
|
-
"firecrawl_scrape
|
|
54
|
-
"Always pass the full URL including https://.",
|
|
53
|
+
"Use firecrawl_scrape only after web_fetch fails or explicit cloud scraping/rendering.",
|
|
54
|
+
"Use firecrawl_scrape for anti-bot pages, heavy JS, and PDFs.",
|
|
55
55
|
],
|
|
56
56
|
parameters: FirecrawlScrapeParamsSchema,
|
|
57
57
|
|
|
@@ -97,13 +97,6 @@ const firecrawlScrapeTool = defineTool({
|
|
|
97
97
|
|
|
98
98
|
renderResult(result, { expanded, isPartial }, theme, context) {
|
|
99
99
|
const isError = context?.isError ?? false;
|
|
100
|
-
|
|
101
|
-
if (isPartial) {
|
|
102
|
-
const domain = details?.url ? getDomain(details.url) : "";
|
|
103
|
-
const label = domain ? `Scraping ${domain} via Firecrawl...` : "Scraping via Firecrawl...";
|
|
104
|
-
return new Text(theme.fg("warning", label), 0, 0);
|
|
105
|
-
}
|
|
106
|
-
|
|
107
100
|
const details = result.details as {
|
|
108
101
|
url?: string;
|
|
109
102
|
bytes?: number;
|
|
@@ -113,6 +106,12 @@ const firecrawlScrapeTool = defineTool({
|
|
|
113
106
|
creditsUsed?: number;
|
|
114
107
|
} | undefined;
|
|
115
108
|
|
|
109
|
+
if (isPartial) {
|
|
110
|
+
const domain = details?.url ? getDomain(details.url) : "";
|
|
111
|
+
const label = domain ? `Scraping ${domain} via Firecrawl...` : "Scraping via Firecrawl...";
|
|
112
|
+
return new Text(theme.fg("warning", label), 0, 0);
|
|
113
|
+
}
|
|
114
|
+
|
|
116
115
|
if (isError) {
|
|
117
116
|
const errText = getErrorText(result);
|
|
118
117
|
let text = theme.fg("error", "✗ Firecrawl scrape failed");
|
|
@@ -42,17 +42,17 @@ const firecrawlSearchTool = defineTool({
|
|
|
42
42
|
name: "firecrawl_search",
|
|
43
43
|
label: "Firecrawl Search",
|
|
44
44
|
description: [
|
|
45
|
-
"
|
|
45
|
+
"Fallback-only cloud search via Firecrawl (keyless — no API key, no signup).",
|
|
46
|
+
"Do not use firecrawl_search as the first attempt for ordinary web discovery; use web_search first.",
|
|
46
47
|
"Supports sources (web/images/news) and categories (github/research/pdf) that",
|
|
47
|
-
"SearXNG does not. Use as an escape hatch
|
|
48
|
+
"SearXNG does not. Use only as an escape hatch when web_search fails/returns nothing, or when the user explicitly asks for Firecrawl/cloud search.",
|
|
48
49
|
"Privacy: the query is sent to Firecrawl's cloud.",
|
|
49
50
|
`Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
|
|
50
51
|
].join(" "),
|
|
51
|
-
promptSnippet: "
|
|
52
|
+
promptSnippet: "Fallback-only Firecrawl search",
|
|
52
53
|
promptGuidelines: [
|
|
53
|
-
"
|
|
54
|
-
"Use categories=[\"github\"
|
|
55
|
-
"Use includeDomains/excludeDomains to scope results to specific sites.",
|
|
54
|
+
"Use firecrawl_search only after web_search fails/returns nothing, for Firecrawl-only categories, or explicit cloud search.",
|
|
55
|
+
"Use categories=[\"github\"|\"research\"|\"pdf\"] and includeDomains/excludeDomains when needed.",
|
|
56
56
|
],
|
|
57
57
|
parameters: FirecrawlSearchParamsSchema,
|
|
58
58
|
|
package/extensions/index.ts
CHANGED
|
@@ -17,6 +17,26 @@ import registerFirecrawlScrape from "./firecrawl_scrape";
|
|
|
17
17
|
import registerFirecrawlSearch from "./firecrawl_search";
|
|
18
18
|
import registerFirecrawlInteract from "./firecrawl_interact";
|
|
19
19
|
|
|
20
|
+
const WEB_TOOL_ROUTING_POLICY = [
|
|
21
|
+
"Web tools are local-first: web_search=discover, web_fetch=one static URL, web_batch_fetch=2–5 static URLs, web_browse=interaction.",
|
|
22
|
+
"Use firecrawl_* only after the matching local tool failed in this conversation, or when the user explicitly asks for Firecrawl/cloud.",
|
|
23
|
+
"web_search/web_fetch/web_browse already auto-fallback to Firecrawl; pass full URLs with scheme and selectors when useful.",
|
|
24
|
+
].join("\n");
|
|
25
|
+
|
|
26
|
+
const WEB_TOOL_NAMES = new Set([
|
|
27
|
+
"web_search",
|
|
28
|
+
"web_fetch",
|
|
29
|
+
"web_browse",
|
|
30
|
+
"web_batch_fetch",
|
|
31
|
+
"firecrawl_search",
|
|
32
|
+
"firecrawl_scrape",
|
|
33
|
+
"firecrawl_interact",
|
|
34
|
+
]);
|
|
35
|
+
|
|
36
|
+
function shouldInjectWebToolRoutingPolicy(selectedTools: readonly string[] | undefined): boolean {
|
|
37
|
+
return selectedTools?.some((tool) => WEB_TOOL_NAMES.has(tool)) ?? false;
|
|
38
|
+
}
|
|
39
|
+
|
|
20
40
|
export default function (pi: ExtensionAPI) {
|
|
21
41
|
registerWebSearch(pi);
|
|
22
42
|
registerWebFetch(pi);
|
|
@@ -25,4 +45,9 @@ export default function (pi: ExtensionAPI) {
|
|
|
25
45
|
registerFirecrawlScrape(pi);
|
|
26
46
|
registerFirecrawlSearch(pi);
|
|
27
47
|
registerFirecrawlInteract(pi);
|
|
48
|
+
|
|
49
|
+
pi.on("before_agent_start", (event) => {
|
|
50
|
+
if (!shouldInjectWebToolRoutingPolicy(event.systemPromptOptions.selectedTools)) return;
|
|
51
|
+
return { systemPrompt: `${event.systemPrompt}\n\n${WEB_TOOL_ROUTING_POLICY}` };
|
|
52
|
+
});
|
|
28
53
|
}
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
8
|
import { runCLI } from "./cli-runner";
|
|
9
|
+
import { getToolkitCommand } from "./config";
|
|
9
10
|
|
|
10
11
|
export interface BrowseAction {
|
|
11
12
|
type: "click" | "fill" | "type" | "press" | "wait" | "wait_selector" | "scroll";
|
|
@@ -180,7 +181,7 @@ export async function runAgentBrowserBatch(
|
|
|
180
181
|
|
|
181
182
|
try {
|
|
182
183
|
const result = await runCLI({
|
|
183
|
-
command: "
|
|
184
|
+
command: getToolkitCommand("agentBrowser"),
|
|
184
185
|
args,
|
|
185
186
|
stdin: JSON.stringify(commands),
|
|
186
187
|
timeout: options.timeout,
|
|
@@ -199,7 +200,7 @@ export async function runAgentBrowserBatch(
|
|
|
199
200
|
);
|
|
200
201
|
}
|
|
201
202
|
} catch (err: any) {
|
|
202
|
-
if (err.message === "
|
|
203
|
+
if (typeof err.message === "string" && err.message.includes("is not installed")) {
|
|
203
204
|
throw new Error(
|
|
204
205
|
"agent-browser is not installed.\n\nInstall it with:\n npm i -g agent-browser && agent-browser install\n\nThen run: agent-browser doctor"
|
|
205
206
|
);
|
|
@@ -211,7 +212,7 @@ export async function runAgentBrowserBatch(
|
|
|
211
212
|
export async function closeAgentBrowserSession(session: string, signal?: AbortSignal): Promise<void> {
|
|
212
213
|
try {
|
|
213
214
|
await runCLI({
|
|
214
|
-
command: "
|
|
215
|
+
command: getToolkitCommand("agentBrowser"),
|
|
215
216
|
args: ["--session", session, "close"],
|
|
216
217
|
signal,
|
|
217
218
|
});
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pi-web-toolkit runtime configuration
|
|
3
|
+
*
|
|
4
|
+
* Reads user-level toolkit configuration without requiring users to modify
|
|
5
|
+
* shell profiles. Environment variables remain the highest-priority override.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
9
|
+
import * as os from "node:os";
|
|
10
|
+
import * as path from "node:path";
|
|
11
|
+
|
|
12
|
+
export const DEFAULT_SEARXNG_URL = "http://localhost:8080";
|
|
13
|
+
|
|
14
|
+
export interface ToolkitCommandsConfig {
|
|
15
|
+
scrapling?: string;
|
|
16
|
+
agentBrowser?: string;
|
|
17
|
+
firecrawl?: string;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export type FirecrawlRunner = "installed" | "npx" | "bunx";
|
|
21
|
+
|
|
22
|
+
export interface ToolkitConfig {
|
|
23
|
+
searxngUrl?: string;
|
|
24
|
+
firecrawlFallback?: boolean;
|
|
25
|
+
firecrawlRunner?: FirecrawlRunner;
|
|
26
|
+
commands?: ToolkitCommandsConfig;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export type ToolkitCommandName = "scrapling" | "agentBrowser" | "firecrawl";
|
|
30
|
+
|
|
31
|
+
const COMMAND_DEFAULTS: Record<ToolkitCommandName, string> = {
|
|
32
|
+
scrapling: "scrapling",
|
|
33
|
+
agentBrowser: "agent-browser",
|
|
34
|
+
firecrawl: "firecrawl",
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
const COMMAND_ENV_VARS: Record<ToolkitCommandName, string> = {
|
|
38
|
+
scrapling: "SCRAPLING_BIN",
|
|
39
|
+
agentBrowser: "AGENT_BROWSER_BIN",
|
|
40
|
+
firecrawl: "FIRECRAWL_BIN",
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
const FIRECRAWL_RUNNERS = ["installed", "npx", "bunx"] as const;
|
|
44
|
+
|
|
45
|
+
function isFirecrawlRunner(value: string): value is FirecrawlRunner {
|
|
46
|
+
return (FIRECRAWL_RUNNERS as readonly string[]).includes(value);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export function getDefaultToolkitConfigPath(): string {
|
|
50
|
+
const configHome = process.env.XDG_CONFIG_HOME?.trim() || path.join(os.homedir(), ".config");
|
|
51
|
+
return path.join(configHome, "pi-web-toolkit", "config.json");
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export function getToolkitConfigPath(): string {
|
|
55
|
+
const configured = process.env.PI_WEB_TOOLKIT_CONFIG?.trim();
|
|
56
|
+
return configured || getDefaultToolkitConfigPath();
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function parseConfigFile(filePath: string, required: boolean): ToolkitConfig {
|
|
60
|
+
if (!existsSync(filePath)) {
|
|
61
|
+
if (required) {
|
|
62
|
+
throw new Error(`Toolkit config file not found: ${filePath}`);
|
|
63
|
+
}
|
|
64
|
+
return {};
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
let raw: string;
|
|
68
|
+
try {
|
|
69
|
+
raw = readFileSync(filePath, "utf8");
|
|
70
|
+
} catch (err: any) {
|
|
71
|
+
throw new Error(`Unable to read toolkit config at ${filePath}: ${err.message ?? String(err)}`);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
try {
|
|
75
|
+
const parsed = JSON.parse(raw) as unknown;
|
|
76
|
+
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
|
|
77
|
+
throw new Error("expected a JSON object");
|
|
78
|
+
}
|
|
79
|
+
validateToolkitConfig(parsed as Record<string, unknown>);
|
|
80
|
+
return parsed as ToolkitConfig;
|
|
81
|
+
} catch (err: any) {
|
|
82
|
+
throw new Error(`Invalid toolkit config at ${filePath}: ${err.message ?? String(err)}`);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function validateOptionalString(value: unknown, key: string): void {
|
|
87
|
+
if (value !== undefined && typeof value !== "string") {
|
|
88
|
+
throw new Error(`${key} must be a string`);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function validateToolkitConfig(value: Record<string, unknown>): void {
|
|
93
|
+
validateOptionalString(value.searxngUrl, "searxngUrl");
|
|
94
|
+
|
|
95
|
+
if (value.firecrawlFallback !== undefined && typeof value.firecrawlFallback !== "boolean") {
|
|
96
|
+
throw new Error("firecrawlFallback must be a boolean");
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
if (value.firecrawlRunner !== undefined) {
|
|
100
|
+
if (typeof value.firecrawlRunner !== "string" || !isFirecrawlRunner(value.firecrawlRunner)) {
|
|
101
|
+
throw new Error("firecrawlRunner must be one of: installed, npx, bunx");
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
if (value.commands !== undefined) {
|
|
106
|
+
if (typeof value.commands !== "object" || value.commands === null || Array.isArray(value.commands)) {
|
|
107
|
+
throw new Error("commands must be an object");
|
|
108
|
+
}
|
|
109
|
+
const commands = value.commands as Record<string, unknown>;
|
|
110
|
+
validateOptionalString(commands.scrapling, "commands.scrapling");
|
|
111
|
+
validateOptionalString(commands.agentBrowser, "commands.agentBrowser");
|
|
112
|
+
validateOptionalString(commands.firecrawl, "commands.firecrawl");
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
export function readToolkitConfig(): ToolkitConfig {
|
|
117
|
+
const filePath = getToolkitConfigPath();
|
|
118
|
+
const required = Boolean(process.env.PI_WEB_TOOLKIT_CONFIG?.trim());
|
|
119
|
+
return parseConfigFile(filePath, required);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function normalizeUrl(url: string): string {
|
|
123
|
+
return url.replace(/\/+$/, "");
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
export function getSearxngUrl(): string {
|
|
127
|
+
const envUrl = process.env.SEARXNG_URL?.trim();
|
|
128
|
+
if (envUrl) return normalizeUrl(envUrl);
|
|
129
|
+
|
|
130
|
+
const cfgUrl = readToolkitConfig().searxngUrl?.trim();
|
|
131
|
+
if (cfgUrl) return normalizeUrl(cfgUrl);
|
|
132
|
+
|
|
133
|
+
return DEFAULT_SEARXNG_URL;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
export function getToolkitCommand(name: ToolkitCommandName): string {
|
|
137
|
+
const envVar = COMMAND_ENV_VARS[name];
|
|
138
|
+
const envCommand = process.env[envVar]?.trim();
|
|
139
|
+
if (envCommand) return envCommand;
|
|
140
|
+
|
|
141
|
+
const cfgCommand = readToolkitConfig().commands?.[name]?.trim();
|
|
142
|
+
if (cfgCommand) return cfgCommand;
|
|
143
|
+
|
|
144
|
+
return COMMAND_DEFAULTS[name];
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
export function isFirecrawlFallbackEnabled(): boolean {
|
|
148
|
+
const envValue = process.env.PI_WEB_FIRECRAWL_FALLBACK;
|
|
149
|
+
if (envValue !== undefined) {
|
|
150
|
+
const v = envValue.trim().toLowerCase();
|
|
151
|
+
return !(v === "0" || v === "false" || v === "no" || v === "off");
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const cfgValue = readToolkitConfig().firecrawlFallback;
|
|
155
|
+
if (cfgValue !== undefined) return cfgValue;
|
|
156
|
+
|
|
157
|
+
return true;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
export function getFirecrawlRunner(): FirecrawlRunner {
|
|
161
|
+
const envValue = process.env.PI_WEB_FIRECRAWL_RUNNER?.trim().toLowerCase();
|
|
162
|
+
if (envValue) {
|
|
163
|
+
if (!isFirecrawlRunner(envValue)) {
|
|
164
|
+
throw new Error("PI_WEB_FIRECRAWL_RUNNER must be one of: installed, npx, bunx");
|
|
165
|
+
}
|
|
166
|
+
return envValue;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return readToolkitConfig().firecrawlRunner ?? "installed";
|
|
170
|
+
}
|
|
@@ -19,6 +19,7 @@ import { mkdtemp, rm } from "node:fs/promises";
|
|
|
19
19
|
import * as os from "node:os";
|
|
20
20
|
import * as path from "node:path";
|
|
21
21
|
import { runCLI } from "./cli-runner";
|
|
22
|
+
import { getFirecrawlRunner, getToolkitCommand, isFirecrawlFallbackEnabled, type FirecrawlRunner } from "./config";
|
|
22
23
|
|
|
23
24
|
// ---------------------------------------------------------------------------
|
|
24
25
|
// Shared types
|
|
@@ -33,8 +34,7 @@ export type FirecrawlFailureKind = "graceful-skip" | "hard-error";
|
|
|
33
34
|
* the single opt-out for a strict local-only / no-cloud-egress policy.
|
|
34
35
|
*/
|
|
35
36
|
export function isFirecrawlEnabled(): boolean {
|
|
36
|
-
|
|
37
|
-
return !(v === "0" || v === "false" || v === "no" || v === "off");
|
|
37
|
+
return isFirecrawlFallbackEnabled();
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
export interface FirecrawlFailure {
|
|
@@ -159,6 +159,29 @@ export interface FirecrawlCliResult {
|
|
|
159
159
|
exitCode: number;
|
|
160
160
|
}
|
|
161
161
|
|
|
162
|
+
export interface FirecrawlCliInvocation {
|
|
163
|
+
command: string;
|
|
164
|
+
args: string[];
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Build the command used to invoke the official Firecrawl CLI. `npx` and
|
|
169
|
+
* `bunx` are opt-in runners because they may run or download packages at
|
|
170
|
+
* fallback time.
|
|
171
|
+
*/
|
|
172
|
+
export function buildFirecrawlCliInvocation(
|
|
173
|
+
args: string[],
|
|
174
|
+
runner: FirecrawlRunner = getFirecrawlRunner(),
|
|
175
|
+
): FirecrawlCliInvocation {
|
|
176
|
+
if (runner === "npx") {
|
|
177
|
+
return { command: "npx", args: ["-y", "firecrawl-cli", ...args] };
|
|
178
|
+
}
|
|
179
|
+
if (runner === "bunx") {
|
|
180
|
+
return { command: "bunx", args: ["firecrawl-cli", ...args] };
|
|
181
|
+
}
|
|
182
|
+
return { command: getToolkitCommand("firecrawl"), args };
|
|
183
|
+
}
|
|
184
|
+
|
|
162
185
|
/**
|
|
163
186
|
* Run the firecrawl CLI under an isolated temporary HOME with no key env, so
|
|
164
187
|
* it can only ever operate in keyless mode (no stored credentials, no
|
|
@@ -178,7 +201,8 @@ export async function runFirecrawlCli(
|
|
|
178
201
|
delete env.FIRECRAWL_OAUTH_TOKEN;
|
|
179
202
|
env.HOME = home;
|
|
180
203
|
env.XDG_CONFIG_HOME = path.join(home, ".config");
|
|
181
|
-
|
|
204
|
+
const invocation = buildFirecrawlCliInvocation(args);
|
|
205
|
+
return await runCLI({ command: invocation.command, args: invocation.args, env, signal, timeout });
|
|
182
206
|
} finally {
|
|
183
207
|
await rm(home, { recursive: true, force: true }).catch(() => { /* best-effort */ });
|
|
184
208
|
}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { runCLI } from "./cli-runner";
|
|
2
|
+
import { getToolkitCommand } from "./config";
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
5
|
* Run a scrapling CLI command with optional abort signal.
|
|
@@ -7,7 +8,7 @@ export function runScrapling(
|
|
|
7
8
|
args: string[],
|
|
8
9
|
signal?: AbortSignal,
|
|
9
10
|
): Promise<{ stdout: string; stderr: string; exitCode: number }> {
|
|
10
|
-
return runCLI({ command: "scrapling", args, signal });
|
|
11
|
+
return runCLI({ command: getToolkitCommand("scrapling"), args, signal });
|
|
11
12
|
}
|
|
12
13
|
|
|
13
14
|
/**
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* web_search execution core
|
|
3
|
+
*
|
|
4
|
+
* Keeps SearXNG-first search behavior behind a testable boundary. Firecrawl
|
|
5
|
+
* remains fallback-only and missing fallback runners never replace the primary
|
|
6
|
+
* SearXNG failure/no-result UX.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { FirecrawlSearchOutput } from "./firecrawl";
|
|
10
|
+
import { shouldFallbackSearch } from "./firecrawl";
|
|
11
|
+
|
|
12
|
+
export interface WebSearchCoreInput {
|
|
13
|
+
query: string;
|
|
14
|
+
language?: string;
|
|
15
|
+
results?: number;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface WebSearchResultItem {
|
|
19
|
+
title: string;
|
|
20
|
+
url: string;
|
|
21
|
+
content?: string;
|
|
22
|
+
engine?: string;
|
|
23
|
+
score?: number;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
interface SearxResponse {
|
|
27
|
+
query: string;
|
|
28
|
+
results: WebSearchResultItem[];
|
|
29
|
+
suggestions?: string[];
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface WebSearchCoreResult {
|
|
33
|
+
query: string;
|
|
34
|
+
totalResults: number;
|
|
35
|
+
results: WebSearchResultItem[];
|
|
36
|
+
suggestions?: string[];
|
|
37
|
+
viaFirecrawl: boolean;
|
|
38
|
+
creditsUsed?: number;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export interface WebSearchCoreDeps {
|
|
42
|
+
searxngUrl: string;
|
|
43
|
+
fetchImpl: typeof fetch;
|
|
44
|
+
firecrawlSearch: (query: string, options: { limit: number }, signal?: AbortSignal) => Promise<FirecrawlSearchOutput>;
|
|
45
|
+
signal?: AbortSignal;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function normalizeSearxngUrl(url: string): string {
|
|
49
|
+
return url.replace(/\/+$/, "");
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export async function runWebSearchCore(
|
|
53
|
+
input: WebSearchCoreInput,
|
|
54
|
+
deps: WebSearchCoreDeps,
|
|
55
|
+
): Promise<WebSearchCoreResult> {
|
|
56
|
+
const searxngUrl = normalizeSearxngUrl(deps.searxngUrl);
|
|
57
|
+
const maxResults = Math.floor(Math.min(60, Math.max(1, input.results ?? 20)));
|
|
58
|
+
const language = input.language ?? "";
|
|
59
|
+
|
|
60
|
+
const allResults: WebSearchResultItem[] = [];
|
|
61
|
+
const seenUrls = new Set<string>();
|
|
62
|
+
let suggestions: string[] | undefined;
|
|
63
|
+
let finalQuery = input.query;
|
|
64
|
+
const MAX_PAGES = 3;
|
|
65
|
+
|
|
66
|
+
let localOk = true;
|
|
67
|
+
let localError: string | undefined;
|
|
68
|
+
|
|
69
|
+
try {
|
|
70
|
+
for (let page = 1; page <= MAX_PAGES; page++) {
|
|
71
|
+
const searchParams = new URLSearchParams({
|
|
72
|
+
q: input.query,
|
|
73
|
+
format: "json",
|
|
74
|
+
pageno: String(page),
|
|
75
|
+
});
|
|
76
|
+
if (language) searchParams.set("language", language);
|
|
77
|
+
|
|
78
|
+
const response = await deps.fetchImpl(`${searxngUrl}/search?${searchParams.toString()}`, {
|
|
79
|
+
method: "GET",
|
|
80
|
+
headers: { Accept: "application/json" },
|
|
81
|
+
signal: deps.signal,
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
if (!response.ok) {
|
|
85
|
+
const body = await response.text().catch(() => "");
|
|
86
|
+
throw new Error(`SearXNG error: ${response.status} ${response.statusText}\n${body}`);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const data = (await response.json()) as SearxResponse;
|
|
90
|
+
finalQuery = data.query;
|
|
91
|
+
|
|
92
|
+
if (data.suggestions && data.suggestions.length > 0 && !suggestions) {
|
|
93
|
+
suggestions = data.suggestions;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (!data.results || data.results.length === 0) {
|
|
97
|
+
break;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
for (const r of data.results) {
|
|
101
|
+
if (!seenUrls.has(r.url)) {
|
|
102
|
+
seenUrls.add(r.url);
|
|
103
|
+
allResults.push(r);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
if (allResults.length >= maxResults) {
|
|
108
|
+
break;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
} catch (err: any) {
|
|
112
|
+
localOk = false;
|
|
113
|
+
localError = err.message ?? String(err);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
if (shouldFallbackSearch(localOk, allResults.length)) {
|
|
117
|
+
const fb = await deps.firecrawlSearch(input.query, { limit: Math.min(maxResults, 10) }, deps.signal);
|
|
118
|
+
if (fb.ok && fb.results.length > 0) {
|
|
119
|
+
const fbResults: WebSearchResultItem[] = fb.results.slice(0, maxResults).map((r) => ({
|
|
120
|
+
title: r.title ?? "(untitled)",
|
|
121
|
+
url: r.url,
|
|
122
|
+
content: r.description,
|
|
123
|
+
engine: "firecrawl",
|
|
124
|
+
}));
|
|
125
|
+
return {
|
|
126
|
+
query: input.query,
|
|
127
|
+
totalResults: fbResults.length,
|
|
128
|
+
results: fbResults,
|
|
129
|
+
viaFirecrawl: true,
|
|
130
|
+
creditsUsed: fb.creditsUsed,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if (!localOk) {
|
|
136
|
+
throw new Error(`Failed to query SearXNG at ${searxngUrl}: ${localError}`);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
query: finalQuery,
|
|
141
|
+
totalResults: allResults.length,
|
|
142
|
+
results: allResults.slice(0, maxResults),
|
|
143
|
+
suggestions,
|
|
144
|
+
viaFirecrawl: false,
|
|
145
|
+
};
|
|
146
|
+
}
|
|
@@ -113,14 +113,10 @@ const webBatchFetchTool = defineTool({
|
|
|
113
113
|
"For a single page, use web_fetch instead.",
|
|
114
114
|
`Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
|
|
115
115
|
].join(" "),
|
|
116
|
-
promptSnippet: "
|
|
116
|
+
promptSnippet: "Parallel fetch for 2–5 URLs",
|
|
117
117
|
promptGuidelines: [
|
|
118
|
-
"Use web_batch_fetch
|
|
119
|
-
"
|
|
120
|
-
"Use web_batch_fetch for cross-referencing sources, comparing implementations, or synthesizing research from multiple sites.",
|
|
121
|
-
"For a single URL, always use web_fetch — it supports per-URL selectors and stealthy mode.",
|
|
122
|
-
"If a page in the batch fails, the tool reports the error but continues with the others.",
|
|
123
|
-
"Keep batch sizes reasonable (≤8) to avoid overwhelming the browser and token budget.",
|
|
118
|
+
"Use web_batch_fetch for 2–5 pages to compare/cross-reference/synthesize; single URL → web_fetch.",
|
|
119
|
+
"Keep batches small (≤8; schema max 15); failed pages are reported without stopping the batch.",
|
|
124
120
|
],
|
|
125
121
|
parameters: WebBatchFetchParamsSchema,
|
|
126
122
|
|
package/extensions/web_browse.ts
CHANGED
|
@@ -106,22 +106,18 @@ const webBrowseTool = defineTool({
|
|
|
106
106
|
name: "web_browse",
|
|
107
107
|
label: "Web Browse",
|
|
108
108
|
description: [
|
|
109
|
-
"
|
|
109
|
+
"Primary local-first tool for interactive web pages: navigate, click, fill forms, scroll,",
|
|
110
110
|
"wait for content, and then extract text.",
|
|
111
|
-
"Uses the agent-browser CLI with batched JSON commands.",
|
|
111
|
+
"Uses the agent-browser CLI with batched JSON commands, then automatically tries Firecrawl keyless only if local browser automation fails.",
|
|
112
112
|
"Use web_browse when the target content requires interaction (clicking buttons,",
|
|
113
113
|
"scrolling, filling search boxes, waiting for JS to load) before it becomes available.",
|
|
114
114
|
"For pages that need no interaction, use web_fetch instead.",
|
|
115
115
|
`Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
|
|
116
116
|
].join(" "),
|
|
117
|
-
promptSnippet: "
|
|
117
|
+
promptSnippet: "Local browser interaction and extraction",
|
|
118
118
|
promptGuidelines: [
|
|
119
|
-
"Use web_browse when
|
|
120
|
-
"
|
|
121
|
-
"For static articles, docs, or blogs that load everything on first request, prefer web_fetch.",
|
|
122
|
-
"After web_search returns results, prefer web_fetch for reading individual articles.",
|
|
123
|
-
"Use web_browse directly when interaction is required; otherwise try web_fetch first.",
|
|
124
|
-
"Always provide a selector to extract only the relevant content area — avoid dumping full page text.",
|
|
119
|
+
"Use web_browse only when clicks/forms/scroll/wait are needed; otherwise use web_fetch.",
|
|
120
|
+
"Provide a selector to narrow extracted content when possible.",
|
|
125
121
|
],
|
|
126
122
|
parameters: WebBrowseParamsSchema,
|
|
127
123
|
|
package/extensions/web_fetch.ts
CHANGED
|
@@ -40,19 +40,15 @@ const webFetchTool = defineTool({
|
|
|
40
40
|
name: "web_fetch",
|
|
41
41
|
label: "Web Fetch",
|
|
42
42
|
description: [
|
|
43
|
-
"
|
|
44
|
-
"
|
|
45
|
-
"Use web_fetch to read the full content of a specific result or user-provided URL.",
|
|
43
|
+
"Primary local-first tool for reading a single web page URL.",
|
|
44
|
+
"Fetches and extracts readable content via scrapling, then automatically tries Firecrawl keyless only if the local fetcher fails.",
|
|
45
|
+
"Use web_fetch as the first attempt to read the full content of a specific result or user-provided URL.",
|
|
46
46
|
"Callers remain responsible for robots.txt and site terms; Scrapling extract commands do not enforce them automatically.",
|
|
47
47
|
`Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
|
|
48
48
|
].join(" "),
|
|
49
|
-
promptSnippet: "
|
|
49
|
+
promptSnippet: "Local-first fetch of one URL as markdown",
|
|
50
50
|
promptGuidelines: [
|
|
51
|
-
"Use web_fetch
|
|
52
|
-
"For a single URL, always use web_fetch instead of web_batch_fetch.",
|
|
53
|
-
"If the page is dynamic/JavaScript-heavy, the tool automatically uses browser automation.",
|
|
54
|
-
"When reading multiple (2–5) pages at once (e.g., after web_search), prefer web_batch_fetch over repeated web_fetch calls.",
|
|
55
|
-
"Always pass the full URL including https://.",
|
|
51
|
+
"Use web_fetch for one non-interactive URL; use web_batch_fetch for 2–5 URLs.",
|
|
56
52
|
],
|
|
57
53
|
parameters: WebFetchParamsSchema,
|
|
58
54
|
|