pi-web-toolkit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Wade Huang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,80 @@
1
+ # pi-web-toolkit
2
+
3
+ Web research toolkit for [pi](https://pi.dev) agents. Search, fetch, browse, and batch-read the web.
4
+
5
+ ## Features
6
+
7
+ | Tool | Purpose |
8
+ |------|---------|
9
+ | **`web_search`** | Search the web via SearXNG |
10
+ | **`web_fetch`** | Fetch a single static page as clean markdown |
11
+ | **`web_browse`** | Interact with a page (click, scroll, fill) then extract content |
12
+ | **`web_batch_fetch`** | Fetch 2–10 pages in parallel for research synthesis |
13
+
14
+ ## Installation
15
+
16
+ ### Option 1: From npm (recommended)
17
+
18
+ ```bash
19
+ pi install pi-web-toolkit
20
+ ```
21
+
22
+ ### Option 2: From GitHub
23
+
24
+ ```bash
25
+ pi install git:github.com/Wade11s/pi-web-toolkit
26
+ ```
27
+
28
+ ## Requirements
29
+
30
+ - **Node.js ≥ 20** — for running pi extensions
31
+ - **SearXNG** — for `web_search`
32
+ ```bash
33
+ # Set your SearXNG instance URL (default: http://localhost:8080)
34
+ export SEARXNG_URL="http://localhost:8080"
35
+
36
+ # Self-host with Docker
37
+ docker run -d -p 8080:8080 -v searxng:/etc/searxng searxng/searxng
38
+ ```
39
+ - **scrapling** — for `web_fetch` and `web_batch_fetch`
40
+ ```bash
41
+ # recommended: install scrapling via uv
42
+ uv tool install "scrapling[all]"
43
+ scrapling install
44
+ ```
45
+ - **agent-browser** — for `web_browse`
46
+ ```bash
47
+ npm i -g agent-browser && agent-browser install
48
+ ```
49
+ Verify installation:
50
+ ```bash
51
+ agent-browser doctor
52
+ ```
53
+
54
+ ## Project Structure
55
+
56
+ ```
57
+ pi-web-toolkit/
58
+ ├── extensions/
59
+ │ ├── utils/
60
+ │ │ └── scrapling.ts # scrapling CLI wrapper
61
+ │ ├── web_search.ts # web_search
62
+ │ ├── web_fetch.ts # web_fetch
63
+ │ ├── web_browse.ts # web_browse (agent-browser)
64
+ │ └── web_batch_fetch.ts # web_batch_fetch
65
+ ├── docs/
66
+ │ ├── tools.md
67
+ │ └── guide.md
68
+ ├── package.json
69
+ ├── README.md
70
+ └── LICENSE
71
+ ```
72
+
73
+ ## Reference
74
+
75
+ - [Tool Reference](docs/tools.md) — Full parameter specs and usage examples for each tool.
76
+ - [Usage Guide](docs/guide.md) — Decision tree and tool comparison.
77
+
78
+ ## License
79
+
80
+ MIT
package/docs/guide.md ADDED
@@ -0,0 +1,41 @@
1
+ # Usage Guide
2
+
3
+ ## Which Tool When? — Decision Tree
4
+
5
+ ```
6
+ User asks about something external / current
7
+
8
+ ├─→ web_search("...")
9
+ │ │
10
+ │ ├─→ 1 relevant result?
11
+ │ │ └─→ web_fetch(url) ← static page
12
+ │ │ OR
13
+ │ │ └─→ web_browse(url, actions) ← needs interaction
14
+ │ │
15
+ │ └─→ 2–5 relevant results?
16
+ │ ├─→ All static pages?
17
+ │ │ └─→ web_batch_fetch(urls[]) ← parallel fetch
18
+ │ └─→ Some need interaction?
19
+ │ └─→ web_fetch (static ones)
20
+ │ web_browse (interactive ones) ← sequential
21
+
22
+ └─→ User provides a URL directly
23
+ ├─→ Static / loads on first request?
24
+ │ └─→ web_fetch(url)
25
+ └─→ Needs clicking / scrolling / waiting?
26
+ └─→ web_browse(url, actions)
27
+ ```
28
+
29
+ ---
30
+
31
+ ## Tool Comparison
32
+
33
+ | | `web_fetch` | `web_browse` | `web_batch_fetch` |
34
+ |--|-------------|--------------|-------------------|
35
+ | **Pages** | 1 | 1 | 2–10 |
36
+ | **Browser** | Yes (scrapling) | Yes (agent-browser) | Yes (scrapling) |
37
+ | **Interaction** | ❌ No | ✅ Click, fill, scroll, wait | ❌ No |
38
+ | **Selector** | ✅ Per-URL | ✅ Final state | ✅ Applied to all |
39
+ | **Stealthy** | ✅ Yes | ❌ No (planned) | ✅ Yes |
40
+ | **Speed** | Fast | Slower (browser ops) | Medium (parallel) |
41
+ | **Best for** | Articles, docs, blogs | SPAs, forms, pagination | Research synthesis |
package/docs/tools.md ADDED
@@ -0,0 +1,151 @@
1
+ # Tool Reference
2
+
3
+ ## `web_search`
4
+
5
+ Search the web via SearXNG. Returns ranked results with title, URL, and snippet.
6
+
7
+ ```typescript
8
+ {
9
+ query: string, // Search query
10
+ language?: string, // Language code (en, de, fr...). Default: "auto"
11
+ results?: number, // Max results (1–50). Default: 10
12
+ }
13
+ ```
14
+
15
+ **When to use:** The user asks about current events, facts, or anything requiring up-to-date information. This is always the **first step** of web research.
16
+
17
+ ---
18
+
19
+ ## `web_fetch`
20
+
21
+ Fetch a single page and convert it to clean markdown. Uses scrapling's browser automation for JS-heavy sites.
22
+
23
+ ```typescript
24
+ {
25
+ url: string, // Full URL
26
+ selector?: string, // CSS selector to extract only a specific area
27
+ stealthy?: boolean, // Anti-bot mode for protected sites. Default: false
28
+ }
29
+ ```
30
+
31
+ **When to use:**
32
+ - After `web_search` finds a relevant result
33
+ - The page is static or loads its content on first request
34
+ - You need to read **one** article, doc, or blog post
35
+
36
+ **Example flow:**
37
+ ```
38
+ User: "What's the latest Rust release?"
39
+ → web_search("latest Rust programming language release")
40
+ → web_fetch("https://blog.rust-lang.org/2026/06/02/maintainers-fund/")
41
+ → Agent answers with full context
42
+ ```
43
+
44
+ ---
45
+
46
+ ## `web_browse`
47
+
48
+ Open a real browser, perform a chain of actions (click, fill, scroll, wait), then extract content.
49
+
50
+ Uses the [agent-browser](https://github.com/vercel-labs/agent-browser) CLI for native browser automation via Chrome CDP.
51
+
52
+ ```typescript
53
+ {
54
+ url: string,
55
+ actions: Array<
56
+ | { type: "click", selector: string }
57
+ | { type: "fill", selector: string, value: string }
58
+ | { type: "type", selector: string, value: string }
59
+ | { type: "press", key: string, selector?: string }
60
+ | { type: "wait", ms: number }
61
+ | { type: "wait_selector", selector: string, state?: "attached" | "visible" | "hidden" }
62
+ | { type: "scroll", direction: "down" | "up" | "bottom" | "top", amount?: number }
63
+ >,
64
+ selector?: string, // Extract content from final page state
65
+ headless?: boolean, // Default: true
66
+ timeout?: number, // Overall browser batch timeout (ms). Default: 30000
67
+ }
68
+ ```
69
+
70
+ **When to use:**
71
+ - The page requires **clicking** before showing target content (e.g. "Load more", pagination, tab switching)
72
+ - The page requires **filling a form** (e.g. search box, login)
73
+ - The page requires **scrolling** to load lazy content (infinite scroll)
74
+ - The page requires **waiting** for JS to render content (SPA)
75
+
76
+ **Example flows:**
77
+
78
+ ```
79
+ # Click "Load more" twice, then extract articles
80
+ → web_browse({
81
+ url: "https://news.example.com",
82
+ actions: [
83
+ { type: "click", selector: "button.load-more" },
84
+ { type: "wait", ms: 1000 },
85
+ { type: "click", selector: "button.load-more" },
86
+ { type: "wait", ms: 1000 },
87
+ ],
88
+ selector: "article"
89
+ })
90
+
91
+ # Fill a search form and press Enter
92
+ → web_browse({
93
+ url: "https://duckduckgo.com",
94
+ actions: [
95
+ { type: "fill", selector: "#searchbox_input", value: "async rust" },
96
+ { type: "press", key: "Enter" },
97
+ { type: "wait_selector", selector: "[data-result]", state: "visible" },
98
+ ],
99
+ selector: "[data-result]"
100
+ })
101
+
102
+ # Scroll to bottom of infinite-scroll page
103
+ → web_browse({
104
+ url: "https://social.example.com/user/posts",
105
+ actions: [
106
+ { type: "scroll", direction: "bottom" },
107
+ { type: "wait", ms: 1500 },
108
+ { type: "scroll", direction: "bottom" },
109
+ ],
110
+ selector: ".post"
111
+ })
112
+ ```
113
+
114
+ ---
115
+
116
+ ## `web_batch_fetch`
117
+
118
+ Fetch multiple pages in parallel and return aggregated content.
119
+
120
+ ```typescript
121
+ {
122
+ urls: string[], // 1–10 URLs
123
+ selector?: string, // CSS selector applied to ALL pages
124
+ stealthy?: boolean, // Default: false
125
+ max_concurrency?: number // Parallel fetches (1–5). Default: 3
126
+ }
127
+ ```
128
+
129
+ **When to use:**
130
+ - After `web_search` returns **2–5 relevant results** that you want to read simultaneously
131
+ - Cross-referencing multiple sources for the same topic
132
+ - Comparing implementations across different docs/pages
133
+ - Research synthesis requiring multiple sources
134
+
135
+ **NOT for:** Single pages (use `web_fetch` — simpler and supports per-URL stealthy mode).
136
+
137
+ **Example flow:**
138
+ ```
139
+ User: "Compare Python asyncio, Trio, and curio"
140
+ → web_search("Python asyncio vs Trio vs curio comparison")
141
+ → web_batch_fetch({
142
+ urls: [
143
+ "https://docs.python.org/3/library/asyncio.html",
144
+ "https://trio.readthedocs.io/",
145
+ "https://curio.readthedocs.io/",
146
+ ],
147
+ selector: "article, .section, main",
148
+ max_concurrency: 3,
149
+ })
150
+ → Agent synthesizes comparison from all 3 sources
151
+ ```
@@ -0,0 +1,36 @@
1
+ import { spawn } from "node:child_process";
2
+
3
+ /**
4
+ * Run a scrapling CLI command with optional abort signal.
5
+ */
6
+ export function runScrapling(
7
+ args: string[],
8
+ signal?: AbortSignal,
9
+ ): Promise<{ stdout: string; stderr: string; exitCode: number }> {
10
+ return new Promise((resolve) => {
11
+ const proc = spawn("scrapling", args, { shell: false, stdio: ["ignore", "pipe", "pipe"] });
12
+ let stdout = "";
13
+ let stderr = "";
14
+
15
+ proc.stdout.on("data", (data) => {
16
+ stdout += data.toString();
17
+ });
18
+ proc.stderr.on("data", (data) => {
19
+ stderr += data.toString();
20
+ });
21
+ proc.on("close", (code, closeSignal) => {
22
+ const exitCode = code ?? 1;
23
+ const signalMessage = closeSignal ? `Process terminated by ${closeSignal}` : "";
24
+ resolve({ stdout, stderr: stderr || signalMessage, exitCode });
25
+ });
26
+ proc.on("error", (err) => resolve({ stdout, stderr: err.message, exitCode: 1 }));
27
+
28
+ if (signal) {
29
+ const kill = () => {
30
+ proc.kill("SIGTERM");
31
+ };
32
+ if (signal.aborted) kill();
33
+ else signal.addEventListener("abort", kill, { once: true });
34
+ }
35
+ });
36
+ }
@@ -0,0 +1,249 @@
1
+ /**
2
+ * Web Batch Fetch Extension — Concurrent multi-page fetching
3
+ *
4
+ * Provides a `web_batch_fetch` tool that fetches multiple URLs in parallel
5
+ * and returns their content as a single aggregated result.
6
+ *
7
+ * Use web_batch_fetch when the agent needs to read 2–5 pages at once
8
+ * (e.g., after web_search returns multiple relevant results).
9
+ *
10
+ * For a single page, use `web_fetch` instead — it has finer control
11
+ * (stealthy mode, per-URL selectors) and is simpler.
12
+ */
13
+
14
+ import {
15
+ defineTool,
16
+ type ExtensionAPI,
17
+ truncateHead,
18
+ formatSize,
19
+ DEFAULT_MAX_BYTES,
20
+ DEFAULT_MAX_LINES,
21
+ } from "@earendil-works/pi-coding-agent";
22
+ import { Text } from "@earendil-works/pi-tui";
23
+ import { Type, type Static } from "typebox";
24
+ import * as fs from "node:fs";
25
+ import * as os from "node:os";
26
+ import * as path from "node:path";
27
+ import { runScrapling } from "./utils/scrapling";
28
+
29
+ interface FetchTask {
30
+ url: string;
31
+ tmpFile: string;
32
+ }
33
+
34
+ async function fetchOne(
35
+ task: FetchTask,
36
+ selector: string | undefined,
37
+ stealthy: boolean,
38
+ signal?: AbortSignal,
39
+ ): Promise<{ url: string; content: string; size: number; ok: boolean; error?: string }> {
40
+ const cmd = stealthy ? "stealthy-fetch" : "fetch";
41
+ const args = ["extract", cmd, task.url, task.tmpFile, "--ai-targeted"];
42
+ if (selector) args.push("--css-selector", selector);
43
+
44
+ const { stderr, exitCode } = await runScrapling(args, signal);
45
+
46
+ if (exitCode !== 0) {
47
+ // Fallback to GET
48
+ const fallback = await runScrapling(["extract", "get", task.url, task.tmpFile, "--ai-targeted"], signal);
49
+ if (fallback.exitCode !== 0) {
50
+ return { url: task.url, content: "", size: 0, ok: false, error: stderr || fallback.stderr };
51
+ }
52
+ }
53
+
54
+ try {
55
+ const content = await fs.promises.readFile(task.tmpFile, "utf-8");
56
+ const stats = await fs.promises.stat(task.tmpFile);
57
+ return { url: task.url, content, size: stats.size, ok: true };
58
+ } catch (err: any) {
59
+ return { url: task.url, content: "", size: 0, ok: false, error: err.message };
60
+ }
61
+ }
62
+
63
+ async function mapWithConcurrencyLimit<TIn, TOut>(
64
+ items: TIn[],
65
+ concurrency: number,
66
+ fn: (item: TIn, index: number) => Promise<TOut>,
67
+ ): Promise<TOut[]> {
68
+ if (items.length === 0) return [];
69
+ const limit = Math.max(1, Math.min(concurrency, items.length));
70
+ const results: TOut[] = new Array(items.length);
71
+ let nextIndex = 0;
72
+
73
+ const workers = new Array(limit).fill(null).map(async () => {
74
+ while (true) {
75
+ const current = nextIndex++;
76
+ if (current >= items.length) return;
77
+ results[current] = await fn(items[current], current);
78
+ }
79
+ });
80
+
81
+ await Promise.all(workers);
82
+ return results;
83
+ }
84
+
85
+ export const WebBatchFetchParamsSchema = Type.Object({
86
+ urls: Type.Array(Type.String(), {
87
+ description: "List of URLs to fetch (2–5 recommended)",
88
+ minItems: 1,
89
+ maxItems: 10,
90
+ }),
91
+ selector: Type.Optional(Type.String({
92
+ description: "CSS selector applied to ALL pages to extract only relevant content",
93
+ })),
94
+ stealthy: Type.Optional(Type.Boolean({
95
+ description: "Use stealthy mode for all requests. Default: false",
96
+ default: false,
97
+ })),
98
+ max_concurrency: Type.Optional(Type.Integer({
99
+ description: "Max parallel fetches (1–5). Default: 3",
100
+ minimum: 1,
101
+ maximum: 5,
102
+ default: 3,
103
+ })),
104
+ });
105
+
106
+ export type WebBatchFetchInput = Static<typeof WebBatchFetchParamsSchema>;
107
+
108
+ const webBatchFetchTool = defineTool({
109
+ name: "web_batch_fetch",
110
+ label: "Web Batch Fetch",
111
+ description: [
112
+ "Fetch multiple web pages in parallel and return their content aggregated.",
113
+ "Use web_batch_fetch AFTER web_search when there are 2–5 relevant results",
114
+ "that the agent wants to read simultaneously for comparison or synthesis.",
115
+ "For a single page, use web_fetch instead.",
116
+ `Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
117
+ ].join(" "),
118
+ promptSnippet: "Fetch multiple URLs in parallel for research",
119
+ promptGuidelines: [
120
+ "Use web_batch_fetch when web_search returns multiple (2–5) relevant pages and the agent needs to read them all.",
121
+ "Use web_batch_fetch for cross-referencing sources, comparing implementations, or synthesizing research from multiple sites.",
122
+ "For a single URL, always use web_fetch — it supports per-URL selectors and stealthy mode.",
123
+ "If a page in the batch fails, the tool reports the error but continues with the others.",
124
+ "Keep batch sizes small (≤5) to avoid overwhelming the browser and token budget.",
125
+ ],
126
+ parameters: WebBatchFetchParamsSchema,
127
+
128
+ async execute(_toolCallId, params, signal, onUpdate) {
129
+ const tmpDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), "pi-batch-"));
130
+ const tasks: FetchTask[] = params.urls.map((url, i) => ({
131
+ url,
132
+ tmpFile: path.join(tmpDir, `page-${i}.md`),
133
+ }));
134
+ let fullOutputPath: string | undefined;
135
+
136
+ try {
137
+ const concurrency = Math.floor(Math.min(5, Math.max(1, params.max_concurrency ?? 3)));
138
+ onUpdate?.({ content: [{ type: "text", text: `Fetching ${tasks.length} pages with concurrency ${concurrency}...` }], details: {} });
139
+
140
+ const results = await mapWithConcurrencyLimit(
141
+ tasks,
142
+ concurrency,
143
+ (task, index) => {
144
+ onUpdate?.({ content: [{ type: "text", text: `Fetching ${task.url} (${index + 1}/${tasks.length})...` }], details: {} });
145
+ return fetchOne(task, params.selector, params.stealthy ?? false, signal);
146
+ },
147
+ );
148
+
149
+ const successCount = results.filter((r) => r.ok).length;
150
+ const lines: string[] = [
151
+ `Batch fetch: ${successCount}/${results.length} succeeded`,
152
+ "",
153
+ ];
154
+
155
+ for (let i = 0; i < results.length; i++) {
156
+ const r = results[i];
157
+ lines.push(`--- Page ${i + 1}: ${r.url} ---`);
158
+ if (r.ok) {
159
+ lines.push(`Size: ${r.size} bytes`);
160
+ lines.push("");
161
+ lines.push(r.content);
162
+ } else {
163
+ lines.push(`ERROR: ${r.error || "unknown error"}`);
164
+ }
165
+ lines.push("");
166
+ }
167
+
168
+ const rawText = lines.join("\n");
169
+ const truncation = truncateHead(rawText, {
170
+ maxLines: DEFAULT_MAX_LINES,
171
+ maxBytes: DEFAULT_MAX_BYTES,
172
+ });
173
+
174
+ let finalText = truncation.content;
175
+ if (truncation.truncated) {
176
+ const fullOutputDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), "pi-web-batch-"));
177
+ fullOutputPath = path.join(fullOutputDir, "output.txt");
178
+ await fs.promises.writeFile(fullOutputPath, rawText, "utf-8");
179
+ finalText += `\n\n[Output truncated: ${truncation.outputLines} of ${truncation.totalLines} lines (${formatSize(truncation.outputBytes)} of ${formatSize(truncation.totalBytes)}). Full output saved to: ${fullOutputPath}]`;
180
+ }
181
+
182
+ onUpdate?.({ content: [{ type: "text", text: `Batch complete: ${successCount}/${results.length} succeeded` }], details: {} });
183
+ return {
184
+ content: [{ type: "text", text: finalText }],
185
+ details: {
186
+ urls: params.urls,
187
+ succeeded: successCount,
188
+ failed: results.length - successCount,
189
+ results: results.map((r) => ({ url: r.url, ok: r.ok, size: r.size })),
190
+ fullOutputPath,
191
+ },
192
+ };
193
+ } catch (err: any) {
194
+ throw new Error(`Batch fetch failed: ${err.message ?? err}`);
195
+ } finally {
196
+ // Cleanup tmp files
197
+ try {
198
+ for (const task of tasks) {
199
+ try { fs.unlinkSync(task.tmpFile); } catch { /* ignore */ }
200
+ }
201
+ fs.rmSync(tmpDir, { recursive: true, force: true });
202
+ } catch { /* ignore */ }
203
+ }
204
+ },
205
+
206
+ renderCall(args, theme) {
207
+ let text = theme.fg("toolTitle", theme.bold("web_batch_fetch "));
208
+ text += theme.fg("muted", `${args.urls?.length ?? 0} URLs`);
209
+ if (args.selector) {
210
+ text += theme.fg("dim", ` selector=${args.selector}`);
211
+ }
212
+ return new Text(text, 0, 0);
213
+ },
214
+
215
+ renderResult(result, { expanded, isPartial }, theme) {
216
+ if (isPartial) {
217
+ return new Text(theme.fg("warning", "Batch fetching..."), 0, 0);
218
+ }
219
+ const details = result.details as {
220
+ succeeded?: number;
221
+ failed?: number;
222
+ urls?: string[];
223
+ results?: Array<{ url: string; ok: boolean; size?: number }>;
224
+ fullOutputPath?: string;
225
+ } | undefined;
226
+ const total = details?.urls?.length ?? 0;
227
+ const ok = details?.succeeded ?? 0;
228
+ let text = theme.fg("success", `✓ ${ok}/${total} fetched`);
229
+ if (details?.failed) {
230
+ text += theme.fg("error", ` (${details.failed} failed)`);
231
+ }
232
+ if (expanded && details?.results) {
233
+ for (const r of details.results) {
234
+ text += `\n ${r.ok ? theme.fg("success", "✓") : theme.fg("error", "✗")} ${theme.fg("dim", r.url)}`;
235
+ if (r.size) {
236
+ text += theme.fg("muted", ` ${formatSize(r.size)}`);
237
+ }
238
+ }
239
+ }
240
+ if (expanded && details?.fullOutputPath) {
241
+ text += `\n${theme.fg("dim", `Full output: ${details.fullOutputPath}`)}`;
242
+ }
243
+ return new Text(text, 0, 0);
244
+ },
245
+ });
246
+
247
+ export default function (pi: ExtensionAPI) {
248
+ pi.registerTool(webBatchFetchTool);
249
+ }
@@ -0,0 +1,415 @@
1
+ /**
2
+ * Web Browse Extension — Interactive browser automation via agent-browser
3
+ *
4
+ * Provides a `web_browse` tool for multi-step page interaction using the
5
+ * agent-browser CLI (https://github.com/vercel-labs/agent-browser).
6
+ *
7
+ * Use web_browse when a page requires interaction (clicking, scrolling,
8
+ * filling forms, waiting for dynamic content) BEFORE its target content
9
+ * becomes available.
10
+ *
11
+ * For static pages that need no interaction, use `web_fetch` instead.
12
+ */
13
+
14
+ import {
15
+ defineTool,
16
+ type ExtensionAPI,
17
+ truncateHead,
18
+ formatSize,
19
+ DEFAULT_MAX_BYTES,
20
+ DEFAULT_MAX_LINES,
21
+ } from "@earendil-works/pi-coding-agent";
22
+ import { StringEnum } from "@earendil-works/pi-ai";
23
+ import { Text } from "@earendil-works/pi-tui";
24
+ import { Type, type Static } from "typebox";
25
+ import { spawn } from "node:child_process";
26
+ import * as fs from "node:fs";
27
+ import * as os from "node:os";
28
+ import * as path from "node:path";
29
+
30
+ interface BrowseAction {
31
+ type: "click" | "fill" | "type" | "press" | "wait" | "wait_selector" | "scroll";
32
+ selector?: string;
33
+ value?: string;
34
+ key?: string;
35
+ ms?: number;
36
+ direction?: "down" | "up" | "bottom" | "top";
37
+ amount?: number;
38
+ state?: "attached" | "visible" | "hidden";
39
+ }
40
+
41
+ interface AgentBrowserBatchItem {
42
+ success: boolean;
43
+ command: string[];
44
+ result?: any;
45
+ error?: string | null;
46
+ }
47
+
48
+ function requireString(action: BrowseAction, field: "selector" | "value" | "key"): string {
49
+ const value = action[field];
50
+ if (typeof value !== "string" || value.length === 0) {
51
+ throw new Error(`Action "${action.type}" requires non-empty ${field}`);
52
+ }
53
+ return value;
54
+ }
55
+
56
+ function requireInteger(action: BrowseAction, field: "ms" | "amount"): number {
57
+ const value = action[field];
58
+ if (!Number.isInteger(value) || value < 0) {
59
+ throw new Error(`Action "${action.type}" requires non-negative integer ${field}`);
60
+ }
61
+ return value;
62
+ }
63
+
64
+ function waitForSelectorScript(selector: string, state: "attached" | "visible" | "hidden"): string {
65
+ const selectorLiteral = JSON.stringify(selector);
66
+ const stateLiteral = JSON.stringify(state);
67
+ return `await new Promise((resolve, reject) => {
68
+ const selector = ${selectorLiteral};
69
+ const state = ${stateLiteral};
70
+ const deadline = Date.now() + 30000;
71
+ const isVisible = (el) => !!(el && (el.offsetWidth || el.offsetHeight || el.getClientRects().length));
72
+ const check = () => {
73
+ const el = document.querySelector(selector);
74
+ const ok = state === "attached" ? !!el : state === "hidden" ? !isVisible(el) : isVisible(el);
75
+ if (ok) return resolve(true);
76
+ if (Date.now() > deadline) return reject(new Error(\`Timed out waiting for ${state} selector: ${selector}\`));
77
+ setTimeout(check, 100);
78
+ };
79
+ check();
80
+ })`;
81
+ }
82
+
83
+ function buildBatchCommands(
84
+ url: string,
85
+ actions: BrowseAction[],
86
+ selector?: string,
87
+ ): string[][] {
88
+ const commands: string[][] = [["open", url]];
89
+
90
+ for (const action of actions) {
91
+ switch (action.type) {
92
+ case "click":
93
+ commands.push(["click", requireString(action, "selector")]);
94
+ break;
95
+ case "fill":
96
+ commands.push(["fill", requireString(action, "selector"), requireString(action, "value")]);
97
+ break;
98
+ case "type":
99
+ commands.push(["type", requireString(action, "selector"), requireString(action, "value")]);
100
+ break;
101
+ case "press": {
102
+ if (action.selector) {
103
+ commands.push(["focus", action.selector]);
104
+ }
105
+ commands.push(["press", requireString(action, "key")]);
106
+ break;
107
+ }
108
+ case "wait":
109
+ commands.push(["wait", String(requireInteger(action, "ms"))]);
110
+ break;
111
+ case "wait_selector": {
112
+ const state = action.state ?? "visible";
113
+ const waitSelector = requireString(action, "selector");
114
+ if (state === "visible") {
115
+ commands.push(["wait", waitSelector]);
116
+ } else {
117
+ commands.push(["eval", waitForSelectorScript(waitSelector, state)]);
118
+ }
119
+ break;
120
+ }
121
+ case "scroll": {
122
+ const dir = action.direction ?? "down";
123
+ if (dir === "top") {
124
+ commands.push(["eval", "window.scrollTo(0, 0)"]);
125
+ } else if (dir === "bottom") {
126
+ commands.push(["eval", "window.scrollTo(0, document.body.scrollHeight)"]);
127
+ } else {
128
+ commands.push(["scroll", dir, String(action.amount ?? 500)]);
129
+ }
130
+ break;
131
+ }
132
+ default:
133
+ throw new Error(`Unsupported browser action: ${(action as BrowseAction).type}`);
134
+ }
135
+ }
136
+
137
+ // Extract content
138
+ if (selector) {
139
+ commands.push(["get", "text", selector, "--json"]);
140
+ } else {
141
+ commands.push(["snapshot", "-i", "--json"]);
142
+ }
143
+
144
+ // Metadata
145
+ commands.push(["get", "title", "--json"]);
146
+ commands.push(["get", "url", "--json"]);
147
+
148
+ return commands;
149
+ }
150
+
151
+ function runAgentBrowserBatch(
152
+ commands: string[][],
153
+ options: { session: string; headless: boolean; signal?: AbortSignal; timeout?: number },
154
+ ): Promise<AgentBrowserBatchItem[]> {
155
+ const args = ["--session", options.session];
156
+ if (!options.headless) args.push("--headed");
157
+ args.push("batch", "--bail", "--json");
158
+
159
+ return new Promise((resolve, reject) => {
160
+ const proc = spawn("agent-browser", args, {
161
+ shell: false,
162
+ stdio: ["pipe", "pipe", "pipe"],
163
+ });
164
+
165
+ let stdout = "";
166
+ let stderr = "";
167
+ let timeoutId: NodeJS.Timeout | undefined;
168
+ let settled = false;
169
+
170
+ const cleanup = () => {
171
+ if (timeoutId) clearTimeout(timeoutId);
172
+ if (options.signal) options.signal.removeEventListener("abort", kill);
173
+ };
174
+
175
+ const settleReject = (err: Error) => {
176
+ if (settled) return;
177
+ settled = true;
178
+ cleanup();
179
+ reject(err);
180
+ };
181
+
182
+ const kill = () => proc.kill("SIGTERM");
183
+
184
+ proc.stdout.on("data", (data: Buffer) => {
185
+ stdout += data.toString();
186
+ });
187
+
188
+ proc.stderr.on("data", (data: Buffer) => {
189
+ stderr += data.toString();
190
+ });
191
+
192
+ if (options.timeout) {
193
+ timeoutId = setTimeout(() => {
194
+ proc.kill("SIGTERM");
195
+ settleReject(new Error(`agent-browser timed out after ${options.timeout}ms`));
196
+ }, options.timeout);
197
+ }
198
+
199
+ proc.on("close", (code) => {
200
+ if (settled) return;
201
+ settled = true;
202
+ cleanup();
203
+
204
+ if (code !== 0 && !stdout.trim()) {
205
+ reject(new Error(`agent-browser failed (exit ${code}):\n${stderr || "unknown error"}`));
206
+ return;
207
+ }
208
+
209
+ try {
210
+ const results = JSON.parse(stdout) as AgentBrowserBatchItem[];
211
+ resolve(results);
212
+ } catch (err: any) {
213
+ reject(new Error(
214
+ `Failed to parse agent-browser output: ${err.message}\nstdout: ${stdout}\nstderr: ${stderr}`
215
+ ));
216
+ }
217
+ });
218
+
219
+ proc.on("error", (err: any) => {
220
+ if (err.code === "ENOENT") {
221
+ settleReject(new Error(
222
+ "agent-browser is not installed.\n\nInstall it with:\n npm i -g agent-browser && agent-browser install\n\nThen run: agent-browser doctor"
223
+ ));
224
+ } else {
225
+ settleReject(err);
226
+ }
227
+ });
228
+
229
+ if (options.signal) {
230
+ if (options.signal.aborted) kill();
231
+ else options.signal.addEventListener("abort", kill, { once: true });
232
+ }
233
+
234
+ proc.stdin.write(JSON.stringify(commands));
235
+ proc.stdin.end();
236
+ });
237
+ }
238
+
239
+ function closeAgentBrowserSession(session: string, signal?: AbortSignal): Promise<void> {
240
+ return new Promise((resolve) => {
241
+ const proc = spawn("agent-browser", ["--session", session, "close"], {
242
+ shell: false,
243
+ stdio: ["ignore", "ignore", "ignore"],
244
+ });
245
+ const done = () => resolve();
246
+ proc.on("close", done);
247
+ proc.on("error", done);
248
+ if (signal) {
249
+ const kill = () => proc.kill("SIGTERM");
250
+ if (signal.aborted) kill();
251
+ else signal.addEventListener("abort", kill, { once: true });
252
+ }
253
+ });
254
+ }
255
+
256
+ export const WebBrowseActionSchema = Type.Object({
257
+ type: StringEnum(["click", "fill", "type", "press", "wait", "wait_selector", "scroll"] as const),
258
+ selector: Type.Optional(Type.String({ description: "CSS selector for actions that target an element" })),
259
+ value: Type.Optional(Type.String({ description: "Value for fill/type actions" })),
260
+ key: Type.Optional(Type.String({ description: "Key for press actions, e.g. Enter or Tab" })),
261
+ ms: Type.Optional(Type.Integer({ description: "Milliseconds for wait actions", minimum: 0 })),
262
+ direction: Type.Optional(StringEnum(["down", "up", "bottom", "top"] as const)),
263
+ amount: Type.Optional(Type.Integer({ description: "Pixels for scroll up/down actions", minimum: 0 })),
264
+ state: Type.Optional(StringEnum(["attached", "visible", "hidden"] as const)),
265
+ });
266
+
267
+ export const WebBrowseParamsSchema = Type.Object({
268
+ url: Type.String({ description: "Starting URL to open in the browser" }),
269
+ actions: Type.Array(WebBrowseActionSchema, {
270
+ description: "Ordered list of actions to perform on the page. Required fields depend on action type.",
271
+ maxItems: 25,
272
+ }),
273
+ selector: Type.Optional(Type.String({ description: "CSS selector to extract content from the final page state" })),
274
+ headless: Type.Optional(Type.Boolean({ description: "Run browser headlessly. Default: true", default: true })),
275
+ timeout: Type.Optional(Type.Integer({ description: "Overall browser batch timeout in milliseconds. Default: 30000", minimum: 1, default: 30000 })),
276
+ });
277
+
278
+ export type WebBrowseInput = Static<typeof WebBrowseParamsSchema>;
279
+
280
+ const webBrowseTool = defineTool({
281
+ name: "web_browse",
282
+ label: "Web Browse",
283
+ description: [
284
+ "Interact with a web page through a browser: navigate, click, fill forms, scroll,",
285
+ "wait for content, and then extract text.",
286
+ "Uses the agent-browser CLI for fast, native browser automation via Chrome CDP.",
287
+ "Use web_browse when the target content requires interaction (clicking buttons,",
288
+ "scrolling, filling search boxes, waiting for JS to load) before it becomes available.",
289
+ "For static pages that need no interaction, use web_fetch instead.",
290
+ `Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
291
+ ].join(" "),
292
+ promptSnippet: "Interact with a web page (click, scroll, fill) and extract content",
293
+ promptGuidelines: [
294
+ "Use web_browse when a page requires clicking, scrolling, or form submission before showing target content.",
295
+ "Use web_browse for SPAs, pagination (click 'Load more'), search forms, tab switching, and modal dialogs.",
296
+ "For static articles, docs, or blogs that load everything on first request, prefer web_fetch.",
297
+ "After web_search returns results, prefer web_fetch for reading individual articles.",
298
+ "Only use web_browse if web_fetch fails to get the needed content.",
299
+ "Always provide a selector to extract only the relevant content area — avoid dumping full page text.",
300
+ ],
301
+ parameters: WebBrowseParamsSchema,
302
+
303
+ async execute(toolCallId, params, signal, onUpdate) {
304
+ let fullOutputPath: string | undefined;
305
+ const session = `pi-web-browse-${toolCallId}`;
306
+
307
+ try {
308
+ onUpdate?.({ content: [{ type: "text", text: `Browsing ${params.url}...` }], details: {} });
309
+
310
+ const commands = buildBatchCommands(
311
+ params.url,
312
+ params.actions as BrowseAction[],
313
+ params.selector,
314
+ );
315
+
316
+ const results = await runAgentBrowserBatch(commands, {
317
+ session,
318
+ headless: params.headless ?? true,
319
+ signal,
320
+ timeout: params.timeout ?? 30000,
321
+ });
322
+
323
+ const failed = results.find((r) => !r.success);
324
+ if (failed) {
325
+ const cmdStr = failed.command?.join(" ") ?? "unknown command";
326
+ const errMsg = failed.error ?? "unknown error";
327
+ throw new Error(`Browser action failed: ${cmdStr} — ${errMsg}`);
328
+ }
329
+
330
+ const contentResult = results.find((r) => {
331
+ if (r.command[0] === "snapshot" && r.command.includes("--json")) return true;
332
+ if (r.command[0] === "get" && r.command[1] === "text") return true;
333
+ return false;
334
+ });
335
+
336
+ const titleResult = results.find((r) => r.command[0] === "get" && r.command[1] === "title");
337
+ const urlResult = results.find((r) => r.command[0] === "get" && r.command[1] === "url");
338
+
339
+ let content = "";
340
+ if (contentResult?.success) {
341
+ if (contentResult.command[0] === "snapshot") {
342
+ content = contentResult.result?.snapshot ?? "";
343
+ } else {
344
+ content = contentResult.result?.text ?? "";
345
+ }
346
+ }
347
+
348
+ const title = titleResult?.result?.title ?? "";
349
+ const finalUrl = urlResult?.result?.url ?? params.url;
350
+
351
+ const lines: string[] = [
352
+ `Title: ${title || "(no title)"}`,
353
+ `URL: ${finalUrl}`,
354
+ "",
355
+ "---",
356
+ "",
357
+ content || "(no content extracted)",
358
+ ];
359
+
360
+ const rawText = lines.join("\n");
361
+ const truncation = truncateHead(rawText, {
362
+ maxLines: DEFAULT_MAX_LINES,
363
+ maxBytes: DEFAULT_MAX_BYTES,
364
+ });
365
+
366
+ let finalText = truncation.content;
367
+ if (truncation.truncated) {
368
+ const fullOutputDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), "pi-web-browse-"));
369
+ fullOutputPath = path.join(fullOutputDir, "output.txt");
370
+ await fs.promises.writeFile(fullOutputPath, rawText, "utf-8");
371
+ finalText += `\n\n[Output truncated: ${truncation.outputLines} of ${truncation.totalLines} lines (${formatSize(truncation.outputBytes)} of ${formatSize(truncation.totalBytes)}). Full output saved to: ${fullOutputPath}]`;
372
+ }
373
+
374
+ onUpdate?.({ content: [{ type: "text", text: `Extracted from ${finalUrl}` }], details: {} });
375
+
376
+ return {
377
+ content: [{ type: "text", text: finalText }],
378
+ details: { title, url: finalUrl, fullOutputPath },
379
+ };
380
+ } catch (err: any) {
381
+ throw new Error(`Error browsing ${params.url}: ${err.message ?? err}`);
382
+ } finally {
383
+ await closeAgentBrowserSession(session, signal);
384
+ }
385
+ },
386
+
387
+ renderCall(args, theme) {
388
+ let text = theme.fg("toolTitle", theme.bold("web_browse "));
389
+ text += theme.fg("muted", args.url);
390
+ text += theme.fg("dim", ` (${args.actions?.length ?? 0} actions)`);
391
+ return new Text(text, 0, 0);
392
+ },
393
+
394
+ renderResult(result, { expanded, isPartial }, theme) {
395
+ if (isPartial) {
396
+ return new Text(theme.fg("warning", "Browsing..."), 0, 0);
397
+ }
398
+ const details = result.details as { title?: string; url?: string; fullOutputPath?: string } | undefined;
399
+ let text = theme.fg("success", "✓ Browsed");
400
+ if (details?.title) {
401
+ text += theme.fg("muted", ` — ${details.title}`);
402
+ }
403
+ if (expanded && details?.url) {
404
+ text += `\n${theme.fg("dim", details.url)}`;
405
+ }
406
+ if (expanded && details?.fullOutputPath) {
407
+ text += `\n${theme.fg("dim", `Full output: ${details.fullOutputPath}`)}`;
408
+ }
409
+ return new Text(text, 0, 0);
410
+ },
411
+ });
412
+
413
+ export default function (pi: ExtensionAPI) {
414
+ pi.registerTool(webBrowseTool);
415
+ }
@@ -0,0 +1,138 @@
1
+ /**
2
+ * Web Fetch Extension — Fetch full page content via scrapling
3
+ *
4
+ * Provides a `web_fetch` tool that downloads and extracts readable
5
+ * content from any URL using the scrapling CLI.
6
+ *
7
+ * Requires: `pip install "scrapling[all]"` and `scrapling install`
8
+ *
9
+ * Usage:
10
+ * The LLM calls web_fetch with a URL after web_search finds relevant pages.
11
+ */
12
+
13
+ import {
14
+ defineTool,
15
+ type ExtensionAPI,
16
+ truncateHead,
17
+ formatSize,
18
+ DEFAULT_MAX_BYTES,
19
+ DEFAULT_MAX_LINES,
20
+ } from "@earendil-works/pi-coding-agent";
21
+ import { Text } from "@earendil-works/pi-tui";
22
+ import { Type, type Static } from "typebox";
23
+ import * as fs from "node:fs";
24
+ import * as os from "node:os";
25
+ import * as path from "node:path";
26
+ import { runScrapling } from "./utils/scrapling";
27
+
28
+ export const WebFetchParamsSchema = Type.Object({
29
+ url: Type.String({ description: "Full URL to fetch (e.g. https://example.com/article)" }),
30
+ selector: Type.Optional(Type.String({ description: "CSS selector to extract only a specific part of the page" })),
31
+ stealthy: Type.Optional(Type.Boolean({ description: "Use stealthy mode for protected/anti-bot sites. Default: false", default: false })),
32
+ });
33
+
34
+ export type WebFetchInput = Static<typeof WebFetchParamsSchema>;
35
+
36
+ const webFetchTool = defineTool({
37
+ name: "web_fetch",
38
+ label: "Web Fetch",
39
+ description: [
40
+ "Fetch and extract readable content from a web page URL.",
41
+ "Uses scrapling to download the page and convert it to clean markdown.",
42
+ "Use web_fetch AFTER web_search to read the full content of a result page.",
43
+ "Respects robots.txt and site ToS.",
44
+ `Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
45
+ ].join(" "),
46
+ promptSnippet: "Fetch full page content from a URL as markdown",
47
+ promptGuidelines: [
48
+ "Use web_fetch after web_search to read full articles, docs, or pages found in search results.",
49
+ "Always pass the full URL including https://.",
50
+ "If the page is dynamic/JavaScript-heavy, the tool automatically uses browser automation.",
51
+ ],
52
+ parameters: WebFetchParamsSchema,
53
+
54
+ async execute(_toolCallId, params, signal) {
55
+ const tmpDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), "pi-web-fetch-"));
56
+ const tmpFile = path.join(tmpDir, "page.md");
57
+ let tmpFull: string | undefined;
58
+
59
+ try {
60
+ const cmd = params.stealthy ? "stealthy-fetch" : "fetch";
61
+ const args = ["extract", cmd, params.url, tmpFile, "--ai-targeted"];
62
+ if (params.selector) {
63
+ args.push("--css-selector", params.selector);
64
+ }
65
+
66
+ const { stdout, stderr, exitCode } = await runScrapling(args, signal);
67
+
68
+ if (exitCode !== 0) {
69
+ // Try fallback to simple HTTP GET if fetch/stealthy-fetch failed
70
+ if (!params.stealthy) {
71
+ const fallback = await runScrapling(["extract", "get", params.url, tmpFile, "--ai-targeted"], signal);
72
+ if (fallback.exitCode !== 0) {
73
+ throw new Error(`Failed to fetch ${params.url}\n\nscrapling error:\n${stderr || fallback.stderr}`);
74
+ }
75
+ } else {
76
+ throw new Error(`Failed to fetch ${params.url}\n\nscrapling error:\n${stderr}`);
77
+ }
78
+ }
79
+
80
+ const content = await fs.promises.readFile(tmpFile, "utf-8");
81
+ const stats = await fs.promises.stat(tmpFile);
82
+
83
+ const rawText = `Fetched: ${params.url}\nSize: ${stats.size} bytes\n\n---\n\n${content}`;
84
+ const truncation = truncateHead(rawText, {
85
+ maxLines: DEFAULT_MAX_LINES,
86
+ maxBytes: DEFAULT_MAX_BYTES,
87
+ });
88
+
89
+ let finalText = truncation.content;
90
+ if (truncation.truncated) {
91
+ const tmpFullDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), "pi-web-fetch-full-"));
92
+ tmpFull = path.join(tmpFullDir, "output.txt");
93
+ await fs.promises.writeFile(tmpFull, rawText, "utf-8");
94
+ finalText += `\n\n[Output truncated: ${truncation.outputLines} of ${truncation.totalLines} lines (${formatSize(truncation.outputBytes)} of ${formatSize(truncation.totalBytes)}). Full output saved to: ${tmpFull}]`;
95
+ }
96
+
97
+ return {
98
+ content: [{ type: "text", text: finalText }],
99
+ details: { url: params.url, bytes: stats.size, fullOutputPath: tmpFull },
100
+ };
101
+ } catch (err: any) {
102
+ throw new Error(`Error fetching ${params.url}: ${err.message ?? err}`);
103
+ } finally {
104
+ try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch { /* ignore */ }
105
+ }
106
+ },
107
+
108
+ renderCall(args, theme) {
109
+ let text = theme.fg("toolTitle", theme.bold("web_fetch "));
110
+ text += theme.fg("muted", args.url);
111
+ if (args.selector) {
112
+ text += theme.fg("dim", ` selector=${args.selector}`);
113
+ }
114
+ return new Text(text, 0, 0);
115
+ },
116
+
117
+ renderResult(result, { expanded, isPartial }, theme) {
118
+ if (isPartial) {
119
+ return new Text(theme.fg("warning", "Fetching..."), 0, 0);
120
+ }
121
+ const details = result.details as { url?: string; bytes?: number; fullOutputPath?: string } | undefined;
122
+ let text = theme.fg("success", "✓ Fetched");
123
+ if (details?.bytes) {
124
+ text += theme.fg("muted", ` (${formatSize(details.bytes)})`);
125
+ }
126
+ if (expanded) {
127
+ text += `\n${theme.fg("dim", details?.url ?? "")}`;
128
+ if (details?.fullOutputPath) {
129
+ text += `\n${theme.fg("dim", `Full output: ${details.fullOutputPath}`)}`;
130
+ }
131
+ }
132
+ return new Text(text, 0, 0);
133
+ },
134
+ });
135
+
136
+ export default function (pi: ExtensionAPI) {
137
+ pi.registerTool(webFetchTool);
138
+ }
@@ -0,0 +1,179 @@
1
+ /**
2
+ * SearXNG Web Search Extension for Pi
3
+ *
4
+ * Provides a `web_search` tool that queries a SearXNG instance.
5
+ *
6
+ * Configuration:
7
+ * Set SEARXNG_URL env var to your instance (default: http://localhost:8080)
8
+ *
9
+ * Usage:
10
+ * The LLM can call web_search with a query to get search results.
11
+ */
12
+
13
+ import {
14
+ defineTool,
15
+ type ExtensionAPI,
16
+ truncateHead,
17
+ formatSize,
18
+ DEFAULT_MAX_BYTES,
19
+ DEFAULT_MAX_LINES,
20
+ } from "@earendil-works/pi-coding-agent";
21
+ import { Text } from "@earendil-works/pi-tui";
22
+ import { Type, type Static } from "typebox";
23
+ import { mkdtemp, writeFile } from "node:fs/promises";
24
+ import * as os from "node:os";
25
+ import * as path from "node:path";
26
+
27
+ const SEARXNG_URL = (process.env.SEARXNG_URL || "http://localhost:8080").replace(/\/$/, "");
28
+
29
+ interface SearxResult {
30
+ title: string;
31
+ url: string;
32
+ content?: string;
33
+ engine?: string;
34
+ score?: number;
35
+ }
36
+
37
+ interface SearxResponse {
38
+ query: string;
39
+ results: SearxResult[];
40
+ suggestions?: string[];
41
+ }
42
+
43
+ export const WebSearchParamsSchema = Type.Object({
44
+ query: Type.String({ description: "Search query" }),
45
+ language: Type.Optional(Type.String({ description: "Language code (e.g. en, en-US, de). Default: auto", default: "auto" })),
46
+ results: Type.Optional(Type.Integer({ description: "Max number of results to return (1-50). Default: 10", minimum: 1, maximum: 50, default: 10 })),
47
+ });
48
+
49
+ export type WebSearchInput = Static<typeof WebSearchParamsSchema>;
50
+
51
+ const webSearchTool = defineTool({
52
+ name: "web_search",
53
+ label: "Web Search",
54
+ description: [
55
+ "Search the web using a SearXNG instance.",
56
+ "Returns a list of results with title, URL, and snippet.",
57
+ "Use web_search when the user asks about current events, facts, or anything",
58
+ "that requires up-to-date information beyond the model's training data.",
59
+ `Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
60
+ ].join(" "),
61
+ promptSnippet: "Search the web for current information",
62
+ promptGuidelines: [
63
+ "Use web_search when the user asks about recent events, current data, or external facts.",
64
+ "Use web_search to verify claims, find documentation, or discover resources online.",
65
+ ],
66
+ parameters: WebSearchParamsSchema,
67
+
68
+ async execute(_toolCallId, params, signal) {
69
+ const maxResults = Math.floor(Math.min(50, Math.max(1, params.results ?? 10)));
70
+ const searchParams = new URLSearchParams({
71
+ q: params.query,
72
+ format: "json",
73
+ language: params.language ?? "auto",
74
+ });
75
+
76
+ const url = `${SEARXNG_URL}/search?${searchParams.toString()}`;
77
+
78
+ let fullOutputPath: string | undefined;
79
+
80
+ try {
81
+ const response = await fetch(url, {
82
+ method: "GET",
83
+ headers: { Accept: "application/json" },
84
+ signal,
85
+ });
86
+
87
+ if (!response.ok) {
88
+ const body = await response.text().catch(() => "");
89
+ throw new Error(`SearXNG error: ${response.status} ${response.statusText}\n${body}`);
90
+ }
91
+
92
+ const data = (await response.json()) as SearxResponse;
93
+
94
+ if (!data.results || data.results.length === 0) {
95
+ let text = `No results found for "${data.query}".`;
96
+ if (data.suggestions && data.suggestions.length > 0) {
97
+ text += `\n\nSuggestions:\n${data.suggestions.map((s) => `- ${s}`).join("\n")}`;
98
+ }
99
+ return {
100
+ content: [{ type: "text", text }],
101
+ details: { query: data.query, totalResults: 0, results: [], fullOutputPath: undefined },
102
+ };
103
+ }
104
+
105
+ const lines: string[] = [
106
+ `Results for "${data.query}":`,
107
+ "",
108
+ ];
109
+
110
+ for (let i = 0; i < Math.min(maxResults, data.results.length); i++) {
111
+ const r = data.results[i];
112
+ lines.push(`${i + 1}. ${r.title}`);
113
+ lines.push(` URL: ${r.url}`);
114
+ if (r.content) {
115
+ const snippet = r.content.replace(/\s+/g, " ").trim();
116
+ lines.push(` ${snippet}`);
117
+ }
118
+ if (r.engine) {
119
+ lines.push(` [engine: ${r.engine}]`);
120
+ }
121
+ lines.push("");
122
+ }
123
+
124
+ const rawText = lines.join("\n");
125
+ const truncation = truncateHead(rawText, {
126
+ maxLines: DEFAULT_MAX_LINES,
127
+ maxBytes: DEFAULT_MAX_BYTES,
128
+ });
129
+
130
+ let finalText = truncation.content;
131
+ if (truncation.truncated) {
132
+ const tmpDir = await mkdtemp(path.join(os.tmpdir(), "pi-web-search-"));
133
+ fullOutputPath = path.join(tmpDir, "output.txt");
134
+ await writeFile(fullOutputPath, rawText, "utf-8");
135
+ finalText += `\n\n[Output truncated: ${truncation.outputLines} of ${truncation.totalLines} lines (${formatSize(truncation.outputBytes)} of ${formatSize(truncation.totalBytes)}). Full output saved to: ${fullOutputPath}]`;
136
+ }
137
+
138
+ return {
139
+ content: [{ type: "text", text: finalText }],
140
+ details: { query: data.query, totalResults: data.results.length, results: data.results.slice(0, maxResults), fullOutputPath },
141
+ };
142
+ } catch (err: any) {
143
+ throw new Error(`Failed to query SearXNG at ${SEARXNG_URL}: ${err.message ?? err}`);
144
+ }
145
+ },
146
+
147
+ renderCall(args, theme) {
148
+ let text = theme.fg("toolTitle", theme.bold("web_search "));
149
+ text += theme.fg("muted", args.query);
150
+ if (args.results) {
151
+ text += theme.fg("dim", ` results=${args.results}`);
152
+ }
153
+ return new Text(text, 0, 0);
154
+ },
155
+
156
+ renderResult(result, { expanded, isPartial }, theme) {
157
+ if (isPartial) {
158
+ return new Text(theme.fg("warning", "Searching..."), 0, 0);
159
+ }
160
+ const details = result.details as { query?: string; totalResults?: number; results?: Array<{ title?: string; url?: string }>; fullOutputPath?: string } | undefined;
161
+ let text = theme.fg("success", `✓ ${details?.totalResults ?? 0} results`);
162
+ if (details?.query) {
163
+ text += theme.fg("muted", ` for ${details.query}`);
164
+ }
165
+ if (expanded && details?.results?.length) {
166
+ for (const r of details.results.slice(0, 10)) {
167
+ text += `\n ${theme.fg("dim", `${r.title ?? "(untitled)"} — ${r.url ?? ""}`)}`;
168
+ }
169
+ }
170
+ if (expanded && details?.fullOutputPath) {
171
+ text += `\n${theme.fg("dim", `Full output: ${details.fullOutputPath}`)}`;
172
+ }
173
+ return new Text(text, 0, 0);
174
+ },
175
+ });
176
+
177
+ export default function (pi: ExtensionAPI) {
178
+ pi.registerTool(webSearchTool);
179
+ }
package/package.json ADDED
@@ -0,0 +1,34 @@
1
+ {
2
+ "name": "pi-web-toolkit",
3
+ "version": "0.1.0",
4
+ "description": "Web research toolkit for the pi coding agent. Search via SearXNG, fetch static pages with scrapling, browse interactively via agent-browser, and batch-read sources in parallel.",
5
+ "author": "Wade Huang <fastwade11@gmail.com>",
6
+ "license": "MIT",
7
+ "repository": {
8
+ "type": "git",
9
+ "url": "git+https://github.com/Wade11s/pi-web-toolkit.git"
10
+ },
11
+ "bugs": {
12
+ "url": "https://github.com/Wade11s/pi-web-toolkit/issues"
13
+ },
14
+ "homepage": "https://github.com/Wade11s/pi-web-toolkit#readme",
15
+ "keywords": ["pi-package", "pi-extension", "web-search", "scrapling", "agent-browser"],
16
+ "files": ["extensions", "docs", "README.md", "package.json", "LICENSE"],
17
+ "engines": {
18
+ "node": ">=22.0.0"
19
+ },
20
+ "peerDependencies": {
21
+ "@earendil-works/pi-ai": "*",
22
+ "@earendil-works/pi-coding-agent": "*",
23
+ "@earendil-works/pi-tui": "*",
24
+ "typebox": "*"
25
+ },
26
+ "pi": {
27
+ "extensions": [
28
+ "./extensions/web_search.ts",
29
+ "./extensions/web_fetch.ts",
30
+ "./extensions/web_browse.ts",
31
+ "./extensions/web_batch_fetch.ts"
32
+ ]
33
+ }
34
+ }