@aprimediet/webtools 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -0
- package/browser.ts +242 -0
- package/config.ts +39 -0
- package/crawl.ts +178 -0
- package/extract.ts +90 -0
- package/index.ts +248 -0
- package/markdown.ts +99 -0
- package/package.json +25 -0
- package/search.ts +141 -0
package/README.md
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# @aprimediet/webtools
|
|
2
|
+
|
|
3
|
+
Self-hosted **web tools** for the [pi coding agent](https://www.npmjs.com/package/@earendil-works/pi-coding-agent): fetch, extract, search, and crawl — using only npm/npx/docker backends. **No MCP, no hosted scraping APIs.** The engine is **Playwright**; extraction is **Mozilla Readability + Turndown**; search drives the browser over a public engine's HTML.
|
|
4
|
+
|
|
5
|
+
## Tools
|
|
6
|
+
|
|
7
|
+
| Tool | Params | Returns |
|
|
8
|
+
|---|---|---|
|
|
9
|
+
| `web_fetch` | `url`, `render?`(auto/static/browser), `format?`(markdown/text/html), `timeoutMs?`, `maxBytes?` | page content as markdown/text/html + `{finalUrl,status,title,bytes,renderUsed,truncated}` |
|
|
10
|
+
| `web_extract` | `url`, `html?`, `render?`, `format?`(markdown/text), `includeMetadata?`, `selectors?`(name→CSS) | main article markdown + `{title,byline,siteName,publishedTime,lang,excerpt,fields}` |
|
|
11
|
+
| `web_search` | `query`, `limit?`, `engine?`(duckduckgo/bing/brave) | `[{title,url,snippet}]` scraped from the engine's HTML |
|
|
12
|
+
| `web_crawl` | `startUrl`, `maxPages?`(20, cap 100), `maxDepth?`(2), `sameOrigin?`, `include?`/`exclude?`, `render?`, `format?`, `concurrency?`(3, cap 5) | combined per-page markdown + `{pageCount,pages[]}` (streams progress) |
|
|
13
|
+
|
|
14
|
+
`render:auto` fetches statically and **escalates to the headless browser** if the page looks like a JS shell. The browser is launched **once** and reused, then closed on session shutdown.
|
|
15
|
+
|
|
16
|
+
## Setup (self-hosted backends)
|
|
17
|
+
|
|
18
|
+
Third-party libs (`playwright`, `@mozilla/readability`, `jsdom`, `turndown`, `turndown-plugin-gfm`) install automatically with the package. **Playwright browser binaries do not** — install them once:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
npx playwright install --with-deps chromium
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Or point at a **Docker / remote browser** (zero local install) and set:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
export PLAYWRIGHT_CDP=http://localhost:9222 # Chrome DevTools Protocol endpoint
|
|
28
|
+
# or
|
|
29
|
+
export PLAYWRIGHT_WS_ENDPOINT=ws://localhost:3000 # a Playwright server / browserless container
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Configuration (env + flags)
|
|
33
|
+
|
|
34
|
+
| Env | Flag | Meaning |
|
|
35
|
+
|---|---|---|
|
|
36
|
+
| `WEBTOOLS_RENDER` | `--render-default` | default render mode (`auto`) |
|
|
37
|
+
| `WEBTOOLS_SEARCH_ENGINE` | `--search-engine` | default engine (`duckduckgo`) |
|
|
38
|
+
| `WEBTOOLS_USER_AGENT` | `--user-agent` | UA string |
|
|
39
|
+
| `WEBTOOLS_ALLOW_PRIVATE` | `--allow-private` | allow private/loopback hosts (off by default) |
|
|
40
|
+
| `WEBTOOLS_IGNORE_ROBOTS` | — | crawl ignores robots.txt (off by default) |
|
|
41
|
+
| `PLAYWRIGHT_CDP` / `PLAYWRIGHT_WS_ENDPOINT` | — | connect to a remote/Docker browser |
|
|
42
|
+
|
|
43
|
+
`/webtools` prints the current config and chosen browser backend.
|
|
44
|
+
|
|
45
|
+
## Safety
|
|
46
|
+
|
|
47
|
+
- **SSRF guard:** `web_fetch`/`web_extract`/`web_crawl` resolve the host and **block loopback/private/link-local ranges** (`127/8`, `10/8`, `172.16/12`, `192.168/16`, `169.254/16`, `::1`, `fc00::/7`, `*.local`) unless `WEBTOOLS_ALLOW_PRIVATE=1`.
|
|
48
|
+
- **Caps:** per-request timeout (30s) and `maxBytes` (~5 MB); crawl caps per-page (30 KB) and total output (200 KB), honors robots.txt, rate-limits, and a concurrency cap.
|
|
49
|
+
- **Search** scrapes a public engine's HTML, so it's best-effort: on a consent/captcha/changed page it **throws** a clear error (try another `engine`).
|
|
50
|
+
|
|
51
|
+
## Install / run
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pi install npm:@aprimediet/webtools
|
|
55
|
+
pi list
|
|
56
|
+
|
|
57
|
+
# Quick try without installing
|
|
58
|
+
pi -e ./extensions/webtools/index.ts
|
|
59
|
+
|
|
60
|
+
# Hot-reload during dev
|
|
61
|
+
/reload
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Layout
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
webtools/ # @aprimediet/webtools
|
|
68
|
+
├── package.json # pi manifest + deps
|
|
69
|
+
├── index.ts # factory: 4 tools + flags + /webtools + cleanup
|
|
70
|
+
├── browser.ts # SSRF guard + shared Playwright + fetchHtml
|
|
71
|
+
├── extract.ts # Readability + selectors
|
|
72
|
+
├── search.ts # Playwright SERP scraping (ddg/bing/brave)
|
|
73
|
+
├── crawl.ts # BFS crawler
|
|
74
|
+
├── markdown.ts # Turndown/jsdom HTML→markdown/text helpers
|
|
75
|
+
└── config.ts # env/flag-driven config
|
|
76
|
+
```
|
package/browser.ts
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* The fetch engine: SSRF guard + a shared (lazy) Playwright browser + `fetchHtml`
|
|
3
|
+
* with static / browser / auto render modes.
|
|
4
|
+
*
|
|
5
|
+
* Dependency-free of pi so it can be unit-tested directly with Node.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { isIP } from "node:net";
|
|
9
|
+
import { lookup } from "node:dns/promises";
|
|
10
|
+
import type { Browser } from "playwright";
|
|
11
|
+
import { chromium } from "playwright";
|
|
12
|
+
import { config, type RenderMode } from "./config.ts";
|
|
13
|
+
|
|
14
|
+
// ---------------------------------------------------------------- SSRF guard
|
|
15
|
+
|
|
16
|
+
function isPrivateV4(ip: string): boolean {
|
|
17
|
+
return (
|
|
18
|
+
/^127\./.test(ip) || // loopback
|
|
19
|
+
/^10\./.test(ip) || // private
|
|
20
|
+
/^192\.168\./.test(ip) || // private
|
|
21
|
+
/^169\.254\./.test(ip) || // link-local / metadata
|
|
22
|
+
/^172\.(1[6-9]|2\d|3[01])\./.test(ip) || // private
|
|
23
|
+
/^0\./.test(ip) || // this host
|
|
24
|
+
ip === "0.0.0.0"
|
|
25
|
+
);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function isPrivateV6(ip: string): boolean {
|
|
29
|
+
const s = ip.toLowerCase();
|
|
30
|
+
return (
|
|
31
|
+
s === "::1" || // loopback
|
|
32
|
+
s === "::" ||
|
|
33
|
+
s.startsWith("fe80:") || // link-local
|
|
34
|
+
s.startsWith("fc") || // unique-local fc00::/7
|
|
35
|
+
s.startsWith("fd") ||
|
|
36
|
+
s.startsWith("::ffff:127.") || // mapped loopback
|
|
37
|
+
s.startsWith("::ffff:10.") ||
|
|
38
|
+
s.startsWith("::ffff:192.168.") ||
|
|
39
|
+
s.startsWith("::ffff:169.254.")
|
|
40
|
+
);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export function isPrivateAddress(ip: string): boolean {
|
|
44
|
+
return isIP(ip) === 6 ? isPrivateV6(ip) : isPrivateV4(ip);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/** Throw unless `url` is a public http(s) URL (unless WEBTOOLS_ALLOW_PRIVATE). */
|
|
48
|
+
export async function assertPublicUrl(url: string): Promise<URL> {
|
|
49
|
+
let u: URL;
|
|
50
|
+
try {
|
|
51
|
+
u = new URL(url);
|
|
52
|
+
} catch {
|
|
53
|
+
throw new Error(`Invalid URL: ${url}`);
|
|
54
|
+
}
|
|
55
|
+
if (u.protocol !== "http:" && u.protocol !== "https:") {
|
|
56
|
+
throw new Error(`Blocked non-http(s) URL: ${url}`);
|
|
57
|
+
}
|
|
58
|
+
if (config.allowPrivate) return u;
|
|
59
|
+
|
|
60
|
+
const host = u.hostname.replace(/^\[|\]$/g, "");
|
|
61
|
+
if (host === "localhost" || host.endsWith(".localhost") || host.endsWith(".local")) {
|
|
62
|
+
throw new Error(`Blocked local hostname "${host}". Set WEBTOOLS_ALLOW_PRIVATE=1 to override.`);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
let ips: string[];
|
|
66
|
+
if (isIP(host)) {
|
|
67
|
+
ips = [host];
|
|
68
|
+
} else {
|
|
69
|
+
const resolved = await lookup(host, { all: true }).catch(() => [] as { address: string }[]);
|
|
70
|
+
ips = resolved.map((r) => r.address);
|
|
71
|
+
}
|
|
72
|
+
for (const ip of ips) {
|
|
73
|
+
if (isPrivateAddress(ip)) {
|
|
74
|
+
throw new Error(
|
|
75
|
+
`Blocked private/loopback address ${ip} for ${host}. Set WEBTOOLS_ALLOW_PRIVATE=1 to override.`,
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
return u;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// ------------------------------------------------------------ shared browser
|
|
83
|
+
|
|
84
|
+
let browserPromise: Promise<Browser> | undefined;
|
|
85
|
+
|
|
86
|
+
function launchBrowser(): Promise<Browser> {
|
|
87
|
+
if (config.cdp) return chromium.connectOverCDP(config.cdp);
|
|
88
|
+
if (config.wsEndpoint) return chromium.connect(config.wsEndpoint);
|
|
89
|
+
return chromium.launch({ headless: true });
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
export async function getBrowser(): Promise<Browser> {
|
|
93
|
+
if (!browserPromise) browserPromise = launchBrowser();
|
|
94
|
+
try {
|
|
95
|
+
return await browserPromise;
|
|
96
|
+
} catch (err) {
|
|
97
|
+
browserPromise = undefined;
|
|
98
|
+
throw new Error(
|
|
99
|
+
`Failed to start Playwright. Install browsers with "npx playwright install --with-deps chromium" or set PLAYWRIGHT_CDP. Cause: ${(err as Error).message}`,
|
|
100
|
+
);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
export async function closeBrowser(): Promise<void> {
|
|
105
|
+
if (!browserPromise) return;
|
|
106
|
+
const p = browserPromise;
|
|
107
|
+
browserPromise = undefined;
|
|
108
|
+
const b = await p.catch(() => undefined);
|
|
109
|
+
if (b) await b.close().catch(() => {});
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// ------------------------------------------------------------------- fetchHtml
|
|
113
|
+
|
|
114
|
+
export interface FetchOptions {
|
|
115
|
+
render?: RenderMode;
|
|
116
|
+
timeoutMs?: number;
|
|
117
|
+
maxBytes?: number;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
export interface FetchResult {
|
|
121
|
+
finalUrl: string;
|
|
122
|
+
status: number;
|
|
123
|
+
html: string;
|
|
124
|
+
contentType: string;
|
|
125
|
+
renderUsed: "static" | "browser";
|
|
126
|
+
truncated: boolean;
|
|
127
|
+
bytes: number;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
function throwIfAborted(signal?: AbortSignal): void {
|
|
131
|
+
if (signal?.aborted) throw new Error("web fetch aborted");
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/** A small/empty body or a known SPA mount point ⇒ escalate to the browser. */
|
|
135
|
+
function looksLikeShell(html: string): boolean {
|
|
136
|
+
if (!html) return true;
|
|
137
|
+
const lower = html.toLowerCase();
|
|
138
|
+
const body = lower.match(/<body[^>]*>([\s\S]*?)<\/body>/)?.[1] ?? lower;
|
|
139
|
+
const text = body
|
|
140
|
+
.replace(/<script[\s\S]*?<\/script>/g, "")
|
|
141
|
+
.replace(/<style[\s\S]*?<\/style>/g, "")
|
|
142
|
+
.replace(/<[^>]+>/g, " ")
|
|
143
|
+
.replace(/\s+/g, " ")
|
|
144
|
+
.trim();
|
|
145
|
+
if (text.length < 200) return true;
|
|
146
|
+
return /<div id="(root|app|__next|__nuxt)">\s*<\/div>/.test(lower);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
async function staticFetch(url: string, opts: Required<FetchOptions>, signal?: AbortSignal): Promise<FetchResult> {
|
|
150
|
+
throwIfAborted(signal);
|
|
151
|
+
const ctrl = new AbortController();
|
|
152
|
+
const onAbort = () => ctrl.abort();
|
|
153
|
+
signal?.addEventListener("abort", onAbort, { once: true });
|
|
154
|
+
const timer = setTimeout(() => ctrl.abort(), opts.timeoutMs);
|
|
155
|
+
try {
|
|
156
|
+
const res = await fetch(url, {
|
|
157
|
+
redirect: "follow",
|
|
158
|
+
signal: ctrl.signal,
|
|
159
|
+
headers: { "user-agent": config.userAgent, accept: "text/html,application/xhtml+xml,*/*" },
|
|
160
|
+
});
|
|
161
|
+
const contentType = res.headers.get("content-type") ?? "";
|
|
162
|
+
const reader = res.body?.getReader();
|
|
163
|
+
const decoder = new TextDecoder();
|
|
164
|
+
let html = "";
|
|
165
|
+
let bytes = 0;
|
|
166
|
+
let truncated = false;
|
|
167
|
+
if (reader) {
|
|
168
|
+
for (;;) {
|
|
169
|
+
const { done, value } = await reader.read();
|
|
170
|
+
if (done) break;
|
|
171
|
+
bytes += value.byteLength;
|
|
172
|
+
if (bytes > opts.maxBytes) {
|
|
173
|
+
html += decoder.decode(value.slice(0, Math.max(0, value.byteLength - (bytes - opts.maxBytes))));
|
|
174
|
+
truncated = true;
|
|
175
|
+
await reader.cancel().catch(() => {});
|
|
176
|
+
break;
|
|
177
|
+
}
|
|
178
|
+
html += decoder.decode(value, { stream: true });
|
|
179
|
+
}
|
|
180
|
+
if (!truncated) html += decoder.decode();
|
|
181
|
+
} else {
|
|
182
|
+
html = await res.text();
|
|
183
|
+
bytes = Buffer.byteLength(html);
|
|
184
|
+
}
|
|
185
|
+
return { finalUrl: res.url || url, status: res.status, html, contentType, renderUsed: "static", truncated, bytes };
|
|
186
|
+
} catch (err) {
|
|
187
|
+
if (signal?.aborted) throw new Error("web fetch aborted");
|
|
188
|
+
throw new Error(`static fetch failed for ${url}: ${(err as Error).message}`);
|
|
189
|
+
} finally {
|
|
190
|
+
clearTimeout(timer);
|
|
191
|
+
signal?.removeEventListener("abort", onAbort);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
async function browserFetch(url: string, opts: Required<FetchOptions>, signal?: AbortSignal): Promise<FetchResult> {
|
|
196
|
+
throwIfAborted(signal);
|
|
197
|
+
const browser = await getBrowser();
|
|
198
|
+
const context = await browser.newContext({ userAgent: config.userAgent });
|
|
199
|
+
const page = await context.newPage();
|
|
200
|
+
try {
|
|
201
|
+
const response = await page.goto(url, { waitUntil: "domcontentloaded", timeout: opts.timeoutMs });
|
|
202
|
+
throwIfAborted(signal);
|
|
203
|
+
const html = await page.content();
|
|
204
|
+
let bytes = Buffer.byteLength(html);
|
|
205
|
+
let truncated = false;
|
|
206
|
+
let body = html;
|
|
207
|
+
if (bytes > opts.maxBytes) {
|
|
208
|
+
body = html.slice(0, opts.maxBytes);
|
|
209
|
+
bytes = Buffer.byteLength(body);
|
|
210
|
+
truncated = true;
|
|
211
|
+
}
|
|
212
|
+
return {
|
|
213
|
+
finalUrl: page.url(),
|
|
214
|
+
status: response?.status() ?? 0,
|
|
215
|
+
html: body,
|
|
216
|
+
contentType: response?.headers()["content-type"] ?? "text/html",
|
|
217
|
+
renderUsed: "browser",
|
|
218
|
+
truncated,
|
|
219
|
+
bytes,
|
|
220
|
+
};
|
|
221
|
+
} finally {
|
|
222
|
+
await page.close().catch(() => {});
|
|
223
|
+
await context.close().catch(() => {});
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
export async function fetchHtml(url: string, options: FetchOptions = {}, signal?: AbortSignal): Promise<FetchResult> {
|
|
228
|
+
await assertPublicUrl(url);
|
|
229
|
+
const opts: Required<FetchOptions> = {
|
|
230
|
+
render: options.render ?? config.renderDefault,
|
|
231
|
+
timeoutMs: options.timeoutMs ?? config.timeoutMs,
|
|
232
|
+
maxBytes: options.maxBytes ?? config.maxBytes,
|
|
233
|
+
};
|
|
234
|
+
|
|
235
|
+
if (opts.render === "static") return staticFetch(url, opts, signal);
|
|
236
|
+
if (opts.render === "browser") return browserFetch(url, opts, signal);
|
|
237
|
+
|
|
238
|
+
// auto: try static, escalate to the browser if it looks like a JS shell or non-HTML.
|
|
239
|
+
const s = await staticFetch(url, opts, signal).catch(() => undefined);
|
|
240
|
+
if (s && /html/i.test(s.contentType || "html") && !looksLikeShell(s.html)) return s;
|
|
241
|
+
return browserFetch(url, opts, signal);
|
|
242
|
+
}
|
package/config.ts
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared runtime configuration for @aprimediet/webtools.
|
|
3
|
+
*
|
|
4
|
+
* Defaults come from environment variables (WEBTOOLS_* / PLAYWRIGHT_*); index.ts
|
|
5
|
+
* may override fields from CLI flags on session start. Kept dependency-free so the
|
|
6
|
+
* logic modules can be unit-tested outside pi.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
export type RenderMode = "auto" | "static" | "browser";
|
|
10
|
+
export type SearchEngine = "duckduckgo" | "bing" | "brave";
|
|
11
|
+
|
|
12
|
+
const truthy = (v: string | undefined): boolean => /^(1|true|yes|on)$/i.test(v ?? "");
|
|
13
|
+
|
|
14
|
+
export const DEFAULT_USER_AGENT =
|
|
15
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36 webtools-bot";
|
|
16
|
+
|
|
17
|
+
export interface WebToolsConfig {
|
|
18
|
+
userAgent: string;
|
|
19
|
+
renderDefault: RenderMode;
|
|
20
|
+
searchEngine: SearchEngine;
|
|
21
|
+
allowPrivate: boolean;
|
|
22
|
+
ignoreRobots: boolean;
|
|
23
|
+
cdp: string;
|
|
24
|
+
wsEndpoint: string;
|
|
25
|
+
timeoutMs: number;
|
|
26
|
+
maxBytes: number;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export const config: WebToolsConfig = {
|
|
30
|
+
userAgent: process.env.WEBTOOLS_USER_AGENT || DEFAULT_USER_AGENT,
|
|
31
|
+
renderDefault: (process.env.WEBTOOLS_RENDER as RenderMode) || "auto",
|
|
32
|
+
searchEngine: (process.env.WEBTOOLS_SEARCH_ENGINE as SearchEngine) || "duckduckgo",
|
|
33
|
+
allowPrivate: truthy(process.env.WEBTOOLS_ALLOW_PRIVATE),
|
|
34
|
+
ignoreRobots: truthy(process.env.WEBTOOLS_IGNORE_ROBOTS),
|
|
35
|
+
cdp: process.env.PLAYWRIGHT_CDP || "",
|
|
36
|
+
wsEndpoint: process.env.PLAYWRIGHT_WS_ENDPOINT || "",
|
|
37
|
+
timeoutMs: 30_000,
|
|
38
|
+
maxBytes: 5 * 1024 * 1024,
|
|
39
|
+
};
|
package/crawl.ts
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Breadth-first crawler built on `fetchHtml` + `extractArticle`.
|
|
3
|
+
* Same-origin by default, depth/page-capped, robots-aware, concurrency-limited.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { assertPublicUrl, fetchHtml, type FetchResult } from "./browser.ts";
|
|
7
|
+
import { config, type RenderMode } from "./config.ts";
|
|
8
|
+
import { extractArticle } from "./extract.ts";
|
|
9
|
+
import { extractLinks } from "./markdown.ts";
|
|
10
|
+
|
|
11
|
+
export interface CrawlOptions {
|
|
12
|
+
startUrl: string;
|
|
13
|
+
maxPages?: number;
|
|
14
|
+
maxDepth?: number;
|
|
15
|
+
sameOrigin?: boolean;
|
|
16
|
+
include?: string;
|
|
17
|
+
exclude?: string;
|
|
18
|
+
render?: RenderMode;
|
|
19
|
+
format?: "markdown" | "text";
|
|
20
|
+
concurrency?: number;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export interface CrawlPage {
|
|
24
|
+
url: string;
|
|
25
|
+
title?: string;
|
|
26
|
+
depth: number;
|
|
27
|
+
markdown: string;
|
|
28
|
+
bytes: number;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const HARD_MAX_PAGES = 100;
|
|
32
|
+
const MAX_CONCURRENCY = 5;
|
|
33
|
+
const TOTAL_OUTPUT_CAP = 200 * 1024;
|
|
34
|
+
const PER_PAGE_CAP = 30 * 1024;
|
|
35
|
+
const PER_ORIGIN_DELAY_MS = 250;
|
|
36
|
+
|
|
37
|
+
const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));
|
|
38
|
+
|
|
39
|
+
/** Minimal robots.txt parser for `User-agent: *` Disallow rules. */
|
|
40
|
+
function parseDisallow(robotsTxt: string): string[] {
|
|
41
|
+
const lines = robotsTxt.split(/\r?\n/);
|
|
42
|
+
const disallow: string[] = [];
|
|
43
|
+
let appliesToAll = false;
|
|
44
|
+
for (const line of lines) {
|
|
45
|
+
const clean = line.replace(/#.*$/, "").trim();
|
|
46
|
+
if (!clean) continue;
|
|
47
|
+
const [rawKey, ...rest] = clean.split(":");
|
|
48
|
+
const key = rawKey.toLowerCase().trim();
|
|
49
|
+
const value = rest.join(":").trim();
|
|
50
|
+
if (key === "user-agent") appliesToAll = value === "*";
|
|
51
|
+
else if (key === "disallow" && appliesToAll && value) disallow.push(value);
|
|
52
|
+
}
|
|
53
|
+
return disallow;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
async function loadRobots(origin: string, signal?: AbortSignal): Promise<string[]> {
|
|
57
|
+
if (config.ignoreRobots) return [];
|
|
58
|
+
try {
|
|
59
|
+
const res = await fetchHtml(`${origin}/robots.txt`, { render: "static", maxBytes: 256 * 1024 }, signal);
|
|
60
|
+
return res.status >= 200 && res.status < 300 ? parseDisallow(res.html) : [];
|
|
61
|
+
} catch {
|
|
62
|
+
return [];
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function isAllowed(url: string, disallow: string[]): boolean {
|
|
67
|
+
if (!disallow.length) return true;
|
|
68
|
+
const path = new URL(url).pathname;
|
|
69
|
+
return !disallow.some((rule) => path.startsWith(rule));
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function matches(pattern: string | undefined, url: string): boolean {
|
|
73
|
+
if (!pattern) return false;
|
|
74
|
+
try {
|
|
75
|
+
return new RegExp(pattern).test(url);
|
|
76
|
+
} catch {
|
|
77
|
+
return url.includes(pattern);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export async function webCrawl(
|
|
82
|
+
options: CrawlOptions,
|
|
83
|
+
signal?: AbortSignal,
|
|
84
|
+
onProgress?: (pages: CrawlPage[]) => void,
|
|
85
|
+
): Promise<{ pages: CrawlPage[]; markdown: string; truncated: boolean }> {
|
|
86
|
+
const start = await assertPublicUrl(options.startUrl);
|
|
87
|
+
const maxPages = Math.min(options.maxPages ?? 20, HARD_MAX_PAGES);
|
|
88
|
+
const maxDepth = options.maxDepth ?? 2;
|
|
89
|
+
const sameOrigin = options.sameOrigin ?? true;
|
|
90
|
+
const concurrency = Math.min(Math.max(options.concurrency ?? 3, 1), MAX_CONCURRENCY);
|
|
91
|
+
const render = options.render;
|
|
92
|
+
const format = options.format ?? "markdown";
|
|
93
|
+
|
|
94
|
+
const disallowByOrigin = new Map<string, string[]>();
|
|
95
|
+
disallowByOrigin.set(start.origin, await loadRobots(start.origin, signal));
|
|
96
|
+
|
|
97
|
+
const queue: { url: string; depth: number }[] = [{ url: start.toString(), depth: 0 }];
|
|
98
|
+
const visited = new Set<string>();
|
|
99
|
+
const pages: CrawlPage[] = [];
|
|
100
|
+
|
|
101
|
+
while (queue.length && pages.length < maxPages) {
|
|
102
|
+
if (signal?.aborted) throw new Error("web crawl aborted");
|
|
103
|
+
|
|
104
|
+
const batch: { url: string; depth: number }[] = [];
|
|
105
|
+
while (queue.length && batch.length < concurrency && pages.length + batch.length < maxPages) {
|
|
106
|
+
const item = queue.shift();
|
|
107
|
+
if (!item || visited.has(item.url)) continue;
|
|
108
|
+
visited.add(item.url);
|
|
109
|
+
|
|
110
|
+
const origin = new URL(item.url).origin;
|
|
111
|
+
if (!disallowByOrigin.has(origin)) disallowByOrigin.set(origin, await loadRobots(origin, signal));
|
|
112
|
+
if (!isAllowed(item.url, disallowByOrigin.get(origin) ?? [])) continue;
|
|
113
|
+
|
|
114
|
+
batch.push(item);
|
|
115
|
+
}
|
|
116
|
+
if (!batch.length) break;
|
|
117
|
+
|
|
118
|
+
const results = await Promise.all(
|
|
119
|
+
batch.map(async (item): Promise<{ page: CrawlPage; links: string[] } | null> => {
|
|
120
|
+
if (signal?.aborted) throw new Error("web crawl aborted");
|
|
121
|
+
let fetched: FetchResult;
|
|
122
|
+
try {
|
|
123
|
+
fetched = await fetchHtml(item.url, { render }, signal);
|
|
124
|
+
} catch {
|
|
125
|
+
return null;
|
|
126
|
+
}
|
|
127
|
+
let markdown = "";
|
|
128
|
+
let title: string | undefined;
|
|
129
|
+
try {
|
|
130
|
+
const ex = extractArticle(fetched.html, fetched.finalUrl, { format });
|
|
131
|
+
markdown = ex.content;
|
|
132
|
+
title = ex.meta.title;
|
|
133
|
+
} catch {
|
|
134
|
+
/* keep empty markdown */
|
|
135
|
+
}
|
|
136
|
+
if (markdown.length > PER_PAGE_CAP) markdown = `${markdown.slice(0, PER_PAGE_CAP)}\n\n…[page truncated]`;
|
|
137
|
+
const links = extractLinks(fetched.html, fetched.finalUrl);
|
|
138
|
+
return {
|
|
139
|
+
page: { url: fetched.finalUrl, title, depth: item.depth, markdown, bytes: Buffer.byteLength(markdown) },
|
|
140
|
+
links,
|
|
141
|
+
};
|
|
142
|
+
}),
|
|
143
|
+
);
|
|
144
|
+
|
|
145
|
+
for (const r of results) {
|
|
146
|
+
if (!r) continue;
|
|
147
|
+
pages.push(r.page);
|
|
148
|
+
onProgress?.(pages);
|
|
149
|
+
if (r.page.depth >= maxDepth) continue;
|
|
150
|
+
for (const link of r.links) {
|
|
151
|
+
if (visited.has(link)) continue;
|
|
152
|
+
try {
|
|
153
|
+
if (sameOrigin && new URL(link).origin !== start.origin) continue;
|
|
154
|
+
} catch {
|
|
155
|
+
continue;
|
|
156
|
+
}
|
|
157
|
+
if (options.include && !matches(options.include, link)) continue;
|
|
158
|
+
if (options.exclude && matches(options.exclude, link)) continue;
|
|
159
|
+
queue.push({ url: link, depth: r.page.depth + 1 });
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
await sleep(PER_ORIGIN_DELAY_MS);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Assemble combined markdown, capped.
|
|
167
|
+
let markdown = "";
|
|
168
|
+
let truncated = false;
|
|
169
|
+
for (const p of pages) {
|
|
170
|
+
const section = `\n\n## ${p.title ?? p.url}\n${p.url}\n\n${p.markdown}\n`;
|
|
171
|
+
if (Buffer.byteLength(markdown) + Buffer.byteLength(section) > TOTAL_OUTPUT_CAP) {
|
|
172
|
+
truncated = true;
|
|
173
|
+
break;
|
|
174
|
+
}
|
|
175
|
+
markdown += section;
|
|
176
|
+
}
|
|
177
|
+
return { pages, markdown: markdown.trim(), truncated };
|
|
178
|
+
}
|
package/extract.ts
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Readable-article extraction (Mozilla Readability) + optional CSS-selector fields.
|
|
3
|
+
*
|
|
4
|
+
* Dependency-free of pi so it can be unit-tested directly with Node.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { Readability } from "@mozilla/readability";
|
|
8
|
+
import { JSDOM } from "jsdom";
|
|
9
|
+
import { htmlToMarkdown, pageToMarkdown, pageToText } from "./markdown.ts";
|
|
10
|
+
|
|
11
|
+
export interface ExtractOptions {
|
|
12
|
+
format?: "markdown" | "text";
|
|
13
|
+
includeMetadata?: boolean;
|
|
14
|
+
selectors?: Record<string, string>;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface ExtractMeta {
|
|
18
|
+
title?: string;
|
|
19
|
+
byline?: string;
|
|
20
|
+
siteName?: string;
|
|
21
|
+
publishedTime?: string;
|
|
22
|
+
lang?: string;
|
|
23
|
+
length?: number;
|
|
24
|
+
excerpt?: string;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface ExtractResult {
|
|
28
|
+
content: string;
|
|
29
|
+
meta: ExtractMeta;
|
|
30
|
+
fields?: Record<string, string | string[]>;
|
|
31
|
+
fallback: boolean;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function collectFields(doc: Document, selectors: Record<string, string>): Record<string, string | string[]> {
|
|
35
|
+
const fields: Record<string, string | string[]> = {};
|
|
36
|
+
for (const [name, sel] of Object.entries(selectors)) {
|
|
37
|
+
try {
|
|
38
|
+
const els = [...doc.querySelectorAll(sel)];
|
|
39
|
+
const vals = els.map((e) => (e.textContent ?? "").replace(/\s+/g, " ").trim()).filter(Boolean);
|
|
40
|
+
fields[name] = vals.length <= 1 ? (vals[0] ?? "") : vals;
|
|
41
|
+
} catch {
|
|
42
|
+
fields[name] = "";
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
return fields;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Extract the main article from `html`. Selectors are evaluated BEFORE Readability
|
|
50
|
+
* (which mutates/strips the DOM). Falls back to full-page conversion when the page
|
|
51
|
+
* is not article-like.
|
|
52
|
+
*/
|
|
53
|
+
export function extractArticle(html: string, url: string, options: ExtractOptions = {}): ExtractResult {
|
|
54
|
+
const format = options.format ?? "markdown";
|
|
55
|
+
const dom = new JSDOM(html, { url });
|
|
56
|
+
const doc = dom.window.document;
|
|
57
|
+
|
|
58
|
+
const fields = options.selectors ? collectFields(doc, options.selectors) : undefined;
|
|
59
|
+
const docLang = doc.documentElement.getAttribute("lang") ?? undefined;
|
|
60
|
+
|
|
61
|
+
let article: ReturnType<Readability["parse"]> = null;
|
|
62
|
+
try {
|
|
63
|
+
// Readability mutates the document, so clone for a clean parse.
|
|
64
|
+
article = new Readability(doc.cloneNode(true) as Document).parse();
|
|
65
|
+
} catch {
|
|
66
|
+
article = null;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if (article?.content) {
|
|
70
|
+
const content = format === "text" ? (article.textContent ?? "").trim() : htmlToMarkdown(article.content);
|
|
71
|
+
const meta: ExtractMeta =
|
|
72
|
+
options.includeMetadata === false
|
|
73
|
+
? {}
|
|
74
|
+
: {
|
|
75
|
+
title: article.title ?? undefined,
|
|
76
|
+
byline: article.byline ?? undefined,
|
|
77
|
+
siteName: article.siteName ?? undefined,
|
|
78
|
+
publishedTime: (article as { publishedTime?: string }).publishedTime,
|
|
79
|
+
lang: article.lang ?? docLang,
|
|
80
|
+
length: article.length ?? undefined,
|
|
81
|
+
excerpt: article.excerpt ?? undefined,
|
|
82
|
+
};
|
|
83
|
+
return { content, meta, fields, fallback: false };
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Fallback: non-article page → clean full-page conversion.
|
|
87
|
+
const content = format === "text" ? pageToText(html, url) : pageToMarkdown(html, url);
|
|
88
|
+
const meta: ExtractMeta = options.includeMetadata === false ? {} : { title: doc.title?.trim() || undefined, lang: docLang };
|
|
89
|
+
return { content, meta, fields, fallback: true };
|
|
90
|
+
}
|
package/index.ts
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @aprimediet/webtools
|
|
3
|
+
*
|
|
4
|
+
* Self-hosted web tools for the pi coding agent — fetch, extract, search, crawl.
|
|
5
|
+
* Engine: Playwright (+ static fetch). Extraction: Mozilla Readability + Turndown.
|
|
6
|
+
* Search: Playwright scrapes a public engine's HTML. No MCP, no hosted APIs.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { ExtensionAPI, ExtensionContext } from "@earendil-works/pi-coding-agent";
|
|
10
|
+
import { StringEnum } from "@earendil-works/pi-ai";
|
|
11
|
+
import { Text } from "@earendil-works/pi-tui";
|
|
12
|
+
import { Type } from "typebox";
|
|
13
|
+
|
|
14
|
+
import { closeBrowser, fetchHtml } from "./browser.ts";
|
|
15
|
+
import { config, type RenderMode, type SearchEngine } from "./config.ts";
|
|
16
|
+
import { webCrawl } from "./crawl.ts";
|
|
17
|
+
import { extractArticle } from "./extract.ts";
|
|
18
|
+
import { extractTitle, pageToMarkdown, pageToText, sanitizeHtml } from "./markdown.ts";
|
|
19
|
+
import { webSearch } from "./search.ts";
|
|
20
|
+
|
|
21
|
+
const RENDER = StringEnum(["auto", "static", "browser"] as const);
|
|
22
|
+
const FETCH_FORMAT = StringEnum(["markdown", "text", "html"] as const);
|
|
23
|
+
const TEXT_FORMAT = StringEnum(["markdown", "text"] as const);
|
|
24
|
+
const ENGINE = StringEnum(["duckduckgo", "bing", "brave"] as const);
|
|
25
|
+
|
|
26
|
+
export default function webtoolsExtension(pi: ExtensionAPI): void {
|
|
27
|
+
// ----------------------------------------------------------- web_fetch
|
|
28
|
+
pi.registerTool({
|
|
29
|
+
name: "web_fetch",
|
|
30
|
+
label: "Web Fetch",
|
|
31
|
+
description:
|
|
32
|
+
"Fetch a URL and return its content as clean markdown, text, or html. Renders JavaScript when needed (render:auto/browser). Use for raw page content; prefer web_extract for article bodies.",
|
|
33
|
+
promptSnippet: "Fetch a web page as markdown/text/html",
|
|
34
|
+
parameters: Type.Object({
|
|
35
|
+
url: Type.String({ description: "The URL to fetch (http/https)" }),
|
|
36
|
+
render: Type.Optional(RENDER),
|
|
37
|
+
format: Type.Optional(FETCH_FORMAT),
|
|
38
|
+
timeoutMs: Type.Optional(Type.Number({ description: "Per-request timeout (ms)" })),
|
|
39
|
+
maxBytes: Type.Optional(Type.Number({ description: "Max bytes to read" })),
|
|
40
|
+
}),
|
|
41
|
+
async execute(_id, params, signal) {
|
|
42
|
+
const res = await fetchHtml(
|
|
43
|
+
params.url,
|
|
44
|
+
{ render: params.render as RenderMode, timeoutMs: params.timeoutMs, maxBytes: params.maxBytes },
|
|
45
|
+
signal,
|
|
46
|
+
);
|
|
47
|
+
const format = params.format ?? "markdown";
|
|
48
|
+
const text =
|
|
49
|
+
format === "html"
|
|
50
|
+
? sanitizeHtml(res.html, res.finalUrl)
|
|
51
|
+
: format === "text"
|
|
52
|
+
? pageToText(res.html, res.finalUrl)
|
|
53
|
+
: pageToMarkdown(res.html, res.finalUrl);
|
|
54
|
+
const title = extractTitle(res.html, res.finalUrl);
|
|
55
|
+
return {
|
|
56
|
+
content: [{ type: "text" as const, text: text || "(empty document)" }],
|
|
57
|
+
details: {
|
|
58
|
+
finalUrl: res.finalUrl,
|
|
59
|
+
status: res.status,
|
|
60
|
+
title,
|
|
61
|
+
contentType: res.contentType,
|
|
62
|
+
renderUsed: res.renderUsed,
|
|
63
|
+
bytes: res.bytes,
|
|
64
|
+
truncated: res.truncated,
|
|
65
|
+
},
|
|
66
|
+
};
|
|
67
|
+
},
|
|
68
|
+
renderResult(result, _opts, theme) {
|
|
69
|
+
const d = result.details as { title?: string; finalUrl?: string; bytes?: number; renderUsed?: string };
|
|
70
|
+
return new Text(
|
|
71
|
+
`${theme.fg("toolTitle", theme.bold("web_fetch "))}${theme.fg("muted", d.title ?? d.finalUrl ?? "")} ${theme.fg("dim", `(${d.bytes ?? 0}b · ${d.renderUsed ?? "?"})`)}`,
|
|
72
|
+
0,
|
|
73
|
+
0,
|
|
74
|
+
);
|
|
75
|
+
},
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
// --------------------------------------------------------- web_extract
|
|
79
|
+
pi.registerTool({
|
|
80
|
+
name: "web_extract",
|
|
81
|
+
label: "Web Extract",
|
|
82
|
+
description:
|
|
83
|
+
"Extract the main readable article (content + title/byline/date/lang metadata) from a URL or supplied HTML. Optionally pull structured fields via CSS selectors. Deterministic — no LLM, no service.",
|
|
84
|
+
promptSnippet: "Extract the main article + metadata from a page",
|
|
85
|
+
parameters: Type.Object({
|
|
86
|
+
url: Type.String({ description: "The URL to extract from" }),
|
|
87
|
+
html: Type.Optional(Type.String({ description: "Extract from this HTML instead of fetching" })),
|
|
88
|
+
render: Type.Optional(RENDER),
|
|
89
|
+
format: Type.Optional(TEXT_FORMAT),
|
|
90
|
+
includeMetadata: Type.Optional(Type.Boolean({ description: "Include title/byline/etc. (default true)" })),
|
|
91
|
+
selectors: Type.Optional(
|
|
92
|
+
Type.Record(Type.String(), Type.String(), {
|
|
93
|
+
description: "Map of field name → CSS selector for structured extraction",
|
|
94
|
+
}),
|
|
95
|
+
),
|
|
96
|
+
}),
|
|
97
|
+
async execute(_id, params, signal) {
|
|
98
|
+
let html = params.html;
|
|
99
|
+
let finalUrl = params.url;
|
|
100
|
+
if (!html) {
|
|
101
|
+
const res = await fetchHtml(params.url, { render: params.render as RenderMode }, signal);
|
|
102
|
+
html = res.html;
|
|
103
|
+
finalUrl = res.finalUrl;
|
|
104
|
+
}
|
|
105
|
+
const ex = extractArticle(html, finalUrl, {
|
|
106
|
+
format: params.format as "markdown" | "text" | undefined,
|
|
107
|
+
includeMetadata: params.includeMetadata,
|
|
108
|
+
selectors: params.selectors as Record<string, string> | undefined,
|
|
109
|
+
});
|
|
110
|
+
const note = ex.fallback ? "\n\n_(no article detected — full-page extraction)_" : "";
|
|
111
|
+
return {
|
|
112
|
+
content: [{ type: "text" as const, text: (ex.content || "(no content)") + note }],
|
|
113
|
+
details: {
|
|
114
|
+
url: finalUrl,
|
|
115
|
+
...ex.meta,
|
|
116
|
+
fields: ex.fields,
|
|
117
|
+
fallback: ex.fallback,
|
|
118
|
+
},
|
|
119
|
+
};
|
|
120
|
+
},
|
|
121
|
+
renderResult(result, _opts, theme) {
|
|
122
|
+
const d = result.details as { title?: string; url?: string; fallback?: boolean };
|
|
123
|
+
return new Text(
|
|
124
|
+
`${theme.fg("toolTitle", theme.bold("web_extract "))}${theme.fg("muted", d.title ?? d.url ?? "")}${d.fallback ? theme.fg("dim", " (full-page)") : ""}`,
|
|
125
|
+
0,
|
|
126
|
+
0,
|
|
127
|
+
);
|
|
128
|
+
},
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
// ---------------------------------------------------------- web_search
|
|
132
|
+
pi.registerTool({
|
|
133
|
+
name: "web_search",
|
|
134
|
+
label: "Web Search",
|
|
135
|
+
description:
|
|
136
|
+
"Search the web by driving a headless browser over a public search engine's HTML results. Returns a list of {title, url, snippet}. Use to find URLs, then web_fetch/web_extract them.",
|
|
137
|
+
promptSnippet: "Search the web for URLs",
|
|
138
|
+
parameters: Type.Object({
|
|
139
|
+
query: Type.String({ description: "The search query" }),
|
|
140
|
+
limit: Type.Optional(Type.Number({ description: "Max results (default 10)" })),
|
|
141
|
+
engine: Type.Optional(ENGINE),
|
|
142
|
+
}),
|
|
143
|
+
async execute(_id, params, signal) {
|
|
144
|
+
const engine = (params.engine as SearchEngine) ?? config.searchEngine;
|
|
145
|
+
const limit = Math.max(1, Math.min(params.limit ?? 10, 25));
|
|
146
|
+
const { results } = await webSearch(params.query, limit, engine, signal);
|
|
147
|
+
const text = results.map((r, i) => `${i + 1}. [${r.title}](${r.url})${r.snippet ? ` — ${r.snippet}` : ""}`).join("\n");
|
|
148
|
+
return {
|
|
149
|
+
content: [{ type: "text" as const, text }],
|
|
150
|
+
details: { engine, query: params.query, results },
|
|
151
|
+
};
|
|
152
|
+
},
|
|
153
|
+
renderResult(result, _opts, theme) {
|
|
154
|
+
const d = result.details as { engine?: string; results?: unknown[] };
|
|
155
|
+
return new Text(
|
|
156
|
+
`${theme.fg("toolTitle", theme.bold("web_search "))}${theme.fg("muted", `${d.results?.length ?? 0} results`)} ${theme.fg("dim", `via ${d.engine ?? "?"}`)}`,
|
|
157
|
+
0,
|
|
158
|
+
0,
|
|
159
|
+
);
|
|
160
|
+
},
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
// ----------------------------------------------------------- web_crawl
|
|
164
|
+
pi.registerTool({
|
|
165
|
+
name: "web_crawl",
|
|
166
|
+
label: "Web Crawl",
|
|
167
|
+
description:
|
|
168
|
+
"Breadth-first crawl from a start URL (same-origin by default), extracting each page to markdown. Depth/page-capped, robots-aware, concurrency-limited. Use to ingest a docs section.",
|
|
169
|
+
promptSnippet: "Crawl a site section and extract pages",
|
|
170
|
+
parameters: Type.Object({
|
|
171
|
+
startUrl: Type.String({ description: "The URL to start crawling from" }),
|
|
172
|
+
maxPages: Type.Optional(Type.Number({ description: "Max pages (default 20, cap 100)" })),
|
|
173
|
+
maxDepth: Type.Optional(Type.Number({ description: "Max link depth (default 2)" })),
|
|
174
|
+
sameOrigin: Type.Optional(Type.Boolean({ description: "Restrict to the start origin (default true)" })),
|
|
175
|
+
include: Type.Optional(Type.String({ description: "Only crawl URLs matching this regex/substring" })),
|
|
176
|
+
exclude: Type.Optional(Type.String({ description: "Skip URLs matching this regex/substring" })),
|
|
177
|
+
render: Type.Optional(RENDER),
|
|
178
|
+
format: Type.Optional(TEXT_FORMAT),
|
|
179
|
+
concurrency: Type.Optional(Type.Number({ description: "Parallel fetches (default 3, cap 5)" })),
|
|
180
|
+
}),
|
|
181
|
+
async execute(_id, params, signal, onUpdate) {
|
|
182
|
+
const { pages, markdown, truncated } = await webCrawl(
|
|
183
|
+
params as Parameters<typeof webCrawl>[0],
|
|
184
|
+
signal,
|
|
185
|
+
(p) =>
|
|
186
|
+
onUpdate?.({
|
|
187
|
+
content: [{ type: "text", text: `crawled ${p.length} page(s)…` }],
|
|
188
|
+
details: { pages: p.map((x) => ({ url: x.url, title: x.title, depth: x.depth, bytes: x.bytes })) },
|
|
189
|
+
}),
|
|
190
|
+
);
|
|
191
|
+
const header = `Crawled ${pages.length} page(s) from ${params.startUrl}${truncated ? " (output truncated)" : ""}.\n`;
|
|
192
|
+
return {
|
|
193
|
+
content: [{ type: "text" as const, text: header + markdown }],
|
|
194
|
+
details: {
|
|
195
|
+
startUrl: params.startUrl,
|
|
196
|
+
pageCount: pages.length,
|
|
197
|
+
truncated,
|
|
198
|
+
pages: pages.map((p) => ({ url: p.url, title: p.title, depth: p.depth, bytes: p.bytes })),
|
|
199
|
+
},
|
|
200
|
+
};
|
|
201
|
+
},
|
|
202
|
+
renderResult(result, _opts, theme) {
|
|
203
|
+
const d = result.details as { pageCount?: number; startUrl?: string };
|
|
204
|
+
return new Text(
|
|
205
|
+
`${theme.fg("toolTitle", theme.bold("web_crawl "))}${theme.fg("muted", `${d.pageCount ?? 0} pages`)} ${theme.fg("dim", d.startUrl ?? "")}`,
|
|
206
|
+
0,
|
|
207
|
+
0,
|
|
208
|
+
);
|
|
209
|
+
},
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
// ------------------------------------------------------------- flags
|
|
213
|
+
pi.registerFlag("render-default", { description: "Default render mode: auto, static, browser", type: "string" });
|
|
214
|
+
pi.registerFlag("search-engine", { description: "Default search engine: duckduckgo, bing, brave", type: "string" });
|
|
215
|
+
pi.registerFlag("user-agent", { description: "HTTP/browser User-Agent string", type: "string" });
|
|
216
|
+
pi.registerFlag("allow-private", { description: "Allow fetching private/loopback addresses (SSRF)", type: "boolean" });
|
|
217
|
+
|
|
218
|
+
function applyFlags(): void {
|
|
219
|
+
const render = pi.getFlag("render-default");
|
|
220
|
+
if (typeof render === "string" && render) config.renderDefault = render as RenderMode;
|
|
221
|
+
const engine = pi.getFlag("search-engine");
|
|
222
|
+
if (typeof engine === "string" && engine) config.searchEngine = engine as SearchEngine;
|
|
223
|
+
const ua = pi.getFlag("user-agent");
|
|
224
|
+
if (typeof ua === "string" && ua) config.userAgent = ua;
|
|
225
|
+
if (pi.getFlag("allow-private") === true) config.allowPrivate = true;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
pi.on("session_start", async () => applyFlags());
|
|
229
|
+
|
|
230
|
+
// ------------------------------------------------------- /webtools cmd
|
|
231
|
+
pi.registerCommand("webtools", {
|
|
232
|
+
description: "Show webtools config and backend availability",
|
|
233
|
+
handler: async (_args, ctx: ExtensionContext) => {
|
|
234
|
+
const lines = [
|
|
235
|
+
`render default : ${config.renderDefault}`,
|
|
236
|
+
`search engine : ${config.searchEngine}`,
|
|
237
|
+
`user agent : ${config.userAgent}`,
|
|
238
|
+
`allow private : ${config.allowPrivate}`,
|
|
239
|
+
`ignore robots : ${config.ignoreRobots}`,
|
|
240
|
+
`browser : ${config.cdp ? `CDP ${config.cdp}` : config.wsEndpoint ? `WS ${config.wsEndpoint}` : "local Playwright (npx playwright install chromium)"}`,
|
|
241
|
+
];
|
|
242
|
+
if (ctx.hasUI) ctx.ui.notify(lines.join("\n"), "info");
|
|
243
|
+
},
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
// --------------------------------------------------------- cleanup
|
|
247
|
+
pi.on("session_shutdown", async () => closeBrowser());
|
|
248
|
+
}
|
package/markdown.ts
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML → Markdown / text helpers (Turndown + jsdom).
|
|
3
|
+
*
|
|
4
|
+
* Dependency-free of pi so it can be unit-tested directly with Node.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { JSDOM } from "jsdom";
|
|
8
|
+
import TurndownService from "turndown";
|
|
9
|
+
// @ts-expect-error - no bundled types for the gfm plugin
|
|
10
|
+
import { gfm } from "turndown-plugin-gfm";
|
|
11
|
+
|
|
12
|
+
let turndown: TurndownService | undefined;
|
|
13
|
+
|
|
14
|
+
function getTurndown(): TurndownService {
|
|
15
|
+
if (!turndown) {
|
|
16
|
+
turndown = new TurndownService({
|
|
17
|
+
headingStyle: "atx",
|
|
18
|
+
codeBlockStyle: "fenced",
|
|
19
|
+
bulletListMarker: "-",
|
|
20
|
+
});
|
|
21
|
+
try {
|
|
22
|
+
turndown.use(gfm); // GFM tables / strikethrough / task lists
|
|
23
|
+
} catch {
|
|
24
|
+
/* plugin optional */
|
|
25
|
+
}
|
|
26
|
+
// Drop noise outright.
|
|
27
|
+
turndown.remove(["script", "style", "noscript", "iframe", "svg", "head"]);
|
|
28
|
+
}
|
|
29
|
+
return turndown;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/** Convert an HTML fragment (e.g. Readability output) to Markdown. */
|
|
33
|
+
export function htmlToMarkdown(html: string): string {
|
|
34
|
+
if (!html) return "";
|
|
35
|
+
return getTurndown().turndown(html).trim();
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function stripNoise(doc: Document): void {
|
|
39
|
+
for (const sel of ["script", "style", "noscript", "iframe", "svg", "template"]) {
|
|
40
|
+
for (const el of [...doc.querySelectorAll(sel)]) el.remove();
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** Full-page HTML → Markdown: strip scripts/styles, convert the body. */
|
|
45
|
+
export function pageToMarkdown(html: string, url: string): string {
|
|
46
|
+
const doc = new JSDOM(html, { url }).window.document;
|
|
47
|
+
stripNoise(doc);
|
|
48
|
+
const body = doc.body ?? doc.documentElement;
|
|
49
|
+
return htmlToMarkdown(body.innerHTML);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/** Full-page HTML → collapsed plain text. */
|
|
53
|
+
export function pageToText(html: string, url: string): string {
|
|
54
|
+
const doc = new JSDOM(html, { url }).window.document;
|
|
55
|
+
stripNoise(doc);
|
|
56
|
+
const body = doc.body ?? doc.documentElement;
|
|
57
|
+
return (body.textContent ?? "").replace(/\s+/g, " ").trim();
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/** Sanitized HTML: scripts/styles removed, body markup returned. */
|
|
61
|
+
export function sanitizeHtml(html: string, url: string): string {
|
|
62
|
+
const doc = new JSDOM(html, { url }).window.document;
|
|
63
|
+
stripNoise(doc);
|
|
64
|
+
const body = doc.body ?? doc.documentElement;
|
|
65
|
+
return body.innerHTML.trim();
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** Best-effort <title>. */
|
|
69
|
+
export function extractTitle(html: string, url: string): string | undefined {
|
|
70
|
+
try {
|
|
71
|
+
const doc = new JSDOM(html, { url }).window.document;
|
|
72
|
+
return doc.title?.trim() || undefined;
|
|
73
|
+
} catch {
|
|
74
|
+
return undefined;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** Same-origin/absolute links found in the page (deduped, hash-stripped). */
|
|
79
|
+
export function extractLinks(html: string, baseUrl: string): string[] {
|
|
80
|
+
const out = new Set<string>();
|
|
81
|
+
try {
|
|
82
|
+
const doc = new JSDOM(html, { url: baseUrl }).window.document;
|
|
83
|
+
for (const a of [...doc.querySelectorAll("a[href]")]) {
|
|
84
|
+
const raw = a.getAttribute("href");
|
|
85
|
+
if (!raw || raw.startsWith("#") || raw.startsWith("mailto:") || raw.startsWith("javascript:")) continue;
|
|
86
|
+
try {
|
|
87
|
+
const u = new URL(raw, baseUrl);
|
|
88
|
+
if (u.protocol !== "http:" && u.protocol !== "https:") continue;
|
|
89
|
+
u.hash = "";
|
|
90
|
+
out.add(u.toString());
|
|
91
|
+
} catch {
|
|
92
|
+
/* skip bad href */
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
} catch {
|
|
96
|
+
/* skip */
|
|
97
|
+
}
|
|
98
|
+
return [...out];
|
|
99
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@aprimediet/webtools",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "Self-hosted web fetch / extract / search / crawl tools for the pi coding agent (Playwright + Readability, no MCP).",
|
|
6
|
+
"keywords": ["pi-package"],
|
|
7
|
+
"pi": {
|
|
8
|
+
"extensions": ["./index.ts"]
|
|
9
|
+
},
|
|
10
|
+
"files": ["*.ts", "README.md"],
|
|
11
|
+
"dependencies": {
|
|
12
|
+
"playwright": "^1.48.0",
|
|
13
|
+
"@mozilla/readability": "^0.5.0",
|
|
14
|
+
"jsdom": "^25.0.0",
|
|
15
|
+
"turndown": "^7.2.0",
|
|
16
|
+
"turndown-plugin-gfm": "^1.0.2"
|
|
17
|
+
},
|
|
18
|
+
"peerDependencies": {
|
|
19
|
+
"@earendil-works/pi-coding-agent": "*",
|
|
20
|
+
"@earendil-works/pi-agent-core": "*",
|
|
21
|
+
"@earendil-works/pi-ai": "*",
|
|
22
|
+
"@earendil-works/pi-tui": "*",
|
|
23
|
+
"typebox": "*"
|
|
24
|
+
}
|
|
25
|
+
}
|
package/search.ts
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Playwright-only web search: drive the shared headless browser over a public
|
|
3
|
+
* search engine's HTML results and scrape the rows. No search API, no service.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { config, type SearchEngine } from "./config.ts";
|
|
7
|
+
import { getBrowser } from "./browser.ts";
|
|
8
|
+
|
|
9
|
+
export interface SearchHit {
|
|
10
|
+
title: string;
|
|
11
|
+
url: string;
|
|
12
|
+
snippet: string;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
interface EngineSpec {
|
|
16
|
+
url: (q: string) => string;
|
|
17
|
+
row: string;
|
|
18
|
+
titleSel: string;
|
|
19
|
+
snippetSel: string;
|
|
20
|
+
/** Some engines wrap result hrefs in a redirect; how to recover the real URL. */
|
|
21
|
+
unwrap?: "ddg" | "bing";
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// Keep selectors in one table so they're trivial to update when a SERP changes.
|
|
25
|
+
const ENGINES: Record<SearchEngine, EngineSpec> = {
|
|
26
|
+
duckduckgo: {
|
|
27
|
+
url: (q) => `https://html.duckduckgo.com/html/?q=${encodeURIComponent(q)}`,
|
|
28
|
+
row: ".result",
|
|
29
|
+
titleSel: "a.result__a",
|
|
30
|
+
snippetSel: ".result__snippet",
|
|
31
|
+
unwrap: "ddg",
|
|
32
|
+
},
|
|
33
|
+
bing: {
|
|
34
|
+
url: (q) => `https://www.bing.com/search?q=${encodeURIComponent(q)}`,
|
|
35
|
+
row: "li.b_algo",
|
|
36
|
+
titleSel: "h2 a",
|
|
37
|
+
snippetSel: ".b_caption p",
|
|
38
|
+
unwrap: "bing",
|
|
39
|
+
},
|
|
40
|
+
brave: {
|
|
41
|
+
url: (q) => `https://search.brave.com/search?q=${encodeURIComponent(q)}`,
|
|
42
|
+
row: "[data-type=web]",
|
|
43
|
+
titleSel: "a",
|
|
44
|
+
snippetSel: ".snippet-description",
|
|
45
|
+
},
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
function unwrapDdg(href: string): string {
|
|
49
|
+
try {
|
|
50
|
+
const u = new URL(href, "https://duckduckgo.com");
|
|
51
|
+
const real = u.searchParams.get("uddg");
|
|
52
|
+
return real ? real : u.toString();
|
|
53
|
+
} catch {
|
|
54
|
+
return href;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/** Bing wraps results in `bing.com/ck/a?...&u=a1<base64url(realUrl)>`. */
|
|
59
|
+
function unwrapBing(href: string): string {
|
|
60
|
+
try {
|
|
61
|
+
const u = new URL(href, "https://www.bing.com");
|
|
62
|
+
const param = u.searchParams.get("u");
|
|
63
|
+
if (param) {
|
|
64
|
+
const enc = param.startsWith("a1") ? param.slice(2) : param;
|
|
65
|
+
const decoded = Buffer.from(enc.replace(/-/g, "+").replace(/_/g, "/"), "base64").toString("utf8");
|
|
66
|
+
if (/^https?:\/\//.test(decoded)) return decoded;
|
|
67
|
+
}
|
|
68
|
+
return u.toString();
|
|
69
|
+
} catch {
|
|
70
|
+
return href;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function unwrapHref(kind: EngineSpec["unwrap"], href: string): string {
|
|
75
|
+
if (kind === "ddg") return unwrapDdg(href);
|
|
76
|
+
if (kind === "bing") return unwrapBing(href);
|
|
77
|
+
return href;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
export async function webSearch(
|
|
81
|
+
query: string,
|
|
82
|
+
limit: number,
|
|
83
|
+
engineName: SearchEngine,
|
|
84
|
+
signal?: AbortSignal,
|
|
85
|
+
): Promise<{ engine: SearchEngine; results: SearchHit[] }> {
|
|
86
|
+
const spec = ENGINES[engineName];
|
|
87
|
+
if (!spec) throw new Error(`Unknown search engine: ${engineName}`);
|
|
88
|
+
|
|
89
|
+
const browser = await getBrowser();
|
|
90
|
+
const context = await browser.newContext({ userAgent: config.userAgent });
|
|
91
|
+
const page = await context.newPage();
|
|
92
|
+
try {
|
|
93
|
+
await page.goto(spec.url(query), { waitUntil: "domcontentloaded", timeout: config.timeoutMs });
|
|
94
|
+
if (signal?.aborted) throw new Error("web search aborted");
|
|
95
|
+
|
|
96
|
+
const raw = (await page
|
|
97
|
+
.$$eval(
|
|
98
|
+
spec.row,
|
|
99
|
+
(rows, sel) =>
|
|
100
|
+
rows.map((el) => {
|
|
101
|
+
const a = el.querySelector(sel.titleSel) as HTMLAnchorElement | null;
|
|
102
|
+
const snip = el.querySelector(sel.snippetSel);
|
|
103
|
+
return {
|
|
104
|
+
title: (a?.textContent ?? "").replace(/\s+/g, " ").trim(),
|
|
105
|
+
href: a?.getAttribute("href") ?? "",
|
|
106
|
+
snippet: (snip?.textContent ?? "").replace(/\s+/g, " ").trim(),
|
|
107
|
+
};
|
|
108
|
+
}),
|
|
109
|
+
{ titleSel: spec.titleSel, snippetSel: spec.snippetSel },
|
|
110
|
+
)
|
|
111
|
+
.catch(() => [] as { title: string; href: string; snippet: string }[])) as {
|
|
112
|
+
title: string;
|
|
113
|
+
href: string;
|
|
114
|
+
snippet: string;
|
|
115
|
+
}[];
|
|
116
|
+
|
|
117
|
+
const results: SearchHit[] = [];
|
|
118
|
+
for (const r of raw) {
|
|
119
|
+
if (!r.href || !r.title) continue;
|
|
120
|
+
let url = unwrapHref(spec.unwrap, r.href);
|
|
121
|
+
try {
|
|
122
|
+
url = new URL(url, spec.url(query)).toString();
|
|
123
|
+
} catch {
|
|
124
|
+
continue;
|
|
125
|
+
}
|
|
126
|
+
if (!/^https?:/.test(url)) continue;
|
|
127
|
+
results.push({ title: r.title, url, snippet: r.snippet });
|
|
128
|
+
if (results.length >= limit) break;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
if (results.length === 0) {
|
|
132
|
+
throw new Error(
|
|
133
|
+
`web_search: no results from "${engineName}". The page may be a consent/captcha/"unusual traffic" page, or its markup changed. Try engine:"bing" or "brave".`,
|
|
134
|
+
);
|
|
135
|
+
}
|
|
136
|
+
return { engine: engineName, results };
|
|
137
|
+
} finally {
|
|
138
|
+
await page.close().catch(() => {});
|
|
139
|
+
await context.close().catch(() => {});
|
|
140
|
+
}
|
|
141
|
+
}
|