agentic-pi 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -27
- package/dist/args.d.ts +16 -0
- package/dist/args.js +29 -0
- package/dist/args.js.map +1 -1
- package/dist/extensions/web-search/extract.d.ts +18 -0
- package/dist/extensions/web-search/extract.js +110 -0
- package/dist/extensions/web-search/extract.js.map +1 -0
- package/dist/extensions/web-search/index.d.ts +43 -0
- package/dist/extensions/web-search/index.js +86 -0
- package/dist/extensions/web-search/index.js.map +1 -0
- package/dist/extensions/web-search/providers/brave.d.ts +21 -0
- package/dist/extensions/web-search/providers/brave.js +73 -0
- package/dist/extensions/web-search/providers/brave.js.map +1 -0
- package/dist/extensions/web-search/providers/exa.d.ts +16 -0
- package/dist/extensions/web-search/providers/exa.js +85 -0
- package/dist/extensions/web-search/providers/exa.js.map +1 -0
- package/dist/extensions/web-search/providers/tavily.d.ts +18 -0
- package/dist/extensions/web-search/providers/tavily.js +85 -0
- package/dist/extensions/web-search/providers/tavily.js.map +1 -0
- package/dist/extensions/web-search/rate-limit.d.ts +14 -0
- package/dist/extensions/web-search/rate-limit.js +24 -0
- package/dist/extensions/web-search/rate-limit.js.map +1 -0
- package/dist/extensions/web-search/safe-fetch.d.ts +54 -0
- package/dist/extensions/web-search/safe-fetch.js +172 -0
- package/dist/extensions/web-search/safe-fetch.js.map +1 -0
- package/dist/extensions/web-search/selection.d.ts +42 -0
- package/dist/extensions/web-search/selection.js +64 -0
- package/dist/extensions/web-search/selection.js.map +1 -0
- package/dist/extensions/web-search/tools.d.ts +13 -0
- package/dist/extensions/web-search/tools.js +136 -0
- package/dist/extensions/web-search/tools.js.map +1 -0
- package/dist/extensions/web-search/types.d.ts +65 -0
- package/dist/extensions/web-search/types.js +10 -0
- package/dist/extensions/web-search/types.js.map +1 -0
- package/dist/run.d.ts +27 -0
- package/dist/run.js +13 -0
- package/dist/run.js.map +1 -1
- package/dist/runner.js +29 -1
- package/dist/runner.js.map +1 -1
- package/dist/sandbox/gondolin.d.ts +13 -4
- package/dist/sandbox/gondolin.js +10 -3
- package/dist/sandbox/gondolin.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -42,8 +42,8 @@ single line you parse.
|
|
|
42
42
|
|
|
43
43
|
Pi explicitly does not support MCP. agentic-pi ships a native Pi extension
|
|
44
44
|
exposing **31 GitHub tools** ported from lastlight's `mcp-github-app`:
|
|
45
|
-
clone/push, issues, PRs, reviews, labels, search.
|
|
46
|
-
|
|
45
|
+
clone/push, issues, PRs, reviews, labels, search. Tool names are prefixed
|
|
46
|
+
with `github_`.
|
|
47
47
|
|
|
48
48
|
Auth is opinionated: **GitHub App credentials preferred**, static
|
|
49
49
|
`GITHUB_TOKEN` only as a low-trust fallback. JWT-minted installation tokens
|
|
@@ -81,38 +81,29 @@ The `extension_status` JSONL event always reports `status`, `reason`,
|
|
|
81
81
|
`message`, `profile`, and `toolCount` so the orchestrator can log the
|
|
82
82
|
outcome programmatically without parsing stderr.
|
|
83
83
|
|
|
84
|
-
### 5.
|
|
84
|
+
### 5. Model selection
|
|
85
85
|
|
|
86
|
-
`--model provider/id`
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
`getModel()`.
|
|
86
|
+
`--model provider/id` (e.g. `anthropic/claude-opus-4-5`, `openai/gpt-4o`).
|
|
87
|
+
Credentials come from environment variables (`OPENAI_API_KEY`,
|
|
88
|
+
`ANTHROPIC_API_KEY`, `OPENROUTER_API_KEY`) or Pi's `~/.pi/agent/auth.json`
|
|
89
|
+
if you've logged in interactively. Provider/id mapping is delegated to
|
|
90
|
+
`@earendil-works/pi-ai`'s `getModel()`.
|
|
92
91
|
|
|
93
92
|
`--thinking <level>` maps directly to Pi's `thinkingLevel`
|
|
94
93
|
(`off`/`minimal`/`low`/`medium`/`high`/`xhigh`). Per-provider effort is
|
|
95
94
|
handled by Pi.
|
|
96
95
|
|
|
97
|
-
### 6.
|
|
98
|
-
|
|
99
|
-
- `--dangerously-skip-permissions` — Pi has no permission prompts to skip
|
|
100
|
-
("run in a container" is Pi's design stance). The flag is accepted so a
|
|
101
|
-
caller that previously spawned opencode does not need to strip it.
|
|
102
|
-
- `--variant <level>` — alias for `--thinking`.
|
|
103
|
-
|
|
104
|
-
### 7. Defaults that match a containerized sandbox
|
|
96
|
+
### 6. Defaults that match a containerized sandbox
|
|
105
97
|
|
|
106
98
|
- **`--no-session`** is intended to be the default in sandboxed runs (state
|
|
107
99
|
lives outside the container).
|
|
108
100
|
- **Built-in tools** (read, write, edit, bash, grep, find, ls) are enabled
|
|
109
101
|
by default. Add `--no-builtin-tools` if you want a GitHub-only agent.
|
|
110
102
|
- **`AGENTS.md`** in the working directory is auto-loaded as the agent's
|
|
111
|
-
system prompt — same convention Pi
|
|
112
|
-
|
|
113
|
-
up.
|
|
103
|
+
system prompt — same convention Pi uses. Drop your workflow's
|
|
104
|
+
`AGENTS.md` into the mounted workspace and the agent picks it up.
|
|
114
105
|
|
|
115
|
-
###
|
|
106
|
+
### 7. Optional micro-VM sandboxing via `--sandbox gondolin`
|
|
116
107
|
|
|
117
108
|
By default Pi's file and bash tools run on the host. Pass `--sandbox gondolin`
|
|
118
109
|
and they get routed through a per-run [Gondolin](https://github.com/earendil-works/gondolin)
|
|
@@ -195,14 +186,73 @@ The **App PEM is never copied into the VM** — only the resulting token,
|
|
|
195
186
|
which is short-lived. User-supplied `--sandbox-env GITHUB_TOKEN=…`
|
|
196
187
|
overrides the auto-injected value if you need to scope down further.
|
|
197
188
|
|
|
189
|
+
### 8. Safe web search via the `web-search` extension
|
|
190
|
+
|
|
191
|
+
agentic-pi can register two native Pi tools — `web_search` and `web_fetch` —
|
|
192
|
+
so the agent can do general-purpose research. Backed by a configurable
|
|
193
|
+
provider:
|
|
194
|
+
|
|
195
|
+
| Provider | API key env var | Native content extraction |
|
|
196
|
+
| --- | --- | --- |
|
|
197
|
+
| Tavily (default) | `TAVILY_API_KEY` | yes (search + extract) |
|
|
198
|
+
| Exa | `EXA_API_KEY` | yes (search + contents) |
|
|
199
|
+
| Brave Search | `BRAVE_SEARCH_API_KEY` | no — `web_fetch` falls back to a safe HTML→text extractor |
|
|
200
|
+
|
|
201
|
+
**Auto-enable.** When at least one API key env var is present, the
|
|
202
|
+
extension is configured automatically. With multiple keys set, priority is
|
|
203
|
+
**Tavily → Exa → Brave**; override with `--web-search-provider` or the
|
|
204
|
+
`WEB_SEARCH_PROVIDER` env var. Pass `--no-web-search` to suppress the
|
|
205
|
+
tools entirely.
|
|
206
|
+
|
|
207
|
+
**Host-process egress.** Both tools run in the agentic-pi process, **not**
|
|
208
|
+
inside the Gondolin guest. That means:
|
|
209
|
+
|
|
210
|
+
- The provider API host is **not** added to the Gondolin egress
|
|
211
|
+
allowlist, and the API key is **never** injected into the VM.
|
|
212
|
+
- Behavior is identical under `--sandbox=none`, `--sandbox=gondolin`, and
|
|
213
|
+
when agentic-pi itself is containerized. The host's own network policy
|
|
214
|
+
controls reachability to the provider + arbitrary http(s) URLs.
|
|
215
|
+
|
|
216
|
+
**Safety rails (built-in, non-configurable in v1).**
|
|
217
|
+
|
|
218
|
+
| Rail | Default |
|
|
219
|
+
| --- | --- |
|
|
220
|
+
| URL scheme allowlist | `http`, `https` only (`web_fetch`) |
|
|
221
|
+
| Request timeout | 15 s |
|
|
222
|
+
| Max response bytes | 1 MiB (streamed, aborted on overflow) |
|
|
223
|
+
| Max redirects | 3 (scheme re-checked at each hop) |
|
|
224
|
+
| Content-type gate (`web_fetch`) | `text/*`, `application/(xhtml+xml\|xml\|json)` |
|
|
225
|
+
| Max search results | 10 (regardless of `max_results` arg) |
|
|
226
|
+
| Extracted text cap | ~200 KiB |
|
|
227
|
+
| HTML cleaning | `<script>`, `<style>`, `<noscript>`, `<iframe>`, comments stripped before extraction |
|
|
228
|
+
| Per-run call budget | 30 combined `web_search` + `web_fetch` calls (override with `--web-search-max-calls`) |
|
|
229
|
+
|
|
230
|
+
When the call budget is hit, further invocations return a structured
|
|
231
|
+
rate-limit error result so the agent can recover; the run is **not**
|
|
232
|
+
aborted.
|
|
233
|
+
|
|
234
|
+
**No SSRF blocking.** Loopback / private IP ranges are **not** blocked by
|
|
235
|
+
default. Operators who care should run agentic-pi behind their own
|
|
236
|
+
egress firewall.
|
|
237
|
+
|
|
238
|
+
**Event stream.** A second `extension_status` event mirrors GitHub's:
|
|
239
|
+
|
|
240
|
+
```jsonl
|
|
241
|
+
{"type":"extension_status","extension":"web-search","status":"configured","provider":"tavily","toolCount":2,"maxCalls":30,"sessionId":"…","timestamp":"…"}
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
When skipped (no keys / `--no-web-search`), `status: "skipped"` carries a
|
|
245
|
+
`reason` of `disabled-by-flag` or `no-credentials`. Misconfigurations
|
|
246
|
+
(explicit provider whose key is missing, or an unknown provider name)
|
|
247
|
+
surface as a warning before the run starts.
|
|
248
|
+
|
|
198
249
|
## When to use this
|
|
199
250
|
|
|
200
251
|
- You have an orchestrator that calls a coding agent once per workflow
|
|
201
252
|
phase, in a container, and parses a JSONL stream.
|
|
202
|
-
- You used to call `opencode run --format json` and want a less-opaque
|
|
203
|
-
replacement built on a more hackable substrate.
|
|
204
253
|
- You need GitHub repo operations available to the agent without standing
|
|
205
254
|
up an MCP server.
|
|
255
|
+
- You want safe, sandbox-mode-agnostic web search available to the agent.
|
|
206
256
|
|
|
207
257
|
## When **not** to use this
|
|
208
258
|
|
|
@@ -245,16 +295,19 @@ GITHUB_TOKEN=ghp_…
|
|
|
245
295
|
| --- | --- |
|
|
246
296
|
| `--model <provider/id>` | Required. e.g. `anthropic/claude-opus-4-5`, `openai/gpt-4o`. |
|
|
247
297
|
| `--thinking <level>` | `off` \| `minimal` \| `low` \| `medium` \| `high` \| `xhigh`. |
|
|
248
|
-
| `--variant <level>` | Alias for `--thinking`. |
|
|
249
298
|
| `--profile <name>` | `read` \| `issues-write` \| `review-write` \| `repo-write`. Omit to disable GitHub tools entirely. |
|
|
250
299
|
| `--cwd <path>` | Working directory for the agent. Default: `$PWD`. |
|
|
251
300
|
| `--no-session` | Ephemeral run — do not persist session jsonl. Recommended in sandboxed containers. |
|
|
252
301
|
| `--session-dir <path>` | Override session storage location. |
|
|
253
302
|
| `--no-builtin-tools` | Disable Pi's `read,write,edit,bash,grep,find,ls`. |
|
|
254
303
|
| `--tools <a,b,c>` | Explicit tool allowlist (combined with profile if set). |
|
|
255
|
-
| `--sandbox <none\|gondolin>` | Route `read`/`write`/`edit`/`bash` through a sandbox backend. Default `none`. `gondolin` boots a QEMU micro-VM mounting cwd at `/workspace`. Requires QEMU on the host; native-only (not Docker-in-Docker). See section
|
|
304
|
+
| `--sandbox <none\|gondolin>` | Route `read`/`write`/`edit`/`bash` through a sandbox backend. Default `none`. `gondolin` boots a QEMU micro-VM mounting cwd at `/workspace`. Requires QEMU on the host; native-only (not Docker-in-Docker). See section 7. |
|
|
256
305
|
| `--sandbox-env KEY=VAL` | Inject env var into the sandbox VM (repeatable). Ignored when `--sandbox=none`. Auto-injects a minted `GITHUB_TOKEN`/`GH_TOKEN` when `--profile` is also active. |
|
|
257
|
-
| `--
|
|
306
|
+
| `--allow-host <host>` | Add host to the sandbox HTTP egress allowlist (repeatable). Ignored when `--sandbox=none`. |
|
|
307
|
+
| `--no-network` | Disable sandbox HTTP egress entirely. Ignored when `--sandbox=none`. |
|
|
308
|
+
| `--web-search-provider <p>` | Force web-search provider: `tavily` \| `brave` \| `exa`. Default: auto-detect by env. See section 8. |
|
|
309
|
+
| `--no-web-search` | Disable the web-search extension (no `web_search`/`web_fetch` tools). |
|
|
310
|
+
| `--web-search-max-calls <n>` | Cap combined `web_search` + `web_fetch` calls per run. Default: 30. |
|
|
258
311
|
|
|
259
312
|
Reads the prompt from stdin. Emits JSONL on stdout. Exits 0 on `agent_end`,
|
|
260
313
|
1 on fatal error.
|
|
@@ -265,6 +318,7 @@ Reads the prompt from stdin. Emits JSONL on stdout. Exits 0 on `agent_end`,
|
|
|
265
318
|
{"type":"session","version":3,"id":"<uuid>","timestamp":"…","cwd":"…"}
|
|
266
319
|
{"type":"sandbox_status","backend":"none","status":{"backend":"none"},"sessionId":"<uuid>","timestamp":"…"}
|
|
267
320
|
{"type":"extension_status","extension":"github","status":"configured","profile":"read","toolCount":18,"sessionId":"<uuid>","timestamp":"…"}
|
|
321
|
+
{"type":"extension_status","extension":"web-search","status":"configured","provider":"tavily","toolCount":2,"maxCalls":30,"sessionId":"<uuid>","timestamp":"…"}
|
|
268
322
|
{"type":"agent_start","sessionId":"<uuid>","timestamp":"…"}
|
|
269
323
|
{"type":"turn_start","sessionId":"<uuid>","timestamp":"…"}
|
|
270
324
|
{"type":"message_start","message":{…},"sessionId":"<uuid>","timestamp":"…"}
|
|
@@ -343,7 +397,8 @@ console.log(result.records.length); // full event log
|
|
|
343
397
|
| `messages` | `unknown[]` | Full Pi message array from `agent_end`. |
|
|
344
398
|
| `stats` | `{userMessages, assistantMessages, toolCalls, toolResults, tokens: {input, output, cacheRead, cacheWrite, total}, cost}` \| `undefined` | Token + cost rollup. |
|
|
345
399
|
| `sandbox` | `{backend, status}` \| `undefined` | Mirror of the `sandbox_status` event. |
|
|
346
|
-
| `github` | `{status, reason, profile, toolCount}` \| `undefined` | Mirror of the `extension_status` event. |
|
|
400
|
+
| `github` | `{status, reason, profile, toolCount}` \| `undefined` | Mirror of the GitHub `extension_status` event. |
|
|
401
|
+
| `webSearch` | `{status, reason, provider, toolCount, maxCalls}` \| `undefined` | Mirror of the web-search `extension_status` event. |
|
|
347
402
|
| `records` | `EmitterRecord[]` | Every JSONL record in order. Same shape that the CLI writes. |
|
|
348
403
|
| `warnings` | `string[]` | Warnings that would have gone to stderr in CLI mode. |
|
|
349
404
|
|
|
@@ -394,6 +449,7 @@ which walks `test/` for `*.test.ts`.
|
|
|
394
449
|
| `test/models.test.ts` | `provider/id` parsing including openrouter triple-slash | — |
|
|
395
450
|
| `test/extensions/github/profiles.test.ts` | Profile → tool allowlist (counts, superset structure, scope tiering) | — |
|
|
396
451
|
| `test/extensions/github/credentials.test.ts` | `assertSafeToken` and `credentialsFilePath` validation | — |
|
|
452
|
+
| `test/extensions/web-search/*.test.ts` | Provider selection, extension wiring, safe-fetch rails, HTML extraction, rate limiter, per-provider normalization (all with injected `fetchImpl`) | — |
|
|
397
453
|
| `test/sandbox/preflight.test.ts` | Preflight returns a structured ok\|error result | — |
|
|
398
454
|
| `test/run.integration.test.ts` | Programmatic `run()`: RunResult populated, onEvent fires for every record, **child-process check confirms zero stdout/stderr leak from library** | `OPENAI_API_KEY` not set |
|
|
399
455
|
| `test/run-sandbox.integration.test.ts` | `run({ sandbox: "gondolin" })` boots a VM, agent's `write` tool produces a host file via the mount | `OPENAI_API_KEY` not set OR QEMU/preflight unavailable |
|
package/dist/args.d.ts
CHANGED
|
@@ -60,6 +60,22 @@ export interface RunConfig {
|
|
|
60
60
|
* entirely. Ignored when `sandbox === "none"`.
|
|
61
61
|
*/
|
|
62
62
|
allowedHttpHosts?: string[] | null;
|
|
63
|
+
/**
|
|
64
|
+
* Web-search extension toggle. Default: true (auto-enables when a
|
|
65
|
+
* provider API key env var is present). Pass `--no-web-search` to
|
|
66
|
+
* force-disable.
|
|
67
|
+
*/
|
|
68
|
+
webSearch: boolean;
|
|
69
|
+
/**
|
|
70
|
+
* Explicit web-search provider. Overrides auto-detection by env var.
|
|
71
|
+
* Set via `--web-search-provider <tavily|brave|exa>`.
|
|
72
|
+
*/
|
|
73
|
+
webSearchProvider?: string;
|
|
74
|
+
/**
|
|
75
|
+
* Per-run cap on combined web_search + web_fetch calls. Default: 30.
|
|
76
|
+
* Set via `--web-search-max-calls <n>`.
|
|
77
|
+
*/
|
|
78
|
+
webSearchMaxCalls?: number;
|
|
63
79
|
}
|
|
64
80
|
export declare function printHelp(): void;
|
|
65
81
|
export declare function parseArgs(argv: string[]): RunConfig;
|
package/dist/args.js
CHANGED
|
@@ -34,6 +34,15 @@ Flags:
|
|
|
34
34
|
Ignored when --sandbox=none.
|
|
35
35
|
--no-network Disable HTTP egress from the sandbox entirely.
|
|
36
36
|
Ignored when --sandbox=none.
|
|
37
|
+
--web-search-provider <p> Force a web-search provider: tavily | brave | exa.
|
|
38
|
+
Default: auto-detect from env (Tavily > Exa > Brave).
|
|
39
|
+
Provider's API key env var must be set:
|
|
40
|
+
TAVILY_API_KEY, EXA_API_KEY, or BRAVE_SEARCH_API_KEY.
|
|
41
|
+
--no-web-search Disable the web-search extension entirely
|
|
42
|
+
(web_search / web_fetch tools not registered).
|
|
43
|
+
--web-search-max-calls <n> Cap combined web_search + web_fetch calls per run.
|
|
44
|
+
Default: 30. When exceeded, further calls return a
|
|
45
|
+
structured error result.
|
|
37
46
|
--sandbox-image <name> Image to boot when --sandbox=gondolin. Values:
|
|
38
47
|
'default' (recommended) — bundled agentic-pi-dev image
|
|
39
48
|
with git/gh/node/python/rust baked in (auto-downloaded).
|
|
@@ -54,6 +63,7 @@ export function parseArgs(argv) {
|
|
|
54
63
|
noBuiltinTools: false,
|
|
55
64
|
dangerouslySkipPermissions: false,
|
|
56
65
|
sandbox: "none",
|
|
66
|
+
webSearch: true,
|
|
57
67
|
};
|
|
58
68
|
for (let i = 0; i < argv.length; i++) {
|
|
59
69
|
const arg = argv[i];
|
|
@@ -147,6 +157,25 @@ export function parseArgs(argv) {
|
|
|
147
157
|
case "--no-network":
|
|
148
158
|
config.allowedHttpHosts = null;
|
|
149
159
|
break;
|
|
160
|
+
case "--no-web-search":
|
|
161
|
+
config.webSearch = false;
|
|
162
|
+
break;
|
|
163
|
+
case "--web-search-provider": {
|
|
164
|
+
const v = next().trim();
|
|
165
|
+
if (!v)
|
|
166
|
+
throw new Error("--web-search-provider requires a value");
|
|
167
|
+
config.webSearchProvider = v;
|
|
168
|
+
break;
|
|
169
|
+
}
|
|
170
|
+
case "--web-search-max-calls": {
|
|
171
|
+
const v = next();
|
|
172
|
+
const n = Number(v);
|
|
173
|
+
if (!Number.isFinite(n) || Math.floor(n) !== n || n < 1) {
|
|
174
|
+
throw new Error(`--web-search-max-calls must be a positive integer (got '${v}')`);
|
|
175
|
+
}
|
|
176
|
+
config.webSearchMaxCalls = n;
|
|
177
|
+
break;
|
|
178
|
+
}
|
|
150
179
|
case "-h":
|
|
151
180
|
case "--help":
|
|
152
181
|
printHelp();
|
package/dist/args.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"args.js","sourceRoot":"","sources":["../src/args.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;
|
|
1
|
+
{"version":3,"file":"args.js","sourceRoot":"","sources":["../src/args.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AA2EH,MAAM,UAAU,SAAS;IACvB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA+CtB,CAAC,CAAC;AACH,CAAC;AAED,MAAM,UAAU,SAAS,CAAC,IAAc;IACtC,MAAM,MAAM,GAAc;QACxB,KAAK,EAAE,EAAE;QACT,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE;QAClB,SAAS,EAAE,KAAK;QAChB,cAAc,EAAE,KAAK;QACrB,0BAA0B,EAAE,KAAK;QACjC,OAAO,EAAE,MAAM;QACf,SAAS,EAAE,IAAI;KAChB,CAAC;IAEF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,MAAM,IAAI,GAAG,GAAW,EAAE;YACxB,MAAM,CAAC,GAAG,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;YACpB,IAAI,CAAC,KAAK,SAAS;gBAAE,MAAM,IAAI,KAAK,CAAC,QAAQ,GAAG,mBAAmB,CAAC,CAAC;YACrE,OAAO,CAAC,CAAC;QACX,CAAC,CAAC;QACF,QAAQ,GAAG,EAAE,CAAC;YACZ,KAAK,SAAS,CAAC;YACf,KAAK,IAAI;gBACP,MAAM,CAAC,KAAK,GAAG,IAAI,EAAE,CAAC;gBACtB,MAAM;YACR,KAAK,YAAY,CAAC;YAClB,KAAK,WAAW,CAAC,CAAC,CAAC;gBACjB,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC;gBACjB,IAAI,CAAC,CAAC,KAAK,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;oBACtE,MAAM,IAAI,KAAK,CAAC,6BAA6B,CAAC,EAAE,CAAC,CAAC;gBACpD,CAAC;gBACD,MAAM,CAAC,QAAQ,GAAG,CAA0B,CAAC;gBAC7C,MAAM;YACR,CAAC;YACD,KAAK,WAAW;gBACd,MAAM,CAAC,OAAO,GAAG,IAAI,EAAE,CAAC;gBACxB,MAAM;YACR,KAAK,OAAO;gBACV,MAAM,CAAC,GAAG,GAAG,IAAI,EAAE,CAAC;gBACpB,MAAM;YACR,KAAK,cAAc;gBACjB,MAAM,CAAC,SAAS,GAAG,IAAI,CAAC;gBACxB,MAAM;YACR,KAAK,eAAe;gBAClB,MAAM,CAAC,UAAU,GAAG,IAAI,EAAE,CAAC;gBAC3B,MAAM;YACR,KAAK,oBAAoB;gBACvB,MAAM,CAAC,cAAc,GAAG,IAAI,CAAC;gBAC7B,MAAM;YACR,KAAK,SAAS;gBACZ,MAAM,CAAC,KAAK,GAAG,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;gBACtE,MAAM;YACR,KAAK,gCAAgC;gBACnC,MAAM,CAAC,0BAA0B,GAAG,IAAI,CAAC;gBACzC,MAAM;YACR,KAAK,WAAW,CAAC,CAAC,CAAC;gBACjB,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC;gBACjB,IAAI,CAAC,KAAK,MAAM,IAAI,CAAC,KAAK,UAAU,EAAE,CAAC;oBACrC,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,8BAA8B,CAAC,CAAC;gBACzE,CAAC;gBACD,MAAM,CAAC,OAAO,GAAG,CAAC,CAAC;gBACnB,MAAM;YACR,CAAC;YACD,KAAK,iBAAiB,CAAC,CAAC,CAAC;gBACvB,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC;gBACjB,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;oBACnB,MAAM,IAAI,KAAK,CAAC,4CAA4C,CAAC,CAAC;gBAChE,CAAC;gBACD,MAAM,CAAC,YAAY,GAAG,CAAC,CAAC;gBACxB,MAAM;YACR,CAAC;YACD,KAAK,eAAe,CAAC,CAAC,CAAC;gBACrB,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC;gBACjB,MAAM,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;gBAC1B,IAAI,EAAE,GAAG,CAAC,EAAE,CAAC;oBACX,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,IAAI,CAAC,CAAC;gBAChE,CAAC;gBACD,MAAM,GAAG,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBAC3B,MAAM,GAAG,GAAG,CAAC,CAAC,KAAK,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;gBAC5B,IAAI,CAAC,0BAA0B,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;oBAC1C,MAAM,IAAI,KAAK,CAAC,6DAA6D,GAAG,IAAI,CAAC,CAAC;gBACxF,CAAC;gBACD,MAAM,CAAC,UAAU,GAAG,EAAE,GAAG,CAAC,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC;gBACjE,MAAM;YACR,CAAC;YACD,KAAK,cAAc,CAAC,CAAC,CAAC;gBACpB,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;gBACxB,IAAI,CAAC,CAAC;oBAAE,MAAM,IAAI,KAAK,CAAC,wCAAwC,CAAC,CAAC;gBAClE,IAAI,CAAC,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;oBAClC,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,IAAI,CAAC,CAAC;gBACtE,CAAC;gBACD,gEAAgE;gBAChE,gEAAgE;gBAChE,+DAA+D;gBAC/D,gEAAgE;gBAChE,kEAAkE;gBAClE,MAAM,GAAG,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,gBAAgB,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,gBAAgB,CAAC,CAAC,CAAC,EAAE,CAAC;gBAClF,MAAM,CAAC,gBAAgB,GAAG,CAAC,GAAG,GAAG,EAAE,CAAC,CAAC,CAAC;gBACtC,MAAM;YACR,CAAC;YACD,KAAK,cAAc;gBACjB,MAAM,CAAC,gBAAgB,GAAG,IAAI,CAAC;gBAC/B,MAAM;YACR,KAAK,iBAAiB;gBACpB,MAAM,CAAC,SAAS,GAAG,KAAK,CAAC;gBACzB,MAAM;YACR,KAAK,uBAAuB,CAAC,CAAC,CAAC;gBAC7B,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;gBACxB,IAAI,CAAC,CAAC;oBAAE,MAAM,IAAI,KAAK,CAAC,wCAAwC,CAAC,CAAC;gBAClE,MAAM,CAAC,iBAAiB,GAAG,CAAC,CAAC;gBAC7B,MAAM;YACR,CAAC;YACD,KAAK,wBAAwB,CAAC,CAAC,CAAC;gBAC9B,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC;gBACjB,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;gBACpB,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;oBACxD,MAAM,IAAI,KAAK,CAAC,2DAA2D,CAAC,IAAI,CAAC,CAAC;gBACpF,CAAC;gBACD,MAAM,CAAC,iBAAiB,GAAG,CAAC,CAAC;gBAC7B,MAAM;YACR,CAAC;YACD,KAAK,IAAI,CAAC;YACV,KAAK,QAAQ;gBACX,SAAS,EAAE,CAAC;gBACZ,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB;gBACE,MAAM,IAAI,KAAK,CAAC,iBAAiB,GAAG,EAAE,CAAC,CAAC;QAC5C,CAAC;IACH,CAAC;IAED,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;QAClB,MAAM,IAAI,KAAK,CAAC,uDAAuD,CAAC,CAAC;IAC3E,CAAC;IACD,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QAChC,MAAM,IAAI,KAAK,CAAC,uCAAuC,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC;IAC1E,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Minimal HTML → readable text extractor. No dependencies.
|
|
3
|
+
*
|
|
4
|
+
* Approach:
|
|
5
|
+
* 1. Strip <script>, <style>, <noscript>, <iframe>, and HTML comments
|
|
6
|
+
* so the agent never sees code or hidden trackers.
|
|
7
|
+
* 2. Replace block-level tags with newlines so paragraphs stay separated.
|
|
8
|
+
* 3. Drop all other tags.
|
|
9
|
+
* 4. Decode the common named entities and any &#NN; / &#xHH; numeric
|
|
10
|
+
* escapes.
|
|
11
|
+
* 5. Collapse runs of whitespace; cap output at MAX_BYTES.
|
|
12
|
+
*
|
|
13
|
+
* Not a Readability-style content-only extractor — for that, use Tavily or
|
|
14
|
+
* Exa's native extraction, which the provider clients invoke directly.
|
|
15
|
+
*/
|
|
16
|
+
export declare const EXTRACT_MAX_BYTES: number;
|
|
17
|
+
export declare function extractTitle(html: string): string | undefined;
|
|
18
|
+
export declare function htmlToText(html: string, maxBytes?: number): string;
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Minimal HTML → readable text extractor. No dependencies.
|
|
3
|
+
*
|
|
4
|
+
* Approach:
|
|
5
|
+
* 1. Strip <script>, <style>, <noscript>, <iframe>, and HTML comments
|
|
6
|
+
* so the agent never sees code or hidden trackers.
|
|
7
|
+
* 2. Replace block-level tags with newlines so paragraphs stay separated.
|
|
8
|
+
* 3. Drop all other tags.
|
|
9
|
+
* 4. Decode the common named entities and any &#NN; / &#xHH; numeric
|
|
10
|
+
* escapes.
|
|
11
|
+
* 5. Collapse runs of whitespace; cap output at MAX_BYTES.
|
|
12
|
+
*
|
|
13
|
+
* Not a Readability-style content-only extractor — for that, use Tavily or
|
|
14
|
+
* Exa's native extraction, which the provider clients invoke directly.
|
|
15
|
+
*/
|
|
16
|
+
export const EXTRACT_MAX_BYTES = 200 * 1024;
|
|
17
|
+
const NAMED_ENTITIES = {
|
|
18
|
+
amp: "&",
|
|
19
|
+
lt: "<",
|
|
20
|
+
gt: ">",
|
|
21
|
+
quot: '"',
|
|
22
|
+
apos: "'",
|
|
23
|
+
nbsp: " ",
|
|
24
|
+
copy: "(c)",
|
|
25
|
+
reg: "(R)",
|
|
26
|
+
trade: "(TM)",
|
|
27
|
+
hellip: "…",
|
|
28
|
+
mdash: "—",
|
|
29
|
+
ndash: "–",
|
|
30
|
+
lsquo: "‘",
|
|
31
|
+
rsquo: "’",
|
|
32
|
+
ldquo: "“",
|
|
33
|
+
rdquo: "”",
|
|
34
|
+
};
|
|
35
|
+
function decodeEntities(s) {
|
|
36
|
+
return s.replace(/&(#x?[0-9a-fA-F]+|[a-zA-Z]+);/g, (_m, body) => {
|
|
37
|
+
if (body.startsWith("#x") || body.startsWith("#X")) {
|
|
38
|
+
const code = parseInt(body.slice(2), 16);
|
|
39
|
+
return Number.isFinite(code) ? safeFromCodePoint(code) : _m;
|
|
40
|
+
}
|
|
41
|
+
if (body.startsWith("#")) {
|
|
42
|
+
const code = parseInt(body.slice(1), 10);
|
|
43
|
+
return Number.isFinite(code) ? safeFromCodePoint(code) : _m;
|
|
44
|
+
}
|
|
45
|
+
const mapped = NAMED_ENTITIES[body];
|
|
46
|
+
return mapped ?? _m;
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
function safeFromCodePoint(code) {
|
|
50
|
+
if (code < 0 || code > 0x10ffff)
|
|
51
|
+
return "";
|
|
52
|
+
try {
|
|
53
|
+
return String.fromCodePoint(code);
|
|
54
|
+
}
|
|
55
|
+
catch {
|
|
56
|
+
return "";
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
const BLOCK_TAGS = new Set([
|
|
60
|
+
"p", "br", "div", "section", "article", "header", "footer", "main",
|
|
61
|
+
"nav", "aside", "ul", "ol", "li", "table", "tr", "td", "th",
|
|
62
|
+
"h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "hr", "pre", "code",
|
|
63
|
+
"dl", "dt", "dd", "figure", "figcaption",
|
|
64
|
+
]);
|
|
65
|
+
export function extractTitle(html) {
|
|
66
|
+
const m = /<title[^>]*>([\s\S]*?)<\/title>/i.exec(html);
|
|
67
|
+
if (!m)
|
|
68
|
+
return undefined;
|
|
69
|
+
const t = decodeEntities(m[1]).replace(/\s+/g, " ").trim();
|
|
70
|
+
return t.length > 0 ? t : undefined;
|
|
71
|
+
}
|
|
72
|
+
export function htmlToText(html, maxBytes = EXTRACT_MAX_BYTES) {
|
|
73
|
+
// 1. Strip dangerous / noise sections wholesale.
|
|
74
|
+
let s = html.replace(/<!--[\s\S]*?-->/g, "");
|
|
75
|
+
s = s.replace(/<script\b[\s\S]*?<\/script\s*>/gi, "");
|
|
76
|
+
s = s.replace(/<style\b[\s\S]*?<\/style\s*>/gi, "");
|
|
77
|
+
s = s.replace(/<noscript\b[\s\S]*?<\/noscript\s*>/gi, "");
|
|
78
|
+
s = s.replace(/<iframe\b[\s\S]*?<\/iframe\s*>/gi, "");
|
|
79
|
+
// 2. Convert block-level open/close tags to newlines so paragraphs survive.
|
|
80
|
+
s = s.replace(/<\/?([a-zA-Z][a-zA-Z0-9]*)\b[^>]*>/g, (_match, tag) => {
|
|
81
|
+
if (BLOCK_TAGS.has(tag.toLowerCase()))
|
|
82
|
+
return "\n";
|
|
83
|
+
return "";
|
|
84
|
+
});
|
|
85
|
+
// 3. Decode entities.
|
|
86
|
+
s = decodeEntities(s);
|
|
87
|
+
// 4. Normalize whitespace: collapse runs of spaces/tabs; trim per line;
|
|
88
|
+
// collapse runs of blank lines.
|
|
89
|
+
s = s.replace(/\r\n?/g, "\n");
|
|
90
|
+
s = s
|
|
91
|
+
.split("\n")
|
|
92
|
+
.map((line) => line.replace(/[ \t\f\v]+/g, " ").trim())
|
|
93
|
+
.filter((line, i, arr) => {
|
|
94
|
+
// collapse 2+ blank lines down to 1
|
|
95
|
+
if (line !== "")
|
|
96
|
+
return true;
|
|
97
|
+
return arr[i - 1] !== "";
|
|
98
|
+
})
|
|
99
|
+
.join("\n")
|
|
100
|
+
.trim();
|
|
101
|
+
// 5. Byte cap. UTF-8 size approximation by encoding; on overflow, slice
|
|
102
|
+
// on code-point boundary then re-decode.
|
|
103
|
+
const encoder = new TextEncoder();
|
|
104
|
+
const bytes = encoder.encode(s);
|
|
105
|
+
if (bytes.byteLength <= maxBytes)
|
|
106
|
+
return s;
|
|
107
|
+
// Decode just the prefix.
|
|
108
|
+
return new TextDecoder("utf-8", { fatal: false }).decode(bytes.subarray(0, maxBytes));
|
|
109
|
+
}
|
|
110
|
+
//# sourceMappingURL=extract.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extract.js","sourceRoot":"","sources":["../../../src/extensions/web-search/extract.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,MAAM,CAAC,MAAM,iBAAiB,GAAG,GAAG,GAAG,IAAI,CAAC;AAE5C,MAAM,cAAc,GAA2B;IAC7C,GAAG,EAAE,GAAG;IACR,EAAE,EAAE,GAAG;IACP,EAAE,EAAE,GAAG;IACP,IAAI,EAAE,GAAG;IACT,IAAI,EAAE,GAAG;IACT,IAAI,EAAE,GAAG;IACT,IAAI,EAAE,KAAK;IACX,GAAG,EAAE,KAAK;IACV,KAAK,EAAE,MAAM;IACb,MAAM,EAAE,GAAG;IACX,KAAK,EAAE,GAAG;IACV,KAAK,EAAE,GAAG;IACV,KAAK,EAAE,GAAG;IACV,KAAK,EAAE,GAAG;IACV,KAAK,EAAE,GAAG;IACV,KAAK,EAAE,GAAG;CACX,CAAC;AAEF,SAAS,cAAc,CAAC,CAAS;IAC/B,OAAO,CAAC,CAAC,OAAO,CAAC,gCAAgC,EAAE,CAAC,EAAE,EAAE,IAAY,EAAE,EAAE;QACtE,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;YACnD,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACzC,OAAO,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC9D,CAAC;QACD,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YACzB,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACzC,OAAO,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC9D,CAAC;QACD,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;QACpC,OAAO,MAAM,IAAI,EAAE,CAAC;IACtB,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAY;IACrC,IAAI,IAAI,GAAG,CAAC,IAAI,IAAI,GAAG,QAAQ;QAAE,OAAO,EAAE,CAAC;IAC3C,IAAI,CAAC;QACH,OAAO,MAAM,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;IACpC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC;IACzB,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM;IAClE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI;IAC3D,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,YAAY,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM;IACrE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,YAAY;CACzC,CAAC,CAAC;AAEH,MAAM,UAAU,YAAY,CAAC,IAAY;IACvC,MAAM,CAAC,GAAG,kCAAkC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACxD,IAAI,CAAC,CAAC;QAAE,OAAO,SAAS,CAAC;IACzB,MAAM,CAAC,GAAG,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAC3D,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;AACtC,CAAC;AAED,MAAM,UAAU,UAAU,CAAC,IAAY,EAAE,QAAQ,GAAG,iBAAiB;IACnE,iDAAiD;IACjD,IAAI,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;IAC7C,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,kCAAkC,EAAE,EAAE,CAAC,CAAC;IACtD,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,gCAAgC,EAAE,EAAE,CAAC,CAAC;IACpD,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,sCAAsC,EAAE,EAAE,CAAC,CAAC;IAC1D,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,kCAAkC,EAAE,EAAE,CAAC,CAAC;IAEtD,4EAA4E;IAC5E,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,qCAAqC,EAAE,CAAC,MAAM,EAAE,GAAW,EAAE,EAAE;QAC3E,IAAI,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC;YAAE,OAAO,IAAI,CAAC;QACnD,OAAO,EAAE,CAAC;IACZ,CAAC,CAAC,CAAC;IAEH,sBAAsB;IACtB,CAAC,GAAG,cAAc,CAAC,CAAC,CAAC,CAAC;IAEtB,wEAAwE;IACxE,mCAAmC;IACnC,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC;IAC9B,CAAC,GAAG,CAAC;SACF,KAAK,CAAC,IAAI,CAAC;SACX,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;SACtD,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,EAAE;QACvB,oCAAoC;QACpC,IAAI,IAAI,KAAK,EAAE;YAAE,OAAO,IAAI,CAAC;QAC7B,OAAO,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC;IAC3B,CAAC,CAAC;SACD,IAAI,CAAC,IAAI,CAAC;SACV,IAAI,EAAE,CAAC;IAEV,wEAAwE;IACxE,4CAA4C;IAC5C,MAAM,OAAO,GAAG,IAAI,WAAW,EAAE,CAAC;IAClC,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;IAChC,IAAI,KAAK,CAAC,UAAU,IAAI,QAAQ;QAAE,OAAO,CAAC,CAAC;IAC3C,0BAA0B;IAC1B,OAAO,IAAI,WAAW,CAAC,OAAO,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC,MAAM,CACtD,KAAK,CAAC,QAAQ,CAAC,CAAC,EAAE,QAAQ,CAAC,CAC5B,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web-search extension entry point.
|
|
3
|
+
*
|
|
4
|
+
* Mirrors `src/extensions/github/index.ts`:
|
|
5
|
+
* - silent skip when no provider is selected / no key present
|
|
6
|
+
* - loud (warning-worthy) skip when the user explicitly asked for a
|
|
7
|
+
* provider but didn't supply its key, or set an unknown name
|
|
8
|
+
* - on success, hands a typed customTools list back to the runner
|
|
9
|
+
*
|
|
10
|
+
* Selection logic lives in `selection.ts`; this module wires the chosen
|
|
11
|
+
* provider to its tool builder and a per-run RateLimiter.
|
|
12
|
+
*/
|
|
13
|
+
import type { ToolDefinition } from "@earendil-works/pi-coding-agent";
|
|
14
|
+
import type { ProviderName, WebSearchSkipReason } from "./types.js";
|
|
15
|
+
export declare const DEFAULT_MAX_CALLS = 30;
|
|
16
|
+
export interface WebSearchExtensionConfig {
|
|
17
|
+
/** When false, the extension is force-skipped (disabled-by-flag). Default: true. */
|
|
18
|
+
webSearch?: boolean;
|
|
19
|
+
/** Explicit provider override. */
|
|
20
|
+
webSearchProvider?: string;
|
|
21
|
+
/** Per-run call budget shared across web_search + web_fetch. Default: 30. */
|
|
22
|
+
webSearchMaxCalls?: number;
|
|
23
|
+
/** Env override (defaults to process.env). Injected by tests. */
|
|
24
|
+
env?: Record<string, string | undefined>;
|
|
25
|
+
}
|
|
26
|
+
export interface WebSearchExtensionResult {
|
|
27
|
+
/** Tools to merge into createAgentSession({ customTools }). */
|
|
28
|
+
customTools: ToolDefinition<any>[];
|
|
29
|
+
toolNames: string[];
|
|
30
|
+
status: "configured" | "skipped";
|
|
31
|
+
reason?: WebSearchSkipReason;
|
|
32
|
+
message?: string;
|
|
33
|
+
provider?: ProviderName;
|
|
34
|
+
/** The cap actually enforced (echoed for observability). */
|
|
35
|
+
maxCalls?: number;
|
|
36
|
+
}
|
|
37
|
+
export declare function loadWebSearchExtension(config?: WebSearchExtensionConfig): WebSearchExtensionResult;
|
|
38
|
+
/**
|
|
39
|
+
* True if the skip is something the user almost certainly wants surfaced
|
|
40
|
+
* as a warning (vs. the silent "no keys set" case).
|
|
41
|
+
*/
|
|
42
|
+
export declare function isMisconfigurationSkip(result: WebSearchExtensionResult): boolean;
|
|
43
|
+
export type { ProviderName, WebSearchSkipReason } from "./types.js";
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web-search extension entry point.
|
|
3
|
+
*
|
|
4
|
+
* Mirrors `src/extensions/github/index.ts`:
|
|
5
|
+
* - silent skip when no provider is selected / no key present
|
|
6
|
+
* - loud (warning-worthy) skip when the user explicitly asked for a
|
|
7
|
+
* provider but didn't supply its key, or set an unknown name
|
|
8
|
+
* - on success, hands a typed customTools list back to the runner
|
|
9
|
+
*
|
|
10
|
+
* Selection logic lives in `selection.ts`; this module wires the chosen
|
|
11
|
+
* provider to its tool builder and a per-run RateLimiter.
|
|
12
|
+
*/
|
|
13
|
+
import { createBraveProvider } from "./providers/brave.js";
|
|
14
|
+
import { createExaProvider } from "./providers/exa.js";
|
|
15
|
+
import { createTavilyProvider } from "./providers/tavily.js";
|
|
16
|
+
import { RateLimiter } from "./rate-limit.js";
|
|
17
|
+
import { selectProvider } from "./selection.js";
|
|
18
|
+
import { buildWebSearchTools } from "./tools.js";
|
|
19
|
+
export const DEFAULT_MAX_CALLS = 30;
|
|
20
|
+
export function loadWebSearchExtension(config = {}) {
|
|
21
|
+
const env = config.env ?? process.env;
|
|
22
|
+
const input = {
|
|
23
|
+
webSearch: config.webSearch ?? true,
|
|
24
|
+
webSearchProvider: config.webSearchProvider,
|
|
25
|
+
env,
|
|
26
|
+
};
|
|
27
|
+
const selection = selectProvider(input);
|
|
28
|
+
if (selection.status === "skipped") {
|
|
29
|
+
return {
|
|
30
|
+
customTools: [],
|
|
31
|
+
toolNames: [],
|
|
32
|
+
status: "skipped",
|
|
33
|
+
reason: selection.reason,
|
|
34
|
+
message: selection.message,
|
|
35
|
+
provider: selection.provider,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
const provider = instantiateProvider(selection.provider, selection.apiKey);
|
|
39
|
+
const maxCalls = clampMaxCalls(config.webSearchMaxCalls);
|
|
40
|
+
const limiter = new RateLimiter(maxCalls);
|
|
41
|
+
const tools = buildWebSearchTools(provider, limiter);
|
|
42
|
+
return {
|
|
43
|
+
customTools: tools,
|
|
44
|
+
toolNames: tools.map((t) => t.name),
|
|
45
|
+
status: "configured",
|
|
46
|
+
provider: selection.provider,
|
|
47
|
+
message: selection.message,
|
|
48
|
+
maxCalls,
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* True if the skip is something the user almost certainly wants surfaced
|
|
53
|
+
* as a warning (vs. the silent "no keys set" case).
|
|
54
|
+
*/
|
|
55
|
+
export function isMisconfigurationSkip(result) {
|
|
56
|
+
if (result.status !== "skipped")
|
|
57
|
+
return false;
|
|
58
|
+
if (result.reason === "invalid-config")
|
|
59
|
+
return true;
|
|
60
|
+
// Explicit provider asked for, but no key — louder than the generic
|
|
61
|
+
// "no creds at all" skip.
|
|
62
|
+
if (result.reason === "no-credentials" && result.provider !== undefined)
|
|
63
|
+
return true;
|
|
64
|
+
return false;
|
|
65
|
+
}
|
|
66
|
+
function instantiateProvider(name, apiKey) {
|
|
67
|
+
switch (name) {
|
|
68
|
+
case "tavily":
|
|
69
|
+
return createTavilyProvider({ apiKey });
|
|
70
|
+
case "brave":
|
|
71
|
+
return createBraveProvider({ apiKey });
|
|
72
|
+
case "exa":
|
|
73
|
+
return createExaProvider({ apiKey });
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
function clampMaxCalls(v) {
|
|
77
|
+
if (v === undefined || !Number.isFinite(v))
|
|
78
|
+
return DEFAULT_MAX_CALLS;
|
|
79
|
+
const n = Math.floor(v);
|
|
80
|
+
if (n < 1)
|
|
81
|
+
return 1;
|
|
82
|
+
if (n > 1000)
|
|
83
|
+
return 1000;
|
|
84
|
+
return n;
|
|
85
|
+
}
|
|
86
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/extensions/web-search/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAIH,OAAO,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AAC3D,OAAO,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AACvD,OAAO,EAAE,oBAAoB,EAAE,MAAM,uBAAuB,CAAC;AAC7D,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAC9C,OAAO,EAAE,cAAc,EAAuB,MAAM,gBAAgB,CAAC;AACrE,OAAO,EAAE,mBAAmB,EAAE,MAAM,YAAY,CAAC;AAGjD,MAAM,CAAC,MAAM,iBAAiB,GAAG,EAAE,CAAC;AA0BpC,MAAM,UAAU,sBAAsB,CACpC,SAAmC,EAAE;IAErC,MAAM,GAAG,GAAG,MAAM,CAAC,GAAG,IAAK,OAAO,CAAC,GAA0C,CAAC;IAC9E,MAAM,KAAK,GAAmB;QAC5B,SAAS,EAAE,MAAM,CAAC,SAAS,IAAI,IAAI;QACnC,iBAAiB,EAAE,MAAM,CAAC,iBAAiB;QAC3C,GAAG;KACJ,CAAC;IAEF,MAAM,SAAS,GAAG,cAAc,CAAC,KAAK,CAAC,CAAC;IACxC,IAAI,SAAS,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;QACnC,OAAO;YACL,WAAW,EAAE,EAAE;YACf,SAAS,EAAE,EAAE;YACb,MAAM,EAAE,SAAS;YACjB,MAAM,EAAE,SAAS,CAAC,MAAM;YACxB,OAAO,EAAE,SAAS,CAAC,OAAO;YAC1B,QAAQ,EAAE,SAAS,CAAC,QAAQ;SAC7B,CAAC;IACJ,CAAC;IAED,MAAM,QAAQ,GAAG,mBAAmB,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC;IAC3E,MAAM,QAAQ,GAAG,aAAa,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC;IACzD,MAAM,OAAO,GAAG,IAAI,WAAW,CAAC,QAAQ,CAAC,CAAC;IAC1C,MAAM,KAAK,GAAG,mBAAmB,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAErD,OAAO;QACL,WAAW,EAAE,KAAK;QAClB,SAAS,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QACnC,MAAM,EAAE,YAAY;QACpB,QAAQ,EAAE,SAAS,CAAC,QAAQ;QAC5B,OAAO,EAAE,SAAS,CAAC,OAAO;QAC1B,QAAQ;KACT,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,sBAAsB,CAAC,MAAgC;IACrE,IAAI,MAAM,CAAC,MAAM,KAAK,SAAS;QAAE,OAAO,KAAK,CAAC;IAC9C,IAAI,MAAM,CAAC,MAAM,KAAK,gBAAgB;QAAE,OAAO,IAAI,CAAC;IACpD,oEAAoE;IACpE,0BAA0B;IAC1B,IAAI,MAAM,CAAC,MAAM,KAAK,gBAAgB,IAAI,MAAM,CAAC,QAAQ,KAAK,SAAS;QAAE,OAAO,IAAI,CAAC;IACrF,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,mBAAmB,CAAC,IAAkB,EAAE,MAAc;IAC7D,QAAQ,IAAI,EAAE,CAAC;QACb,KAAK,QAAQ;YACX,OAAO,oBAAoB,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QAC1C,KAAK,OAAO;YACV,OAAO,mBAAmB,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QACzC,KAAK,KAAK;YACR,OAAO,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;IACzC,CAAC;AACH,CAAC;AAED,SAAS,aAAa,CAAC,CAAqB;IAC1C,IAAI,CAAC,KAAK,SAAS,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC;QAAE,OAAO,iBAAiB,CAAC;IACrE,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACxB,IAAI,CAAC,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IACpB,IAAI,CAAC,GAAG,IAAI;QAAE,OAAO,IAAI,CAAC;IAC1B,OAAO,CAAC,CAAC;AACX,CAAC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Brave Search provider. https://api.search.brave.com/app/documentation
|
|
3
|
+
*
|
|
4
|
+
* Endpoint used:
|
|
5
|
+
* GET https://api.search.brave.com/res/v1/web/search?q=…&count=…
|
|
6
|
+
* Header: X-Subscription-Token: <key>
|
|
7
|
+
*
|
|
8
|
+
* Brave has no content-extraction endpoint, so this provider has no
|
|
9
|
+
* `fetch()` method — the tool layer falls back to safeFetch + the HTML
|
|
10
|
+
* extractor for `web_fetch`.
|
|
11
|
+
*
|
|
12
|
+
* `include_domains` / `exclude_domains` are honored via client-side
|
|
13
|
+
* post-filtering so the tool's schema behaves uniformly across providers.
|
|
14
|
+
*/
|
|
15
|
+
import type { FetchImpl, Provider } from "../types.js";
|
|
16
|
+
export interface BraveOptions {
|
|
17
|
+
apiKey: string;
|
|
18
|
+
fetchImpl?: FetchImpl;
|
|
19
|
+
baseUrl?: string;
|
|
20
|
+
}
|
|
21
|
+
export declare function createBraveProvider(options: BraveOptions): Provider;
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Brave Search provider. https://api.search.brave.com/app/documentation
|
|
3
|
+
*
|
|
4
|
+
* Endpoint used:
|
|
5
|
+
* GET https://api.search.brave.com/res/v1/web/search?q=…&count=…
|
|
6
|
+
* Header: X-Subscription-Token: <key>
|
|
7
|
+
*
|
|
8
|
+
* Brave has no content-extraction endpoint, so this provider has no
|
|
9
|
+
* `fetch()` method — the tool layer falls back to safeFetch + the HTML
|
|
10
|
+
* extractor for `web_fetch`.
|
|
11
|
+
*
|
|
12
|
+
* `include_domains` / `exclude_domains` are honored via client-side
|
|
13
|
+
* post-filtering so the tool's schema behaves uniformly across providers.
|
|
14
|
+
*/
|
|
15
|
+
function hostMatches(url, pattern) {
|
|
16
|
+
let host;
|
|
17
|
+
try {
|
|
18
|
+
host = new URL(url).hostname.toLowerCase();
|
|
19
|
+
}
|
|
20
|
+
catch {
|
|
21
|
+
return false;
|
|
22
|
+
}
|
|
23
|
+
const p = pattern.toLowerCase();
|
|
24
|
+
return host === p || host.endsWith(`.${p}`);
|
|
25
|
+
}
|
|
26
|
+
export function createBraveProvider(options) {
|
|
27
|
+
const fetchImpl = options.fetchImpl ?? globalThis.fetch;
|
|
28
|
+
const baseUrl = options.baseUrl ?? "https://api.search.brave.com/res/v1";
|
|
29
|
+
return {
|
|
30
|
+
name: "brave",
|
|
31
|
+
supportsExtractedContent: false,
|
|
32
|
+
async search(params) {
|
|
33
|
+
const url = new URL(`${baseUrl}/web/search`);
|
|
34
|
+
url.searchParams.set("q", params.query);
|
|
35
|
+
// Brave's `count` caps at 20; we then post-filter and slice.
|
|
36
|
+
url.searchParams.set("count", String(Math.min(20, Math.max(1, params.maxResults * 2))));
|
|
37
|
+
const r = await fetchImpl(url.toString(), {
|
|
38
|
+
method: "GET",
|
|
39
|
+
headers: {
|
|
40
|
+
accept: "application/json",
|
|
41
|
+
"x-subscription-token": options.apiKey,
|
|
42
|
+
},
|
|
43
|
+
});
|
|
44
|
+
if (!r.ok) {
|
|
45
|
+
const text = await r.text().catch(() => "");
|
|
46
|
+
throw new Error(`brave search failed: http ${r.status} ${text.slice(0, 200)}`);
|
|
47
|
+
}
|
|
48
|
+
const data = (await r.json());
|
|
49
|
+
const items = (data.web?.results ?? []).map((it) => ({
|
|
50
|
+
title: it.title ?? "",
|
|
51
|
+
url: it.url ?? "",
|
|
52
|
+
snippet: it.description,
|
|
53
|
+
publishedDate: it.age,
|
|
54
|
+
}));
|
|
55
|
+
let filtered = items;
|
|
56
|
+
if (params.includeDomains?.length) {
|
|
57
|
+
const list = params.includeDomains;
|
|
58
|
+
filtered = filtered.filter((it) => it.url && list.some((d) => hostMatches(it.url, d)));
|
|
59
|
+
}
|
|
60
|
+
if (params.excludeDomains?.length) {
|
|
61
|
+
const list = params.excludeDomains;
|
|
62
|
+
filtered = filtered.filter((it) => !(it.url && list.some((d) => hostMatches(it.url, d))));
|
|
63
|
+
}
|
|
64
|
+
filtered = filtered.slice(0, params.maxResults);
|
|
65
|
+
return {
|
|
66
|
+
provider: "brave",
|
|
67
|
+
query: params.query,
|
|
68
|
+
results: filtered,
|
|
69
|
+
};
|
|
70
|
+
},
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
//# sourceMappingURL=brave.js.map
|