@juicesharp/rpiv-web-tools 1.7.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -22
- package/index.ts +5 -2
- package/package.json +12 -3
- package/providers/brave.ts +76 -0
- package/providers/exa.ts +106 -0
- package/providers/factory.ts +26 -0
- package/providers/fetch-helpers.ts +117 -0
- package/providers/firecrawl.ts +119 -0
- package/providers/index.ts +24 -0
- package/providers/jina.ts +104 -0
- package/providers/serper.ts +81 -0
- package/providers/tavily.ts +117 -0
- package/providers/types.ts +25 -0
- package/web-tools.ts +134 -222
package/README.md
CHANGED
|
@@ -11,16 +11,18 @@
|
|
|
11
11
|
[](https://www.npmjs.com/package/@juicesharp/rpiv-web-tools)
|
|
12
12
|
[](https://opensource.org/licenses/MIT)
|
|
13
13
|
|
|
14
|
-
Let the model search the web and read pages. `rpiv-web-tools` adds `web_search` and `web_fetch` tools to [Pi Agent](https://github.com/badlogic/pi-mono)
|
|
14
|
+
Let the model search the web and read pages. `rpiv-web-tools` adds `web_search` and `web_fetch` tools to [Pi Agent](https://github.com/badlogic/pi-mono) with pluggable providers (Brave, Tavily, Serper, Exa, Jina, Firecrawl), plus `/web-search-config` for interactive provider selection and API-key setup.
|
|
15
15
|
|
|
16
|
-

|
|
17
17
|
|
|
18
18
|
## Features
|
|
19
19
|
|
|
20
|
-
- **
|
|
21
|
-
- **
|
|
20
|
+
- **Six pluggable providers** - Brave, Tavily, Serper, Exa, Jina, Firecrawl. Pick one as the active backend; switch any time without losing the others' keys.
|
|
21
|
+
- **Per-provider fetch strategy** - Brave and Serper read the URL directly and strip HTML to text; Tavily/Exa/Jina/Firecrawl use their native extraction endpoints (markdown for Jina/Firecrawl, plain text for Tavily/Exa).
|
|
22
|
+
- **Read any URL** - fetch http/https pages with HTML-to-text extraction, or get the raw response with `raw: true` (honoured by Brave/Serper; extraction providers always return their parsed text).
|
|
22
23
|
- **Large-page spillover** - oversized responses truncate inline and spill the full body to a temp file the model can read on demand.
|
|
23
|
-
- **
|
|
24
|
+
- **SSRF guard** - refuses loopback, RFC 1918, link-local, and cloud-metadata addresses (`localhost`, `127.0.0.0/8`, `10.0.0.0/8`, `169.254.0.0/16`, `172.16.0.0/12`, `192.168.0.0/16`, `::1`, `fc00::/7`, `fe80::/10`).
|
|
25
|
+
- **Interactive setup** - `/web-search-config` lists providers (active one first, configured ones marked) and writes to `~/.config/rpiv-web-tools/config.json` (chmod 0600); per-provider env vars also work and take precedence over persisted keys.
|
|
24
26
|
|
|
25
27
|
## Install
|
|
26
28
|
|
|
@@ -32,11 +34,11 @@ Then restart your Pi session.
|
|
|
32
34
|
|
|
33
35
|
## Tools
|
|
34
36
|
|
|
35
|
-
- **`web_search`** - query the
|
|
37
|
+
- **`web_search`** - query the active provider's search API and return titled snippets.
|
|
36
38
|
1–10 results per call.
|
|
37
|
-
- **`web_fetch`** - fetch an http/https URL
|
|
38
|
-
|
|
39
|
-
the full content.
|
|
39
|
+
- **`web_fetch`** - fetch an http/https URL through the active provider's content path
|
|
40
|
+
(raw HTTP+htmlToText for Brave/Serper; native extraction for Tavily/Exa/Jina/Firecrawl),
|
|
41
|
+
truncate large responses with a temp-file spill for the full content.
|
|
40
42
|
|
|
41
43
|
### Schema - `web_search`
|
|
42
44
|
|
|
@@ -54,14 +56,14 @@ Returns:
|
|
|
54
56
|
content: [{ type: "text", text: string }], // markdown list of "**title**\n url\n snippet"
|
|
55
57
|
details: {
|
|
56
58
|
query: string,
|
|
57
|
-
backend: "brave",
|
|
59
|
+
backend: "brave" | "tavily" | "serper" | "exa" | "jina" | "firecrawl",
|
|
58
60
|
resultCount: number,
|
|
59
61
|
results?: Array<{ title: string, url: string, snippet: string }>,
|
|
60
62
|
}
|
|
61
63
|
}
|
|
62
64
|
```
|
|
63
65
|
|
|
64
|
-
Throws when
|
|
66
|
+
Throws when the active provider's API key is unset (e.g. `EXA_API_KEY is not set`) or the provider's API returns a non-2xx response.
|
|
65
67
|
|
|
66
68
|
### Schema - `web_fetch`
|
|
67
69
|
|
|
@@ -88,20 +90,24 @@ Returns:
|
|
|
88
90
|
}
|
|
89
91
|
```
|
|
90
92
|
|
|
91
|
-
Throws on invalid URL, non-http(s) protocol, non-2xx response, or `image/` / `video/` / `audio/` content types.
|
|
93
|
+
Throws on invalid URL, non-http(s) protocol, private/loopback hostnames (SSRF guard), non-2xx response, or `image/` / `video/` / `audio/` content types. Extraction providers (Tavily/Exa/Jina/Firecrawl) additionally throw when the API returns an empty body or a vendor-level failure (e.g. Firecrawl `success: false`, Tavily `failed_results`).
|
|
92
94
|
|
|
93
95
|
## Commands
|
|
94
96
|
|
|
95
|
-
- **`/web-search-config`** -
|
|
96
|
-
|
|
97
|
-
the
|
|
97
|
+
- **`/web-search-config`** - pick the active provider and set its API key interactively.
|
|
98
|
+
Providers already configured show `(configured)`; the active one is listed first with a `✓`.
|
|
99
|
+
Pressing Enter on an empty input keeps the existing key for the chosen provider while
|
|
100
|
+
persisting the provider switch. Pass `--show` to see all per-provider keys (masked) and env var status.
|
|
98
101
|
|
|
99
|
-
## API key resolution
|
|
102
|
+
## API key resolution (per active provider)
|
|
100
103
|
|
|
101
104
|
First match wins:
|
|
102
105
|
|
|
103
|
-
1. `BRAVE_SEARCH_API_KEY`
|
|
104
|
-
2. `
|
|
106
|
+
1. The active provider's environment variable: `BRAVE_SEARCH_API_KEY`, `TAVILY_API_KEY`, `SERPER_API_KEY`, `EXA_API_KEY`, `JINA_API_KEY`, or `FIRECRAWL_API_KEY`
|
|
107
|
+
2. `apiKeys.<provider>` field in `~/.config/rpiv-web-tools/config.json`
|
|
108
|
+
3. Legacy `apiKey` field (Brave only — auto-migrated to the new shape on next save)
|
|
109
|
+
|
|
110
|
+
The active provider is `config.provider` (set by `/web-search-config`); falls back to `brave` if absent.
|
|
105
111
|
|
|
106
112
|
## Executor guidance overrides
|
|
107
113
|
|
|
@@ -109,10 +115,14 @@ Override the `promptSnippet` / `promptGuidelines` the model sees for each tool b
|
|
|
109
115
|
|
|
110
116
|
```json
|
|
111
117
|
{
|
|
112
|
-
"
|
|
118
|
+
"provider": "exa",
|
|
119
|
+
"apiKeys": {
|
|
120
|
+
"exa": "sk-...",
|
|
121
|
+
"brave": "sk-..."
|
|
122
|
+
},
|
|
113
123
|
"guidance": {
|
|
114
124
|
"web_search": {
|
|
115
|
-
"promptSnippet": "Search
|
|
125
|
+
"promptSnippet": "Search the web for current docs and library versions",
|
|
116
126
|
"promptGuidelines": [
|
|
117
127
|
"Only call web_search when training-data answers may be stale.",
|
|
118
128
|
"Always include a Sources: section with markdown hyperlinks."
|
|
@@ -127,9 +137,11 @@ Override the `promptSnippet` / `promptGuidelines` the model sees for each tool b
|
|
|
127
137
|
|
|
128
138
|
Each field is independent: omit one and the built-in default is kept. Invalid values (empty string, wrong type, empty array) silently fall back to defaults. Changes take effect on the next Pi session start.
|
|
129
139
|
|
|
130
|
-
## Security note: `web_fetch`
|
|
140
|
+
## Security note: `web_fetch` host guard
|
|
141
|
+
|
|
142
|
+
`web_fetch` refuses URLs targeting loopback (`localhost`, `127.0.0.0/8`, `::1`), RFC 1918 private ranges (`10.0.0.0/8`, `172.16.0.0/12`, `192.168.0.0/16`), link-local (`169.254.0.0/16`, including cloud-metadata at `169.254.169.254`), and IPv6 unique-local / link-local (`fc00::/7`, `fe80::/10`). Attempts surface as `Refusing to fetch private/loopback address: <host>`. This blocks the most common SSRF class — direct-literal targeting of internal services or cloud-metadata endpoints — without preventing legitimate public-web fetches.
|
|
131
143
|
|
|
132
|
-
|
|
144
|
+
The guard is host-literal only; it does NOT resolve DNS or validate redirects. A public hostname that resolves to a private IP, or a public URL that 302-redirects to one, will still reach the target. For untrusted automation environments, layer an egress proxy or firewall on top.
|
|
133
145
|
|
|
134
146
|
## License
|
|
135
147
|
|
package/index.ts
CHANGED
|
@@ -4,13 +4,16 @@
|
|
|
4
4
|
* Registers the `web_search` and `web_fetch` tools, plus the
|
|
5
5
|
* `/web-search-config` slash command. Body lives in `web-tools.ts`.
|
|
6
6
|
*
|
|
7
|
-
* Config persists at ~/.config/rpiv-web-tools/config.json.
|
|
8
|
-
* BRAVE_SEARCH_API_KEY
|
|
7
|
+
* Config persists at ~/.config/rpiv-web-tools/config.json. Per-provider env
|
|
8
|
+
* vars (e.g. BRAVE_SEARCH_API_KEY, TAVILY_API_KEY) win over the config file.
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
11
|
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
12
12
|
import { registerWebFetchTool, registerWebSearchConfigCommand, registerWebSearchTool } from "./web-tools.js";
|
|
13
13
|
|
|
14
|
+
export { createSearchProvider } from "./providers/factory.js";
|
|
15
|
+
|
|
16
|
+
export type { FetchResponse, SearchProvider, SearchResponse, SearchResult } from "./providers/types.js";
|
|
14
17
|
export {
|
|
15
18
|
DEFAULT_WEB_FETCH_GUIDELINES,
|
|
16
19
|
DEFAULT_WEB_FETCH_SNIPPET,
|
package/package.json
CHANGED
|
@@ -1,13 +1,21 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@juicesharp/rpiv-web-tools",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "Pi extension. Web search and fetch for the model
|
|
3
|
+
"version": "1.8.0",
|
|
4
|
+
"description": "Pi extension. Web search and fetch for the model with pluggable providers (Brave, Tavily, Serper, Exa, Jina, Firecrawl).",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"pi-package",
|
|
7
7
|
"pi-extension",
|
|
8
8
|
"rpiv",
|
|
9
9
|
"web-search",
|
|
10
|
-
"
|
|
10
|
+
"search",
|
|
11
|
+
"fetch",
|
|
12
|
+
"scrape",
|
|
13
|
+
"brave",
|
|
14
|
+
"tavily",
|
|
15
|
+
"serper",
|
|
16
|
+
"exa",
|
|
17
|
+
"jina",
|
|
18
|
+
"firecrawl"
|
|
11
19
|
],
|
|
12
20
|
"type": "module",
|
|
13
21
|
"license": "MIT",
|
|
@@ -30,6 +38,7 @@
|
|
|
30
38
|
"files": [
|
|
31
39
|
"index.ts",
|
|
32
40
|
"web-tools.ts",
|
|
41
|
+
"providers/",
|
|
33
42
|
"README.md",
|
|
34
43
|
"LICENSE"
|
|
35
44
|
],
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { assertTextContentType, extractBodyAsText, fetchUrlOrThrow } from "./fetch-helpers.js";
|
|
2
|
+
import type { FetchResponse, SearchProvider, SearchResponse, SearchResult } from "./types.js";
|
|
3
|
+
|
|
4
|
+
const BRAVE_SEARCH_API_URL = "https://api.search.brave.com/res/v1/web/search";
|
|
5
|
+
export const BRAVE_API_KEY_ENV_VAR = "BRAVE_SEARCH_API_KEY";
|
|
6
|
+
export const BRAVE_PROVIDER_META = {
|
|
7
|
+
name: "brave",
|
|
8
|
+
label: "Brave",
|
|
9
|
+
envVar: BRAVE_API_KEY_ENV_VAR,
|
|
10
|
+
} as const;
|
|
11
|
+
|
|
12
|
+
interface BraveRawResponse {
|
|
13
|
+
web?: { results?: Array<{ title?: string; url?: string; description?: string }> };
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function normalizeBraveResults(raw: BraveRawResponse): SearchResult[] {
|
|
17
|
+
return (raw.web?.results ?? []).map((r) => ({
|
|
18
|
+
title: r.title ?? "",
|
|
19
|
+
url: r.url ?? "",
|
|
20
|
+
snippet: r.description ?? "",
|
|
21
|
+
}));
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export class BraveProvider implements SearchProvider {
|
|
25
|
+
readonly name = BRAVE_PROVIDER_META.name;
|
|
26
|
+
readonly label = BRAVE_PROVIDER_META.label;
|
|
27
|
+
readonly envVar = BRAVE_PROVIDER_META.envVar;
|
|
28
|
+
|
|
29
|
+
constructor(private readonly apiKey: string) {}
|
|
30
|
+
|
|
31
|
+
async search(query: string, maxResults: number, signal?: AbortSignal): Promise<SearchResponse> {
|
|
32
|
+
if (!this.apiKey) {
|
|
33
|
+
throw new Error(`${this.envVar} is not set. Run /web-search-config to configure, or export the env var.`);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const url = new URL(BRAVE_SEARCH_API_URL);
|
|
37
|
+
url.searchParams.set("q", query);
|
|
38
|
+
url.searchParams.set("count", String(maxResults));
|
|
39
|
+
|
|
40
|
+
const res = await fetch(url.toString(), {
|
|
41
|
+
method: "GET",
|
|
42
|
+
headers: {
|
|
43
|
+
Accept: "application/json",
|
|
44
|
+
"Accept-Encoding": "gzip",
|
|
45
|
+
"X-Subscription-Token": this.apiKey,
|
|
46
|
+
},
|
|
47
|
+
signal,
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
if (!res.ok) {
|
|
51
|
+
const text = await res.text();
|
|
52
|
+
throw new Error(`${this.label} Search API error (${res.status}): ${text}`);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const raw = (await res.json()) as BraveRawResponse;
|
|
56
|
+
return { query, results: normalizeBraveResults(raw) };
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// No apiKey guard: Brave's fetch() wraps the built-in HTTP+htmlToText
|
|
60
|
+
// pipeline and does not call any vendor endpoint. Adding a guard would
|
|
61
|
+
// break the "use any provider for fetch" contract.
|
|
62
|
+
async fetch(url: string, raw: boolean, signal?: AbortSignal): Promise<FetchResponse> {
|
|
63
|
+
const res = await fetchUrlOrThrow(url, signal);
|
|
64
|
+
const contentType = res.headers.get("content-type") ?? "";
|
|
65
|
+
assertTextContentType(contentType);
|
|
66
|
+
|
|
67
|
+
const { text, title } = await extractBodyAsText(res, contentType, raw);
|
|
68
|
+
const contentLengthHeader = res.headers.get("content-length");
|
|
69
|
+
return {
|
|
70
|
+
text,
|
|
71
|
+
title,
|
|
72
|
+
contentType: contentType || undefined,
|
|
73
|
+
contentLength: contentLengthHeader ? Number(contentLengthHeader) : undefined,
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
}
|
package/providers/exa.ts
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import type { FetchResponse, SearchProvider, SearchResponse, SearchResult } from "./types.js";
|
|
2
|
+
|
|
3
|
+
const EXA_API_URL = "https://api.exa.ai/search";
|
|
4
|
+
const EXA_CONTENTS_API_URL = "https://api.exa.ai/contents";
|
|
5
|
+
export const EXA_API_KEY_ENV_VAR = "EXA_API_KEY";
|
|
6
|
+
export const EXA_PROVIDER_META = {
|
|
7
|
+
name: "exa",
|
|
8
|
+
label: "Exa",
|
|
9
|
+
envVar: EXA_API_KEY_ENV_VAR,
|
|
10
|
+
} as const;
|
|
11
|
+
const EXA_MAX_SNIPPET_CHARACTERS = 300;
|
|
12
|
+
const EXA_MAX_FETCH_CHARACTERS = 10000;
|
|
13
|
+
|
|
14
|
+
interface ExaRawResult {
|
|
15
|
+
title?: string;
|
|
16
|
+
url?: string;
|
|
17
|
+
text?: string;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
interface ExaRawResponse {
|
|
21
|
+
results?: ExaRawResult[];
|
|
22
|
+
error?: string;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function normalizeExaResults(results: ExaRawResult[]): SearchResult[] {
|
|
26
|
+
return results.map((r) => ({
|
|
27
|
+
title: r.title ?? "",
|
|
28
|
+
url: r.url ?? "",
|
|
29
|
+
snippet: r.text ?? "",
|
|
30
|
+
}));
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export class ExaProvider implements SearchProvider {
|
|
34
|
+
readonly name = EXA_PROVIDER_META.name;
|
|
35
|
+
readonly label = EXA_PROVIDER_META.label;
|
|
36
|
+
readonly envVar = EXA_PROVIDER_META.envVar;
|
|
37
|
+
|
|
38
|
+
constructor(private readonly apiKey: string) {}
|
|
39
|
+
|
|
40
|
+
async search(query: string, maxResults: number, signal?: AbortSignal): Promise<SearchResponse> {
|
|
41
|
+
if (!this.apiKey) {
|
|
42
|
+
throw new Error(`${this.envVar} is not set. Run /web-search-config to configure, or export the env var.`);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const res = await fetch(EXA_API_URL, {
|
|
46
|
+
method: "POST",
|
|
47
|
+
headers: {
|
|
48
|
+
"Content-Type": "application/json",
|
|
49
|
+
"x-api-key": this.apiKey,
|
|
50
|
+
},
|
|
51
|
+
body: JSON.stringify({
|
|
52
|
+
query,
|
|
53
|
+
numResults: maxResults,
|
|
54
|
+
contents: {
|
|
55
|
+
text: { maxCharacters: EXA_MAX_SNIPPET_CHARACTERS },
|
|
56
|
+
},
|
|
57
|
+
}),
|
|
58
|
+
signal,
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
if (!res.ok) {
|
|
62
|
+
const text = await res.text();
|
|
63
|
+
throw new Error(`${this.label} Search API error (${res.status}): ${text}`);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const raw = (await res.json()) as ExaRawResponse;
|
|
67
|
+
return { query, results: normalizeExaResults(raw.results ?? []) };
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
async fetch(url: string, _raw: boolean, signal?: AbortSignal): Promise<FetchResponse> {
|
|
71
|
+
if (!this.apiKey) {
|
|
72
|
+
throw new Error(`${this.envVar} is not set. Run /web-search-config to configure, or export the env var.`);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const res = await fetch(EXA_CONTENTS_API_URL, {
|
|
76
|
+
method: "POST",
|
|
77
|
+
headers: {
|
|
78
|
+
"Content-Type": "application/json",
|
|
79
|
+
"x-api-key": this.apiKey,
|
|
80
|
+
},
|
|
81
|
+
body: JSON.stringify({
|
|
82
|
+
ids: [url],
|
|
83
|
+
text: { maxCharacters: EXA_MAX_FETCH_CHARACTERS },
|
|
84
|
+
}),
|
|
85
|
+
signal,
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
if (!res.ok) {
|
|
89
|
+
const text = await res.text();
|
|
90
|
+
throw new Error(`${this.label} Fetch API error (${res.status}): ${text}`);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const data = (await res.json()) as ExaRawResponse;
|
|
94
|
+
const result = data.results?.[0];
|
|
95
|
+
|
|
96
|
+
if (!result?.text) {
|
|
97
|
+
throw new Error(`${this.label} Fetch API error: no content returned for ${url}`);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return {
|
|
101
|
+
text: result.text,
|
|
102
|
+
title: result.title || undefined,
|
|
103
|
+
contentType: "text/plain",
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { BraveProvider } from "./brave.js";
|
|
2
|
+
import { ExaProvider } from "./exa.js";
|
|
3
|
+
import { FirecrawlProvider } from "./firecrawl.js";
|
|
4
|
+
import { JinaProvider } from "./jina.js";
|
|
5
|
+
import { SerperProvider } from "./serper.js";
|
|
6
|
+
import { TavilyProvider } from "./tavily.js";
|
|
7
|
+
import type { SearchProvider } from "./types.js";
|
|
8
|
+
|
|
9
|
+
export function createSearchProvider(name: string, apiKey: string): SearchProvider {
|
|
10
|
+
switch (name) {
|
|
11
|
+
case "brave":
|
|
12
|
+
return new BraveProvider(apiKey);
|
|
13
|
+
case "tavily":
|
|
14
|
+
return new TavilyProvider(apiKey);
|
|
15
|
+
case "serper":
|
|
16
|
+
return new SerperProvider(apiKey);
|
|
17
|
+
case "exa":
|
|
18
|
+
return new ExaProvider(apiKey);
|
|
19
|
+
case "jina":
|
|
20
|
+
return new JinaProvider(apiKey);
|
|
21
|
+
case "firecrawl":
|
|
22
|
+
return new FirecrawlProvider(apiKey);
|
|
23
|
+
default:
|
|
24
|
+
throw new Error(`Unknown search provider: "${name}"`);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared fetch helpers — HTTP client, content-type guards, and HTML-to-text
|
|
3
|
+
* extraction used by providers that wrap the built-in pipeline (Brave, Serper).
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
// ---------------------------------------------------------------------------
|
|
7
|
+
// Constants
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
const USER_AGENT = "Mozilla/5.0 (compatible; rpiv-pi/1.0)";
|
|
11
|
+
const FETCH_ACCEPT_HEADER = "text/html,application/xhtml+xml,application/xml;q=0.9,text/plain;q=0.8,*/*;q=0.5";
|
|
12
|
+
const BINARY_CONTENT_TYPE_PREFIXES = ["image/", "video/", "audio/"];
|
|
13
|
+
const HTML_CONTENT_TYPE_TOKEN = "text/html";
|
|
14
|
+
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// HTML-to-text extraction
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
const SCRIPT_BLOCK_REGEX = /<script[\s\S]*?<\/script>/gi;
|
|
20
|
+
const STYLE_BLOCK_REGEX = /<style[\s\S]*?<\/style>/gi;
|
|
21
|
+
const NOSCRIPT_BLOCK_REGEX = /<noscript[\s\S]*?<\/noscript>/gi;
|
|
22
|
+
const BLOCK_CLOSER_REGEX =
|
|
23
|
+
/<\/(p|div|h[1-6]|li|tr|br|blockquote|pre|section|article|header|footer|nav|details|summary)>/gi;
|
|
24
|
+
const SELF_CLOSING_BR_REGEX = /<br\s*\/?>/gi;
|
|
25
|
+
const ANY_REMAINING_TAG_REGEX = /<[^>]+>/g;
|
|
26
|
+
const TITLE_TAG_REGEX = /<title[^>]*>([\s\S]*?)<\/title>/i;
|
|
27
|
+
const NUMERIC_HTML_ENTITY_REGEX = /&#(\d+);/g;
|
|
28
|
+
const HORIZONTAL_WHITESPACE_RUN = /[ \t]+/g;
|
|
29
|
+
const BLANK_LINE_RUN = /\n{3,}/g;
|
|
30
|
+
|
|
31
|
+
function stripNonContentBlocks(html: string): string {
|
|
32
|
+
return html.replace(SCRIPT_BLOCK_REGEX, "").replace(STYLE_BLOCK_REGEX, "").replace(NOSCRIPT_BLOCK_REGEX, "");
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function convertBlockTagsToNewlines(text: string): string {
|
|
36
|
+
return text.replace(BLOCK_CLOSER_REGEX, "\n").replace(SELF_CLOSING_BR_REGEX, "\n");
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function stripRemainingTags(text: string): string {
|
|
40
|
+
return text.replace(ANY_REMAINING_TAG_REGEX, " ");
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function decodeHtmlEntities(text: string): string {
|
|
44
|
+
return text
|
|
45
|
+
.replace(/&/g, "&")
|
|
46
|
+
.replace(/</g, "<")
|
|
47
|
+
.replace(/>/g, ">")
|
|
48
|
+
.replace(/"/g, '"')
|
|
49
|
+
.replace(/'/g, "'")
|
|
50
|
+
.replace(/ /g, " ")
|
|
51
|
+
.replace(NUMERIC_HTML_ENTITY_REGEX, (_, code) => String.fromCharCode(Number(code)));
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function collapseWhitespace(text: string): string {
|
|
55
|
+
return text.replace(HORIZONTAL_WHITESPACE_RUN, " ").replace(BLANK_LINE_RUN, "\n\n");
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export function htmlToText(html: string): string {
|
|
59
|
+
let text = stripNonContentBlocks(html);
|
|
60
|
+
text = convertBlockTagsToNewlines(text);
|
|
61
|
+
text = stripRemainingTags(text);
|
|
62
|
+
text = decodeHtmlEntities(text);
|
|
63
|
+
text = collapseWhitespace(text);
|
|
64
|
+
return text.trim();
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export function extractTitle(html: string): string | undefined {
|
|
68
|
+
const match = html.match(TITLE_TAG_REGEX);
|
|
69
|
+
if (!match) return undefined;
|
|
70
|
+
return match[1].replace(ANY_REMAINING_TAG_REGEX, "").trim() || undefined;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// ---------------------------------------------------------------------------
|
|
74
|
+
// URL + content-type guards
|
|
75
|
+
// ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
export function isHtmlContentType(contentType: string): boolean {
|
|
78
|
+
return contentType.includes(HTML_CONTENT_TYPE_TOKEN);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export function assertTextContentType(contentType: string): void {
|
|
82
|
+
if (BINARY_CONTENT_TYPE_PREFIXES.some((prefix) => contentType.includes(prefix))) {
|
|
83
|
+
throw new Error(`Unsupported content type: ${contentType}. web_fetch supports text pages only.`);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// ---------------------------------------------------------------------------
|
|
88
|
+
// HTTP fetch
|
|
89
|
+
// ---------------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
export function buildFetchRequestInit(signal: AbortSignal | undefined): RequestInit {
|
|
92
|
+
return {
|
|
93
|
+
signal,
|
|
94
|
+
redirect: "follow",
|
|
95
|
+
headers: { "User-Agent": USER_AGENT, Accept: FETCH_ACCEPT_HEADER },
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
export async function fetchUrlOrThrow(url: string, signal: AbortSignal | undefined): Promise<Response> {
|
|
100
|
+
const res = await fetch(url, buildFetchRequestInit(signal));
|
|
101
|
+
if (!res.ok) {
|
|
102
|
+
throw new Error(`HTTP ${res.status} ${res.statusText} for ${url}`);
|
|
103
|
+
}
|
|
104
|
+
return res;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export async function extractBodyAsText(
|
|
108
|
+
res: Response,
|
|
109
|
+
contentType: string,
|
|
110
|
+
raw: boolean,
|
|
111
|
+
): Promise<{ text: string; title?: string }> {
|
|
112
|
+
const body = await res.text();
|
|
113
|
+
if (!raw && isHtmlContentType(contentType)) {
|
|
114
|
+
return { text: htmlToText(body), title: extractTitle(body) };
|
|
115
|
+
}
|
|
116
|
+
return { text: body };
|
|
117
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import type { FetchResponse, SearchProvider, SearchResponse, SearchResult } from "./types.js";
|
|
2
|
+
|
|
3
|
+
const FIRECRAWL_API_URL = "https://api.firecrawl.dev/v1";
|
|
4
|
+
export const FIRECRAWL_API_KEY_ENV_VAR = "FIRECRAWL_API_KEY";
|
|
5
|
+
export const FIRECRAWL_PROVIDER_META = {
|
|
6
|
+
name: "firecrawl",
|
|
7
|
+
label: "Firecrawl",
|
|
8
|
+
envVar: FIRECRAWL_API_KEY_ENV_VAR,
|
|
9
|
+
} as const;
|
|
10
|
+
|
|
11
|
+
interface FirecrawlSearchResult {
|
|
12
|
+
title?: string;
|
|
13
|
+
url?: string;
|
|
14
|
+
description?: string;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
interface FirecrawlSearchResponse {
|
|
18
|
+
success?: boolean;
|
|
19
|
+
data?: FirecrawlSearchResult[];
|
|
20
|
+
error?: string;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
interface FirecrawlScrapeResponse {
|
|
24
|
+
success?: boolean;
|
|
25
|
+
data?: {
|
|
26
|
+
markdown?: string;
|
|
27
|
+
html?: string;
|
|
28
|
+
metadata?: {
|
|
29
|
+
title?: string;
|
|
30
|
+
description?: string;
|
|
31
|
+
language?: string;
|
|
32
|
+
statusCode?: number;
|
|
33
|
+
};
|
|
34
|
+
};
|
|
35
|
+
error?: string;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function normalizeFirecrawlResults(results: FirecrawlSearchResult[]): SearchResult[] {
|
|
39
|
+
return results.map((r) => ({
|
|
40
|
+
title: r.title ?? "",
|
|
41
|
+
url: r.url ?? "",
|
|
42
|
+
snippet: r.description ?? "",
|
|
43
|
+
}));
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export class FirecrawlProvider implements SearchProvider {
|
|
47
|
+
readonly name = FIRECRAWL_PROVIDER_META.name;
|
|
48
|
+
readonly label = FIRECRAWL_PROVIDER_META.label;
|
|
49
|
+
readonly envVar = FIRECRAWL_PROVIDER_META.envVar;
|
|
50
|
+
|
|
51
|
+
constructor(private readonly apiKey: string) {}
|
|
52
|
+
|
|
53
|
+
async search(query: string, maxResults: number, signal?: AbortSignal): Promise<SearchResponse> {
|
|
54
|
+
if (!this.apiKey) {
|
|
55
|
+
throw new Error(`${this.envVar} is not set. Run /web-search-config to configure, or export the env var.`);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const res = await fetch(`${FIRECRAWL_API_URL}/search`, {
|
|
59
|
+
method: "POST",
|
|
60
|
+
headers: {
|
|
61
|
+
"Content-Type": "application/json",
|
|
62
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
63
|
+
},
|
|
64
|
+
body: JSON.stringify({
|
|
65
|
+
query,
|
|
66
|
+
limit: maxResults,
|
|
67
|
+
}),
|
|
68
|
+
signal,
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
if (!res.ok) {
|
|
72
|
+
const text = await res.text();
|
|
73
|
+
throw new Error(`${this.label} Search API error (${res.status}): ${text}`);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const raw = (await res.json()) as FirecrawlSearchResponse;
|
|
77
|
+
return { query, results: normalizeFirecrawlResults(raw.data ?? []) };
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
async fetch(url: string, _raw: boolean, signal?: AbortSignal): Promise<FetchResponse> {
|
|
81
|
+
if (!this.apiKey) {
|
|
82
|
+
throw new Error(`${this.envVar} is not set. Run /web-search-config to configure, or export the env var.`);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const res = await fetch(`${FIRECRAWL_API_URL}/scrape`, {
|
|
86
|
+
method: "POST",
|
|
87
|
+
headers: {
|
|
88
|
+
"Content-Type": "application/json",
|
|
89
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
90
|
+
},
|
|
91
|
+
body: JSON.stringify({
|
|
92
|
+
url,
|
|
93
|
+
formats: ["markdown"],
|
|
94
|
+
}),
|
|
95
|
+
signal,
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
if (!res.ok) {
|
|
99
|
+
const text = await res.text();
|
|
100
|
+
throw new Error(`${this.label} Fetch API error (${res.status}): ${text}`);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const raw = (await res.json()) as FirecrawlScrapeResponse;
|
|
104
|
+
|
|
105
|
+
if (!raw.success) {
|
|
106
|
+
throw new Error(`${this.label} Fetch API error: ${raw.error ?? "scrape failed"}`);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if (!raw.data?.markdown) {
|
|
110
|
+
throw new Error(`${this.label} Fetch API error: no content returned for ${url}`);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
return {
|
|
114
|
+
text: raw.data.markdown,
|
|
115
|
+
title: raw.data.metadata?.title || undefined,
|
|
116
|
+
contentType: "text/markdown",
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { BRAVE_PROVIDER_META } from "./brave.js";
|
|
2
|
+
import { EXA_PROVIDER_META } from "./exa.js";
|
|
3
|
+
import { FIRECRAWL_PROVIDER_META } from "./firecrawl.js";
|
|
4
|
+
import { JINA_PROVIDER_META } from "./jina.js";
|
|
5
|
+
import { SERPER_PROVIDER_META } from "./serper.js";
|
|
6
|
+
import { TAVILY_PROVIDER_META } from "./tavily.js";
|
|
7
|
+
|
|
8
|
+
export { BRAVE_API_KEY_ENV_VAR, BRAVE_PROVIDER_META, BraveProvider } from "./brave.js";
|
|
9
|
+
export { EXA_API_KEY_ENV_VAR, EXA_PROVIDER_META, ExaProvider } from "./exa.js";
|
|
10
|
+
export { createSearchProvider } from "./factory.js";
|
|
11
|
+
export { FIRECRAWL_API_KEY_ENV_VAR, FIRECRAWL_PROVIDER_META, FirecrawlProvider } from "./firecrawl.js";
|
|
12
|
+
export { JINA_API_KEY_ENV_VAR, JINA_PROVIDER_META, JinaProvider } from "./jina.js";
|
|
13
|
+
export { SERPER_API_KEY_ENV_VAR, SERPER_PROVIDER_META, SerperProvider } from "./serper.js";
|
|
14
|
+
export { TAVILY_API_KEY_ENV_VAR, TAVILY_PROVIDER_META, TavilyProvider } from "./tavily.js";
|
|
15
|
+
export type { FetchResponse, SearchProvider, SearchResponse, SearchResult } from "./types.js";
|
|
16
|
+
|
|
17
|
+
export const PROVIDERS = [
|
|
18
|
+
BRAVE_PROVIDER_META,
|
|
19
|
+
TAVILY_PROVIDER_META,
|
|
20
|
+
SERPER_PROVIDER_META,
|
|
21
|
+
EXA_PROVIDER_META,
|
|
22
|
+
JINA_PROVIDER_META,
|
|
23
|
+
FIRECRAWL_PROVIDER_META,
|
|
24
|
+
] as const;
|