@juicesharp/rpiv-web-tools 1.14.6 → 1.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -4
- package/index.ts +29 -2
- package/package.json +2 -2
- package/providers/brave.ts +2 -20
- package/providers/config.test.ts +129 -0
- package/providers/config.ts +96 -0
- package/providers/exa.ts +3 -2
- package/providers/factory.ts +6 -2
- package/providers/fetch-helpers.ts +23 -1
- package/providers/firecrawl.ts +3 -2
- package/providers/index.ts +14 -0
- package/providers/interceptors/chain.test.ts +209 -0
- package/providers/interceptors/github.test.ts +933 -0
- package/providers/interceptors/github.ts +890 -0
- package/providers/interceptors/index.ts +63 -0
- package/providers/interceptors/types.ts +12 -0
- package/providers/jina.ts +3 -2
- package/providers/ollama.ts +3 -2
- package/providers/searxng.ts +1 -19
- package/providers/serper.ts +2 -19
- package/providers/tavily.ts +3 -2
- package/providers/types.ts +22 -0
- package/web-tools.ts +53 -29
package/README.md
CHANGED
|
@@ -30,6 +30,7 @@ Pick one as the active backend; switch any time without losing the others' keys.
|
|
|
30
30
|
## Features
|
|
31
31
|
|
|
32
32
|
- **Read any URL** - fetch http/https pages with HTML-to-text extraction, or get the raw response with `raw: true` (honoured by Brave/Serper/SearXNG; extraction providers — Tavily/Exa/Jina/Firecrawl/Ollama — always return their parsed text).
|
|
33
|
+
- **GitHub URL interceptor (opt-in)** - github.com URLs route through `gh`/`git` for full repository content (file tree, README, individual file contents) instead of the rendered HTML page. Off by default; enable per-user via config or per-consumer at registration time. See [§GitHub URL interceptor](#github-url-interceptor).
|
|
33
34
|
- **Large-page spillover** - oversized responses truncate inline and spill the full body to a temp file the model can read on demand.
|
|
34
35
|
- **SSRF guard** - refuses loopback, RFC 1918, link-local, and cloud-metadata addresses (`localhost`, `127.0.0.0/8`, `10.0.0.0/8`, `169.254.0.0/16`, `172.16.0.0/12`, `192.168.0.0/16`, `::1`, `fc00::/7`, `fe80::/10`).
|
|
35
36
|
- **Interactive setup** - `/web-tools` lists providers (active one first, configured ones marked) and writes to `~/.config/rpiv-web-tools/config.json` (chmod 0600); per-provider env vars also work and take precedence over persisted keys.
|
|
@@ -46,9 +47,11 @@ Then restart your Pi session.
|
|
|
46
47
|
|
|
47
48
|
- **`web_search`** - query the active provider's search API and return titled snippets.
|
|
48
49
|
1–10 results per call.
|
|
49
|
-
- **`web_fetch`** -
|
|
50
|
-
(
|
|
51
|
-
|
|
50
|
+
- **`web_fetch`** - read an http/https URL. Lookup order: opt-in URL interceptors
|
|
51
|
+
(see [§GitHub URL interceptor](#github-url-interceptor)), then the active provider's native
|
|
52
|
+
fetch endpoint when it has one (Tavily/Exa/Jina/Firecrawl/Ollama → vendor extraction;
|
|
53
|
+
Brave/Serper/SearXNG → shared raw HTTP + HTML-to-text fallback). Large responses truncate
|
|
54
|
+
inline and spill the full body to a temp file the model can read on demand.
|
|
52
55
|
|
|
53
56
|
### Schema - `web_search`
|
|
54
57
|
|
|
@@ -107,7 +110,8 @@ Throws on invalid URL, non-http(s) protocol, private/loopback hostnames (SSRF gu
|
|
|
107
110
|
- **`/web-tools`** - pick the active provider and set its API key interactively.
|
|
108
111
|
Providers already configured show `(configured)`; the active one is listed first with a `✓`.
|
|
109
112
|
Pressing Enter on an empty input keeps the existing key for the chosen provider while
|
|
110
|
-
persisting the provider switch. Pass `--show` to see all per-provider keys (masked)
|
|
113
|
+
persisting the provider switch. Pass `--show` to see all per-provider keys (masked), env var status,
|
|
114
|
+
and current URL interceptor states (see [§GitHub URL interceptor](#github-url-interceptor)).
|
|
111
115
|
|
|
112
116
|
## API key resolution (per active provider)
|
|
113
117
|
|
|
@@ -189,6 +193,45 @@ The provider automatically uses the correct API paths:
|
|
|
189
193
|
- **Local** (`localhost`, `127.0.0.1`, `0.0.0.0`): `/api/experimental/web_search` and `/api/experimental/web_fetch`
|
|
190
194
|
- **Cloud** (any other host): `/api/web_search` and `/api/web_fetch`
|
|
191
195
|
|
|
196
|
+
## GitHub URL interceptor
|
|
197
|
+
|
|
198
|
+
Routes github.com URLs through `gh` / `git` to return repository content (file tree, README, file content) instead of the rendered HTML. **Off by default.** Opt in two ways:
|
|
199
|
+
|
|
200
|
+
```json
|
|
201
|
+
// ~/.config/rpiv-web-tools/config.json — end-user opt-in
|
|
202
|
+
{ "interceptors": { "github": true } }
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
```ts
|
|
206
|
+
// or per-consumer at registration time (user config still wins)
|
|
207
|
+
registerWebTools(pi, { interceptors: { github: true } });
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
When enabled, github.com URLs are parsed into `owner/repo/ref/path`; non-code paths (`/issues`, `/pulls`, `/discussions`, `/releases`, …) fall through to the active provider. The interceptor probes for `gh`, falls back to plain `git clone` (with a stderr hint to install `gh`), and uses the `gh api` JSON view for SHA-pinned URLs and repos above `maxRepoSizeMB`. Shallow clones (`--depth 1 --single-branch`) land in `clonePath`; successful clones cache by `owner/repo@ref` for the session. Auth flows through `gh`'s normal `GH_TOKEN`/`GITHUB_TOKEN` precedence — export `GITHUB_TOKEN` to reach private repos.
|
|
211
|
+
|
|
212
|
+
Replace the boolean shorthand with an object to tune the defaults; object form implies opt-in.
|
|
213
|
+
|
|
214
|
+
```json
|
|
215
|
+
{
|
|
216
|
+
"interceptors": {
|
|
217
|
+
"github": {
|
|
218
|
+
"maxRepoSizeMB": 1000,
|
|
219
|
+
"cloneTimeoutSeconds": 90,
|
|
220
|
+
"clonePath": "/Users/me/.cache/pi-github-repos"
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
| Field | Default | Purpose |
|
|
227
|
+
|---|---|---|
|
|
228
|
+
| `enabled` | `false` (top-level) / `true` (inside object form) | Master switch |
|
|
229
|
+
| `maxRepoSizeMB` | `350` | Repos above this threshold skip the clone and use the API view |
|
|
230
|
+
| `cloneTimeoutSeconds` | `30` | Kill the clone process after this many seconds |
|
|
231
|
+
| `clonePath` | `$TMPDIR/pi-github-repos` | Where shallow clones land; one subdir per `owner/repo@ref` |
|
|
232
|
+
|
|
233
|
+
`/web-tools --show` reports the current state at the bottom of its output (resolved token masked, `clonePath`, `maxRepoSizeMB`). The SSRF guard still runs first — a URL with a private/loopback host can't bypass it via a github.com path shape.
|
|
234
|
+
|
|
192
235
|
## Executor guidance overrides
|
|
193
236
|
|
|
194
237
|
Override the `promptSnippet` / `promptGuidelines` the model sees for each tool by editing `~/.config/rpiv-web-tools/config.json`. Note the per-tool nesting under `guidance.web_search` / `guidance.web_fetch` — this differs from the flat `guidance` shape used by single-tool siblings (`rpiv-advisor`, `rpiv-todo`, `rpiv-ask-user-question`):
|
|
@@ -200,6 +243,9 @@ Override the `promptSnippet` / `promptGuidelines` the model sees for each tool b
|
|
|
200
243
|
"exa": "sk-...",
|
|
201
244
|
"brave": "sk-..."
|
|
202
245
|
},
|
|
246
|
+
"interceptors": {
|
|
247
|
+
"github": true
|
|
248
|
+
},
|
|
203
249
|
"guidance": {
|
|
204
250
|
"web_search": {
|
|
205
251
|
"promptSnippet": "Search the web for current docs and library versions",
|
|
@@ -217,6 +263,8 @@ Override the `promptSnippet` / `promptGuidelines` the model sees for each tool b
|
|
|
217
263
|
|
|
218
264
|
Each field is independent: omit one and the built-in default is kept. Invalid values (empty string, wrong type, empty array) silently fall back to defaults. Changes take effect on the next Pi session start.
|
|
219
265
|
|
|
266
|
+
The `interceptors` key is the GitHub URL interceptor opt-in — see [§GitHub URL interceptor](#github-url-interceptor) for the full schema (boolean shorthand or per-field overrides).
|
|
267
|
+
|
|
220
268
|
## Security note: `web_fetch` host guard
|
|
221
269
|
|
|
222
270
|
`web_fetch` refuses URLs targeting loopback (`localhost`, `127.0.0.0/8`, `::1`), RFC 1918 private ranges (`10.0.0.0/8`, `172.16.0.0/12`, `192.168.0.0/16`), link-local (`169.254.0.0/16`, including cloud-metadata at `169.254.169.254`), and IPv6 unique-local / link-local (`fc00::/7`, `fe80::/10`). Attempts surface as `Refusing to fetch private/loopback address: <host>`. This blocks the most common SSRF class — direct-literal targeting of internal services or cloud-metadata endpoints — without preventing legitimate public-web fetches.
|
package/index.ts
CHANGED
|
@@ -9,11 +9,28 @@
|
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
11
|
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
12
|
+
import { buildInterceptors } from "./providers/interceptors/index.js";
|
|
12
13
|
import { registerWebFetchTool, registerWebSearchConfigCommand, registerWebSearchTool } from "./web-tools.js";
|
|
13
14
|
|
|
14
15
|
export { createSearchProvider } from "./providers/factory.js";
|
|
16
|
+
export {
|
|
17
|
+
GITHUB_TOKEN_ENV_VAR,
|
|
18
|
+
GitHubInterceptor,
|
|
19
|
+
type GitHubInterceptorOptions,
|
|
20
|
+
type GitHubUrlInfo,
|
|
21
|
+
parseGitHubUrl,
|
|
22
|
+
resolveGitHubOptions,
|
|
23
|
+
type UrlInterceptor,
|
|
24
|
+
} from "./providers/interceptors/index.js";
|
|
15
25
|
|
|
16
|
-
export type {
|
|
26
|
+
export type {
|
|
27
|
+
FetchProvider,
|
|
28
|
+
FetchResponse,
|
|
29
|
+
FullProvider,
|
|
30
|
+
SearchProvider,
|
|
31
|
+
SearchResponse,
|
|
32
|
+
SearchResult,
|
|
33
|
+
} from "./providers/types.js";
|
|
17
34
|
export {
|
|
18
35
|
DEFAULT_WEB_FETCH_GUIDELINES,
|
|
19
36
|
DEFAULT_WEB_FETCH_SNIPPET,
|
|
@@ -24,7 +41,17 @@ export {
|
|
|
24
41
|
registerWebSearchTool,
|
|
25
42
|
} from "./web-tools.js";
|
|
26
43
|
|
|
27
|
-
|
|
44
|
+
// Programmatic consumer-side opt-in for URL interceptors. Tier 2 in the
|
|
45
|
+
// resolution model: end-user config (Tier 1) still wins. Default OFF —
|
|
46
|
+
// existing rpiv-web-tools users see zero behavior change.
|
|
47
|
+
export interface RegisterOptions {
|
|
48
|
+
interceptors?: {
|
|
49
|
+
github?: boolean;
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export default function registerWebTools(pi: ExtensionAPI, opts?: RegisterOptions): void {
|
|
54
|
+
buildInterceptors(opts?.interceptors);
|
|
28
55
|
registerWebSearchTool(pi);
|
|
29
56
|
registerWebFetchTool(pi);
|
|
30
57
|
registerWebSearchConfigCommand(pi);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@juicesharp/rpiv-web-tools",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.15.0",
|
|
4
4
|
"description": "Pi extension. Web search and fetch for the model with pluggable providers (Brave, Tavily, Serper, Exa, Jina, Firecrawl, SearXNG, Ollama).",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"pi-package",
|
|
@@ -57,7 +57,7 @@
|
|
|
57
57
|
]
|
|
58
58
|
},
|
|
59
59
|
"dependencies": {
|
|
60
|
-
"@juicesharp/rpiv-config": "^1.
|
|
60
|
+
"@juicesharp/rpiv-config": "^1.15.0"
|
|
61
61
|
},
|
|
62
62
|
"peerDependencies": {
|
|
63
63
|
"@earendil-works/pi-coding-agent": "*",
|
package/providers/brave.ts
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import type { FetchResponse, SearchProvider, SearchResponse, SearchResult } from "./types.js";
|
|
1
|
+
import type { SearchProvider, SearchResponse, SearchResult } from "./types.js";
|
|
3
2
|
|
|
4
3
|
const BRAVE_SEARCH_API_URL = "https://api.search.brave.com/res/v1/web/search";
|
|
5
4
|
export const BRAVE_API_KEY_ENV_VAR = "BRAVE_SEARCH_API_KEY";
|
|
@@ -7,6 +6,7 @@ export const BRAVE_PROVIDER_META = {
|
|
|
7
6
|
name: "brave",
|
|
8
7
|
label: "Brave",
|
|
9
8
|
envVar: BRAVE_API_KEY_ENV_VAR,
|
|
9
|
+
roles: ["search"] as const,
|
|
10
10
|
} as const;
|
|
11
11
|
|
|
12
12
|
interface BraveRawResponse {
|
|
@@ -55,22 +55,4 @@ export class BraveProvider implements SearchProvider {
|
|
|
55
55
|
const raw = (await res.json()) as BraveRawResponse;
|
|
56
56
|
return { query, results: normalizeBraveResults(raw) };
|
|
57
57
|
}
|
|
58
|
-
|
|
59
|
-
// No apiKey guard: Brave's fetch() wraps the built-in HTTP+htmlToText
|
|
60
|
-
// pipeline and does not call any vendor endpoint. Adding a guard would
|
|
61
|
-
// break the "use any provider for fetch" contract.
|
|
62
|
-
async fetch(url: string, raw: boolean, signal?: AbortSignal): Promise<FetchResponse> {
|
|
63
|
-
const res = await fetchUrlOrThrow(url, signal);
|
|
64
|
-
const contentType = res.headers.get("content-type") ?? "";
|
|
65
|
-
assertTextContentType(contentType);
|
|
66
|
-
|
|
67
|
-
const { text, title } = await extractBodyAsText(res, contentType, raw);
|
|
68
|
-
const contentLengthHeader = res.headers.get("content-length");
|
|
69
|
-
return {
|
|
70
|
-
text,
|
|
71
|
-
title,
|
|
72
|
-
contentType: contentType || undefined,
|
|
73
|
-
contentLength: contentLengthHeader ? Number(contentLengthHeader) : undefined,
|
|
74
|
-
};
|
|
75
|
-
}
|
|
76
58
|
}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import { mkdirSync, rmSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { dirname } from "node:path";
|
|
3
|
+
import { configPath } from "@juicesharp/rpiv-config";
|
|
4
|
+
import { beforeEach, describe, expect, it } from "vitest";
|
|
5
|
+
import { getConfigPath, readConfig, WebToolsConfigSchema, writeConfig } from "./config.js";
|
|
6
|
+
|
|
7
|
+
const CONFIG_PATH = configPath("rpiv-web-tools");
|
|
8
|
+
|
|
9
|
+
beforeEach(() => {
|
|
10
|
+
rmSync(CONFIG_PATH, { force: true });
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
function writeRaw(contents: string): void {
|
|
14
|
+
mkdirSync(dirname(CONFIG_PATH), { recursive: true });
|
|
15
|
+
writeFileSync(CONFIG_PATH, contents, "utf-8");
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
describe("getConfigPath", () => {
|
|
19
|
+
it("returns the canonical ~/.config/rpiv-web-tools/config.json", () => {
|
|
20
|
+
expect(getConfigPath()).toBe(CONFIG_PATH);
|
|
21
|
+
});
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
describe("readConfig — fail-soft posture", () => {
|
|
25
|
+
it("returns {} when the file does not exist", () => {
|
|
26
|
+
expect(readConfig()).toEqual({});
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
it("returns {} on malformed JSON (matches loadJsonConfig tolerance)", () => {
|
|
30
|
+
writeRaw("{ not valid json");
|
|
31
|
+
expect(readConfig()).toEqual({});
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
it("returns {} when the file is a directory (EISDIR)", () => {
|
|
35
|
+
mkdirSync(CONFIG_PATH, { recursive: true });
|
|
36
|
+
try {
|
|
37
|
+
expect(readConfig()).toEqual({});
|
|
38
|
+
} finally {
|
|
39
|
+
rmSync(CONFIG_PATH, { recursive: true, force: true });
|
|
40
|
+
}
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it("returns {} when the schema validation fails hard (e.g. provider is a number)", () => {
|
|
44
|
+
writeRaw(JSON.stringify({ provider: 123 }));
|
|
45
|
+
expect(readConfig()).toEqual({});
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
describe("readConfig — released-shape compatibility", () => {
|
|
50
|
+
it("loads a minimal { provider, apiKeys } config unchanged", () => {
|
|
51
|
+
writeRaw(JSON.stringify({ provider: "brave", apiKeys: { brave: "k" } }));
|
|
52
|
+
expect(readConfig()).toEqual({ provider: "brave", apiKeys: { brave: "k" } });
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
it("loads the legacy top-level apiKey field", () => {
|
|
56
|
+
writeRaw(JSON.stringify({ apiKey: "legacy" }));
|
|
57
|
+
expect(readConfig()).toMatchObject({ apiKey: "legacy" });
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it("preserves unknown top-level keys (otherField round-trip contract)", () => {
|
|
61
|
+
// The released /web-tools migrate-legacy-apiKey test relies on this:
|
|
62
|
+
// unknown keys MUST NOT be stripped by the schema reader.
|
|
63
|
+
writeRaw(JSON.stringify({ apiKey: "k", otherField: "keep" }));
|
|
64
|
+
const cfg = readConfig() as { otherField?: string };
|
|
65
|
+
expect(cfg.otherField).toBe("keep");
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it("loads the guidance subtree with web_search + web_fetch", () => {
|
|
69
|
+
writeRaw(
|
|
70
|
+
JSON.stringify({
|
|
71
|
+
guidance: {
|
|
72
|
+
web_search: { promptSnippet: "snip", promptGuidelines: ["a", "b"] },
|
|
73
|
+
web_fetch: { promptSnippet: "snip2" },
|
|
74
|
+
},
|
|
75
|
+
}),
|
|
76
|
+
);
|
|
77
|
+
const cfg = readConfig();
|
|
78
|
+
expect(cfg.guidance?.web_search?.promptSnippet).toBe("snip");
|
|
79
|
+
expect(cfg.guidance?.web_fetch?.promptSnippet).toBe("snip2");
|
|
80
|
+
});
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
describe("readConfig — interceptors.github union", () => {
|
|
84
|
+
it("accepts the boolean true shorthand", () => {
|
|
85
|
+
writeRaw(JSON.stringify({ interceptors: { github: true } }));
|
|
86
|
+
expect(readConfig().interceptors?.github).toBe(true);
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
it("accepts the boolean false shorthand", () => {
|
|
90
|
+
writeRaw(JSON.stringify({ interceptors: { github: false } }));
|
|
91
|
+
expect(readConfig().interceptors?.github).toBe(false);
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it("accepts the object override form", () => {
|
|
95
|
+
writeRaw(
|
|
96
|
+
JSON.stringify({
|
|
97
|
+
interceptors: { github: { maxRepoSizeMB: 1000, clonePath: "/x" } },
|
|
98
|
+
}),
|
|
99
|
+
);
|
|
100
|
+
const gh = readConfig().interceptors?.github;
|
|
101
|
+
expect(gh).toEqual({ maxRepoSizeMB: 1000, clonePath: "/x" });
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
it("falls back to {} when interceptors.github has a type-incompatible shape", () => {
|
|
105
|
+
// A number is neither boolean nor a GitHubInterceptorOptions object —
|
|
106
|
+
// hard schema failure → fail-soft to {}.
|
|
107
|
+
writeRaw(JSON.stringify({ interceptors: { github: 42 } }));
|
|
108
|
+
expect(readConfig()).toEqual({});
|
|
109
|
+
});
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
describe("writeConfig", () => {
|
|
113
|
+
it("round-trips a config through readConfig", () => {
|
|
114
|
+
expect(writeConfig({ provider: "brave", apiKeys: { brave: "k" } })).toBe(true);
|
|
115
|
+
expect(readConfig()).toEqual({ provider: "brave", apiKeys: { brave: "k" } });
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
it("preserves the interceptors.github stanza across save+load", () => {
|
|
119
|
+
expect(writeConfig({ interceptors: { github: { maxRepoSizeMB: 500 } } })).toBe(true);
|
|
120
|
+
expect(readConfig().interceptors?.github).toEqual({ maxRepoSizeMB: 500 });
|
|
121
|
+
});
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
describe("WebToolsConfigSchema — schema-only sanity", () => {
|
|
125
|
+
it("exists and is a TypeBox object", () => {
|
|
126
|
+
expect(WebToolsConfigSchema).toBeDefined();
|
|
127
|
+
expect(WebToolsConfigSchema.type).toBe("object");
|
|
128
|
+
});
|
|
129
|
+
});
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Single typed reader/writer for ~/.config/rpiv-web-tools/config.json.
|
|
3
|
+
*
|
|
4
|
+
* Owns the canonical WebToolsConfigSchema. All schema fields are optional and
|
|
5
|
+
* unknown keys pass through (additionalProperties: true) so existing configs
|
|
6
|
+
* carrying legacy/unrelated fields keep working — required for the
|
|
7
|
+
* `otherField: "keep"` preservation contract the released `/web-tools`
|
|
8
|
+
* legacy-apiKey migration depends on.
|
|
9
|
+
*
|
|
10
|
+
* Validation is fail-soft (matching `loadJsonConfig` and `validateConfig` in
|
|
11
|
+
* rpiv-config): malformed JSON, EISDIR, or a hard schema violation all
|
|
12
|
+
* degrade to `{}`. The orchestrator never has to handle "config blew up at
|
|
13
|
+
* startup."
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { configPath, GuidanceFieldsSchema, loadJsonConfig, saveJsonConfig } from "@juicesharp/rpiv-config";
|
|
17
|
+
import { type Static, Type } from "typebox";
|
|
18
|
+
import { Value } from "typebox/value";
|
|
19
|
+
|
|
20
|
+
// The web_search / web_fetch tool-namespace wrapper is web-tools' concept, not
|
|
21
|
+
// rpiv-config's. The leaf schema (`GuidanceFieldsSchema`) is sibling-agnostic
|
|
22
|
+
// and lives in rpiv-config; this file only composes the tool-namespaced shell
|
|
23
|
+
// around it.
|
|
24
|
+
const WebToolsGuidanceSchema = Type.Object(
|
|
25
|
+
{
|
|
26
|
+
web_search: Type.Optional(GuidanceFieldsSchema),
|
|
27
|
+
web_fetch: Type.Optional(GuidanceFieldsSchema),
|
|
28
|
+
},
|
|
29
|
+
{ additionalProperties: true },
|
|
30
|
+
);
|
|
31
|
+
|
|
32
|
+
const GitHubInterceptorOptionsSchema = Type.Object(
|
|
33
|
+
{
|
|
34
|
+
enabled: Type.Optional(Type.Boolean()),
|
|
35
|
+
maxRepoSizeMB: Type.Optional(Type.Number()),
|
|
36
|
+
cloneTimeoutSeconds: Type.Optional(Type.Number()),
|
|
37
|
+
clonePath: Type.Optional(Type.String()),
|
|
38
|
+
},
|
|
39
|
+
{ additionalProperties: true },
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
const InterceptorsConfigSchema = Type.Object(
|
|
43
|
+
{
|
|
44
|
+
// Boolean shorthand or per-field overrides. `enabled: false` inside the
|
|
45
|
+
// object form is allowed but redundant — use the top-level `false`.
|
|
46
|
+
github: Type.Optional(Type.Union([Type.Boolean(), GitHubInterceptorOptionsSchema])),
|
|
47
|
+
},
|
|
48
|
+
{ additionalProperties: true },
|
|
49
|
+
);
|
|
50
|
+
|
|
51
|
+
export const WebToolsConfigSchema = Type.Object(
|
|
52
|
+
{
|
|
53
|
+
provider: Type.Optional(Type.String()),
|
|
54
|
+
apiKeys: Type.Optional(Type.Record(Type.String(), Type.String())),
|
|
55
|
+
baseUrls: Type.Optional(Type.Record(Type.String(), Type.String())),
|
|
56
|
+
// Legacy top-level Brave key. Auto-migrated to `apiKeys.brave` by the
|
|
57
|
+
// /web-tools save path — kept here for the load+rewrite round-trip.
|
|
58
|
+
apiKey: Type.Optional(Type.String()),
|
|
59
|
+
guidance: Type.Optional(WebToolsGuidanceSchema),
|
|
60
|
+
interceptors: Type.Optional(InterceptorsConfigSchema),
|
|
61
|
+
},
|
|
62
|
+
{ additionalProperties: true },
|
|
63
|
+
);
|
|
64
|
+
|
|
65
|
+
export type WebToolsConfig = Static<typeof WebToolsConfigSchema>;
|
|
66
|
+
|
|
67
|
+
const CONFIG_PATH = configPath("rpiv-web-tools");
|
|
68
|
+
|
|
69
|
+
export function getConfigPath(): string {
|
|
70
|
+
return CONFIG_PATH;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Tolerant read: loadJsonConfig already swallows JSON parse failures + EISDIR
|
|
74
|
+
// into `{}`; we then run a schema check that — on hard failure — falls back to
|
|
75
|
+
// the same `{}`. Validation uses `Value.Check` (no mutation) rather than
|
|
76
|
+
// `Value.Clean` (would strip unknown fields like the released `otherField`
|
|
77
|
+
// pass-through contract).
|
|
78
|
+
export function readConfig(): WebToolsConfig {
|
|
79
|
+
const raw = loadJsonConfig<unknown>(CONFIG_PATH);
|
|
80
|
+
if (!Value.Check(WebToolsConfigSchema, raw)) {
|
|
81
|
+
return {} as WebToolsConfig;
|
|
82
|
+
}
|
|
83
|
+
return raw as WebToolsConfig;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export function writeConfig(c: WebToolsConfig): boolean {
|
|
87
|
+
return saveJsonConfig(CONFIG_PATH, c);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Plan-surface no-op. Phase 4 omits the in-memory cache the plan sketched —
|
|
91
|
+
// the tests' direct-writeFileSync pattern makes per-test invalidation a
|
|
92
|
+
// rewrite-the-suite job for marginal perf gain. Kept exported so that
|
|
93
|
+
// consumers writing against the plan's API can call it without breaking.
|
|
94
|
+
export function invalidateConfigCache(): void {
|
|
95
|
+
// no-op
|
|
96
|
+
}
|
package/providers/exa.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { FetchResponse,
|
|
1
|
+
import type { FetchResponse, FullProvider, SearchResponse, SearchResult } from "./types.js";
|
|
2
2
|
|
|
3
3
|
const EXA_API_URL = "https://api.exa.ai/search";
|
|
4
4
|
const EXA_CONTENTS_API_URL = "https://api.exa.ai/contents";
|
|
@@ -7,6 +7,7 @@ export const EXA_PROVIDER_META = {
|
|
|
7
7
|
name: "exa",
|
|
8
8
|
label: "Exa",
|
|
9
9
|
envVar: EXA_API_KEY_ENV_VAR,
|
|
10
|
+
roles: ["search", "fetch"] as const,
|
|
10
11
|
} as const;
|
|
11
12
|
const EXA_MAX_SNIPPET_CHARACTERS = 300;
|
|
12
13
|
const EXA_MAX_FETCH_CHARACTERS = 10000;
|
|
@@ -30,7 +31,7 @@ function normalizeExaResults(results: ExaRawResult[]): SearchResult[] {
|
|
|
30
31
|
}));
|
|
31
32
|
}
|
|
32
33
|
|
|
33
|
-
export class ExaProvider implements
|
|
34
|
+
export class ExaProvider implements FullProvider {
|
|
34
35
|
readonly name = EXA_PROVIDER_META.name;
|
|
35
36
|
readonly label = EXA_PROVIDER_META.label;
|
|
36
37
|
readonly envVar = EXA_PROVIDER_META.envVar;
|
package/providers/factory.ts
CHANGED
|
@@ -6,14 +6,18 @@ import { OllamaProvider } from "./ollama.js";
|
|
|
6
6
|
import { SearxngProvider } from "./searxng.js";
|
|
7
7
|
import { SerperProvider } from "./serper.js";
|
|
8
8
|
import { TavilyProvider } from "./tavily.js";
|
|
9
|
-
import type { SearchProvider } from "./types.js";
|
|
9
|
+
import type { FullProvider, SearchProvider } from "./types.js";
|
|
10
10
|
|
|
11
11
|
export interface ProviderCredentials {
|
|
12
12
|
apiKey?: string;
|
|
13
13
|
baseUrl?: string;
|
|
14
14
|
}
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
// The return union mirrors the role split: Brave/Serper/SearXNG are search-
|
|
17
|
+
// only (SearchProvider); the other five expose native fetch endpoints too
|
|
18
|
+
// (FullProvider). Consumers narrow with `"fetch" in provider` when they need
|
|
19
|
+
// to dispatch on capability.
|
|
20
|
+
export function createSearchProvider(name: string, creds: ProviderCredentials): SearchProvider | FullProvider {
|
|
17
21
|
const apiKey = creds.apiKey ?? "";
|
|
18
22
|
switch (name) {
|
|
19
23
|
case "brave":
|
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Shared fetch helpers — HTTP client, content-type guards, and HTML-to-text
|
|
3
|
-
* extraction used by providers that wrap the built-in pipeline (Brave, Serper
|
|
3
|
+
* extraction used by providers that wrap the built-in pipeline (Brave, Serper,
|
|
4
|
+
* SearXNG). `fetchViaGenericHtml` is the one-stop entry point those providers
|
|
5
|
+
* delegate their `fetch()` method to.
|
|
4
6
|
*/
|
|
5
7
|
|
|
8
|
+
import type { FetchResponse } from "./types.js";
|
|
9
|
+
|
|
6
10
|
// ---------------------------------------------------------------------------
|
|
7
11
|
// Constants
|
|
8
12
|
// ---------------------------------------------------------------------------
|
|
@@ -115,3 +119,21 @@ export async function extractBodyAsText(
|
|
|
115
119
|
}
|
|
116
120
|
return { text: body };
|
|
117
121
|
}
|
|
122
|
+
|
|
123
|
+
// One-stop fetch helper for providers that have no native fetch endpoint
|
|
124
|
+
// (Brave/Serper/SearXNG). Bundles the quartet — fetchUrlOrThrow →
|
|
125
|
+
// content-type assertion → body extraction → FetchResponse envelope — so
|
|
126
|
+
// providers collapse to a single delegating call.
|
|
127
|
+
export async function fetchViaGenericHtml(url: string, raw: boolean, signal?: AbortSignal): Promise<FetchResponse> {
|
|
128
|
+
const res = await fetchUrlOrThrow(url, signal);
|
|
129
|
+
const contentType = res.headers.get("content-type") ?? "";
|
|
130
|
+
assertTextContentType(contentType);
|
|
131
|
+
const { text, title } = await extractBodyAsText(res, contentType, raw);
|
|
132
|
+
const contentLengthHeader = res.headers.get("content-length");
|
|
133
|
+
return {
|
|
134
|
+
text,
|
|
135
|
+
title,
|
|
136
|
+
contentType: contentType || undefined,
|
|
137
|
+
contentLength: contentLengthHeader ? Number(contentLengthHeader) : undefined,
|
|
138
|
+
};
|
|
139
|
+
}
|
package/providers/firecrawl.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { FetchResponse,
|
|
1
|
+
import type { FetchResponse, FullProvider, SearchResponse, SearchResult } from "./types.js";
|
|
2
2
|
|
|
3
3
|
const FIRECRAWL_API_URL = "https://api.firecrawl.dev/v1";
|
|
4
4
|
export const FIRECRAWL_API_KEY_ENV_VAR = "FIRECRAWL_API_KEY";
|
|
@@ -6,6 +6,7 @@ export const FIRECRAWL_PROVIDER_META = {
|
|
|
6
6
|
name: "firecrawl",
|
|
7
7
|
label: "Firecrawl",
|
|
8
8
|
envVar: FIRECRAWL_API_KEY_ENV_VAR,
|
|
9
|
+
roles: ["search", "fetch"] as const,
|
|
9
10
|
} as const;
|
|
10
11
|
|
|
11
12
|
interface FirecrawlSearchResult {
|
|
@@ -43,7 +44,7 @@ function normalizeFirecrawlResults(results: FirecrawlSearchResult[]): SearchResu
|
|
|
43
44
|
}));
|
|
44
45
|
}
|
|
45
46
|
|
|
46
|
-
export class FirecrawlProvider implements
|
|
47
|
+
export class FirecrawlProvider implements FullProvider {
|
|
47
48
|
readonly name = FIRECRAWL_PROVIDER_META.name;
|
|
48
49
|
readonly label = FIRECRAWL_PROVIDER_META.label;
|
|
49
50
|
readonly envVar = FIRECRAWL_PROVIDER_META.envVar;
|
package/providers/index.ts
CHANGED
|
@@ -12,6 +12,17 @@ export { BRAVE_API_KEY_ENV_VAR, BRAVE_PROVIDER_META, BraveProvider } from "./bra
|
|
|
12
12
|
export { EXA_API_KEY_ENV_VAR, EXA_PROVIDER_META, ExaProvider } from "./exa.js";
|
|
13
13
|
export { createSearchProvider, type ProviderCredentials } from "./factory.js";
|
|
14
14
|
export { FIRECRAWL_API_KEY_ENV_VAR, FIRECRAWL_PROVIDER_META, FirecrawlProvider } from "./firecrawl.js";
|
|
15
|
+
// URL interceptors live in providers/interceptors/. The github primitives
|
|
16
|
+
// (parseGitHubUrl, GitHubUrlInfo, etc.) are re-exported from there.
|
|
17
|
+
export {
|
|
18
|
+
clearCloneCache,
|
|
19
|
+
GITHUB_TOKEN_ENV_VAR,
|
|
20
|
+
GitHubInterceptor,
|
|
21
|
+
type GitHubInterceptorOptions,
|
|
22
|
+
type GitHubUrlInfo,
|
|
23
|
+
parseGitHubUrl,
|
|
24
|
+
type UrlInterceptor,
|
|
25
|
+
} from "./interceptors/index.js";
|
|
15
26
|
export { JINA_API_KEY_ENV_VAR, JINA_PROVIDER_META, JinaProvider } from "./jina.js";
|
|
16
27
|
export {
|
|
17
28
|
configureOllama,
|
|
@@ -35,11 +46,14 @@ export {
|
|
|
35
46
|
export { SERPER_API_KEY_ENV_VAR, SERPER_PROVIDER_META, SerperProvider } from "./serper.js";
|
|
36
47
|
export { TAVILY_API_KEY_ENV_VAR, TAVILY_PROVIDER_META, TavilyProvider } from "./tavily.js";
|
|
37
48
|
export type {
|
|
49
|
+
FetchProvider,
|
|
38
50
|
FetchResponse,
|
|
51
|
+
FullProvider,
|
|
39
52
|
ProviderConfigChange,
|
|
40
53
|
ProviderConfigCurrent,
|
|
41
54
|
ProviderConfigUi,
|
|
42
55
|
ProviderMeta,
|
|
56
|
+
ProviderRole,
|
|
43
57
|
SearchProvider,
|
|
44
58
|
SearchResponse,
|
|
45
59
|
SearchResult,
|