@juicesharp/rpiv-web-tools 1.14.6 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -30,6 +30,7 @@ Pick one as the active backend; switch any time without losing the others' keys.
30
30
  ## Features
31
31
 
32
32
  - **Read any URL** - fetch http/https pages with HTML-to-text extraction, or get the raw response with `raw: true` (honoured by Brave/Serper/SearXNG; extraction providers — Tavily/Exa/Jina/Firecrawl/Ollama — always return their parsed text).
33
+ - **GitHub URL interceptor (opt-in)** - github.com URLs route through `gh`/`git` for full repository content (file tree, README, individual file contents) instead of the rendered HTML page. Off by default; enable per-user via config or per-consumer at registration time. See [§GitHub URL interceptor](#github-url-interceptor).
33
34
  - **Large-page spillover** - oversized responses truncate inline and spill the full body to a temp file the model can read on demand.
34
35
  - **SSRF guard** - refuses loopback, RFC 1918, link-local, and cloud-metadata addresses (`localhost`, `127.0.0.0/8`, `10.0.0.0/8`, `169.254.0.0/16`, `172.16.0.0/12`, `192.168.0.0/16`, `::1`, `fc00::/7`, `fe80::/10`).
35
36
  - **Interactive setup** - `/web-tools` lists providers (active one first, configured ones marked) and writes to `~/.config/rpiv-web-tools/config.json` (chmod 0600); per-provider env vars also work and take precedence over persisted keys.
@@ -46,9 +47,11 @@ Then restart your Pi session.
46
47
 
47
48
  - **`web_search`** - query the active provider's search API and return titled snippets.
48
49
  1–10 results per call.
49
- - **`web_fetch`** - fetch an http/https URL through the active provider's content path
50
- (raw HTTP+htmlToText for Brave/Serper/SearXNG; native extraction for Tavily/Exa/Jina/Firecrawl/Ollama),
51
- truncate large responses with a temp-file spill for the full content.
50
+ - **`web_fetch`** - read an http/https URL. Lookup order: opt-in URL interceptors
51
+ (see [§GitHub URL interceptor](#github-url-interceptor)), then the active provider's native
52
+ fetch endpoint when it has one (Tavily/Exa/Jina/Firecrawl/Ollama vendor extraction;
53
+ Brave/Serper/SearXNG → shared raw HTTP + HTML-to-text fallback). Large responses truncate
54
+ inline and spill the full body to a temp file the model can read on demand.
52
55
 
53
56
  ### Schema - `web_search`
54
57
 
@@ -107,7 +110,8 @@ Throws on invalid URL, non-http(s) protocol, private/loopback hostnames (SSRF gu
107
110
  - **`/web-tools`** - pick the active provider and set its API key interactively.
108
111
  Providers already configured show `(configured)`; the active one is listed first with a `✓`.
109
112
  Pressing Enter on an empty input keeps the existing key for the chosen provider while
110
- persisting the provider switch. Pass `--show` to see all per-provider keys (masked) and env var status.
113
+ persisting the provider switch. Pass `--show` to see all per-provider keys (masked), env var status,
114
+ and current URL interceptor states (see [§GitHub URL interceptor](#github-url-interceptor)).
111
115
 
112
116
  ## API key resolution (per active provider)
113
117
 
@@ -189,6 +193,45 @@ The provider automatically uses the correct API paths:
189
193
  - **Local** (`localhost`, `127.0.0.1`, `0.0.0.0`): `/api/experimental/web_search` and `/api/experimental/web_fetch`
190
194
  - **Cloud** (any other host): `/api/web_search` and `/api/web_fetch`
191
195
 
196
+ ## GitHub URL interceptor
197
+
198
+ Routes github.com URLs through `gh` / `git` to return repository content (file tree, README, file content) instead of the rendered HTML. **Off by default.** Opt in two ways:
199
+
200
+ ```json
201
+ // ~/.config/rpiv-web-tools/config.json — end-user opt-in
202
+ { "interceptors": { "github": true } }
203
+ ```
204
+
205
+ ```ts
206
+ // or per-consumer at registration time (user config still wins)
207
+ registerWebTools(pi, { interceptors: { github: true } });
208
+ ```
209
+
210
+ When enabled, github.com URLs are parsed into `owner/repo/ref/path`; non-code paths (`/issues`, `/pulls`, `/discussions`, `/releases`, …) fall through to the active provider. The interceptor probes for `gh`, falls back to plain `git clone` (with a stderr hint to install `gh`), and uses the `gh api` JSON view for SHA-pinned URLs and repos above `maxRepoSizeMB`. Shallow clones (`--depth 1 --single-branch`) land in `clonePath`; successful clones cache by `owner/repo@ref` for the session. Auth flows through `gh`'s normal `GH_TOKEN`/`GITHUB_TOKEN` precedence — export `GITHUB_TOKEN` to reach private repos.
211
+
212
+ Replace the boolean shorthand with an object to tune the defaults; object form implies opt-in.
213
+
214
+ ```json
215
+ {
216
+ "interceptors": {
217
+ "github": {
218
+ "maxRepoSizeMB": 1000,
219
+ "cloneTimeoutSeconds": 90,
220
+ "clonePath": "/Users/me/.cache/pi-github-repos"
221
+ }
222
+ }
223
+ }
224
+ ```
225
+
226
+ | Field | Default | Purpose |
227
+ |---|---|---|
228
+ | `enabled` | `false` (top-level) / `true` (inside object form) | Master switch |
229
+ | `maxRepoSizeMB` | `350` | Repos above this threshold skip the clone and use the API view |
230
+ | `cloneTimeoutSeconds` | `30` | Kill the clone process after this many seconds |
231
+ | `clonePath` | `$TMPDIR/pi-github-repos` | Where shallow clones land; one subdir per `owner/repo@ref` |
232
+
233
+ `/web-tools --show` reports the current state at the bottom of its output (resolved token masked, `clonePath`, `maxRepoSizeMB`). The SSRF guard still runs first — a URL with a private/loopback host can't bypass it via a github.com path shape.
234
+
192
235
  ## Executor guidance overrides
193
236
 
194
237
  Override the `promptSnippet` / `promptGuidelines` the model sees for each tool by editing `~/.config/rpiv-web-tools/config.json`. Note the per-tool nesting under `guidance.web_search` / `guidance.web_fetch` — this differs from the flat `guidance` shape used by single-tool siblings (`rpiv-advisor`, `rpiv-todo`, `rpiv-ask-user-question`):
@@ -200,6 +243,9 @@ Override the `promptSnippet` / `promptGuidelines` the model sees for each tool b
200
243
  "exa": "sk-...",
201
244
  "brave": "sk-..."
202
245
  },
246
+ "interceptors": {
247
+ "github": true
248
+ },
203
249
  "guidance": {
204
250
  "web_search": {
205
251
  "promptSnippet": "Search the web for current docs and library versions",
@@ -217,6 +263,8 @@ Override the `promptSnippet` / `promptGuidelines` the model sees for each tool b
217
263
 
218
264
  Each field is independent: omit one and the built-in default is kept. Invalid values (empty string, wrong type, empty array) silently fall back to defaults. Changes take effect on the next Pi session start.
219
265
 
266
+ The `interceptors` key is the GitHub URL interceptor opt-in — see [§GitHub URL interceptor](#github-url-interceptor) for the full schema (boolean shorthand or per-field overrides).
267
+
220
268
  ## Security note: `web_fetch` host guard
221
269
 
222
270
  `web_fetch` refuses URLs targeting loopback (`localhost`, `127.0.0.0/8`, `::1`), RFC 1918 private ranges (`10.0.0.0/8`, `172.16.0.0/12`, `192.168.0.0/16`), link-local (`169.254.0.0/16`, including cloud-metadata at `169.254.169.254`), and IPv6 unique-local / link-local (`fc00::/7`, `fe80::/10`). Attempts surface as `Refusing to fetch private/loopback address: <host>`. This blocks the most common SSRF class — direct-literal targeting of internal services or cloud-metadata endpoints — without preventing legitimate public-web fetches.
package/index.ts CHANGED
@@ -9,11 +9,28 @@
9
9
  */
10
10
 
11
11
  import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
12
+ import { buildInterceptors } from "./providers/interceptors/index.js";
12
13
  import { registerWebFetchTool, registerWebSearchConfigCommand, registerWebSearchTool } from "./web-tools.js";
13
14
 
14
15
  export { createSearchProvider } from "./providers/factory.js";
16
+ export {
17
+ GITHUB_TOKEN_ENV_VAR,
18
+ GitHubInterceptor,
19
+ type GitHubInterceptorOptions,
20
+ type GitHubUrlInfo,
21
+ parseGitHubUrl,
22
+ resolveGitHubOptions,
23
+ type UrlInterceptor,
24
+ } from "./providers/interceptors/index.js";
15
25
 
16
- export type { FetchResponse, SearchProvider, SearchResponse, SearchResult } from "./providers/types.js";
26
+ export type {
27
+ FetchProvider,
28
+ FetchResponse,
29
+ FullProvider,
30
+ SearchProvider,
31
+ SearchResponse,
32
+ SearchResult,
33
+ } from "./providers/types.js";
17
34
  export {
18
35
  DEFAULT_WEB_FETCH_GUIDELINES,
19
36
  DEFAULT_WEB_FETCH_SNIPPET,
@@ -24,7 +41,17 @@ export {
24
41
  registerWebSearchTool,
25
42
  } from "./web-tools.js";
26
43
 
27
- export default function (pi: ExtensionAPI) {
44
+ // Programmatic consumer-side opt-in for URL interceptors. Tier 2 in the
45
+ // resolution model: end-user config (Tier 1) still wins. Default OFF —
46
+ // existing rpiv-web-tools users see zero behavior change.
47
+ export interface RegisterOptions {
48
+ interceptors?: {
49
+ github?: boolean;
50
+ };
51
+ }
52
+
53
+ export default function registerWebTools(pi: ExtensionAPI, opts?: RegisterOptions): void {
54
+ buildInterceptors(opts?.interceptors);
28
55
  registerWebSearchTool(pi);
29
56
  registerWebFetchTool(pi);
30
57
  registerWebSearchConfigCommand(pi);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@juicesharp/rpiv-web-tools",
3
- "version": "1.14.6",
3
+ "version": "1.15.0",
4
4
  "description": "Pi extension. Web search and fetch for the model with pluggable providers (Brave, Tavily, Serper, Exa, Jina, Firecrawl, SearXNG, Ollama).",
5
5
  "keywords": [
6
6
  "pi-package",
@@ -57,7 +57,7 @@
57
57
  ]
58
58
  },
59
59
  "dependencies": {
60
- "@juicesharp/rpiv-config": "^1.14.6"
60
+ "@juicesharp/rpiv-config": "^1.15.0"
61
61
  },
62
62
  "peerDependencies": {
63
63
  "@earendil-works/pi-coding-agent": "*",
@@ -1,5 +1,4 @@
1
- import { assertTextContentType, extractBodyAsText, fetchUrlOrThrow } from "./fetch-helpers.js";
2
- import type { FetchResponse, SearchProvider, SearchResponse, SearchResult } from "./types.js";
1
+ import type { SearchProvider, SearchResponse, SearchResult } from "./types.js";
3
2
 
4
3
  const BRAVE_SEARCH_API_URL = "https://api.search.brave.com/res/v1/web/search";
5
4
  export const BRAVE_API_KEY_ENV_VAR = "BRAVE_SEARCH_API_KEY";
@@ -7,6 +6,7 @@ export const BRAVE_PROVIDER_META = {
7
6
  name: "brave",
8
7
  label: "Brave",
9
8
  envVar: BRAVE_API_KEY_ENV_VAR,
9
+ roles: ["search"] as const,
10
10
  } as const;
11
11
 
12
12
  interface BraveRawResponse {
@@ -55,22 +55,4 @@ export class BraveProvider implements SearchProvider {
55
55
  const raw = (await res.json()) as BraveRawResponse;
56
56
  return { query, results: normalizeBraveResults(raw) };
57
57
  }
58
-
59
- // No apiKey guard: Brave's fetch() wraps the built-in HTTP+htmlToText
60
- // pipeline and does not call any vendor endpoint. Adding a guard would
61
- // break the "use any provider for fetch" contract.
62
- async fetch(url: string, raw: boolean, signal?: AbortSignal): Promise<FetchResponse> {
63
- const res = await fetchUrlOrThrow(url, signal);
64
- const contentType = res.headers.get("content-type") ?? "";
65
- assertTextContentType(contentType);
66
-
67
- const { text, title } = await extractBodyAsText(res, contentType, raw);
68
- const contentLengthHeader = res.headers.get("content-length");
69
- return {
70
- text,
71
- title,
72
- contentType: contentType || undefined,
73
- contentLength: contentLengthHeader ? Number(contentLengthHeader) : undefined,
74
- };
75
- }
76
58
  }
@@ -0,0 +1,129 @@
1
+ import { mkdirSync, rmSync, writeFileSync } from "node:fs";
2
+ import { dirname } from "node:path";
3
+ import { configPath } from "@juicesharp/rpiv-config";
4
+ import { beforeEach, describe, expect, it } from "vitest";
5
+ import { getConfigPath, readConfig, WebToolsConfigSchema, writeConfig } from "./config.js";
6
+
7
+ const CONFIG_PATH = configPath("rpiv-web-tools");
8
+
9
+ beforeEach(() => {
10
+ rmSync(CONFIG_PATH, { force: true });
11
+ });
12
+
13
+ function writeRaw(contents: string): void {
14
+ mkdirSync(dirname(CONFIG_PATH), { recursive: true });
15
+ writeFileSync(CONFIG_PATH, contents, "utf-8");
16
+ }
17
+
18
+ describe("getConfigPath", () => {
19
+ it("returns the canonical ~/.config/rpiv-web-tools/config.json", () => {
20
+ expect(getConfigPath()).toBe(CONFIG_PATH);
21
+ });
22
+ });
23
+
24
+ describe("readConfig — fail-soft posture", () => {
25
+ it("returns {} when the file does not exist", () => {
26
+ expect(readConfig()).toEqual({});
27
+ });
28
+
29
+ it("returns {} on malformed JSON (matches loadJsonConfig tolerance)", () => {
30
+ writeRaw("{ not valid json");
31
+ expect(readConfig()).toEqual({});
32
+ });
33
+
34
+ it("returns {} when the file is a directory (EISDIR)", () => {
35
+ mkdirSync(CONFIG_PATH, { recursive: true });
36
+ try {
37
+ expect(readConfig()).toEqual({});
38
+ } finally {
39
+ rmSync(CONFIG_PATH, { recursive: true, force: true });
40
+ }
41
+ });
42
+
43
+ it("returns {} when the schema validation fails hard (e.g. provider is a number)", () => {
44
+ writeRaw(JSON.stringify({ provider: 123 }));
45
+ expect(readConfig()).toEqual({});
46
+ });
47
+ });
48
+
49
+ describe("readConfig — released-shape compatibility", () => {
50
+ it("loads a minimal { provider, apiKeys } config unchanged", () => {
51
+ writeRaw(JSON.stringify({ provider: "brave", apiKeys: { brave: "k" } }));
52
+ expect(readConfig()).toEqual({ provider: "brave", apiKeys: { brave: "k" } });
53
+ });
54
+
55
+ it("loads the legacy top-level apiKey field", () => {
56
+ writeRaw(JSON.stringify({ apiKey: "legacy" }));
57
+ expect(readConfig()).toMatchObject({ apiKey: "legacy" });
58
+ });
59
+
60
+ it("preserves unknown top-level keys (otherField round-trip contract)", () => {
61
+ // The released /web-tools migrate-legacy-apiKey test relies on this:
62
+ // unknown keys MUST NOT be stripped by the schema reader.
63
+ writeRaw(JSON.stringify({ apiKey: "k", otherField: "keep" }));
64
+ const cfg = readConfig() as { otherField?: string };
65
+ expect(cfg.otherField).toBe("keep");
66
+ });
67
+
68
+ it("loads the guidance subtree with web_search + web_fetch", () => {
69
+ writeRaw(
70
+ JSON.stringify({
71
+ guidance: {
72
+ web_search: { promptSnippet: "snip", promptGuidelines: ["a", "b"] },
73
+ web_fetch: { promptSnippet: "snip2" },
74
+ },
75
+ }),
76
+ );
77
+ const cfg = readConfig();
78
+ expect(cfg.guidance?.web_search?.promptSnippet).toBe("snip");
79
+ expect(cfg.guidance?.web_fetch?.promptSnippet).toBe("snip2");
80
+ });
81
+ });
82
+
83
+ describe("readConfig — interceptors.github union", () => {
84
+ it("accepts the boolean true shorthand", () => {
85
+ writeRaw(JSON.stringify({ interceptors: { github: true } }));
86
+ expect(readConfig().interceptors?.github).toBe(true);
87
+ });
88
+
89
+ it("accepts the boolean false shorthand", () => {
90
+ writeRaw(JSON.stringify({ interceptors: { github: false } }));
91
+ expect(readConfig().interceptors?.github).toBe(false);
92
+ });
93
+
94
+ it("accepts the object override form", () => {
95
+ writeRaw(
96
+ JSON.stringify({
97
+ interceptors: { github: { maxRepoSizeMB: 1000, clonePath: "/x" } },
98
+ }),
99
+ );
100
+ const gh = readConfig().interceptors?.github;
101
+ expect(gh).toEqual({ maxRepoSizeMB: 1000, clonePath: "/x" });
102
+ });
103
+
104
+ it("falls back to {} when interceptors.github has a type-incompatible shape", () => {
105
+ // A number is neither boolean nor a GitHubInterceptorOptions object —
106
+ // hard schema failure → fail-soft to {}.
107
+ writeRaw(JSON.stringify({ interceptors: { github: 42 } }));
108
+ expect(readConfig()).toEqual({});
109
+ });
110
+ });
111
+
112
+ describe("writeConfig", () => {
113
+ it("round-trips a config through readConfig", () => {
114
+ expect(writeConfig({ provider: "brave", apiKeys: { brave: "k" } })).toBe(true);
115
+ expect(readConfig()).toEqual({ provider: "brave", apiKeys: { brave: "k" } });
116
+ });
117
+
118
+ it("preserves the interceptors.github stanza across save+load", () => {
119
+ expect(writeConfig({ interceptors: { github: { maxRepoSizeMB: 500 } } })).toBe(true);
120
+ expect(readConfig().interceptors?.github).toEqual({ maxRepoSizeMB: 500 });
121
+ });
122
+ });
123
+
124
+ describe("WebToolsConfigSchema — schema-only sanity", () => {
125
+ it("exists and is a TypeBox object", () => {
126
+ expect(WebToolsConfigSchema).toBeDefined();
127
+ expect(WebToolsConfigSchema.type).toBe("object");
128
+ });
129
+ });
@@ -0,0 +1,96 @@
1
+ /**
2
+ * Single typed reader/writer for ~/.config/rpiv-web-tools/config.json.
3
+ *
4
+ * Owns the canonical WebToolsConfigSchema. All schema fields are optional and
5
+ * unknown keys pass through (additionalProperties: true) so existing configs
6
+ * carrying legacy/unrelated fields keep working — required for the
7
+ * `otherField: "keep"` preservation contract the released `/web-tools`
8
+ * legacy-apiKey migration depends on.
9
+ *
10
+ * Validation is fail-soft (matching `loadJsonConfig` and `validateConfig` in
11
+ * rpiv-config): malformed JSON, EISDIR, or a hard schema violation all
12
+ * degrade to `{}`. The orchestrator never has to handle "config blew up at
13
+ * startup."
14
+ */
15
+
16
+ import { configPath, GuidanceFieldsSchema, loadJsonConfig, saveJsonConfig } from "@juicesharp/rpiv-config";
17
+ import { type Static, Type } from "typebox";
18
+ import { Value } from "typebox/value";
19
+
20
+ // The web_search / web_fetch tool-namespace wrapper is web-tools' concept, not
21
+ // rpiv-config's. The leaf schema (`GuidanceFieldsSchema`) is sibling-agnostic
22
+ // and lives in rpiv-config; this file only composes the tool-namespaced shell
23
+ // around it.
24
+ const WebToolsGuidanceSchema = Type.Object(
25
+ {
26
+ web_search: Type.Optional(GuidanceFieldsSchema),
27
+ web_fetch: Type.Optional(GuidanceFieldsSchema),
28
+ },
29
+ { additionalProperties: true },
30
+ );
31
+
32
+ const GitHubInterceptorOptionsSchema = Type.Object(
33
+ {
34
+ enabled: Type.Optional(Type.Boolean()),
35
+ maxRepoSizeMB: Type.Optional(Type.Number()),
36
+ cloneTimeoutSeconds: Type.Optional(Type.Number()),
37
+ clonePath: Type.Optional(Type.String()),
38
+ },
39
+ { additionalProperties: true },
40
+ );
41
+
42
+ const InterceptorsConfigSchema = Type.Object(
43
+ {
44
+ // Boolean shorthand or per-field overrides. `enabled: false` inside the
45
+ // object form is allowed but redundant — use the top-level `false`.
46
+ github: Type.Optional(Type.Union([Type.Boolean(), GitHubInterceptorOptionsSchema])),
47
+ },
48
+ { additionalProperties: true },
49
+ );
50
+
51
+ export const WebToolsConfigSchema = Type.Object(
52
+ {
53
+ provider: Type.Optional(Type.String()),
54
+ apiKeys: Type.Optional(Type.Record(Type.String(), Type.String())),
55
+ baseUrls: Type.Optional(Type.Record(Type.String(), Type.String())),
56
+ // Legacy top-level Brave key. Auto-migrated to `apiKeys.brave` by the
57
+ // /web-tools save path — kept here for the load+rewrite round-trip.
58
+ apiKey: Type.Optional(Type.String()),
59
+ guidance: Type.Optional(WebToolsGuidanceSchema),
60
+ interceptors: Type.Optional(InterceptorsConfigSchema),
61
+ },
62
+ { additionalProperties: true },
63
+ );
64
+
65
+ export type WebToolsConfig = Static<typeof WebToolsConfigSchema>;
66
+
67
+ const CONFIG_PATH = configPath("rpiv-web-tools");
68
+
69
+ export function getConfigPath(): string {
70
+ return CONFIG_PATH;
71
+ }
72
+
73
+ // Tolerant read: loadJsonConfig already swallows JSON parse failures + EISDIR
74
+ // into `{}`; we then run a schema check that — on hard failure — falls back to
75
+ // the same `{}`. Validation uses `Value.Check` (no mutation) rather than
76
+ // `Value.Clean` (would strip unknown fields like the released `otherField`
77
+ // pass-through contract).
78
+ export function readConfig(): WebToolsConfig {
79
+ const raw = loadJsonConfig<unknown>(CONFIG_PATH);
80
+ if (!Value.Check(WebToolsConfigSchema, raw)) {
81
+ return {} as WebToolsConfig;
82
+ }
83
+ return raw as WebToolsConfig;
84
+ }
85
+
86
+ export function writeConfig(c: WebToolsConfig): boolean {
87
+ return saveJsonConfig(CONFIG_PATH, c);
88
+ }
89
+
90
+ // Plan-surface no-op. Phase 4 omits the in-memory cache the plan sketched —
91
+ // the tests' direct-writeFileSync pattern makes per-test invalidation a
92
+ // rewrite-the-suite job for marginal perf gain. Kept exported so that
93
+ // consumers writing against the plan's API can call it without breaking.
94
+ export function invalidateConfigCache(): void {
95
+ // no-op
96
+ }
package/providers/exa.ts CHANGED
@@ -1,4 +1,4 @@
1
- import type { FetchResponse, SearchProvider, SearchResponse, SearchResult } from "./types.js";
1
+ import type { FetchResponse, FullProvider, SearchResponse, SearchResult } from "./types.js";
2
2
 
3
3
  const EXA_API_URL = "https://api.exa.ai/search";
4
4
  const EXA_CONTENTS_API_URL = "https://api.exa.ai/contents";
@@ -7,6 +7,7 @@ export const EXA_PROVIDER_META = {
7
7
  name: "exa",
8
8
  label: "Exa",
9
9
  envVar: EXA_API_KEY_ENV_VAR,
10
+ roles: ["search", "fetch"] as const,
10
11
  } as const;
11
12
  const EXA_MAX_SNIPPET_CHARACTERS = 300;
12
13
  const EXA_MAX_FETCH_CHARACTERS = 10000;
@@ -30,7 +31,7 @@ function normalizeExaResults(results: ExaRawResult[]): SearchResult[] {
30
31
  }));
31
32
  }
32
33
 
33
- export class ExaProvider implements SearchProvider {
34
+ export class ExaProvider implements FullProvider {
34
35
  readonly name = EXA_PROVIDER_META.name;
35
36
  readonly label = EXA_PROVIDER_META.label;
36
37
  readonly envVar = EXA_PROVIDER_META.envVar;
@@ -6,14 +6,18 @@ import { OllamaProvider } from "./ollama.js";
6
6
  import { SearxngProvider } from "./searxng.js";
7
7
  import { SerperProvider } from "./serper.js";
8
8
  import { TavilyProvider } from "./tavily.js";
9
- import type { SearchProvider } from "./types.js";
9
+ import type { FullProvider, SearchProvider } from "./types.js";
10
10
 
11
11
  export interface ProviderCredentials {
12
12
  apiKey?: string;
13
13
  baseUrl?: string;
14
14
  }
15
15
 
16
- export function createSearchProvider(name: string, creds: ProviderCredentials): SearchProvider {
16
+ // The return union mirrors the role split: Brave/Serper/SearXNG are search-
17
+ // only (SearchProvider); the other five expose native fetch endpoints too
18
+ // (FullProvider). Consumers narrow with `"fetch" in provider` when they need
19
+ // to dispatch on capability.
20
+ export function createSearchProvider(name: string, creds: ProviderCredentials): SearchProvider | FullProvider {
17
21
  const apiKey = creds.apiKey ?? "";
18
22
  switch (name) {
19
23
  case "brave":
@@ -1,8 +1,12 @@
1
1
  /**
2
2
  * Shared fetch helpers — HTTP client, content-type guards, and HTML-to-text
3
- * extraction used by providers that wrap the built-in pipeline (Brave, Serper).
3
+ * extraction used by providers that wrap the built-in pipeline (Brave, Serper,
4
+ * SearXNG). `fetchViaGenericHtml` is the one-stop entry point those providers
5
+ * delegate their `fetch()` method to.
4
6
  */
5
7
 
8
+ import type { FetchResponse } from "./types.js";
9
+
6
10
  // ---------------------------------------------------------------------------
7
11
  // Constants
8
12
  // ---------------------------------------------------------------------------
@@ -115,3 +119,21 @@ export async function extractBodyAsText(
115
119
  }
116
120
  return { text: body };
117
121
  }
122
+
123
+ // One-stop fetch helper for providers that have no native fetch endpoint
124
+ // (Brave/Serper/SearXNG). Bundles the quartet — fetchUrlOrThrow →
125
+ // content-type assertion → body extraction → FetchResponse envelope — so
126
+ // providers collapse to a single delegating call.
127
+ export async function fetchViaGenericHtml(url: string, raw: boolean, signal?: AbortSignal): Promise<FetchResponse> {
128
+ const res = await fetchUrlOrThrow(url, signal);
129
+ const contentType = res.headers.get("content-type") ?? "";
130
+ assertTextContentType(contentType);
131
+ const { text, title } = await extractBodyAsText(res, contentType, raw);
132
+ const contentLengthHeader = res.headers.get("content-length");
133
+ return {
134
+ text,
135
+ title,
136
+ contentType: contentType || undefined,
137
+ contentLength: contentLengthHeader ? Number(contentLengthHeader) : undefined,
138
+ };
139
+ }
@@ -1,4 +1,4 @@
1
- import type { FetchResponse, SearchProvider, SearchResponse, SearchResult } from "./types.js";
1
+ import type { FetchResponse, FullProvider, SearchResponse, SearchResult } from "./types.js";
2
2
 
3
3
  const FIRECRAWL_API_URL = "https://api.firecrawl.dev/v1";
4
4
  export const FIRECRAWL_API_KEY_ENV_VAR = "FIRECRAWL_API_KEY";
@@ -6,6 +6,7 @@ export const FIRECRAWL_PROVIDER_META = {
6
6
  name: "firecrawl",
7
7
  label: "Firecrawl",
8
8
  envVar: FIRECRAWL_API_KEY_ENV_VAR,
9
+ roles: ["search", "fetch"] as const,
9
10
  } as const;
10
11
 
11
12
  interface FirecrawlSearchResult {
@@ -43,7 +44,7 @@ function normalizeFirecrawlResults(results: FirecrawlSearchResult[]): SearchResu
43
44
  }));
44
45
  }
45
46
 
46
- export class FirecrawlProvider implements SearchProvider {
47
+ export class FirecrawlProvider implements FullProvider {
47
48
  readonly name = FIRECRAWL_PROVIDER_META.name;
48
49
  readonly label = FIRECRAWL_PROVIDER_META.label;
49
50
  readonly envVar = FIRECRAWL_PROVIDER_META.envVar;
@@ -12,6 +12,17 @@ export { BRAVE_API_KEY_ENV_VAR, BRAVE_PROVIDER_META, BraveProvider } from "./bra
12
12
  export { EXA_API_KEY_ENV_VAR, EXA_PROVIDER_META, ExaProvider } from "./exa.js";
13
13
  export { createSearchProvider, type ProviderCredentials } from "./factory.js";
14
14
  export { FIRECRAWL_API_KEY_ENV_VAR, FIRECRAWL_PROVIDER_META, FirecrawlProvider } from "./firecrawl.js";
15
+ // URL interceptors live in providers/interceptors/. The github primitives
16
+ // (parseGitHubUrl, GitHubUrlInfo, etc.) are re-exported from there.
17
+ export {
18
+ clearCloneCache,
19
+ GITHUB_TOKEN_ENV_VAR,
20
+ GitHubInterceptor,
21
+ type GitHubInterceptorOptions,
22
+ type GitHubUrlInfo,
23
+ parseGitHubUrl,
24
+ type UrlInterceptor,
25
+ } from "./interceptors/index.js";
15
26
  export { JINA_API_KEY_ENV_VAR, JINA_PROVIDER_META, JinaProvider } from "./jina.js";
16
27
  export {
17
28
  configureOllama,
@@ -35,11 +46,14 @@ export {
35
46
  export { SERPER_API_KEY_ENV_VAR, SERPER_PROVIDER_META, SerperProvider } from "./serper.js";
36
47
  export { TAVILY_API_KEY_ENV_VAR, TAVILY_PROVIDER_META, TavilyProvider } from "./tavily.js";
37
48
  export type {
49
+ FetchProvider,
38
50
  FetchResponse,
51
+ FullProvider,
39
52
  ProviderConfigChange,
40
53
  ProviderConfigCurrent,
41
54
  ProviderConfigUi,
42
55
  ProviderMeta,
56
+ ProviderRole,
43
57
  SearchProvider,
44
58
  SearchResponse,
45
59
  SearchResult,