webveil 0.0.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +661 -0
- package/README.md +326 -0
- package/dist/cli.d.ts +58 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +91 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/backends/custom.d.ts +15 -0
- package/dist/core/backends/custom.d.ts.map +1 -0
- package/dist/core/backends/custom.js +106 -0
- package/dist/core/backends/custom.js.map +1 -0
- package/dist/core/backends/registry.d.ts +13 -0
- package/dist/core/backends/registry.d.ts.map +1 -0
- package/dist/core/backends/registry.js +31 -0
- package/dist/core/backends/registry.js.map +1 -0
- package/dist/core/backends/searxng.d.ts +8 -0
- package/dist/core/backends/searxng.d.ts.map +1 -0
- package/dist/core/backends/searxng.js +43 -0
- package/dist/core/backends/searxng.js.map +1 -0
- package/dist/core/backends/tavily-compat.d.ts +10 -0
- package/dist/core/backends/tavily-compat.d.ts.map +1 -0
- package/dist/core/backends/tavily-compat.js +85 -0
- package/dist/core/backends/tavily-compat.js.map +1 -0
- package/dist/core/backends/types.d.ts +48 -0
- package/dist/core/backends/types.d.ts.map +1 -0
- package/dist/core/backends/types.js +5 -0
- package/dist/core/backends/types.js.map +1 -0
- package/dist/core/baseurl.d.ts +42 -0
- package/dist/core/baseurl.d.ts.map +1 -0
- package/dist/core/baseurl.js +79 -0
- package/dist/core/baseurl.js.map +1 -0
- package/dist/core/config.d.ts +39 -0
- package/dist/core/config.d.ts.map +1 -0
- package/dist/core/config.js +72 -0
- package/dist/core/config.js.map +1 -0
- package/dist/core/egress.d.ts +46 -0
- package/dist/core/egress.d.ts.map +1 -0
- package/dist/core/egress.js +113 -0
- package/dist/core/egress.js.map +1 -0
- package/dist/core/extract.d.ts +45 -0
- package/dist/core/extract.d.ts.map +1 -0
- package/dist/core/extract.js +36 -0
- package/dist/core/extract.js.map +1 -0
- package/dist/core/fetch.d.ts +42 -0
- package/dist/core/fetch.d.ts.map +1 -0
- package/dist/core/fetch.js +76 -0
- package/dist/core/fetch.js.map +1 -0
- package/dist/core/http.d.ts +8 -0
- package/dist/core/http.d.ts.map +1 -0
- package/dist/core/http.js +49 -0
- package/dist/core/http.js.map +1 -0
- package/dist/core/search.d.ts +34 -0
- package/dist/core/search.d.ts.map +1 -0
- package/dist/core/search.js +92 -0
- package/dist/core/search.js.map +1 -0
- package/dist/core/security.d.ts +35 -0
- package/dist/core/security.d.ts.map +1 -0
- package/dist/core/security.js +141 -0
- package/dist/core/security.js.map +1 -0
- package/dist/index.d.ts +22 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +40 -0
- package/dist/index.js.map +1 -0
- package/package.json +62 -2
- package/src/cli.ts +106 -0
- package/src/core/backends/custom.ts +159 -0
- package/src/core/backends/registry.ts +41 -0
- package/src/core/backends/searxng.ts +70 -0
- package/src/core/backends/tavily-compat.ts +156 -0
- package/src/core/backends/types.ts +61 -0
- package/src/core/baseurl.ts +104 -0
- package/src/core/config.ts +106 -0
- package/src/core/egress.ts +134 -0
- package/src/core/extract.ts +82 -0
- package/src/core/fetch.ts +132 -0
- package/src/core/http.ts +62 -0
- package/src/core/search.ts +140 -0
- package/src/core/security.ts +141 -0
- package/src/index.ts +82 -0
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
// core fetch: the plain, framework-agnostic `fetch()` BOTH frontends (the incur
|
|
2
|
+
// CLI/MCP and the pi extension) call. Returns clean, size-bounded markdown with
|
|
3
|
+
// distilly's `truncated` flag.
|
|
4
|
+
//
|
|
5
|
+
// Flow (per URL): pick the content source (a backend's own `/extract`
|
|
6
|
+
// (tavily-compat) when the configured backend provides one, OTHERWISE the
|
|
7
|
+
// default distilly Extractor seam, urlToMarkdown over webveil's egress). The
|
|
8
|
+
// SSRF guard lives INSIDE the egress-bound fetch injected into distilly, so it
|
|
9
|
+
// covers distilly's rule-rewritten requests too (docs/adr/0001).
|
|
10
|
+
//
|
|
11
|
+
// LIST-READY INTERNALS (story 12): the work happens in `fetchAll(urls, …)`, a
|
|
12
|
+
// list-processing internal, so a future `web_batch_fetch` tool is a trivial add
|
|
13
|
+
// with no redesign. The public `fetch()` is a thin single-URL wrapper over it.
|
|
14
|
+
|
|
15
|
+
import {resolveConfig as defaultResolveConfig} from './config.js';
|
|
16
|
+
import type {Config, ResolveOptions} from './config.js';
|
|
17
|
+
import {createEgressFetch as defaultCreateEgressFetch} from './egress.js';
|
|
18
|
+
import type {EgressFetch} from './egress.js';
|
|
19
|
+
import {guardEgressFetch as defaultGuardEgressFetch} from './security.js';
|
|
20
|
+
import {createHttp as defaultCreateHttp} from './http.js';
|
|
21
|
+
import {buildDispatcher as defaultBuildDispatcher} from './egress.js';
|
|
22
|
+
import type {Dispatcher} from './egress.js';
|
|
23
|
+
import {extract as defaultExtract} from './extract.js';
|
|
24
|
+
import type {ExtractDeps} from './extract.js';
|
|
25
|
+
import {getBackend as defaultGetBackend} from './backends/registry.js';
|
|
26
|
+
import type {
|
|
27
|
+
Backend,
|
|
28
|
+
FetchOptions,
|
|
29
|
+
FetchResult,
|
|
30
|
+
Http,
|
|
31
|
+
} from './backends/types.js';
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Collaborators, seamed so the core is testable WITHOUT real config files,
|
|
35
|
+
* undici, network, or distilly: a test injects fakes to assert the
|
|
36
|
+
* backend-`/extract`-vs-distilly branch, the list path, and that the guarded
|
|
37
|
+
* egress fetch (never a global) is what reaches distilly. Defaults wire the real
|
|
38
|
+
* modules.
|
|
39
|
+
*/
|
|
40
|
+
export interface FetchDeps {
|
|
41
|
+
resolveConfig?: (options?: ResolveOptions) => Config;
|
|
42
|
+
getBackend?: (name: string, config: Config) => Backend;
|
|
43
|
+
buildDispatcher?: (config: Config) => Dispatcher | undefined;
|
|
44
|
+
createHttp?: (dispatcher: Dispatcher | undefined) => Http;
|
|
45
|
+
createEgressFetch?: (config: Config) => EgressFetch;
|
|
46
|
+
guardEgressFetch?: (fetch: EgressFetch, config: Config) => EgressFetch;
|
|
47
|
+
extract?: (
|
|
48
|
+
url: string,
|
|
49
|
+
config: Config,
|
|
50
|
+
options: {size?: Config['fetchSize']},
|
|
51
|
+
deps: ExtractDeps,
|
|
52
|
+
) => Promise<FetchResult>;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/** Per-call fetch options plus the config-resolution knobs (cwd/env/global). */
|
|
56
|
+
export interface FetchCoreOptions extends FetchOptions, ResolveOptions {}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Fetch a LIST of urls to clean, size-bounded markdown, in order. This is the
|
|
60
|
+
* list-ready internal (story 12): the single-URL `fetch()` below is a thin
|
|
61
|
+
* wrapper over it, so a future `web_batch_fetch` reuses this directly.
|
|
62
|
+
*
|
|
63
|
+
* Each url goes through the SAME content-source choice: a backend's own
|
|
64
|
+
* `/extract` (if the configured backend implements `fetch`) OR the default
|
|
65
|
+
* distilly Extractor with the GUARDED egress fetch injected.
|
|
66
|
+
*/
|
|
67
|
+
export async function fetchAll(
|
|
68
|
+
urls: string[],
|
|
69
|
+
options: FetchCoreOptions = {},
|
|
70
|
+
deps: FetchDeps = {},
|
|
71
|
+
): Promise<FetchResult[]> {
|
|
72
|
+
const resolveConfig = deps.resolveConfig ?? defaultResolveConfig;
|
|
73
|
+
const getBackend = deps.getBackend ?? defaultGetBackend;
|
|
74
|
+
const buildDispatcher = deps.buildDispatcher ?? defaultBuildDispatcher;
|
|
75
|
+
const createHttp = deps.createHttp ?? defaultCreateHttp;
|
|
76
|
+
const createEgressFetch = deps.createEgressFetch ?? defaultCreateEgressFetch;
|
|
77
|
+
const guardEgressFetch = deps.guardEgressFetch ?? defaultGuardEgressFetch;
|
|
78
|
+
const extract = deps.extract ?? defaultExtract;
|
|
79
|
+
|
|
80
|
+
const config = resolveConfig({
|
|
81
|
+
cwd: options.cwd,
|
|
82
|
+
env: options.env,
|
|
83
|
+
globalPath: options.globalPath,
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
const backend = getBackend(config.backend, config);
|
|
87
|
+
|
|
88
|
+
// A backend that provides its own `/extract` (tavily-compat) OVERRIDES the
|
|
89
|
+
// distilly Extractor; it is handed only the proxied http helper (built from
|
|
90
|
+
// the SAME dispatcher as the egress fetch), so it cannot bypass egress.
|
|
91
|
+
if (backend.fetch) {
|
|
92
|
+
const http = createHttp(buildDispatcher(config));
|
|
93
|
+
const backendFetch = backend.fetch.bind(backend);
|
|
94
|
+
return runAll(urls, (url) =>
|
|
95
|
+
backendFetch(url, http, {size: options.size, signal: options.signal}),
|
|
96
|
+
);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Default path: distilly Extractor over webveil's egress. Build the
|
|
100
|
+
// egress-bound fetch ONCE, wrap it with the SSRF guard, and inject THAT into
|
|
101
|
+
// distilly (never a global fetch). The guard covers distilly's rule-rewritten
|
|
102
|
+
// requests too. A configured-but-unbuildable proxy throws at build time
|
|
103
|
+
// (fail-loud), before any I/O.
|
|
104
|
+
const guardedFetch = guardEgressFetch(createEgressFetch(config), config);
|
|
105
|
+
const extractDeps: ExtractDeps = {createEgressFetch: () => guardedFetch};
|
|
106
|
+
return runAll(urls, (url) =>
|
|
107
|
+
extract(url, config, {size: options.size}, extractDeps),
|
|
108
|
+
);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/** Run a per-url worker over the list in order, collecting the results. */
|
|
112
|
+
async function runAll(
|
|
113
|
+
urls: string[],
|
|
114
|
+
work: (url: string) => Promise<FetchResult>,
|
|
115
|
+
): Promise<FetchResult[]> {
|
|
116
|
+
const out: FetchResult[] = [];
|
|
117
|
+
for (const url of urls) out.push(await work(url));
|
|
118
|
+
return out;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Fetch ONE url to clean, size-bounded markdown (`{ markdown, truncated, … }`).
|
|
123
|
+
* A thin single-URL wrapper over the list-ready `fetchAll` (story 12).
|
|
124
|
+
*/
|
|
125
|
+
export async function fetch(
|
|
126
|
+
url: string,
|
|
127
|
+
options: FetchCoreOptions = {},
|
|
128
|
+
deps: FetchDeps = {},
|
|
129
|
+
): Promise<FetchResult> {
|
|
130
|
+
const [result] = await fetchAll([url], options, deps);
|
|
131
|
+
return result!;
|
|
132
|
+
}
|
package/src/core/http.ts
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
// http helper — the proxied `http` handed to backends. fetchJson / fetchText
|
|
2
|
+
// apply the egress dispatcher + a per-request timeout + abort. Distinct from the
|
|
3
|
+
// egress-bound WHATWG `fetch` (egress.ts), but bound to the SAME dispatcher, so
|
|
4
|
+
// a backend physically cannot bypass the configured egress.
|
|
5
|
+
|
|
6
|
+
import {type Dispatcher, fetch as undiciFetch} from 'undici';
|
|
7
|
+
import type {Http, HttpRequestOptions} from './backends/types.js';
|
|
8
|
+
|
|
9
|
+
const DEFAULT_TIMEOUT_MS = 30_000;
|
|
10
|
+
|
|
11
|
+
async function request(
|
|
12
|
+
dispatcher: Dispatcher | undefined,
|
|
13
|
+
url: string,
|
|
14
|
+
options: HttpRequestOptions = {},
|
|
15
|
+
): Promise<Response> {
|
|
16
|
+
const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
17
|
+
const controller = new AbortController();
|
|
18
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
19
|
+
if (options.signal)
|
|
20
|
+
options.signal.addEventListener('abort', () => controller.abort(), {
|
|
21
|
+
once: true,
|
|
22
|
+
});
|
|
23
|
+
try {
|
|
24
|
+
const res = await undiciFetch(url, {
|
|
25
|
+
method: options.method,
|
|
26
|
+
headers: options.headers,
|
|
27
|
+
body: options.body,
|
|
28
|
+
signal: controller.signal,
|
|
29
|
+
dispatcher,
|
|
30
|
+
} as never);
|
|
31
|
+
return res as unknown as Response;
|
|
32
|
+
} finally {
|
|
33
|
+
clearTimeout(timer);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Build the proxied http helper over a given dispatcher. Both methods throw on a
|
|
39
|
+
* non-2xx response so a backend never silently consumes an error body.
|
|
40
|
+
*/
|
|
41
|
+
export function createHttp(dispatcher: Dispatcher | undefined): Http {
|
|
42
|
+
return {
|
|
43
|
+
async fetchJson<T = unknown>(
|
|
44
|
+
url: string,
|
|
45
|
+
options?: HttpRequestOptions,
|
|
46
|
+
): Promise<T> {
|
|
47
|
+
const res = await request(dispatcher, url, options);
|
|
48
|
+
if (!res.ok)
|
|
49
|
+
throw new Error(`http ${res.status} ${res.statusText} for ${url}`);
|
|
50
|
+
return (await res.json()) as T;
|
|
51
|
+
},
|
|
52
|
+
async fetchText(
|
|
53
|
+
url: string,
|
|
54
|
+
options?: HttpRequestOptions,
|
|
55
|
+
): Promise<string> {
|
|
56
|
+
const res = await request(dispatcher, url, options);
|
|
57
|
+
if (!res.ok)
|
|
58
|
+
throw new Error(`http ${res.status} ${res.statusText} for ${url}`);
|
|
59
|
+
return await res.text();
|
|
60
|
+
},
|
|
61
|
+
};
|
|
62
|
+
}
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
// core search — the plain, framework-agnostic `search()` BOTH frontends (the
|
|
2
|
+
// incur CLI/MCP and the pi extension) call. It owns the wiring and the
|
|
3
|
+
// caller-facing post-processing; the per-source parsing lives in the backend.
|
|
4
|
+
//
|
|
5
|
+
// Flow: resolve config → build the egress dispatcher → bind the proxied `http`
|
|
6
|
+
// helper to it → select the backend from the registry → call the backend with
|
|
7
|
+
// ONLY that proxied helper → normalize (dedup + clamp) the SearchResult[].
|
|
8
|
+
//
|
|
9
|
+
// The egress invariant (docs/adr/0001): the backend is handed only the
|
|
10
|
+
// dispatcher-bound `http` helper, so it physically cannot reach a global fetch
|
|
11
|
+
// and bypass the configured egress. A configured-but-unbuildable proxy throws at
|
|
12
|
+
// buildDispatcher (fail-loud), never silently un-proxied.
|
|
13
|
+
|
|
14
|
+
import {resolveConfig as defaultResolveConfig} from './config.js';
|
|
15
|
+
import type {Config, ResolveOptions} from './config.js';
|
|
16
|
+
import {
|
|
17
|
+
buildDispatcher as defaultBuildDispatcher,
|
|
18
|
+
assertEgressAllowsBaseUrl as defaultAssertEgressAllowsBaseUrl,
|
|
19
|
+
} from './egress.js';
|
|
20
|
+
import type {Dispatcher} from './egress.js';
|
|
21
|
+
import {resolveBackendTransport as defaultResolveBackendTransport} from './baseurl.js';
|
|
22
|
+
import type {BackendTransport} from './baseurl.js';
|
|
23
|
+
import {createHttp as defaultCreateHttp} from './http.js';
|
|
24
|
+
import {getBackend as defaultGetBackend} from './backends/registry.js';
|
|
25
|
+
import type {Http, SearchOptions, SearchResult} from './backends/types.js';
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Default cap on returned results when the caller does not pass `maxResults`.
|
|
29
|
+
* Keeps an agent's context small by default; a caller can raise/lower it per
|
|
30
|
+
* call. (Recorded decision: there is no configured default, so the core sets
|
|
31
|
+
* one; see the task's Decisions block.)
|
|
32
|
+
*/
|
|
33
|
+
const DEFAULT_MAX_RESULTS = 10;
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Collaborators, seamed so the core is testable WITHOUT real config files,
|
|
37
|
+
* undici, or network: a test injects a fake `getBackend`/`createHttp` to assert
|
|
38
|
+
* the backend is handed only the proxied helper, and a fake backend returning
|
|
39
|
+
* duplicate/over-limit hits to assert dedup + clamp. Defaults wire the real
|
|
40
|
+
* config/egress/http/registry modules.
|
|
41
|
+
*/
|
|
42
|
+
export interface SearchDeps {
|
|
43
|
+
resolveConfig?: (options?: ResolveOptions) => Config;
|
|
44
|
+
buildDispatcher?: (config: Config) => Dispatcher | undefined;
|
|
45
|
+
assertEgressAllowsBaseUrl?: (config: Config) => void;
|
|
46
|
+
resolveBackendTransport?: (baseUrl: string) => BackendTransport;
|
|
47
|
+
createHttp?: (dispatcher: Dispatcher | undefined) => Http;
|
|
48
|
+
getBackend?: (
|
|
49
|
+
name: string,
|
|
50
|
+
config: Config,
|
|
51
|
+
) => {
|
|
52
|
+
search: (
|
|
53
|
+
query: string,
|
|
54
|
+
http: Http,
|
|
55
|
+
options?: SearchOptions,
|
|
56
|
+
) => Promise<SearchResult[]>;
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/** Per-call search options plus the config-resolution knobs (cwd/env/global). */
|
|
61
|
+
export interface SearchCoreOptions extends SearchOptions, ResolveOptions {}
|
|
62
|
+
|
|
63
|
+
/** Dedup by url (the hit's identity), preserving first-seen order. */
|
|
64
|
+
function dedup(results: SearchResult[]): SearchResult[] {
|
|
65
|
+
const seen = new Set<string>();
|
|
66
|
+
const out: SearchResult[] = [];
|
|
67
|
+
for (const r of results) {
|
|
68
|
+
if (seen.has(r.url)) continue;
|
|
69
|
+
seen.add(r.url);
|
|
70
|
+
out.push(r);
|
|
71
|
+
}
|
|
72
|
+
return out;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Search the configured backend over the configured egress and return
|
|
77
|
+
* normalized `SearchResult[]` (deduped by url, then clamped to `maxResults`).
|
|
78
|
+
*
|
|
79
|
+
* Dedup runs BEFORE the clamp so the caller gets up to `maxResults` UNIQUE hits,
|
|
80
|
+
* not a window that duplicates eat into; for the same reason the backend is NOT
|
|
81
|
+
* asked to pre-clamp (only the abort signal is forwarded).
|
|
82
|
+
*/
|
|
83
|
+
export async function search(
|
|
84
|
+
query: string,
|
|
85
|
+
options: SearchCoreOptions = {},
|
|
86
|
+
deps: SearchDeps = {},
|
|
87
|
+
): Promise<SearchResult[]> {
|
|
88
|
+
const resolveConfig = deps.resolveConfig ?? defaultResolveConfig;
|
|
89
|
+
const buildDispatcher = deps.buildDispatcher ?? defaultBuildDispatcher;
|
|
90
|
+
const assertEgressAllowsBaseUrl =
|
|
91
|
+
deps.assertEgressAllowsBaseUrl ?? defaultAssertEgressAllowsBaseUrl;
|
|
92
|
+
const resolveBackendTransport =
|
|
93
|
+
deps.resolveBackendTransport ?? defaultResolveBackendTransport;
|
|
94
|
+
const createHttp = deps.createHttp ?? defaultCreateHttp;
|
|
95
|
+
const getBackend = deps.getBackend ?? defaultGetBackend;
|
|
96
|
+
|
|
97
|
+
const config = resolveConfig({
|
|
98
|
+
cwd: options.cwd,
|
|
99
|
+
env: options.env,
|
|
100
|
+
globalPath: options.globalPath,
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
// Fail loud on the false-confidence combo (a local `unix:` socket baseUrl
|
|
104
|
+
// behind a proxy egress) BEFORE any transport is built.
|
|
105
|
+
assertEgressAllowsBaseUrl(config);
|
|
106
|
+
|
|
107
|
+
// Resolve the BACKEND-hop transport. For a normal TCP baseUrl this is a no-op
|
|
108
|
+
// (no per-hop dispatcher); for a `unix:` baseUrl it yields a socket-bound
|
|
109
|
+
// `Agent` and a synthetic `http://localhost…` base the backend builds on. The
|
|
110
|
+
// socket transport is scoped to THIS hop only and is NEVER bound into the
|
|
111
|
+
// shared config-wide egress dispatcher, so `web_fetch` egress is unaffected.
|
|
112
|
+
const transport = resolveBackendTransport(config.baseUrl);
|
|
113
|
+
|
|
114
|
+
// Build the egress dispatcher FIRST: a configured-but-unbuildable proxy throws
|
|
115
|
+
// here, before any network access (never an un-proxied request). For a socket
|
|
116
|
+
// baseUrl the per-hop socket dispatcher overrides the (direct/undefined) one.
|
|
117
|
+
const dispatcher = transport.dispatcher ?? buildDispatcher(config);
|
|
118
|
+
const http = createHttp(dispatcher);
|
|
119
|
+
|
|
120
|
+
// The backend stays transport-unaware: it receives a config whose baseUrl is
|
|
121
|
+
// always a real `http(s):` base (the `unix:` form is rewritten away here).
|
|
122
|
+
const backendConfig: Config =
|
|
123
|
+
transport.baseUrl === config.baseUrl
|
|
124
|
+
? config
|
|
125
|
+
: {...config, baseUrl: transport.baseUrl};
|
|
126
|
+
const backend = getBackend(backendConfig.backend, backendConfig);
|
|
127
|
+
// Hand the backend ONLY the proxied helper (no maxResults: dedup happens
|
|
128
|
+
// here, over the full set, so the clamp below is over UNIQUE results).
|
|
129
|
+
let raw: SearchResult[];
|
|
130
|
+
try {
|
|
131
|
+
raw = await backend.search(query, http, {signal: options.signal});
|
|
132
|
+
} finally {
|
|
133
|
+
// Best-effort close of the per-hop socket Agent (the shared egress
|
|
134
|
+
// dispatcher, owned by config, is NOT touched here).
|
|
135
|
+
if (transport.dispatcher) void transport.dispatcher.close();
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
const maxResults = options.maxResults ?? DEFAULT_MAX_RESULTS;
|
|
139
|
+
return dedup(raw).slice(0, maxResults);
|
|
140
|
+
}
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
// SSRF guard: the security seam wrapped AROUND the egress-bound `fetch`, so it
|
|
2
|
+
// covers BOTH webveil's own GETs AND distilly's rule-rewritten requests (see
|
|
3
|
+
// docs/adr/0001: the guard lives inside the egress fetch). Adapts the range
|
|
4
|
+
// classification + DNS-resolve approach of leing2021/pi-search's `security.ts`.
|
|
5
|
+
//
|
|
6
|
+
// THE RELAXATION RULE (load-bearing, recorded in the task's Decisions): the
|
|
7
|
+
// guard BLOCKS private/loopback/link-local/etc. addresses on DIRECT egress, and
|
|
8
|
+
// RELAXES ENTIRELY under a proxy egress (`http` | `socks5`). Tor/Mullvad
|
|
9
|
+
// legitimately reach private-looking addresses (e.g. `10.64.0.1`), AND a local
|
|
10
|
+
// DNS lookup for a proxied request would itself be a deanonymizing leak, so
|
|
11
|
+
// under a proxy we neither block nor resolve locally; the proxy owns egress.
|
|
12
|
+
|
|
13
|
+
import {lookup} from 'node:dns/promises';
|
|
14
|
+
import {isIP} from 'node:net';
|
|
15
|
+
import type {Config} from './config.js';
|
|
16
|
+
import type {EgressFetch} from './egress.js';
|
|
17
|
+
|
|
18
|
+
/** Thrown when the SSRF guard refuses a request to a private/blocked address. */
|
|
19
|
+
export class SsrfError extends Error {
|
|
20
|
+
constructor(message: string) {
|
|
21
|
+
super(message);
|
|
22
|
+
this.name = 'SsrfError';
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/** A proxy egress owns egress + DNS, so the local SSRF guard relaxes for it. */
|
|
27
|
+
function egressIsProxy(config: Config): boolean {
|
|
28
|
+
return config.egress.mode === 'http' || config.egress.mode === 'socks5';
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Is this LITERAL IP private / non-public? Covers the ranges that must never be
|
|
33
|
+
* reachable from a direct-egress web fetch:
|
|
34
|
+
* IPv4: 0.0.0.0/8, 10/8 (RFC1918), 127/8 (loopback), 169.254/16 (link-local,
|
|
35
|
+
* incl. the 169.254.169.254 cloud metadata endpoint), 172.16/12 (RFC1918),
|
|
36
|
+
* 192.168/16 (RFC1918), 100.64/10 (CGNAT), 192.0.0/24, 192.0.2/24,
|
|
37
|
+
* 198.18/15, 198.51.100/24, 203.0.113/24, 224/4 (multicast), 240/4
|
|
38
|
+
* (reserved).
|
|
39
|
+
* IPv6: ::1 (loopback), :: (unspecified), fc00::/7 (ULA), fe80::/10
|
|
40
|
+
* (link-local), ff00::/8 (multicast), plus IPv4-mapped (::ffff:a.b.c.d,
|
|
41
|
+
* re-checked as IPv4). Default-deny: anything outside global unicast
|
|
42
|
+
* (2000::/3) is treated as non-public.
|
|
43
|
+
*/
|
|
44
|
+
export function isPrivateIp(ip: string): boolean {
|
|
45
|
+
const kind = isIP(ip);
|
|
46
|
+
if (kind === 4) return isPrivateIpv4(ip);
|
|
47
|
+
if (kind === 6) return isPrivateIpv6(ip);
|
|
48
|
+
return false; // not a literal IP; hostname handling resolves it first
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function isPrivateIpv4(ip: string): boolean {
|
|
52
|
+
const parts = ip.split('.').map((p) => Number(p));
|
|
53
|
+
if (
|
|
54
|
+
parts.length !== 4 ||
|
|
55
|
+
parts.some((n) => !Number.isInteger(n) || n < 0 || n > 255)
|
|
56
|
+
)
|
|
57
|
+
return true; // malformed → treat as non-public (fail closed)
|
|
58
|
+
const [a, b] = parts as [number, number, number, number];
|
|
59
|
+
if (a === 0 || a === 10 || a === 127) return true;
|
|
60
|
+
if (a === 169 && b === 254) return true; // link-local incl. cloud metadata
|
|
61
|
+
if (a === 172 && b >= 16 && b <= 31) return true; // 172.16/12
|
|
62
|
+
if (a === 192 && b === 168) return true; // 192.168/16
|
|
63
|
+
if (a === 100 && b >= 64 && b <= 127) return true; // 100.64/10 CGNAT
|
|
64
|
+
if (a === 192 && b === 0) return true; // 192.0.0/24 + 192.0.2/24
|
|
65
|
+
if (a === 198 && (b === 18 || b === 19)) return true; // 198.18/15 benchmark
|
|
66
|
+
if (a === 198 && b === 51) return true; // 198.51.100/24 TEST-NET-2
|
|
67
|
+
if (a === 203 && b === 0) return true; // 203.0.113/24 TEST-NET-3
|
|
68
|
+
if (a >= 224) return true; // 224/4 multicast + 240/4 reserved
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function isPrivateIpv6(ip: string): boolean {
|
|
73
|
+
const lower = ip.toLowerCase();
|
|
74
|
+
if (lower === '::1' || lower === '::') return true; // loopback / unspecified
|
|
75
|
+
// IPv4-mapped (::ffff:a.b.c.d): re-check the embedded IPv4.
|
|
76
|
+
const mapped = lower.match(/^::ffff:(\d+\.\d+\.\d+\.\d+)$/);
|
|
77
|
+
if (mapped) return isPrivateIpv4(mapped[1]!);
|
|
78
|
+
const head = lower.split(':')[0] ?? '';
|
|
79
|
+
const first = parseInt(head || '0', 16);
|
|
80
|
+
if (Number.isNaN(first)) return true; // fail closed on anything unparseable
|
|
81
|
+
if ((first & 0xfe00) === 0xfc00) return true; // fc00::/7 ULA
|
|
82
|
+
if ((first & 0xffc0) === 0xfe80) return true; // fe80::/10 link-local
|
|
83
|
+
if ((first & 0xff00) === 0xff00) return true; // ff00::/8 multicast
|
|
84
|
+
// Default-deny: only global unicast 2000::/3 is public.
|
|
85
|
+
return (first & 0xe000) !== 0x2000;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Assert a URL is safe to fetch under THIS config's egress. Under a proxy egress
|
|
90
|
+
* it always passes (the proxy owns egress + DNS). Under direct egress it rejects
|
|
91
|
+
* a literal private IP and, for a hostname, resolves it locally and rejects if it
|
|
92
|
+
* maps to a private IP (so a name pointing at 127.0.0.1 / metadata is caught).
|
|
93
|
+
*/
|
|
94
|
+
export async function assertPublicUrl(
|
|
95
|
+
url: string,
|
|
96
|
+
config: Config,
|
|
97
|
+
): Promise<void> {
|
|
98
|
+
if (egressIsProxy(config)) return; // proxy owns egress + DNS; relax entirely
|
|
99
|
+
let parsed: URL;
|
|
100
|
+
try {
|
|
101
|
+
parsed = new URL(url);
|
|
102
|
+
} catch {
|
|
103
|
+
throw new SsrfError(`webveil SSRF: malformed url ${url}`);
|
|
104
|
+
}
|
|
105
|
+
const host = parsed.hostname.replace(/^\[|\]$/g, ''); // strip IPv6 brackets
|
|
106
|
+
if (isIP(host)) {
|
|
107
|
+
if (isPrivateIp(host))
|
|
108
|
+
throw new SsrfError(`webveil SSRF: blocked private address ${host}`);
|
|
109
|
+
return;
|
|
110
|
+
}
|
|
111
|
+
// A hostname: resolve it locally (safe on direct egress) and check every
|
|
112
|
+
// address it maps to, so a name pointing at a private IP is also blocked.
|
|
113
|
+
const addrs = await lookup(host, {all: true});
|
|
114
|
+
for (const {address} of addrs)
|
|
115
|
+
if (isPrivateIp(address))
|
|
116
|
+
throw new SsrfError(
|
|
117
|
+
`webveil SSRF: ${host} resolves to private address ${address}`,
|
|
118
|
+
);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Wrap an egress-bound `fetch` with the SSRF guard. The returned fetch checks
|
|
123
|
+
* EVERY request URL (so it covers distilly's rule-rewritten requests too, not
|
|
124
|
+
* only webveil's own GET) before delegating to the underlying egress fetch.
|
|
125
|
+
* This is what `core.fetch()` injects into distilly. See docs/adr/0001.
|
|
126
|
+
*/
|
|
127
|
+
export function guardEgressFetch(
|
|
128
|
+
fetch: EgressFetch,
|
|
129
|
+
config: Config,
|
|
130
|
+
): EgressFetch {
|
|
131
|
+
return (async (input: RequestInfo | URL, init?: RequestInit) => {
|
|
132
|
+
const url =
|
|
133
|
+
typeof input === 'string'
|
|
134
|
+
? input
|
|
135
|
+
: input instanceof URL
|
|
136
|
+
? input.href
|
|
137
|
+
: input.url;
|
|
138
|
+
await assertPublicUrl(url, config);
|
|
139
|
+
return fetch(input as never, init as never);
|
|
140
|
+
}) as EgressFetch;
|
|
141
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
// webveil — anonymous-capable, self-hosted, account-free web search + fetch for agents.
|
|
2
|
+
//
|
|
3
|
+
// This is the public surface. The framework-agnostic core lives under src/core:
|
|
4
|
+
// - core/config.ts : config seam (per-folder .pi/webveil.json + global + env)
|
|
5
|
+
// - core/egress.ts : egress seam (direct | http | socks5/Tor) — dispatcher + egress fetch
|
|
6
|
+
// - core/http.ts : the proxied `http` helper handed to backends
|
|
7
|
+
// - core/extract.ts : Extractor seam (distilly/fetch + injected egress fetch)
|
|
8
|
+
// - core/backends/types.ts : backend seam (the Backend interface + result shapes)
|
|
9
|
+
// - core/backends/registry.ts : name -> Backend dispatcher
|
|
10
|
+
// - core/backends/searxng.ts : the keyless self-hosted SearXNG backend
|
|
11
|
+
// - core/backends/tavily-compat.ts : the generic Tavily-shaped backend (/search + /extract)
|
|
12
|
+
// - core/search.ts : the framework-agnostic search() both frontends call
|
|
13
|
+
// - core/security.ts : SSRF guard wrapped around the egress fetch
|
|
14
|
+
// - core/fetch.ts : the framework-agnostic fetch() both frontends call
|
|
15
|
+
// - core/backends/custom.ts : the local-command escape hatch (JSON stdin/stdout)
|
|
16
|
+
// - cli.ts : the incur CLI + MCP frontend (the `webveil` bin)
|
|
17
|
+
// pi-webveil (sibling package) wraps the SAME core functions as registerTool
|
|
18
|
+
// web_search / web_fetch, in-process, as an Ollama drop-in.
|
|
19
|
+
|
|
20
|
+
// config seam
|
|
21
|
+
export {resolveConfig} from './core/config.js';
|
|
22
|
+
export type {
|
|
23
|
+
Config,
|
|
24
|
+
Egress,
|
|
25
|
+
FetchSize,
|
|
26
|
+
PartialConfig,
|
|
27
|
+
ResolveOptions,
|
|
28
|
+
} from './core/config.js';
|
|
29
|
+
|
|
30
|
+
// egress seam
|
|
31
|
+
export {
|
|
32
|
+
buildDispatcher,
|
|
33
|
+
createEgressFetch,
|
|
34
|
+
EgressError,
|
|
35
|
+
} from './core/egress.js';
|
|
36
|
+
export type {Dispatcher, EgressFetch} from './core/egress.js';
|
|
37
|
+
|
|
38
|
+
// http helper
|
|
39
|
+
export {createHttp} from './core/http.js';
|
|
40
|
+
|
|
41
|
+
// Extractor seam (distilly/fetch over webveil's egress)
|
|
42
|
+
export {extract} from './core/extract.js';
|
|
43
|
+
export type {ExtractOptions, ExtractDeps} from './core/extract.js';
|
|
44
|
+
|
|
45
|
+
// SSRF guard (wrapped around the egress fetch; covers distilly's requests too)
|
|
46
|
+
export {
|
|
47
|
+
assertPublicUrl,
|
|
48
|
+
guardEgressFetch,
|
|
49
|
+
isPrivateIp,
|
|
50
|
+
SsrfError,
|
|
51
|
+
} from './core/security.js';
|
|
52
|
+
|
|
53
|
+
// backend seam (the contract + result types)
|
|
54
|
+
export type {
|
|
55
|
+
Backend,
|
|
56
|
+
Http,
|
|
57
|
+
HttpRequestOptions,
|
|
58
|
+
SearchResult,
|
|
59
|
+
FetchResult,
|
|
60
|
+
SearchOptions,
|
|
61
|
+
FetchOptions,
|
|
62
|
+
} from './core/backends/types.js';
|
|
63
|
+
|
|
64
|
+
// backend registry + implementations
|
|
65
|
+
export {backendNames, getBackend} from './core/backends/registry.js';
|
|
66
|
+
export type {BackendFactory} from './core/backends/registry.js';
|
|
67
|
+
export {createSearxngBackend} from './core/backends/searxng.js';
|
|
68
|
+
export {createTavilyCompatBackend} from './core/backends/tavily-compat.js';
|
|
69
|
+
export {createCustomBackend} from './core/backends/custom.js';
|
|
70
|
+
export type {SpawnFn} from './core/backends/custom.js';
|
|
71
|
+
|
|
72
|
+
// core search (the framework-agnostic search() both frontends call)
|
|
73
|
+
export {search} from './core/search.js';
|
|
74
|
+
export type {SearchCoreOptions, SearchDeps} from './core/search.js';
|
|
75
|
+
|
|
76
|
+
// core fetch (the framework-agnostic fetch() + list-ready fetchAll internal)
|
|
77
|
+
export {fetch, fetchAll} from './core/fetch.js';
|
|
78
|
+
export type {FetchCoreOptions, FetchDeps} from './core/fetch.js';
|
|
79
|
+
|
|
80
|
+
// incur CLI + MCP frontend (the `webveil` bin builds and serves this)
|
|
81
|
+
export {createCli} from './cli.js';
|
|
82
|
+
export type {CliDeps} from './cli.js';
|