webveil 0.0.0 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +661 -0
- package/README.md +101 -0
- package/dist/cli.d.ts +58 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +91 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/backends/custom.d.ts +15 -0
- package/dist/core/backends/custom.d.ts.map +1 -0
- package/dist/core/backends/custom.js +106 -0
- package/dist/core/backends/custom.js.map +1 -0
- package/dist/core/backends/registry.d.ts +13 -0
- package/dist/core/backends/registry.d.ts.map +1 -0
- package/dist/core/backends/registry.js +31 -0
- package/dist/core/backends/registry.js.map +1 -0
- package/dist/core/backends/searxng.d.ts +8 -0
- package/dist/core/backends/searxng.d.ts.map +1 -0
- package/dist/core/backends/searxng.js +43 -0
- package/dist/core/backends/searxng.js.map +1 -0
- package/dist/core/backends/tavily-compat.d.ts +10 -0
- package/dist/core/backends/tavily-compat.d.ts.map +1 -0
- package/dist/core/backends/tavily-compat.js +85 -0
- package/dist/core/backends/tavily-compat.js.map +1 -0
- package/dist/core/backends/types.d.ts +48 -0
- package/dist/core/backends/types.d.ts.map +1 -0
- package/dist/core/backends/types.js +5 -0
- package/dist/core/backends/types.js.map +1 -0
- package/dist/core/config.d.ts +39 -0
- package/dist/core/config.d.ts.map +1 -0
- package/dist/core/config.js +72 -0
- package/dist/core/config.js.map +1 -0
- package/dist/core/egress.d.ts +30 -0
- package/dist/core/egress.d.ts.map +1 -0
- package/dist/core/egress.js +87 -0
- package/dist/core/egress.js.map +1 -0
- package/dist/core/extract.d.ts +45 -0
- package/dist/core/extract.d.ts.map +1 -0
- package/dist/core/extract.js +36 -0
- package/dist/core/extract.js.map +1 -0
- package/dist/core/fetch.d.ts +42 -0
- package/dist/core/fetch.d.ts.map +1 -0
- package/dist/core/fetch.js +76 -0
- package/dist/core/fetch.js.map +1 -0
- package/dist/core/http.d.ts +8 -0
- package/dist/core/http.d.ts.map +1 -0
- package/dist/core/http.js +49 -0
- package/dist/core/http.js.map +1 -0
- package/dist/core/search.d.ts +31 -0
- package/dist/core/search.d.ts.map +1 -0
- package/dist/core/search.js +65 -0
- package/dist/core/search.js.map +1 -0
- package/dist/core/security.d.ts +35 -0
- package/dist/core/security.d.ts.map +1 -0
- package/dist/core/security.js +141 -0
- package/dist/core/security.js.map +1 -0
- package/dist/index.d.ts +22 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +40 -0
- package/dist/index.js.map +1 -0
- package/package.json +62 -2
- package/src/cli.ts +106 -0
- package/src/core/backends/custom.ts +159 -0
- package/src/core/backends/registry.ts +41 -0
- package/src/core/backends/searxng.ts +70 -0
- package/src/core/backends/tavily-compat.ts +156 -0
- package/src/core/backends/types.ts +61 -0
- package/src/core/config.ts +106 -0
- package/src/core/egress.ts +106 -0
- package/src/core/extract.ts +82 -0
- package/src/core/fetch.ts +132 -0
- package/src/core/http.ts +62 -0
- package/src/core/search.ts +104 -0
- package/src/core/security.ts +141 -0
- package/src/index.ts +82 -0
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
// custom backend — the local-command escape hatch (contract lifted from
|
|
2
|
+
// pi-web-providers' custom-wrapper). Instead of an HTTP source, it spawns a
|
|
3
|
+
// configured local command, writes the request as JSON to its stdin, and parses
|
|
4
|
+
// `SearchResult[]` from its stdout. This lets any local script be a backend.
|
|
5
|
+
//
|
|
6
|
+
// Egress note: this backend owns its own I/O (the spawned command does whatever
|
|
7
|
+
// it wants), so the handed `http` helper is unused here — there is no outbound
|
|
8
|
+
// HTTP for webveil to proxy. It still returns the normalized SearchResult shape.
|
|
9
|
+
//
|
|
10
|
+
// Command source: the configured `baseUrl` carries the command line, parsed as a
|
|
11
|
+
// whitespace-separated argv (first token = executable, rest = args), matching how
|
|
12
|
+
// the other backends read `baseUrl` as "where results come from". (Recorded
|
|
13
|
+
// decision; see the task's Decisions block.)
|
|
14
|
+
//
|
|
15
|
+
// Contract:
|
|
16
|
+
// stdin <- JSON: {"query": string, "maxResults"?: number}
|
|
17
|
+
// stdout -> JSON: SearchResult[] (each {title, url, snippet?})
|
|
18
|
+
// Malformed stdout (non-JSON, not an array, or entries missing url/title) FAILS
|
|
19
|
+
// CLEARLY — it never silently returns an empty list.
|
|
20
|
+
|
|
21
|
+
import {spawn as defaultSpawn} from 'node:child_process';
|
|
22
|
+
import type {Config} from '../config.js';
|
|
23
|
+
import type {Backend, Http, SearchOptions, SearchResult} from './types.js';
|
|
24
|
+
|
|
25
|
+
/** The JSON request written to the command's stdin. */
|
|
26
|
+
interface CustomRequest {
|
|
27
|
+
query: string;
|
|
28
|
+
maxResults?: number;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/** The result of running the command: its stdout text (and exit status). */
|
|
32
|
+
interface CommandRun {
|
|
33
|
+
stdout: string;
|
|
34
|
+
stderr: string;
|
|
35
|
+
code: number | null;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Minimal `spawn` shape this backend needs, seamed so a test can inject a fake
|
|
40
|
+
* without a real subprocess. Defaults to `node:child_process` `spawn`.
|
|
41
|
+
*/
|
|
42
|
+
export type SpawnFn = typeof defaultSpawn;
|
|
43
|
+
|
|
44
|
+
function str(value: unknown): string | undefined {
|
|
45
|
+
return typeof value === 'string' && value.length > 0 ? value : undefined;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/** Parse the configured command line into [executable, ...args]. */
|
|
49
|
+
function parseCommand(baseUrl: string): [string, string[]] {
|
|
50
|
+
const parts = baseUrl.trim().split(/\s+/).filter(Boolean);
|
|
51
|
+
if (parts.length === 0)
|
|
52
|
+
throw new Error(
|
|
53
|
+
'custom: no command configured (set baseUrl to the command to run)',
|
|
54
|
+
);
|
|
55
|
+
return [parts[0]!, parts.slice(1)];
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Normalize one stdout entry into a SearchResult, FAILING CLEARLY on a malformed
|
|
60
|
+
* entry rather than dropping it — the custom contract is explicit, so a missing
|
|
61
|
+
* url/title is a contract violation the user should see, not a silent skip.
|
|
62
|
+
*/
|
|
63
|
+
function toResult(entry: unknown, index: number): SearchResult {
|
|
64
|
+
if (typeof entry !== 'object' || entry === null)
|
|
65
|
+
throw new Error(
|
|
66
|
+
`custom: malformed output — result[${index}] is not an object`,
|
|
67
|
+
);
|
|
68
|
+
const hit = entry as Record<string, unknown>;
|
|
69
|
+
const url = str(hit.url);
|
|
70
|
+
const title = str(hit.title);
|
|
71
|
+
if (!url || !title)
|
|
72
|
+
throw new Error(
|
|
73
|
+
`custom: malformed output — result[${index}] is missing a url or title`,
|
|
74
|
+
);
|
|
75
|
+
const snippet = str(hit.snippet);
|
|
76
|
+
return snippet ? {title, url, snippet} : {title, url};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/** Parse the command's stdout into SearchResult[], failing clearly on garbage. */
|
|
80
|
+
function parseOutput(stdout: string): SearchResult[] {
|
|
81
|
+
const trimmed = stdout.trim();
|
|
82
|
+
if (trimmed.length === 0)
|
|
83
|
+
throw new Error('custom: command produced no output');
|
|
84
|
+
let parsed: unknown;
|
|
85
|
+
try {
|
|
86
|
+
parsed = JSON.parse(trimmed);
|
|
87
|
+
} catch (cause) {
|
|
88
|
+
throw new Error(
|
|
89
|
+
`custom: malformed output — stdout is not valid JSON: ${(cause as Error).message}`,
|
|
90
|
+
);
|
|
91
|
+
}
|
|
92
|
+
if (!Array.isArray(parsed))
|
|
93
|
+
throw new Error(
|
|
94
|
+
'custom: malformed output — expected a JSON array of results',
|
|
95
|
+
);
|
|
96
|
+
return parsed.map(toResult);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/** Spawn the command, write the request to stdin, and collect stdout/stderr. */
|
|
100
|
+
function runCommand(
|
|
101
|
+
spawn: SpawnFn,
|
|
102
|
+
exe: string,
|
|
103
|
+
args: string[],
|
|
104
|
+
request: CustomRequest,
|
|
105
|
+
signal?: AbortSignal,
|
|
106
|
+
): Promise<CommandRun> {
|
|
107
|
+
return new Promise<CommandRun>((resolve, reject) => {
|
|
108
|
+
const child = spawn(exe, args, {
|
|
109
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
110
|
+
signal,
|
|
111
|
+
});
|
|
112
|
+
let stdout = '';
|
|
113
|
+
let stderr = '';
|
|
114
|
+
child.stdout?.on('data', (chunk) => (stdout += String(chunk)));
|
|
115
|
+
child.stderr?.on('data', (chunk) => (stderr += String(chunk)));
|
|
116
|
+
child.on('error', (err) =>
|
|
117
|
+
reject(new Error(`custom: failed to spawn '${exe}': ${err.message}`)),
|
|
118
|
+
);
|
|
119
|
+
child.on('close', (code) => resolve({stdout, stderr, code}));
|
|
120
|
+
child.stdin?.on('error', () => {
|
|
121
|
+
// A command that exits before reading stdin closes the pipe; ignore the
|
|
122
|
+
// EPIPE here and let the close handler report via exit code/stderr.
|
|
123
|
+
});
|
|
124
|
+
child.stdin?.end(JSON.stringify(request));
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Build a custom backend bound to the configured command. The command owns its
|
|
130
|
+
* own I/O; webveil hands it the request as JSON on stdin and parses
|
|
131
|
+
* SearchResult[] from stdout, failing clearly on malformed output.
|
|
132
|
+
*/
|
|
133
|
+
export function createCustomBackend(
|
|
134
|
+
config: Config,
|
|
135
|
+
spawn: SpawnFn = defaultSpawn,
|
|
136
|
+
): Backend {
|
|
137
|
+
const [exe, args] = parseCommand(config.baseUrl);
|
|
138
|
+
return {
|
|
139
|
+
async search(
|
|
140
|
+
query: string,
|
|
141
|
+
_http: Http,
|
|
142
|
+
options: SearchOptions = {},
|
|
143
|
+
): Promise<SearchResult[]> {
|
|
144
|
+
const request: CustomRequest = {query};
|
|
145
|
+
if (options.maxResults !== undefined)
|
|
146
|
+
request.maxResults = options.maxResults;
|
|
147
|
+
const run = await runCommand(spawn, exe, args, request, options.signal);
|
|
148
|
+
if (run.code !== 0)
|
|
149
|
+
throw new Error(
|
|
150
|
+
`custom: command '${exe}' exited with code ${run.code}` +
|
|
151
|
+
(run.stderr.trim() ? `: ${run.stderr.trim()}` : ''),
|
|
152
|
+
);
|
|
153
|
+
const results = parseOutput(run.stdout);
|
|
154
|
+
return options.maxResults !== undefined
|
|
155
|
+
? results.slice(0, options.maxResults)
|
|
156
|
+
: results;
|
|
157
|
+
},
|
|
158
|
+
};
|
|
159
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
// backend registry — a tiny `name -> Backend` dispatcher (concept trimmed from
|
|
2
|
+
// pi-search-hub's registry). Each backend registers a factory keyed by its config
|
|
3
|
+
// `backend` name; `getBackend` resolves the name to a constructed Backend (handed
|
|
4
|
+
// the resolved config so it knows its instance baseUrl / apiKey) and fails clearly
|
|
5
|
+
// on an unknown name. Later backend tasks (tavily-compat, custom) append their own
|
|
6
|
+
// registrations to FACTORIES below.
|
|
7
|
+
|
|
8
|
+
import type {Config} from '../config.js';
|
|
9
|
+
import type {Backend} from './types.js';
|
|
10
|
+
import {createSearxngBackend} from './searxng.js';
|
|
11
|
+
import {createTavilyCompatBackend} from './tavily-compat.js';
|
|
12
|
+
import {createCustomBackend} from './custom.js';
|
|
13
|
+
|
|
14
|
+
/** Builds a Backend from the resolved config (knows its baseUrl / apiKey). */
|
|
15
|
+
export type BackendFactory = (config: Config) => Backend;
|
|
16
|
+
|
|
17
|
+
/** name -> factory. New backends add an entry here. */
|
|
18
|
+
const FACTORIES: Record<string, BackendFactory> = {
|
|
19
|
+
searxng: createSearxngBackend,
|
|
20
|
+
'tavily-compat': createTavilyCompatBackend,
|
|
21
|
+
custom: createCustomBackend,
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
/** The backend names the registry can resolve. */
|
|
25
|
+
export function backendNames(): string[] {
|
|
26
|
+
return Object.keys(FACTORIES);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Resolve a backend name to a constructed Backend. Throws clearly on an unknown
|
|
31
|
+
* name (listing the known ones) so a misconfigured `backend` fails loud, never
|
|
32
|
+
* silently no-ops.
|
|
33
|
+
*/
|
|
34
|
+
export function getBackend(name: string, config: Config): Backend {
|
|
35
|
+
const factory = FACTORIES[name];
|
|
36
|
+
if (!factory)
|
|
37
|
+
throw new Error(
|
|
38
|
+
`webveil: unknown backend '${name}' (known: ${backendNames().join(', ')})`,
|
|
39
|
+
);
|
|
40
|
+
return factory(config);
|
|
41
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
// searxng backend — the keyless, self-hosted metasearch default. Queries a
|
|
2
|
+
// SearXNG instance's JSON API (`/search?format=json`) THROUGH the handed `http`
|
|
3
|
+
// helper (never a direct fetch, so egress is not bypassable) and normalizes the
|
|
4
|
+
// response into SearchResult[].
|
|
5
|
+
|
|
6
|
+
import type {Config} from '../config.js';
|
|
7
|
+
import type {Backend, Http, SearchOptions, SearchResult} from './types.js';
|
|
8
|
+
|
|
9
|
+
/** The shape of one entry in a SearXNG JSON `results` array (subset we use). */
|
|
10
|
+
interface SearxngResult {
|
|
11
|
+
url?: unknown;
|
|
12
|
+
title?: unknown;
|
|
13
|
+
content?: unknown;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/** The SearXNG JSON API response (subset we use). */
|
|
17
|
+
interface SearxngResponse {
|
|
18
|
+
results?: SearxngResult[];
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function str(value: unknown): string | undefined {
|
|
22
|
+
return typeof value === 'string' && value.length > 0 ? value : undefined;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/** Normalize one SearXNG hit; drop entries without a usable url + title. */
|
|
26
|
+
function toResult(hit: SearxngResult): SearchResult | undefined {
|
|
27
|
+
const url = str(hit.url);
|
|
28
|
+
const title = str(hit.title);
|
|
29
|
+
if (!url || !title) return undefined;
|
|
30
|
+
const snippet = str(hit.content);
|
|
31
|
+
return snippet ? {title, url, snippet} : {title, url};
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** Build the SearXNG JSON search URL for a query against the instance baseUrl. */
|
|
35
|
+
function buildUrl(baseUrl: string, query: string): string {
|
|
36
|
+
const url = new URL(
|
|
37
|
+
'search',
|
|
38
|
+
baseUrl.endsWith('/') ? baseUrl : baseUrl + '/',
|
|
39
|
+
);
|
|
40
|
+
url.searchParams.set('q', query);
|
|
41
|
+
url.searchParams.set('format', 'json');
|
|
42
|
+
return url.toString();
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Build a SearXNG backend bound to the configured instance. The returned backend
|
|
47
|
+
* only ever touches the network via the injected `http` helper.
|
|
48
|
+
*/
|
|
49
|
+
export function createSearxngBackend(config: Config): Backend {
|
|
50
|
+
const baseUrl = config.baseUrl;
|
|
51
|
+
return {
|
|
52
|
+
async search(
|
|
53
|
+
query: string,
|
|
54
|
+
http: Http,
|
|
55
|
+
options: SearchOptions = {},
|
|
56
|
+
): Promise<SearchResult[]> {
|
|
57
|
+
const body = await http.fetchJson<SearxngResponse>(
|
|
58
|
+
buildUrl(baseUrl, query),
|
|
59
|
+
{headers: {accept: 'application/json'}, signal: options.signal},
|
|
60
|
+
);
|
|
61
|
+
const results = Array.isArray(body.results) ? body.results : [];
|
|
62
|
+
const normalized = results
|
|
63
|
+
.map(toResult)
|
|
64
|
+
.filter((r): r is SearchResult => r !== undefined);
|
|
65
|
+
return options.maxResults !== undefined
|
|
66
|
+
? normalized.slice(0, options.maxResults)
|
|
67
|
+
: normalized;
|
|
68
|
+
},
|
|
69
|
+
};
|
|
70
|
+
}
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
// tavily-compat backend — a generic Tavily-shaped client (POST `/search` and an
|
|
2
|
+
// optional POST `/extract`) selected purely by `baseUrl`, so it covers
|
|
3
|
+
// orio-search / searcharvester / agent-search and any other Tavily-API-shaped
|
|
4
|
+
// instance. Both endpoints go THROUGH the handed `http` helper (never a direct
|
|
5
|
+
// fetch, so egress is not bypassable). `/search` normalizes to SearchResult[];
|
|
6
|
+
// `/extract` is exposed as the optional `Backend.fetch` a later task uses to
|
|
7
|
+
// override the distilly Extractor.
|
|
8
|
+
//
|
|
9
|
+
// Auth: a Bearer header is sent only when an apiKey is configured. The covered
|
|
10
|
+
// self-hosted instances are typically keyless, so a missing key is normal, not
|
|
11
|
+
// an error.
|
|
12
|
+
|
|
13
|
+
import type {Config} from '../config.js';
|
|
14
|
+
import type {
|
|
15
|
+
Backend,
|
|
16
|
+
FetchOptions,
|
|
17
|
+
FetchResult,
|
|
18
|
+
Http,
|
|
19
|
+
HttpRequestOptions,
|
|
20
|
+
SearchOptions,
|
|
21
|
+
SearchResult,
|
|
22
|
+
} from './types.js';
|
|
23
|
+
|
|
24
|
+
/** One entry in a Tavily `/search` `results` array (subset we use). */
|
|
25
|
+
interface TavilySearchHit {
|
|
26
|
+
title?: unknown;
|
|
27
|
+
url?: unknown;
|
|
28
|
+
content?: unknown;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/** The Tavily `/search` response (subset we use). */
|
|
32
|
+
interface TavilySearchResponse {
|
|
33
|
+
results?: TavilySearchHit[];
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/** One entry in a Tavily `/extract` `results` array (subset we use). */
|
|
37
|
+
interface TavilyExtractHit {
|
|
38
|
+
url?: unknown;
|
|
39
|
+
raw_content?: unknown;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/** One entry in a Tavily `/extract` `failed_results` array (subset we use). */
|
|
43
|
+
interface TavilyExtractFailure {
|
|
44
|
+
url?: unknown;
|
|
45
|
+
error?: unknown;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/** The Tavily `/extract` response (subset we use). */
|
|
49
|
+
interface TavilyExtractResponse {
|
|
50
|
+
results?: TavilyExtractHit[];
|
|
51
|
+
failed_results?: TavilyExtractFailure[];
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function str(value: unknown): string | undefined {
|
|
55
|
+
return typeof value === 'string' && value.length > 0 ? value : undefined;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/** Normalize one Tavily search hit; drop entries without a usable url + title. */
|
|
59
|
+
function toResult(hit: TavilySearchHit): SearchResult | undefined {
|
|
60
|
+
const url = str(hit.url);
|
|
61
|
+
const title = str(hit.title);
|
|
62
|
+
if (!url || !title) return undefined;
|
|
63
|
+
const snippet = str(hit.content);
|
|
64
|
+
return snippet ? {title, url, snippet} : {title, url};
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/** Resolve an endpoint path against the instance baseUrl. */
|
|
68
|
+
function endpoint(baseUrl: string, path: string): string {
|
|
69
|
+
return new URL(
|
|
70
|
+
path,
|
|
71
|
+
baseUrl.endsWith('/') ? baseUrl : baseUrl + '/',
|
|
72
|
+
).toString();
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Build a Tavily-compat backend bound to the configured instance. The returned
|
|
77
|
+
* backend only ever touches the network via the injected `http` helper. A Bearer
|
|
78
|
+
* header is added only when an apiKey is set (the covered instances are usually
|
|
79
|
+
* keyless).
|
|
80
|
+
*/
|
|
81
|
+
export function createTavilyCompatBackend(config: Config): Backend {
|
|
82
|
+
const baseUrl = config.baseUrl;
|
|
83
|
+
const apiKey = config.apiKey;
|
|
84
|
+
|
|
85
|
+
function headers(): Record<string, string> {
|
|
86
|
+
const h: Record<string, string> = {
|
|
87
|
+
'content-type': 'application/json',
|
|
88
|
+
accept: 'application/json',
|
|
89
|
+
};
|
|
90
|
+
if (apiKey) h.authorization = `Bearer ${apiKey}`;
|
|
91
|
+
return h;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function post(
|
|
95
|
+
path: string,
|
|
96
|
+
payload: unknown,
|
|
97
|
+
signal?: AbortSignal,
|
|
98
|
+
): HttpRequestOptions {
|
|
99
|
+
return {
|
|
100
|
+
method: 'POST',
|
|
101
|
+
headers: headers(),
|
|
102
|
+
body: JSON.stringify(payload),
|
|
103
|
+
signal,
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return {
|
|
108
|
+
async search(
|
|
109
|
+
query: string,
|
|
110
|
+
http: Http,
|
|
111
|
+
options: SearchOptions = {},
|
|
112
|
+
): Promise<SearchResult[]> {
|
|
113
|
+
const payload: Record<string, unknown> = {query};
|
|
114
|
+
if (options.maxResults !== undefined)
|
|
115
|
+
payload.max_results = options.maxResults;
|
|
116
|
+
const body = await http.fetchJson<TavilySearchResponse>(
|
|
117
|
+
endpoint(baseUrl, 'search'),
|
|
118
|
+
post('search', payload, options.signal),
|
|
119
|
+
);
|
|
120
|
+
const results = Array.isArray(body.results) ? body.results : [];
|
|
121
|
+
const normalized = results
|
|
122
|
+
.map(toResult)
|
|
123
|
+
.filter((r): r is SearchResult => r !== undefined);
|
|
124
|
+
return options.maxResults !== undefined
|
|
125
|
+
? normalized.slice(0, options.maxResults)
|
|
126
|
+
: normalized;
|
|
127
|
+
},
|
|
128
|
+
|
|
129
|
+
async fetch(
|
|
130
|
+
url: string,
|
|
131
|
+
http: Http,
|
|
132
|
+
options: FetchOptions = {},
|
|
133
|
+
): Promise<FetchResult> {
|
|
134
|
+
// Tavily `/extract` has no `s/m/l/f` size knob (it has `format` /
|
|
135
|
+
// `extract_depth` instead); always request markdown. The default
|
|
136
|
+
// distilly Extractor owns webveil's size presets.
|
|
137
|
+
const body = await http.fetchJson<TavilyExtractResponse>(
|
|
138
|
+
endpoint(baseUrl, 'extract'),
|
|
139
|
+
post('extract', {urls: url, format: 'markdown'}, options.signal),
|
|
140
|
+
);
|
|
141
|
+
const failure = (body.failed_results ?? []).find(
|
|
142
|
+
(f) => str(f.url) === url,
|
|
143
|
+
);
|
|
144
|
+
if (failure)
|
|
145
|
+
throw new Error(
|
|
146
|
+
`tavily-compat: /extract failed for ${url}: ${str(failure.error) ?? 'unknown error'}`,
|
|
147
|
+
);
|
|
148
|
+
const hit = (body.results ?? [])[0];
|
|
149
|
+
const markdown = hit ? str(hit.raw_content) : undefined;
|
|
150
|
+
if (markdown === undefined)
|
|
151
|
+
throw new Error(`tavily-compat: no extract result for ${url}`);
|
|
152
|
+
// Tavily `/extract` returns no `truncated` flag and no page title.
|
|
153
|
+
return {url, markdown, truncated: false};
|
|
154
|
+
},
|
|
155
|
+
};
|
|
156
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
// backend seam — the contract every result source (searxng | tavily-compat |
|
|
2
|
+
// custom) implements. A Backend is HANDED a proxied `http` helper (bound to the
|
|
3
|
+
// configured egress dispatcher) so it physically cannot bypass the egress.
|
|
4
|
+
|
|
5
|
+
/** A single search hit. */
|
|
6
|
+
export interface SearchResult {
|
|
7
|
+
title: string;
|
|
8
|
+
url: string;
|
|
9
|
+
snippet?: string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
/** A fetched, extracted page as budget-bounded markdown. */
|
|
13
|
+
export interface FetchResult {
|
|
14
|
+
url: string;
|
|
15
|
+
title?: string;
|
|
16
|
+
markdown: string;
|
|
17
|
+
truncated: boolean;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface SearchOptions {
|
|
21
|
+
maxResults?: number;
|
|
22
|
+
signal?: AbortSignal;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface FetchOptions {
|
|
26
|
+
size?: 's' | 'm' | 'l' | 'f';
|
|
27
|
+
signal?: AbortSignal;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** Options the http helper accepts for a single request. */
|
|
31
|
+
export interface HttpRequestOptions {
|
|
32
|
+
method?: string;
|
|
33
|
+
headers?: Record<string, string>;
|
|
34
|
+
body?: string;
|
|
35
|
+
/** Per-request timeout in ms (the helper aborts past this). */
|
|
36
|
+
timeoutMs?: number;
|
|
37
|
+
signal?: AbortSignal;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* The proxied http helper handed to backends. Both methods route through the
|
|
42
|
+
* egress dispatcher; a backend never gets un-proxied transport of its own.
|
|
43
|
+
*/
|
|
44
|
+
export interface Http {
|
|
45
|
+
fetchJson<T = unknown>(url: string, options?: HttpRequestOptions): Promise<T>;
|
|
46
|
+
fetchText(url: string, options?: HttpRequestOptions): Promise<string>;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* A result/content source. `search` is required; `fetch` is optional (a backend
|
|
51
|
+
* may override the default distilly Extractor with its own `/extract`). Both are
|
|
52
|
+
* given the proxied `http` helper so they cannot escape the configured egress.
|
|
53
|
+
*/
|
|
54
|
+
export interface Backend {
|
|
55
|
+
search(
|
|
56
|
+
query: string,
|
|
57
|
+
http: Http,
|
|
58
|
+
options?: SearchOptions,
|
|
59
|
+
): Promise<SearchResult[]>;
|
|
60
|
+
fetch?(url: string, http: Http, options?: FetchOptions): Promise<FetchResult>;
|
|
61
|
+
}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
// config seam — per-folder resolution. Precedence (highest wins):
|
|
2
|
+
// env > nearest .pi/webveil.json (walking up from cwd) > global
|
|
3
|
+
// ~/.pi/agent/webveil.json > defaults.
|
|
4
|
+
// "Per folder = per account/egress." Each layer is a partial; later (lower)
|
|
5
|
+
// layers fill gaps the higher layers leave.
|
|
6
|
+
|
|
7
|
+
import {readFileSync} from 'node:fs';
|
|
8
|
+
import {homedir} from 'node:os';
|
|
9
|
+
import {dirname, join, parse} from 'node:path';
|
|
10
|
+
|
|
11
|
+
/** How outbound HTTP leaves the machine. See egress.ts. */
|
|
12
|
+
export type Egress =
|
|
13
|
+
| {mode: 'direct'}
|
|
14
|
+
| {mode: 'http'; url: string}
|
|
15
|
+
| {mode: 'socks5'; url: string};
|
|
16
|
+
|
|
17
|
+
/** Page-size budget preset for fetch (passed through to distilly). */
|
|
18
|
+
export type FetchSize = 's' | 'm' | 'l' | 'f';
|
|
19
|
+
|
|
20
|
+
/** The fully-resolved config every webveil module consumes. */
|
|
21
|
+
export interface Config {
|
|
22
|
+
backend: string;
|
|
23
|
+
baseUrl: string;
|
|
24
|
+
apiKey?: string;
|
|
25
|
+
egress: Egress;
|
|
26
|
+
fetchSize: FetchSize;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** A config file / env layer: any subset of the resolved shape. */
|
|
30
|
+
export type PartialConfig = Partial<Config>;
|
|
31
|
+
|
|
32
|
+
export interface ResolveOptions {
|
|
33
|
+
/** Directory the per-folder walk starts from. Defaults to process.cwd(). */
|
|
34
|
+
cwd?: string;
|
|
35
|
+
/** Environment to read overrides from. Defaults to process.env. */
|
|
36
|
+
env?: Record<string, string | undefined>;
|
|
37
|
+
/**
|
|
38
|
+
* Path to the global config file. Defaults to ~/.pi/agent/webveil.json.
|
|
39
|
+
* Tests point this at a temp dir to isolate the real home directory.
|
|
40
|
+
*/
|
|
41
|
+
globalPath?: string;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const DEFAULTS: Config = {
|
|
45
|
+
backend: 'searxng',
|
|
46
|
+
baseUrl: 'http://127.0.0.1:8080',
|
|
47
|
+
egress: {mode: 'direct'},
|
|
48
|
+
fetchSize: 'm',
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
const PROJECT_FILE = join('.pi', 'webveil.json');
|
|
52
|
+
|
|
53
|
+
function readJson(path: string): PartialConfig | undefined {
|
|
54
|
+
let text: string;
|
|
55
|
+
try {
|
|
56
|
+
text = readFileSync(path, 'utf8');
|
|
57
|
+
} catch {
|
|
58
|
+
return undefined; // absent file is fine; missing layers are expected
|
|
59
|
+
}
|
|
60
|
+
return JSON.parse(text) as PartialConfig;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/** The nearest `.pi/webveil.json` walking up from `cwd` (first found wins). */
|
|
64
|
+
function readProjectChain(cwd: string): PartialConfig | undefined {
|
|
65
|
+
let dir = cwd;
|
|
66
|
+
const {root} = parse(dir);
|
|
67
|
+
for (;;) {
|
|
68
|
+
const found = readJson(join(dir, PROJECT_FILE));
|
|
69
|
+
if (found) return found;
|
|
70
|
+
if (dir === root) return undefined;
|
|
71
|
+
dir = dirname(dir);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function readEnv(env: Record<string, string | undefined>): PartialConfig {
|
|
76
|
+
const layer: PartialConfig = {};
|
|
77
|
+
if (env.WEBVEIL_BACKEND) layer.backend = env.WEBVEIL_BACKEND;
|
|
78
|
+
if (env.WEBVEIL_BASE_URL) layer.baseUrl = env.WEBVEIL_BASE_URL;
|
|
79
|
+
if (env.WEBVEIL_API_KEY) layer.apiKey = env.WEBVEIL_API_KEY;
|
|
80
|
+
if (env.WEBVEIL_FETCH_SIZE)
|
|
81
|
+
layer.fetchSize = env.WEBVEIL_FETCH_SIZE as FetchSize;
|
|
82
|
+
const mode = env.WEBVEIL_EGRESS;
|
|
83
|
+
if (mode === 'direct') layer.egress = {mode: 'direct'};
|
|
84
|
+
else if (mode === 'http' || mode === 'socks5')
|
|
85
|
+
layer.egress = {mode, url: env.WEBVEIL_EGRESS_URL ?? ''};
|
|
86
|
+
return layer;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Resolve the effective config. Higher-precedence layers override lower ones,
|
|
91
|
+
* key by key: env > project chain > global file > defaults.
|
|
92
|
+
*/
|
|
93
|
+
export function resolveConfig(options: ResolveOptions = {}): Config {
|
|
94
|
+
const cwd = options.cwd ?? process.cwd();
|
|
95
|
+
const env = options.env ?? process.env;
|
|
96
|
+
const globalPath =
|
|
97
|
+
options.globalPath ?? join(homedir(), '.pi', 'agent', 'webveil.json');
|
|
98
|
+
|
|
99
|
+
const layers: PartialConfig[] = [
|
|
100
|
+
DEFAULTS,
|
|
101
|
+
readJson(globalPath) ?? {},
|
|
102
|
+
readProjectChain(cwd) ?? {},
|
|
103
|
+
readEnv(env),
|
|
104
|
+
];
|
|
105
|
+
return Object.assign({}, ...layers) as Config;
|
|
106
|
+
}
|