@counterposition/pi-web-search 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +675 -0
- package/README.md +30 -0
- package/extensions/web-search.ts +249 -0
- package/package.json +60 -0
- package/src/config.ts +281 -0
- package/src/format.ts +348 -0
- package/src/page-cache.ts +58 -0
- package/src/pi-ambient.d.ts +30 -0
- package/src/provider-utils.ts +269 -0
- package/src/providers/brave.ts +292 -0
- package/src/providers/exa.ts +227 -0
- package/src/providers/firecrawl.ts +67 -0
- package/src/providers/index.ts +38 -0
- package/src/providers/jina.ts +131 -0
- package/src/providers/serper.ts +193 -0
- package/src/providers/tavily.ts +231 -0
- package/src/types.ts +131 -0
- package/src/url-safety.ts +92 -0
package/src/format.ts
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
import { truncateSnippet, normalizeIsoDate } from "./provider-utils.js";
|
|
2
|
+
import type {
|
|
3
|
+
AppliedFilters,
|
|
4
|
+
FetchProviderName,
|
|
5
|
+
FormatSearchResultsArgs,
|
|
6
|
+
PaginatedContent,
|
|
7
|
+
SearchResult,
|
|
8
|
+
} from "./types.js";
|
|
9
|
+
|
|
10
|
+
const SEARCH_OUTPUT_BUDGET = 12_000;
|
|
11
|
+
const SEARCH_CONTENT_EXCERPT_LIMIT = 1_500;
|
|
12
|
+
const FETCH_DEFAULT_MAX_CHARS = 12_000;
|
|
13
|
+
const MIN_CONTENT_BLOCK_CHARS = 200;
|
|
14
|
+
|
|
15
|
+
export function formatSearchResults(args: FormatSearchResultsArgs): string {
|
|
16
|
+
const notes = collectSearchNotes(args);
|
|
17
|
+
const resultBlocks = args.results.map((result, index) =>
|
|
18
|
+
renderBaseResultBlock(result, index + 1),
|
|
19
|
+
);
|
|
20
|
+
|
|
21
|
+
const topContentCandidates = args.results
|
|
22
|
+
.slice(0, 3)
|
|
23
|
+
.map((result, index) => ({ index, content: result.content?.trim() }))
|
|
24
|
+
.filter((entry): entry is { index: number; content: string } => Boolean(entry.content));
|
|
25
|
+
|
|
26
|
+
const contentMap = new Map<number, string>();
|
|
27
|
+
let omittedCount = 0;
|
|
28
|
+
|
|
29
|
+
let text = renderSearchDocument({
|
|
30
|
+
provider: args.provider,
|
|
31
|
+
servedDepth: args.servedDepth,
|
|
32
|
+
notes,
|
|
33
|
+
resultBlocks,
|
|
34
|
+
contentMap,
|
|
35
|
+
basicHint: args.servedDepth === "basic",
|
|
36
|
+
omissionNote: undefined,
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
for (const candidate of topContentCandidates) {
|
|
40
|
+
const baseExcerpt = truncateSnippet(candidate.content, SEARCH_CONTENT_EXCERPT_LIMIT);
|
|
41
|
+
const tentativeMap = new Map(contentMap);
|
|
42
|
+
tentativeMap.set(candidate.index, baseExcerpt);
|
|
43
|
+
|
|
44
|
+
const tentativeText = renderSearchDocument({
|
|
45
|
+
provider: args.provider,
|
|
46
|
+
servedDepth: args.servedDepth,
|
|
47
|
+
notes,
|
|
48
|
+
resultBlocks,
|
|
49
|
+
contentMap: tentativeMap,
|
|
50
|
+
basicHint: args.servedDepth === "basic",
|
|
51
|
+
omissionNote: undefined,
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
if (tentativeText.length <= SEARCH_OUTPUT_BUDGET) {
|
|
55
|
+
contentMap.set(candidate.index, baseExcerpt);
|
|
56
|
+
text = tentativeText;
|
|
57
|
+
continue;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const blockOverhead = `\nContent:\n\n`.length;
|
|
61
|
+
const remaining = SEARCH_OUTPUT_BUDGET - text.length - blockOverhead;
|
|
62
|
+
if (remaining < MIN_CONTENT_BLOCK_CHARS) {
|
|
63
|
+
omittedCount += 1;
|
|
64
|
+
continue;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const trimmedExcerpt = truncateSnippet(candidate.content, remaining);
|
|
68
|
+
if (trimmedExcerpt.length < MIN_CONTENT_BLOCK_CHARS) {
|
|
69
|
+
omittedCount += 1;
|
|
70
|
+
continue;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
contentMap.set(candidate.index, trimmedExcerpt);
|
|
74
|
+
text = renderSearchDocument({
|
|
75
|
+
provider: args.provider,
|
|
76
|
+
servedDepth: args.servedDepth,
|
|
77
|
+
notes,
|
|
78
|
+
resultBlocks,
|
|
79
|
+
contentMap,
|
|
80
|
+
basicHint: args.servedDepth === "basic",
|
|
81
|
+
omissionNote: undefined,
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
if (text.length > SEARCH_OUTPUT_BUDGET) {
|
|
85
|
+
contentMap.delete(candidate.index);
|
|
86
|
+
omittedCount += 1;
|
|
87
|
+
text = renderSearchDocument({
|
|
88
|
+
provider: args.provider,
|
|
89
|
+
servedDepth: args.servedDepth,
|
|
90
|
+
notes,
|
|
91
|
+
resultBlocks,
|
|
92
|
+
contentMap,
|
|
93
|
+
basicHint: args.servedDepth === "basic",
|
|
94
|
+
omissionNote: undefined,
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
omittedCount += topContentCandidates.length - contentMap.size - omittedCount;
|
|
100
|
+
|
|
101
|
+
const omissionUrl = topContentCandidates
|
|
102
|
+
.map((candidate) => args.results[candidate.index]?.url)
|
|
103
|
+
.find((url) => Boolean(url));
|
|
104
|
+
const omissionNote =
|
|
105
|
+
omittedCount > 0 && omissionUrl
|
|
106
|
+
? `[Full extracted content omitted for ${omittedCount} result${omittedCount === 1 ? "" : "s"} due to output budget. Use web_fetch on ${omissionUrl} to read more.]`
|
|
107
|
+
: undefined;
|
|
108
|
+
|
|
109
|
+
text = renderSearchDocument({
|
|
110
|
+
provider: args.provider,
|
|
111
|
+
servedDepth: args.servedDepth,
|
|
112
|
+
notes,
|
|
113
|
+
resultBlocks,
|
|
114
|
+
contentMap,
|
|
115
|
+
basicHint: args.servedDepth === "basic",
|
|
116
|
+
omissionNote,
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
if (text.length > SEARCH_OUTPUT_BUDGET && omissionNote) {
|
|
120
|
+
text = renderSearchDocument({
|
|
121
|
+
provider: args.provider,
|
|
122
|
+
servedDepth: args.servedDepth,
|
|
123
|
+
notes,
|
|
124
|
+
resultBlocks,
|
|
125
|
+
contentMap,
|
|
126
|
+
basicHint: false,
|
|
127
|
+
omissionNote,
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
if (text.length > SEARCH_OUTPUT_BUDGET) {
|
|
132
|
+
return `${text.slice(0, SEARCH_OUTPUT_BUDGET - 3).trimEnd()}...`;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return text;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
export function paginateContent(
|
|
139
|
+
content: string,
|
|
140
|
+
offset: number,
|
|
141
|
+
maxChars = FETCH_DEFAULT_MAX_CHARS,
|
|
142
|
+
): PaginatedContent {
|
|
143
|
+
const totalChars = content.length;
|
|
144
|
+
|
|
145
|
+
if (offset >= totalChars) {
|
|
146
|
+
return {
|
|
147
|
+
text: "",
|
|
148
|
+
offset,
|
|
149
|
+
returnedChars: 0,
|
|
150
|
+
totalChars,
|
|
151
|
+
hasMore: false,
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
const safeMaxChars = Math.max(1, Math.min(maxChars, 20_000));
|
|
156
|
+
const desiredEnd = Math.min(offset + safeMaxChars, totalChars);
|
|
157
|
+
const boundary = findSliceBoundary(content, offset, desiredEnd);
|
|
158
|
+
const end = boundary > offset ? boundary : desiredEnd;
|
|
159
|
+
const text = content.slice(offset, end).trim();
|
|
160
|
+
const nextOffset = end < totalChars ? end : undefined;
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
text,
|
|
164
|
+
offset,
|
|
165
|
+
returnedChars: text.length,
|
|
166
|
+
totalChars,
|
|
167
|
+
nextOffset,
|
|
168
|
+
hasMore: end < totalChars,
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
export function formatFetchContent(
|
|
173
|
+
url: string,
|
|
174
|
+
provider: FetchProviderName,
|
|
175
|
+
chunk: PaginatedContent,
|
|
176
|
+
): string {
|
|
177
|
+
const header = `## Content from ${url} (via ${providerLabel(provider)})`;
|
|
178
|
+
|
|
179
|
+
if (chunk.offset >= chunk.totalChars) {
|
|
180
|
+
return [
|
|
181
|
+
header,
|
|
182
|
+
"",
|
|
183
|
+
`[Offset ${chunk.offset} is beyond the end of the document. Total content length: ${chunk.totalChars} characters.]`,
|
|
184
|
+
].join("\n");
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const lines = [
|
|
188
|
+
header,
|
|
189
|
+
"",
|
|
190
|
+
`[Showing chars ${chunk.offset}-${chunk.offset + chunk.returnedChars - 1} of ${chunk.totalChars}]`,
|
|
191
|
+
"",
|
|
192
|
+
chunk.text,
|
|
193
|
+
];
|
|
194
|
+
|
|
195
|
+
if (chunk.hasMore && chunk.nextOffset !== undefined) {
|
|
196
|
+
lines.push(
|
|
197
|
+
"",
|
|
198
|
+
`[More content available. Next chunk: web_fetch(url="${url}", offset=${chunk.nextOffset})]`,
|
|
199
|
+
);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return lines.join("\n");
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
function collectSearchNotes(args: FormatSearchResultsArgs): string[] {
|
|
206
|
+
const notes = [...(args.notes ?? [])];
|
|
207
|
+
|
|
208
|
+
const freshnessNote = formatFreshnessNote(args.freshness, args.appliedFilters);
|
|
209
|
+
if (freshnessNote) notes.push(freshnessNote);
|
|
210
|
+
|
|
211
|
+
const domainNote = formatDomainNote(args.domains, args.appliedFilters);
|
|
212
|
+
if (domainNote) notes.push(domainNote);
|
|
213
|
+
|
|
214
|
+
if (args.requestedDepth !== args.servedDepth) {
|
|
215
|
+
notes.push(
|
|
216
|
+
`Requested ${args.requestedDepth} search degraded to ${args.servedDepth} because no content-capable provider was available.`,
|
|
217
|
+
);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
return [...new Set(notes)];
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
function renderBaseResultBlock(result: SearchResult, rank: number): string {
|
|
224
|
+
const lines = [`### ${rank}. ${result.title}`, `URL: ${result.url}`];
|
|
225
|
+
|
|
226
|
+
if (result.sourceDomain) {
|
|
227
|
+
lines.push(`Source: ${result.sourceDomain}`);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
const normalizedDate = normalizeIsoDate(result.publishedAt);
|
|
231
|
+
if (normalizedDate) {
|
|
232
|
+
lines.push(`Published: ${normalizedDate.slice(0, 10)}`);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
lines.push(`Snippet: ${truncateSnippet(result.snippet || "", 320) || "[No snippet available]"}`);
|
|
236
|
+
return lines.join("\n");
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
function renderSearchDocument(args: {
|
|
240
|
+
provider: string;
|
|
241
|
+
servedDepth: string;
|
|
242
|
+
notes: string[];
|
|
243
|
+
resultBlocks: string[];
|
|
244
|
+
contentMap: Map<number, string>;
|
|
245
|
+
basicHint: boolean;
|
|
246
|
+
omissionNote?: string;
|
|
247
|
+
}): string {
|
|
248
|
+
const lines = [`## Search Results (via ${providerLabel(args.provider)}, ${args.servedDepth})`];
|
|
249
|
+
|
|
250
|
+
if (args.notes.length > 0) {
|
|
251
|
+
lines.push("");
|
|
252
|
+
for (const note of args.notes) {
|
|
253
|
+
lines.push(`Note: ${note}`);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const renderedBlocks = args.resultBlocks.map((block, index) => {
|
|
258
|
+
const content = args.contentMap.get(index);
|
|
259
|
+
if (!content) return block;
|
|
260
|
+
return `${block}\n\nContent:\n${content}`;
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
if (renderedBlocks.length > 0) {
|
|
264
|
+
lines.push("", renderedBlocks.join("\n\n---\n\n"));
|
|
265
|
+
} else {
|
|
266
|
+
lines.push("", "[No results returned.]");
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
if (args.omissionNote) {
|
|
270
|
+
lines.push("", args.omissionNote);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
if (args.basicHint) {
|
|
274
|
+
lines.push("", "_Use web_fetch on any URL above to read the full page content._");
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
return lines.join("\n");
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
function formatFreshnessNote(
|
|
281
|
+
freshness: FormatSearchResultsArgs["freshness"],
|
|
282
|
+
appliedFilters?: AppliedFilters,
|
|
283
|
+
): string | undefined {
|
|
284
|
+
if (!freshness) return undefined;
|
|
285
|
+
|
|
286
|
+
switch (appliedFilters?.freshness) {
|
|
287
|
+
case "native":
|
|
288
|
+
return `Freshness filter "${freshness}" was applied natively by the provider.`;
|
|
289
|
+
case "approximate":
|
|
290
|
+
return `Freshness filter "${freshness}" was applied approximately by the provider.`;
|
|
291
|
+
case "not_applied":
|
|
292
|
+
return `Freshness filter "${freshness}" could not be applied strictly by the provider.`;
|
|
293
|
+
default:
|
|
294
|
+
return `Freshness filter "${freshness}" was requested.`;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
function formatDomainNote(
|
|
299
|
+
domains: string[] | undefined,
|
|
300
|
+
appliedFilters?: AppliedFilters,
|
|
301
|
+
): string | undefined {
|
|
302
|
+
if (!domains || domains.length === 0) return undefined;
|
|
303
|
+
|
|
304
|
+
switch (appliedFilters?.domains) {
|
|
305
|
+
case "native":
|
|
306
|
+
return `Domain filter was applied natively for ${domains.join(", ")}.`;
|
|
307
|
+
case "query_rewrite":
|
|
308
|
+
return `Domain filter was approximated with a site: query rewrite for ${domains.join(", ")}.`;
|
|
309
|
+
case "fanout_merge":
|
|
310
|
+
return `Domain filter was approximated by running one query per domain and merging results for ${domains.join(", ")}.`;
|
|
311
|
+
case "not_applied":
|
|
312
|
+
return `Domain filter could not be applied strictly for ${domains.join(", ")}.`;
|
|
313
|
+
default:
|
|
314
|
+
return `Domain filter requested for ${domains.join(", ")}.`;
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
function findSliceBoundary(content: string, start: number, end: number): number {
|
|
319
|
+
if (end >= content.length) return end;
|
|
320
|
+
|
|
321
|
+
const minimumBoundary = start + Math.floor((end - start) * 0.6);
|
|
322
|
+
const paragraphBoundary = content.lastIndexOf("\n\n", end);
|
|
323
|
+
if (paragraphBoundary >= minimumBoundary) return paragraphBoundary;
|
|
324
|
+
|
|
325
|
+
const lineBoundary = content.lastIndexOf("\n", end);
|
|
326
|
+
if (lineBoundary >= minimumBoundary) return lineBoundary;
|
|
327
|
+
|
|
328
|
+
return end;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
function providerLabel(provider: string): string {
|
|
332
|
+
switch (provider) {
|
|
333
|
+
case "brave":
|
|
334
|
+
return "Brave";
|
|
335
|
+
case "serper":
|
|
336
|
+
return "Serper";
|
|
337
|
+
case "tavily":
|
|
338
|
+
return "Tavily";
|
|
339
|
+
case "exa":
|
|
340
|
+
return "Exa";
|
|
341
|
+
case "jina":
|
|
342
|
+
return "Jina Reader";
|
|
343
|
+
case "firecrawl":
|
|
344
|
+
return "Firecrawl";
|
|
345
|
+
default:
|
|
346
|
+
return provider;
|
|
347
|
+
}
|
|
348
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { MAX_CACHE_CHARS_PER_PAGE } from "./provider-utils.js";
|
|
2
|
+
import type { FetchProviderName, PageCacheEntry } from "./types.js";
|
|
3
|
+
|
|
4
|
+
const DEFAULT_TTL_MS = 5 * 60 * 1_000;
|
|
5
|
+
const DEFAULT_CAPACITY = 20;
|
|
6
|
+
|
|
7
|
+
export class PageCache {
|
|
8
|
+
readonly ttlMs: number;
|
|
9
|
+
readonly capacity: number;
|
|
10
|
+
readonly maxCharsPerPage: number;
|
|
11
|
+
readonly entries = new Map<string, PageCacheEntry>();
|
|
12
|
+
|
|
13
|
+
constructor(args?: { ttlMs?: number; capacity?: number; maxCharsPerPage?: number }) {
|
|
14
|
+
this.ttlMs = args?.ttlMs ?? DEFAULT_TTL_MS;
|
|
15
|
+
this.capacity = args?.capacity ?? DEFAULT_CAPACITY;
|
|
16
|
+
this.maxCharsPerPage = args?.maxCharsPerPage ?? MAX_CACHE_CHARS_PER_PAGE;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
get(url: string): PageCacheEntry | undefined {
|
|
20
|
+
const entry = this.entries.get(url);
|
|
21
|
+
if (!entry) return undefined;
|
|
22
|
+
|
|
23
|
+
if (Date.now() - entry.fetchedAt > this.ttlMs) {
|
|
24
|
+
this.entries.delete(url);
|
|
25
|
+
return undefined;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
this.entries.delete(url);
|
|
29
|
+
this.entries.set(url, entry);
|
|
30
|
+
return entry;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
set(url: string, content: string, provider: FetchProviderName): void {
|
|
34
|
+
if (content.length > this.maxCharsPerPage) return;
|
|
35
|
+
|
|
36
|
+
const entry: PageCacheEntry = {
|
|
37
|
+
url,
|
|
38
|
+
content,
|
|
39
|
+
provider,
|
|
40
|
+
fetchedAt: Date.now(),
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
if (this.entries.has(url)) this.entries.delete(url);
|
|
44
|
+
this.entries.set(url, entry);
|
|
45
|
+
|
|
46
|
+
while (this.entries.size > this.capacity) {
|
|
47
|
+
const oldestKey = this.entries.keys().next().value;
|
|
48
|
+
if (!oldestKey) break;
|
|
49
|
+
this.entries.delete(oldestKey);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
clear(): void {
|
|
54
|
+
this.entries.clear();
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export const pageCache = new PageCache();
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
declare module "@mariozechner/pi-coding-agent" {
|
|
2
|
+
export interface ExtensionAPI {
|
|
3
|
+
on(event: string, handler: (...args: any[]) => unknown): void;
|
|
4
|
+
registerTool(tool: {
|
|
5
|
+
name: string;
|
|
6
|
+
label?: string;
|
|
7
|
+
description?: string;
|
|
8
|
+
parameters: unknown;
|
|
9
|
+
execute: (
|
|
10
|
+
toolCallId: string,
|
|
11
|
+
params: any,
|
|
12
|
+
signal: AbortSignal,
|
|
13
|
+
onUpdate?: (update: unknown) => void,
|
|
14
|
+
ctx?: unknown,
|
|
15
|
+
) => Promise<unknown>;
|
|
16
|
+
}): void;
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
declare module "@mariozechner/pi-ai" {
|
|
21
|
+
export const Type: {
|
|
22
|
+
Object(schema: Record<string, unknown>, options?: Record<string, unknown>): unknown;
|
|
23
|
+
String(options?: Record<string, unknown>): unknown;
|
|
24
|
+
Optional(schema: unknown): unknown;
|
|
25
|
+
Array(schema: unknown, options?: Record<string, unknown>): unknown;
|
|
26
|
+
Number(options?: Record<string, unknown>): unknown;
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
export function StringEnum(values: readonly string[], options?: Record<string, unknown>): unknown;
|
|
30
|
+
}
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
import type { FetchProviderName, ProviderName, SearchResult } from "./types.js";
|
|
2
|
+
|
|
3
|
+
export const TIMEOUTS = {
|
|
4
|
+
searchBasicMs: 10_000,
|
|
5
|
+
searchThoroughMs: 30_000,
|
|
6
|
+
fetchMs: 30_000,
|
|
7
|
+
} as const;
|
|
8
|
+
|
|
9
|
+
export const MAX_RESPONSE_BYTES = {
|
|
10
|
+
search: 2 * 1024 * 1024,
|
|
11
|
+
fetch: 10 * 1024 * 1024,
|
|
12
|
+
} as const;
|
|
13
|
+
|
|
14
|
+
export const MAX_CACHE_CHARS_PER_PAGE = 250_000;
|
|
15
|
+
|
|
16
|
+
const TRANSIENT_STATUSES = new Set([408, 429]);
|
|
17
|
+
|
|
18
|
+
type RequestHeaders = Record<string, string>;
|
|
19
|
+
|
|
20
|
+
export class ProviderError extends Error {
|
|
21
|
+
readonly provider: ProviderName;
|
|
22
|
+
readonly transient: boolean;
|
|
23
|
+
readonly status?: number;
|
|
24
|
+
readonly retryAfterSeconds?: number;
|
|
25
|
+
readonly code?: string;
|
|
26
|
+
|
|
27
|
+
constructor(args: {
|
|
28
|
+
provider: ProviderName;
|
|
29
|
+
message: string;
|
|
30
|
+
transient: boolean;
|
|
31
|
+
status?: number;
|
|
32
|
+
retryAfterSeconds?: number;
|
|
33
|
+
code?: string;
|
|
34
|
+
cause?: unknown;
|
|
35
|
+
}) {
|
|
36
|
+
super(args.message, args.cause ? { cause: args.cause } : undefined);
|
|
37
|
+
this.name = "ProviderError";
|
|
38
|
+
this.provider = args.provider;
|
|
39
|
+
this.transient = args.transient;
|
|
40
|
+
this.status = args.status;
|
|
41
|
+
this.retryAfterSeconds = args.retryAfterSeconds;
|
|
42
|
+
this.code = args.code;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export function buildRequestSignal(signal: AbortSignal, timeoutMs: number): AbortSignal {
|
|
47
|
+
return AbortSignal.any([signal, AbortSignal.timeout(timeoutMs)]);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export function classifyStatus(status: number): boolean {
|
|
51
|
+
return TRANSIENT_STATUSES.has(status) || status >= 500;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export function createHttpError(
|
|
55
|
+
provider: ProviderName,
|
|
56
|
+
response: Response,
|
|
57
|
+
summary: string,
|
|
58
|
+
): ProviderError {
|
|
59
|
+
const retryAfter = response.headers.get("retry-after");
|
|
60
|
+
const retryAfterSeconds = retryAfter ? Number.parseInt(retryAfter, 10) : undefined;
|
|
61
|
+
const message = `${provider} request failed: ${response.status} ${summary}`.trim();
|
|
62
|
+
|
|
63
|
+
return new ProviderError({
|
|
64
|
+
provider,
|
|
65
|
+
message,
|
|
66
|
+
transient: classifyStatus(response.status),
|
|
67
|
+
status: response.status,
|
|
68
|
+
retryAfterSeconds: Number.isFinite(retryAfterSeconds) ? retryAfterSeconds : undefined,
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export async function readBoundedBody(response: Response, maxBytes: number): Promise<string> {
|
|
73
|
+
if (!response.body) return "";
|
|
74
|
+
|
|
75
|
+
const reader = response.body.getReader();
|
|
76
|
+
const decoder = new TextDecoder();
|
|
77
|
+
const chunks: string[] = [];
|
|
78
|
+
let totalBytes = 0;
|
|
79
|
+
|
|
80
|
+
while (true) {
|
|
81
|
+
const { done, value } = await reader.read();
|
|
82
|
+
if (done) break;
|
|
83
|
+
if (!value) continue;
|
|
84
|
+
|
|
85
|
+
totalBytes += value.byteLength;
|
|
86
|
+
if (totalBytes > maxBytes) {
|
|
87
|
+
throw new Error(`Response exceeded size limit of ${maxBytes} bytes.`);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
chunks.push(decoder.decode(value, { stream: true }));
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
chunks.push(decoder.decode());
|
|
94
|
+
return chunks.join("");
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
export async function fetchJson<T>(
|
|
98
|
+
provider: ProviderName,
|
|
99
|
+
url: string,
|
|
100
|
+
options: {
|
|
101
|
+
method?: string;
|
|
102
|
+
headers?: RequestHeaders;
|
|
103
|
+
body?: string;
|
|
104
|
+
signal: AbortSignal;
|
|
105
|
+
timeoutMs: number;
|
|
106
|
+
maxBytes: number;
|
|
107
|
+
validate: (value: unknown) => T;
|
|
108
|
+
},
|
|
109
|
+
): Promise<T> {
|
|
110
|
+
try {
|
|
111
|
+
const response = await fetch(url, {
|
|
112
|
+
method: options.method ?? "GET",
|
|
113
|
+
headers: options.headers,
|
|
114
|
+
body: options.body,
|
|
115
|
+
signal: buildRequestSignal(options.signal, options.timeoutMs),
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
if (!response.ok) {
|
|
119
|
+
throw createHttpError(provider, response, response.statusText || "request failed");
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const body = await readBoundedBody(response, options.maxBytes);
|
|
123
|
+
|
|
124
|
+
let parsed: unknown;
|
|
125
|
+
try {
|
|
126
|
+
parsed = body ? JSON.parse(body) : null;
|
|
127
|
+
} catch (error) {
|
|
128
|
+
throw new ProviderError({
|
|
129
|
+
provider,
|
|
130
|
+
message: `${provider} returned invalid JSON.`,
|
|
131
|
+
transient: false,
|
|
132
|
+
cause: error,
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
try {
|
|
137
|
+
return options.validate(parsed);
|
|
138
|
+
} catch (error) {
|
|
139
|
+
throw new ProviderError({
|
|
140
|
+
provider,
|
|
141
|
+
message:
|
|
142
|
+
error instanceof Error && error.message
|
|
143
|
+
? error.message
|
|
144
|
+
: `${provider} returned unexpected response shape.`,
|
|
145
|
+
transient: false,
|
|
146
|
+
cause: error,
|
|
147
|
+
});
|
|
148
|
+
}
|
|
149
|
+
} catch (error) {
|
|
150
|
+
throw toProviderError(provider, error, options.signal);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
export async function fetchText(
|
|
155
|
+
provider: FetchProviderName,
|
|
156
|
+
url: string,
|
|
157
|
+
options: {
|
|
158
|
+
method?: string;
|
|
159
|
+
headers?: RequestHeaders;
|
|
160
|
+
body?: string;
|
|
161
|
+
signal: AbortSignal;
|
|
162
|
+
timeoutMs: number;
|
|
163
|
+
maxBytes: number;
|
|
164
|
+
},
|
|
165
|
+
): Promise<string> {
|
|
166
|
+
try {
|
|
167
|
+
const response = await fetch(url, {
|
|
168
|
+
method: options.method ?? "GET",
|
|
169
|
+
headers: options.headers,
|
|
170
|
+
body: options.body,
|
|
171
|
+
signal: buildRequestSignal(options.signal, options.timeoutMs),
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
if (!response.ok) {
|
|
175
|
+
throw createHttpError(provider, response, response.statusText || "request failed");
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return await readBoundedBody(response, options.maxBytes);
|
|
179
|
+
} catch (error) {
|
|
180
|
+
throw toProviderError(provider, error, options.signal);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
export function toProviderError(
|
|
185
|
+
provider: ProviderName,
|
|
186
|
+
error: unknown,
|
|
187
|
+
signal?: AbortSignal,
|
|
188
|
+
): Error {
|
|
189
|
+
if (error instanceof ProviderError) return error;
|
|
190
|
+
|
|
191
|
+
if (isAbortError(error)) {
|
|
192
|
+
if (signal?.aborted) {
|
|
193
|
+
return error instanceof Error ? error : new Error("Request aborted.");
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
return new ProviderError({
|
|
197
|
+
provider,
|
|
198
|
+
message: `${provider} request timed out.`,
|
|
199
|
+
transient: true,
|
|
200
|
+
code: "TIMEOUT",
|
|
201
|
+
cause: error,
|
|
202
|
+
});
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
return new ProviderError({
|
|
206
|
+
provider,
|
|
207
|
+
message:
|
|
208
|
+
error instanceof Error && error.message
|
|
209
|
+
? `${provider} request failed: ${error.message}`
|
|
210
|
+
: `${provider} request failed.`,
|
|
211
|
+
transient: true,
|
|
212
|
+
cause: error,
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
export function isTransientProviderError(error: unknown): boolean {
|
|
217
|
+
return error instanceof ProviderError ? error.transient : false;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
export function truncateSnippet(text: string, maxLen: number): string {
|
|
221
|
+
const normalized = text.replaceAll(/\s+/g, " ").trim();
|
|
222
|
+
if (normalized.length <= maxLen) return normalized;
|
|
223
|
+
|
|
224
|
+
const slice = normalized.slice(0, maxLen + 1);
|
|
225
|
+
const lastSpace = slice.lastIndexOf(" ");
|
|
226
|
+
const cutoff = lastSpace >= Math.floor(maxLen * 0.6) ? lastSpace : maxLen;
|
|
227
|
+
return `${normalized.slice(0, cutoff).trimEnd()}...`;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
export function normalizeIsoDate(input: string | undefined): string | undefined {
|
|
231
|
+
if (!input) return undefined;
|
|
232
|
+
const parsed = new Date(input);
|
|
233
|
+
if (Number.isNaN(parsed.getTime())) return undefined;
|
|
234
|
+
return parsed.toISOString();
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
export function hostnameFromUrl(url: string): string | undefined {
|
|
238
|
+
try {
|
|
239
|
+
return new URL(url).hostname.toLowerCase();
|
|
240
|
+
} catch {
|
|
241
|
+
return undefined;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
export function dedupeResultsByUrl(results: SearchResult[], maxResults: number): SearchResult[] {
|
|
246
|
+
const seen = new Set<string>();
|
|
247
|
+
const deduped: SearchResult[] = [];
|
|
248
|
+
|
|
249
|
+
for (const result of results) {
|
|
250
|
+
const key = result.url.trim().toLowerCase();
|
|
251
|
+
if (!key || seen.has(key)) continue;
|
|
252
|
+
seen.add(key);
|
|
253
|
+
deduped.push(result);
|
|
254
|
+
if (deduped.length >= maxResults) break;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return deduped;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
export function addSiteConstraint(query: string, domain: string): string {
|
|
261
|
+
return `${query} site:${domain}`;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
function isAbortError(error: unknown): boolean {
|
|
265
|
+
return (
|
|
266
|
+
(error instanceof DOMException && error.name === "AbortError") ||
|
|
267
|
+
(error instanceof Error && error.name === "AbortError")
|
|
268
|
+
);
|
|
269
|
+
}
|