@counterposition/pi-web-search 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,131 @@
1
+ import type { FetchProvider } from "../types.js";
2
+ import {
3
+ MAX_RESPONSE_BYTES,
4
+ ProviderError,
5
+ TIMEOUTS,
6
+ buildRequestSignal,
7
+ createHttpError,
8
+ readBoundedBody,
9
+ toProviderError,
10
+ } from "../provider-utils.js";
11
+
12
+ const JINA_ENDPOINT = "https://r.jina.ai/";
13
+
14
+ export function createJinaProvider(apiKey?: string | null): FetchProvider {
15
+ return {
16
+ name: "jina",
17
+ async fetch(url: string, signal: AbortSignal): Promise<string> {
18
+ const targetUrl = `${JINA_ENDPOINT}${url}`;
19
+ const headers = buildHeaders(apiKey, "application/json");
20
+
21
+ try {
22
+ const jsonContent = await fetchJinaContent(targetUrl, headers, signal, true);
23
+ if (jsonContent) return jsonContent;
24
+ } catch (error) {
25
+ if (!shouldFallbackToText(error)) {
26
+ throw toProviderError("jina", error, signal);
27
+ }
28
+ }
29
+
30
+ try {
31
+ const textContent = await fetchJinaContent(
32
+ targetUrl,
33
+ buildHeaders(apiKey, "text/plain"),
34
+ signal,
35
+ false,
36
+ );
37
+ if (textContent) return textContent;
38
+ throw new ProviderError({
39
+ provider: "jina",
40
+ message: "jina returned an empty response.",
41
+ transient: false,
42
+ });
43
+ } catch (error) {
44
+ throw toProviderError("jina", error, signal);
45
+ }
46
+ },
47
+ };
48
+ }
49
+
50
+ async function fetchJinaContent(
51
+ targetUrl: string,
52
+ headers: Record<string, string>,
53
+ signal: AbortSignal,
54
+ preferJson: boolean,
55
+ ): Promise<string | undefined> {
56
+ const response = await fetch(targetUrl, {
57
+ method: "GET",
58
+ headers,
59
+ signal: buildRequestSignal(signal, TIMEOUTS.fetchMs),
60
+ });
61
+
62
+ if (!response.ok) {
63
+ throw createHttpError("jina", response, response.statusText || "request failed");
64
+ }
65
+
66
+ const body = await readBoundedBody(response, MAX_RESPONSE_BYTES.fetch);
67
+
68
+ if (preferJson) {
69
+ const parsed = safeParseJson(body);
70
+ if (parsed) {
71
+ const content = extractJinaJsonContent(parsed);
72
+ if (content) return content;
73
+ }
74
+
75
+ return undefined;
76
+ }
77
+
78
+ return normalizePageContent(body);
79
+ }
80
+
81
+ function buildHeaders(apiKey: string | null | undefined, accept: string): Record<string, string> {
82
+ const headers: Record<string, string> = {
83
+ Accept: accept,
84
+ "X-Retain-Images": "none",
85
+ };
86
+
87
+ if (apiKey?.trim()) {
88
+ headers.Authorization = `Bearer ${apiKey.trim()}`;
89
+ }
90
+
91
+ return headers;
92
+ }
93
+
94
+ function extractJinaJsonContent(value: unknown): string | undefined {
95
+ if (!isPlainObject(value)) return undefined;
96
+ const data = value.data;
97
+ if (!isPlainObject(data)) return undefined;
98
+
99
+ const content = data.content;
100
+ if (typeof content === "string" && content.trim()) {
101
+ return normalizePageContent(content);
102
+ }
103
+
104
+ const markdown = data.markdown;
105
+ if (typeof markdown === "string" && markdown.trim()) {
106
+ return normalizePageContent(markdown);
107
+ }
108
+
109
+ return undefined;
110
+ }
111
+
112
+ function normalizePageContent(value: string): string | undefined {
113
+ const text = value.replaceAll(/\r\n/g, "\n").trim();
114
+ return text.length > 0 ? text : undefined;
115
+ }
116
+
117
+ function safeParseJson(body: string): unknown {
118
+ try {
119
+ return JSON.parse(body) as unknown;
120
+ } catch {
121
+ return undefined;
122
+ }
123
+ }
124
+
125
+ function shouldFallbackToText(error: unknown): boolean {
126
+ return error instanceof ProviderError && (error.status === 406 || error.status === 415);
127
+ }
128
+
129
+ function isPlainObject(value: unknown): value is Record<string, unknown> {
130
+ return typeof value === "object" && value !== null && !Array.isArray(value);
131
+ }
@@ -0,0 +1,193 @@
1
+ import {
2
+ addSiteConstraint,
3
+ dedupeResultsByUrl,
4
+ fetchJson,
5
+ hostnameFromUrl,
6
+ MAX_RESPONSE_BYTES,
7
+ ProviderError,
8
+ TIMEOUTS,
9
+ toProviderError,
10
+ truncateSnippet,
11
+ } from "../provider-utils.js";
12
+ import type {
13
+ AppliedFilters,
14
+ ProviderSearchResponse,
15
+ SearchProvider,
16
+ SearchProviderArgs,
17
+ SearchResult,
18
+ } from "../types.js";
19
+
20
+ const SERPER_SEARCH_URL = "https://google.serper.dev/search";
21
+ const SERPER_PROVIDER_NAME: SearchProvider["name"] = "serper";
22
+
23
+ export function createSerperProvider(apiKey: string): SearchProvider {
24
+ if (!apiKey.trim()) {
25
+ throw new Error("Serper API key is required.");
26
+ }
27
+
28
+ return {
29
+ name: SERPER_PROVIDER_NAME,
30
+ capabilities: new Set(["search"]),
31
+ async search(args: SearchProviderArgs): Promise<ProviderSearchResponse> {
32
+ try {
33
+ const domains = args.domains?.filter(Boolean) ?? [];
34
+ const response =
35
+ domains.length > 1
36
+ ? await searchAcrossDomains(apiKey, args, domains)
37
+ : await searchSingleQuery(apiKey, args, domains[0]);
38
+
39
+ return response;
40
+ } catch (error) {
41
+ throw toProviderError(SERPER_PROVIDER_NAME, error, args.signal);
42
+ }
43
+ },
44
+ };
45
+ }
46
+
47
+ export default createSerperProvider;
48
+
49
+ async function searchSingleQuery(
50
+ apiKey: string,
51
+ args: SearchProviderArgs,
52
+ domain?: string,
53
+ ): Promise<ProviderSearchResponse> {
54
+ const query = domain ? addSiteConstraint(args.query, domain) : args.query;
55
+ const request = await fetchJson<SerperResponse>(SERPER_PROVIDER_NAME, SERPER_SEARCH_URL, {
56
+ method: "POST",
57
+ headers: {
58
+ "Content-Type": "application/json",
59
+ "X-API-KEY": apiKey,
60
+ },
61
+ body: JSON.stringify({
62
+ q: query,
63
+ num: args.maxResults,
64
+ }),
65
+ signal: args.signal,
66
+ timeoutMs: TIMEOUTS.searchBasicMs,
67
+ maxBytes: MAX_RESPONSE_BYTES.search,
68
+ validate: validateSerperResponse,
69
+ });
70
+
71
+ const results = mapSerperResults(request.organic, args.maxResults);
72
+ const appliedFilters: AppliedFilters = {};
73
+
74
+ if (args.freshness) {
75
+ appliedFilters.freshness = "not_applied";
76
+ }
77
+
78
+ if (domain) {
79
+ appliedFilters.domains = "query_rewrite";
80
+ }
81
+
82
+ return {
83
+ results,
84
+ appliedFilters: Object.keys(appliedFilters).length > 0 ? appliedFilters : undefined,
85
+ };
86
+ }
87
+
88
+ async function searchAcrossDomains(
89
+ apiKey: string,
90
+ args: SearchProviderArgs,
91
+ domains: string[],
92
+ ): Promise<ProviderSearchResponse> {
93
+ const perDomainResults: SearchResult[] = [];
94
+ const notes: string[] = [];
95
+
96
+ for (const domain of domains) {
97
+ const response = await searchSingleQuery(apiKey, args, domain);
98
+ perDomainResults.push(...response.results);
99
+ if (response.notes?.length) notes.push(...response.notes);
100
+ }
101
+
102
+ const results = dedupeResultsByUrl(perDomainResults, args.maxResults);
103
+ const appliedFilters: AppliedFilters = {
104
+ domains: "fanout_merge",
105
+ };
106
+
107
+ if (args.freshness) {
108
+ appliedFilters.freshness = "not_applied";
109
+ }
110
+
111
+ notes.push(
112
+ `Domain filter was approximated by running one query per domain for ${domains.join(", ")}.`,
113
+ );
114
+
115
+ return {
116
+ results,
117
+ appliedFilters,
118
+ notes: notes.length > 0 ? [...new Set(notes)] : undefined,
119
+ };
120
+ }
121
+
122
+ function mapSerperResults(results: unknown[] | undefined, maxResults: number): SearchResult[] {
123
+ if (!Array.isArray(results)) return [];
124
+
125
+ const mapped: SearchResult[] = [];
126
+
127
+ for (const item of results) {
128
+ const parsed = parseSerperResult(item);
129
+ if (!parsed) continue;
130
+ mapped.push(parsed);
131
+ if (mapped.length >= maxResults) break;
132
+ }
133
+
134
+ return mapped;
135
+ }
136
+
137
+ function parseSerperResult(item: unknown): SearchResult | undefined {
138
+ if (!isPlainObject(item)) return undefined;
139
+
140
+ const title = readString(item.title);
141
+ const url = readString(item.link) ?? readString(item.url);
142
+ if (!title || !url) return undefined;
143
+
144
+ const snippet = truncateSnippet(
145
+ readString(item.snippet) ?? readString(item.description) ?? "",
146
+ 300,
147
+ );
148
+
149
+ const sourceDomain = hostnameFromUrl(url);
150
+
151
+ return {
152
+ title,
153
+ url,
154
+ snippet,
155
+ ...(sourceDomain ? { sourceDomain } : {}),
156
+ };
157
+ }
158
+
159
+ function validateSerperResponse(value: unknown): SerperResponse {
160
+ if (!isPlainObject(value)) {
161
+ throw new ProviderError({
162
+ provider: SERPER_PROVIDER_NAME,
163
+ message: "Serper returned unexpected response shape.",
164
+ transient: false,
165
+ cause: value,
166
+ });
167
+ }
168
+
169
+ if (!("organic" in value) || !Array.isArray(value.organic)) {
170
+ throw new ProviderError({
171
+ provider: SERPER_PROVIDER_NAME,
172
+ message: "Serper returned unexpected response shape.",
173
+ transient: false,
174
+ cause: value,
175
+ });
176
+ }
177
+
178
+ return {
179
+ organic: value.organic,
180
+ };
181
+ }
182
+
183
+ type SerperResponse = {
184
+ organic: unknown[];
185
+ };
186
+
187
+ function readString(value: unknown): string | undefined {
188
+ return typeof value === "string" && value.trim() ? value.trim() : undefined;
189
+ }
190
+
191
+ function isPlainObject(value: unknown): value is Record<string, unknown> {
192
+ return typeof value === "object" && value !== null && !Array.isArray(value);
193
+ }
@@ -0,0 +1,231 @@
1
+ import {
2
+ fetchJson,
3
+ hostnameFromUrl,
4
+ MAX_RESPONSE_BYTES,
5
+ normalizeIsoDate,
6
+ ProviderError,
7
+ TIMEOUTS,
8
+ truncateSnippet,
9
+ } from "../provider-utils.js";
10
+ import type {
11
+ AppliedFilters,
12
+ ProviderSearchResponse,
13
+ SearchCapability,
14
+ SearchProvider,
15
+ SearchProviderArgs,
16
+ SearchResult,
17
+ } from "../types.js";
18
+
19
+ type TavilySearchDepth = "basic" | "advanced";
20
+ type TavilyTopic = "general" | "news" | "finance";
21
+ type TavilyTimeRange = "day" | "week" | "month" | "year";
22
+
23
+ type TavilyRequestBody = {
24
+ query: string;
25
+ topic: TavilyTopic;
26
+ search_depth: TavilySearchDepth;
27
+ max_results: number;
28
+ include_answer: false;
29
+ include_raw_content: false | "markdown";
30
+ time_range?: TavilyTimeRange;
31
+ include_domains?: string[];
32
+ };
33
+
34
+ type TavilyResponse = {
35
+ results?: unknown;
36
+ answer?: unknown;
37
+ };
38
+
39
+ const TAVILY_CAPABILITIES = new Set<SearchCapability>([
40
+ "search",
41
+ "content",
42
+ "semantic",
43
+ "freshness",
44
+ "domainFilter",
45
+ "resultDates",
46
+ ]);
47
+
48
+ const TAVILY_ENDPOINT = "https://api.tavily.com/search";
49
+
50
+ export function createTavilyProvider(apiKey: string): SearchProvider {
51
+ const trimmedKey = apiKey.trim();
52
+
53
+ return {
54
+ name: "tavily",
55
+ capabilities: TAVILY_CAPABILITIES,
56
+ async search(args: SearchProviderArgs): Promise<ProviderSearchResponse> {
57
+ const requestBody = buildRequestBody(args);
58
+ const response = await fetchJson<TavilyResponse>("tavily", TAVILY_ENDPOINT, {
59
+ method: "POST",
60
+ headers: {
61
+ "Content-Type": "application/json",
62
+ Authorization: `Bearer ${trimmedKey}`,
63
+ },
64
+ body: JSON.stringify(requestBody),
65
+ signal: args.signal,
66
+ timeoutMs: args.includeContent ? TIMEOUTS.searchThoroughMs : TIMEOUTS.searchBasicMs,
67
+ maxBytes: MAX_RESPONSE_BYTES.search,
68
+ validate: validateTavilyResponse,
69
+ });
70
+
71
+ const results = normalizeResults(response.results, args.includeContent);
72
+ const appliedFilters = buildAppliedFilters(args);
73
+ const notes = buildNotes(args, results);
74
+
75
+ return {
76
+ results,
77
+ appliedFilters,
78
+ notes,
79
+ };
80
+ },
81
+ };
82
+ }
83
+
84
+ function buildRequestBody(args: SearchProviderArgs): TavilyRequestBody {
85
+ const body: TavilyRequestBody = {
86
+ query: args.query,
87
+ topic: chooseTopic(args.query, args.freshness),
88
+ search_depth: args.includeContent ? "advanced" : "basic",
89
+ max_results: clampMaxResults(args.maxResults),
90
+ include_answer: false,
91
+ include_raw_content: args.includeContent ? "markdown" : false,
92
+ };
93
+
94
+ if (args.freshness) {
95
+ body.time_range = args.freshness;
96
+ }
97
+
98
+ if (args.domains && args.domains.length > 0) {
99
+ body.include_domains = args.domains;
100
+ }
101
+
102
+ return body;
103
+ }
104
+
105
+ function buildAppliedFilters(args: SearchProviderArgs): AppliedFilters | undefined {
106
+ const applied: AppliedFilters = {};
107
+
108
+ if (args.freshness) {
109
+ applied.freshness = "native";
110
+ }
111
+
112
+ if (args.domains && args.domains.length > 0) {
113
+ applied.domains = "native";
114
+ }
115
+
116
+ return Object.keys(applied).length > 0 ? applied : undefined;
117
+ }
118
+
119
+ function buildNotes(args: SearchProviderArgs, results: SearchResult[]): string[] | undefined {
120
+ const notes: string[] = [];
121
+
122
+ if (args.includeContent) {
123
+ notes.push("Tavily returned content-enriched search results.");
124
+ }
125
+
126
+ if (args.freshness && !results.some((result) => result.publishedAt)) {
127
+ notes.push("Tavily did not return publish dates for all results.");
128
+ }
129
+
130
+ return notes.length > 0 ? notes : undefined;
131
+ }
132
+
133
+ function chooseTopic(query: string, freshness?: string): TavilyTopic {
134
+ if (freshness && looksNewsLike(query)) {
135
+ return "news";
136
+ }
137
+
138
+ return "general";
139
+ }
140
+
141
+ function looksNewsLike(query: string): boolean {
142
+ return /\b(latest|news|breaking|release|released|update|updated|today|yesterday|cve|vulnerability)\b/i.test(
143
+ query,
144
+ );
145
+ }
146
+
147
+ function clampMaxResults(value: number): number {
148
+ if (!Number.isFinite(value)) return 5;
149
+ return Math.max(1, Math.min(20, Math.trunc(value)));
150
+ }
151
+
152
+ function validateTavilyResponse(value: unknown): TavilyResponse {
153
+ if (!isPlainObject(value)) {
154
+ throw new Error("Tavily returned unexpected response shape.");
155
+ }
156
+
157
+ const results = value.results;
158
+ if (results !== undefined && !Array.isArray(results)) {
159
+ throw new Error("Tavily returned unexpected response shape.");
160
+ }
161
+
162
+ return {
163
+ results,
164
+ answer: value.answer,
165
+ };
166
+ }
167
+
168
+ function normalizeResults(results: unknown, includeContent: boolean): SearchResult[] {
169
+ if (!Array.isArray(results)) return [];
170
+
171
+ const normalized: SearchResult[] = [];
172
+
173
+ for (const item of results) {
174
+ const result = normalizeResult(item, includeContent);
175
+ if (result) normalized.push(result);
176
+ }
177
+
178
+ return normalized;
179
+ }
180
+
181
+ function normalizeResult(item: unknown, includeContent: boolean): SearchResult | undefined {
182
+ if (!isPlainObject(item)) return undefined;
183
+
184
+ const title = typeof item.title === "string" ? item.title.trim() : "";
185
+ const url = typeof item.url === "string" ? item.url.trim() : "";
186
+ if (!title || !url) return undefined;
187
+
188
+ const sourceDomain = hostnameFromUrl(url);
189
+
190
+ const snippetSource =
191
+ typeof item.content === "string" && item.content.trim()
192
+ ? item.content
193
+ : typeof item.raw_content === "string" && item.raw_content.trim()
194
+ ? item.raw_content
195
+ : "";
196
+
197
+ const result: SearchResult = {
198
+ title,
199
+ url,
200
+ snippet: truncateSnippet(snippetSource, 320) || "[No snippet available]",
201
+ };
202
+
203
+ if (sourceDomain) {
204
+ result.sourceDomain = sourceDomain;
205
+ }
206
+
207
+ if (typeof item.published_date === "string") {
208
+ const publishedAt = normalizeIsoDate(item.published_date);
209
+ if (publishedAt) {
210
+ result.publishedAt = publishedAt;
211
+ }
212
+ }
213
+
214
+ if (includeContent && typeof item.raw_content === "string" && item.raw_content.trim()) {
215
+ result.content = item.raw_content.trim();
216
+ }
217
+
218
+ return result;
219
+ }
220
+
221
+ function isPlainObject(value: unknown): value is Record<string, unknown> {
222
+ return typeof value === "object" && value !== null && !Array.isArray(value);
223
+ }
224
+
225
+ export function createTavilyConfigError(message: string): ProviderError {
226
+ return new ProviderError({
227
+ provider: "tavily",
228
+ message,
229
+ transient: false,
230
+ });
231
+ }
package/src/types.ts ADDED
@@ -0,0 +1,131 @@
1
+ export type SearchProviderName = "brave" | "serper" | "tavily" | "exa";
2
+
3
+ export type FetchProviderName = "jina" | "firecrawl";
4
+
5
+ export type ProviderName = SearchProviderName | FetchProviderName;
6
+
7
+ export type SearchDepth = "basic" | "thorough";
8
+
9
+ export type SearchCapability =
10
+ | "search"
11
+ | "content"
12
+ | "semantic"
13
+ | "freshness"
14
+ | "domainFilter"
15
+ | "resultDates";
16
+
17
+ export type SearchFreshness = "day" | "week" | "month" | "year";
18
+
19
+ export interface SearchResult {
20
+ title: string;
21
+ url: string;
22
+ snippet: string;
23
+ sourceDomain?: string;
24
+ publishedAt?: string;
25
+ content?: string;
26
+ }
27
+
28
+ export interface AppliedFilters {
29
+ freshness?: "native" | "approximate" | "not_applied";
30
+ domains?: "native" | "query_rewrite" | "fanout_merge" | "not_applied";
31
+ }
32
+
33
+ export interface ProviderSearchResponse {
34
+ results: SearchResult[];
35
+ appliedFilters?: AppliedFilters;
36
+ notes?: string[];
37
+ }
38
+
39
+ export interface SearchResponse extends ProviderSearchResponse {
40
+ provider: SearchProviderName;
41
+ }
42
+
43
+ export interface FetchResponse {
44
+ url: string;
45
+ content: string;
46
+ offset: number;
47
+ returnedChars: number;
48
+ totalChars: number;
49
+ nextOffset?: number;
50
+ hasMore: boolean;
51
+ provider: FetchProviderName;
52
+ }
53
+
54
+ export interface SearchProviderArgs {
55
+ query: string;
56
+ maxResults: number;
57
+ includeContent: boolean;
58
+ freshness?: SearchFreshness;
59
+ domains?: string[];
60
+ signal: AbortSignal;
61
+ }
62
+
63
+ export interface SearchProvider {
64
+ name: SearchProviderName;
65
+ capabilities: ReadonlySet<SearchCapability>;
66
+ search(args: SearchProviderArgs): Promise<ProviderSearchResponse>;
67
+ }
68
+
69
+ export interface FetchProvider {
70
+ name: FetchProviderName;
71
+ fetch(url: string, signal: AbortSignal): Promise<string>;
72
+ }
73
+
74
+ export interface WebSearchSettings {
75
+ preferredBasicProvider?: SearchProviderName | null;
76
+ preferredThoroughProvider?: SearchProviderName | null;
77
+ preferredFetchProvider?: FetchProviderName | null;
78
+ }
79
+
80
+ export type ApiKeyEnvName =
81
+ | "BRAVE_API_KEY"
82
+ | "SERPER_API_KEY"
83
+ | "TAVILY_API_KEY"
84
+ | "EXA_API_KEY"
85
+ | "JINA_API_KEY"
86
+ | "FIRECRAWL_API_KEY";
87
+
88
+ export interface LoadedConfig {
89
+ apiKeys: Partial<Record<ApiKeyEnvName, string>>;
90
+ settings: WebSearchSettings;
91
+ warnings: string[];
92
+ }
93
+
94
+ export interface InitializedProviders {
95
+ search: Partial<Record<SearchProviderName, SearchProvider>>;
96
+ fetch: Partial<Record<FetchProviderName, FetchProvider>>;
97
+ hasAnySearchProvider: boolean;
98
+ }
99
+
100
+ export interface ResolvedSearchProviders {
101
+ providers: SearchProvider[];
102
+ servedDepth: SearchDepth;
103
+ notes: string[];
104
+ }
105
+
106
+ export interface PaginatedContent {
107
+ text: string;
108
+ offset: number;
109
+ returnedChars: number;
110
+ totalChars: number;
111
+ nextOffset?: number;
112
+ hasMore: boolean;
113
+ }
114
+
115
+ export interface FormatSearchResultsArgs {
116
+ results: SearchResult[];
117
+ provider: SearchProviderName;
118
+ requestedDepth: SearchDepth;
119
+ servedDepth: SearchDepth;
120
+ freshness?: SearchFreshness;
121
+ domains?: string[];
122
+ appliedFilters?: AppliedFilters;
123
+ notes?: string[];
124
+ }
125
+
126
+ export interface PageCacheEntry {
127
+ url: string;
128
+ content: string;
129
+ provider: FetchProviderName;
130
+ fetchedAt: number;
131
+ }