@vertana/context-web 0.1.0-dev.11 → 0.1.0-dev.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -5
- package/dist/index.cjs +3 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +2 -1
- package/dist/search.cjs +221 -0
- package/dist/search.d.cts +21 -0
- package/dist/search.d.ts +21 -0
- package/dist/search.js +221 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -16,11 +16,13 @@ linked pages to provide additional context for translation.
|
|
|
16
16
|
Features
|
|
17
17
|
--------
|
|
18
18
|
|
|
19
|
-
-
|
|
19
|
+
- `fetchWebPage`: A passive context source that fetches a single URL
|
|
20
20
|
and extracts the main content using Mozilla's Readability algorithm.
|
|
21
|
-
-
|
|
21
|
+
- `fetchLinkedPages`: A required context source factory that extracts
|
|
22
22
|
all links from the source text and fetches their content.
|
|
23
|
-
-
|
|
23
|
+
- `searchWeb`: A passive context source that performs a web search
|
|
24
|
+
(DuckDuckGo Lite) and returns a list of results (title, URL, snippet).
|
|
25
|
+
- `extractLinks`: A utility function to extract URLs from text
|
|
24
26
|
in various formats (plain text, Markdown, HTML).
|
|
25
27
|
|
|
26
28
|
|
|
@@ -51,7 +53,7 @@ Usage
|
|
|
51
53
|
|
|
52
54
|
~~~~ typescript
|
|
53
55
|
import { translate } from "@vertana/facade";
|
|
54
|
-
import { fetchLinkedPages, fetchWebPage } from "@vertana/context-web";
|
|
56
|
+
import { fetchLinkedPages, fetchWebPage, searchWeb } from "@vertana/context-web";
|
|
55
57
|
import { openai } from "@ai-sdk/openai";
|
|
56
58
|
|
|
57
59
|
const text = `
|
|
@@ -63,7 +65,8 @@ const result = await translate(openai("gpt-4o"), "ko", text, {
|
|
|
63
65
|
contextSources: [
|
|
64
66
|
// Automatically fetch all links in the text
|
|
65
67
|
fetchLinkedPages({ text, mediaType: "text/plain" }),
|
|
66
|
-
// Allow LLM to fetch
|
|
68
|
+
// Allow LLM to search the web and fetch URLs on demand
|
|
69
|
+
searchWeb,
|
|
67
70
|
fetchWebPage,
|
|
68
71
|
],
|
|
69
72
|
});
|
package/dist/index.cjs
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
const require_extract_links = require('./extract-links.cjs');
|
|
2
2
|
const require_fetch = require('./fetch.cjs');
|
|
3
|
+
const require_search = require('./search.cjs');
|
|
3
4
|
|
|
4
5
|
exports.extractContent = require_fetch.extractContent;
|
|
5
6
|
exports.extractLinks = require_extract_links.extractLinks;
|
|
6
7
|
exports.fetchLinkedPages = require_fetch.fetchLinkedPages;
|
|
7
|
-
exports.fetchWebPage = require_fetch.fetchWebPage;
|
|
8
|
+
exports.fetchWebPage = require_fetch.fetchWebPage;
|
|
9
|
+
exports.searchWeb = require_search.searchWeb;
|
package/dist/index.d.cts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
1
|
import { MediaType, extractLinks } from "./extract-links.cjs";
|
|
2
2
|
import { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.cjs";
|
|
3
|
-
|
|
3
|
+
import { searchWeb } from "./search.cjs";
|
|
4
|
+
export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage, searchWeb };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
1
|
import { MediaType, extractLinks } from "./extract-links.js";
|
|
2
2
|
import { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.js";
|
|
3
|
-
|
|
3
|
+
import { searchWeb } from "./search.js";
|
|
4
|
+
export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage, searchWeb };
|
package/dist/index.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { extractLinks } from "./extract-links.js";
|
|
2
2
|
import { extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.js";
|
|
3
|
+
import { searchWeb } from "./search.js";
|
|
3
4
|
|
|
4
|
-
export { extractContent, extractLinks, fetchLinkedPages, fetchWebPage };
|
|
5
|
+
export { extractContent, extractLinks, fetchLinkedPages, fetchWebPage, searchWeb };
|
package/dist/search.cjs
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
let _logtape_logtape = require("@logtape/logtape");
|
|
2
|
+
let zod = require("zod");
|
|
3
|
+
let htmlparser2 = require("htmlparser2");
|
|
4
|
+
|
|
5
|
+
//#region src/search.ts
|
|
6
|
+
const logger = (0, _logtape_logtape.getLogger)([
|
|
7
|
+
"vertana",
|
|
8
|
+
"context-web",
|
|
9
|
+
"search"
|
|
10
|
+
]);
|
|
11
|
+
function unwrapDuckDuckGoRedirectUrl(href) {
|
|
12
|
+
const trimmed = href.trim();
|
|
13
|
+
const normalized = trimmed.startsWith("//") ? `https:${trimmed}` : trimmed;
|
|
14
|
+
let parsed;
|
|
15
|
+
try {
|
|
16
|
+
parsed = new URL(normalized);
|
|
17
|
+
} catch {
|
|
18
|
+
return null;
|
|
19
|
+
}
|
|
20
|
+
if (!/(^|\.)duckduckgo\.com$/i.test(parsed.hostname)) return trimmed;
|
|
21
|
+
if (parsed.pathname !== "/l/") return trimmed;
|
|
22
|
+
const raw = parsed.searchParams.get("uddg");
|
|
23
|
+
if (raw == null || raw.length === 0) return null;
|
|
24
|
+
try {
|
|
25
|
+
const decoded = decodeURIComponent(raw);
|
|
26
|
+
new URL(decoded);
|
|
27
|
+
return decoded;
|
|
28
|
+
} catch {
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
function isElement(node) {
|
|
33
|
+
return node.type === "tag";
|
|
34
|
+
}
|
|
35
|
+
function getTextContent(node) {
|
|
36
|
+
if (node.type === "text") return node.data;
|
|
37
|
+
if (isElement(node)) return node.children.map(getTextContent).join("");
|
|
38
|
+
return "";
|
|
39
|
+
}
|
|
40
|
+
function hasClass(element, className) {
|
|
41
|
+
const classes = element.attribs.class;
|
|
42
|
+
if (classes == null) return false;
|
|
43
|
+
return classes.split(/\s+/).includes(className);
|
|
44
|
+
}
|
|
45
|
+
function collectElementsByTagName(doc, tagName) {
|
|
46
|
+
const results = [];
|
|
47
|
+
function visit(node) {
|
|
48
|
+
if (isElement(node)) {
|
|
49
|
+
if (node.name === tagName) results.push(node);
|
|
50
|
+
for (const child of node.children) visit(child);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
for (const child of doc.children) visit(child);
|
|
54
|
+
return results;
|
|
55
|
+
}
|
|
56
|
+
function findFirstAnchorWithClass(node, className) {
|
|
57
|
+
if (isElement(node) && node.name === "a" && hasClass(node, className)) return node;
|
|
58
|
+
if (isElement(node)) for (const child of node.children) {
|
|
59
|
+
const found = findFirstAnchorWithClass(child, className);
|
|
60
|
+
if (found != null) return found;
|
|
61
|
+
}
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
function findFirstTextByClass(node, className) {
|
|
65
|
+
if (isElement(node) && hasClass(node, className)) {
|
|
66
|
+
const text = getTextContent(node).trim();
|
|
67
|
+
return text.length > 0 ? text : null;
|
|
68
|
+
}
|
|
69
|
+
if (isElement(node)) for (const child of node.children) {
|
|
70
|
+
const found = findFirstTextByClass(child, className);
|
|
71
|
+
if (found != null) return found;
|
|
72
|
+
}
|
|
73
|
+
return null;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Parses DuckDuckGo Lite search result HTML.
|
|
77
|
+
*
|
|
78
|
+
* This parser intentionally relies on minimal semantics:
|
|
79
|
+
* - Each result starts at a `<tr>` that contains an `a.result-link`.
|
|
80
|
+
* - Additional data (snippet, display URL) is searched within subsequent `<tr>`
|
|
81
|
+
* siblings until the next result starts.
|
|
82
|
+
*
|
|
83
|
+
* This keeps the parser resistant to minor structure changes while avoiding
|
|
84
|
+
* accidentally attaching a snippet from the next result.
|
|
85
|
+
*
|
|
86
|
+
* @param html DuckDuckGo Lite HTML.
|
|
87
|
+
* @param options Parsing options.
|
|
88
|
+
* @returns Parsed search results.
|
|
89
|
+
* @since 0.1.0
|
|
90
|
+
*/
|
|
91
|
+
function parseDuckDuckGoLiteResults(html, options = {}) {
|
|
92
|
+
const maxResults = options.maxResults ?? 10;
|
|
93
|
+
if (maxResults <= 0) return [];
|
|
94
|
+
const rows = collectElementsByTagName((0, htmlparser2.parseDocument)(html, {
|
|
95
|
+
lowerCaseTags: true,
|
|
96
|
+
lowerCaseAttributeNames: true
|
|
97
|
+
}), "tr");
|
|
98
|
+
const results = [];
|
|
99
|
+
for (let rowIndex = 0; rowIndex < rows.length; rowIndex++) {
|
|
100
|
+
const row = rows[rowIndex];
|
|
101
|
+
const anchor = findFirstAnchorWithClass(row, "result-link");
|
|
102
|
+
if (anchor == null) continue;
|
|
103
|
+
const title = getTextContent(anchor).trim();
|
|
104
|
+
const href = anchor.attribs.href?.trim();
|
|
105
|
+
if (title.length === 0 || href == null || href.length === 0) continue;
|
|
106
|
+
const url = unwrapDuckDuckGoRedirectUrl(href) ?? href;
|
|
107
|
+
let snippet = null;
|
|
108
|
+
let displayUrl = null;
|
|
109
|
+
for (let j = rowIndex; j < rows.length; j++) {
|
|
110
|
+
if (j !== rowIndex && findFirstAnchorWithClass(rows[j], "result-link") != null) break;
|
|
111
|
+
snippet ??= findFirstTextByClass(rows[j], "result-snippet");
|
|
112
|
+
displayUrl ??= findFirstTextByClass(rows[j], "link-text");
|
|
113
|
+
if (snippet != null && displayUrl != null) break;
|
|
114
|
+
}
|
|
115
|
+
results.push({
|
|
116
|
+
title,
|
|
117
|
+
url,
|
|
118
|
+
snippet: snippet ?? void 0,
|
|
119
|
+
displayUrl: displayUrl ?? void 0
|
|
120
|
+
});
|
|
121
|
+
if (results.length >= maxResults) break;
|
|
122
|
+
}
|
|
123
|
+
return results;
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* A passive context source that performs a web search using DuckDuckGo Lite.
|
|
127
|
+
*
|
|
128
|
+
* This source returns a list of search results (title, URL, snippet) and does
|
|
129
|
+
* not fetch the target pages themselves. Combine with {@link fetchWebPage} if
|
|
130
|
+
* you want to retrieve a specific result in detail.
|
|
131
|
+
*
|
|
132
|
+
* @since 0.1.0
|
|
133
|
+
*/
|
|
134
|
+
const searchWeb = {
|
|
135
|
+
name: "search-web",
|
|
136
|
+
description: "Searches the web (DuckDuckGo Lite) and returns a list of results with titles, URLs, and snippets. Use this to quickly find relevant pages, then fetch a specific page separately if needed.",
|
|
137
|
+
mode: "passive",
|
|
138
|
+
parameters: zod.z.object({
|
|
139
|
+
query: zod.z.string().min(1).describe("The search query keyword(s)"),
|
|
140
|
+
maxResults: zod.z.number().int().positive().max(50).optional().describe("Maximum number of results to return (default: 10)"),
|
|
141
|
+
region: zod.z.string().optional().describe("DuckDuckGo region (kl) parameter, e.g. 'kr-kr' or 'us-en'"),
|
|
142
|
+
timeRange: zod.z.enum([
|
|
143
|
+
"d",
|
|
144
|
+
"w",
|
|
145
|
+
"m",
|
|
146
|
+
"y"
|
|
147
|
+
]).optional().describe("Time range filter (df): d=day, w=week, m=month, y=year")
|
|
148
|
+
}),
|
|
149
|
+
async gather(params, options) {
|
|
150
|
+
const maxResults = params.maxResults ?? 10;
|
|
151
|
+
const url = new URL("https://lite.duckduckgo.com/lite/");
|
|
152
|
+
url.searchParams.set("q", params.query);
|
|
153
|
+
if (params.region != null && params.region.trim().length > 0) url.searchParams.set("kl", params.region.trim());
|
|
154
|
+
if (params.timeRange != null) url.searchParams.set("df", params.timeRange);
|
|
155
|
+
logger.debug("Searching DuckDuckGo Lite: {url}", { url: url.toString() });
|
|
156
|
+
try {
|
|
157
|
+
const response = await fetch(url, {
|
|
158
|
+
signal: options?.signal,
|
|
159
|
+
headers: {
|
|
160
|
+
"User-Agent": "Mozilla/5.0 (compatible; Vertana/0.1; +https://vertana.org)",
|
|
161
|
+
Accept: "text/html,application/xhtml+xml"
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
if (!response.ok) return {
|
|
165
|
+
content: `Failed to search the web. Status: ${response.status}`,
|
|
166
|
+
metadata: {
|
|
167
|
+
query: params.query,
|
|
168
|
+
success: false,
|
|
169
|
+
status: response.status
|
|
170
|
+
}
|
|
171
|
+
};
|
|
172
|
+
const results = parseDuckDuckGoLiteResults(await response.text(), { maxResults });
|
|
173
|
+
return {
|
|
174
|
+
content: formatSearchResults(params.query, results),
|
|
175
|
+
metadata: {
|
|
176
|
+
query: params.query,
|
|
177
|
+
resultCount: results.length,
|
|
178
|
+
urls: results.map((r) => r.url),
|
|
179
|
+
success: true
|
|
180
|
+
}
|
|
181
|
+
};
|
|
182
|
+
} catch (error) {
|
|
183
|
+
if (error instanceof Error && error.name === "AbortError") return {
|
|
184
|
+
content: "Search aborted.",
|
|
185
|
+
metadata: {
|
|
186
|
+
query: params.query,
|
|
187
|
+
success: false,
|
|
188
|
+
aborted: true
|
|
189
|
+
}
|
|
190
|
+
};
|
|
191
|
+
return {
|
|
192
|
+
content: `Failed to search the web. Error: ${String(error)}`,
|
|
193
|
+
metadata: {
|
|
194
|
+
query: params.query,
|
|
195
|
+
success: false
|
|
196
|
+
}
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
};
|
|
201
|
+
function formatSearchResults(query, results) {
|
|
202
|
+
if (results.length === 0) return `No web search results found for: ${query}`;
|
|
203
|
+
const lines = [];
|
|
204
|
+
lines.push(`# Web search results: ${query}`);
|
|
205
|
+
lines.push("");
|
|
206
|
+
for (let i = 0; i < results.length; i++) {
|
|
207
|
+
const result = results[i];
|
|
208
|
+
lines.push(`## ${i + 1}. ${result.title}`);
|
|
209
|
+
lines.push(`URL: ${result.url}`);
|
|
210
|
+
if (result.displayUrl != null) lines.push(`Display: ${result.displayUrl}`);
|
|
211
|
+
if (result.snippet != null) {
|
|
212
|
+
lines.push("");
|
|
213
|
+
lines.push(result.snippet);
|
|
214
|
+
}
|
|
215
|
+
if (i !== results.length - 1) lines.push("");
|
|
216
|
+
}
|
|
217
|
+
return lines.join("\n");
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
//#endregion
|
|
221
|
+
exports.searchWeb = searchWeb;
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { PassiveContextSource } from "@vertana/core/context";
|
|
2
|
+
|
|
3
|
+
//#region src/search.d.ts
|
|
4
|
+
interface SearchWebParams {
|
|
5
|
+
readonly query: string;
|
|
6
|
+
readonly maxResults?: number;
|
|
7
|
+
readonly region?: string;
|
|
8
|
+
readonly timeRange?: "d" | "w" | "m" | "y";
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* A passive context source that performs a web search using DuckDuckGo Lite.
|
|
12
|
+
*
|
|
13
|
+
* This source returns a list of search results (title, URL, snippet) and does
|
|
14
|
+
* not fetch the target pages themselves. Combine with {@link fetchWebPage} if
|
|
15
|
+
* you want to retrieve a specific result in detail.
|
|
16
|
+
*
|
|
17
|
+
* @since 0.1.0
|
|
18
|
+
*/
|
|
19
|
+
declare const searchWeb: PassiveContextSource<SearchWebParams>;
|
|
20
|
+
//#endregion
|
|
21
|
+
export { searchWeb };
|
package/dist/search.d.ts
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { PassiveContextSource } from "@vertana/core/context";
|
|
2
|
+
|
|
3
|
+
//#region src/search.d.ts
|
|
4
|
+
interface SearchWebParams {
|
|
5
|
+
readonly query: string;
|
|
6
|
+
readonly maxResults?: number;
|
|
7
|
+
readonly region?: string;
|
|
8
|
+
readonly timeRange?: "d" | "w" | "m" | "y";
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* A passive context source that performs a web search using DuckDuckGo Lite.
|
|
12
|
+
*
|
|
13
|
+
* This source returns a list of search results (title, URL, snippet) and does
|
|
14
|
+
* not fetch the target pages themselves. Combine with {@link fetchWebPage} if
|
|
15
|
+
* you want to retrieve a specific result in detail.
|
|
16
|
+
*
|
|
17
|
+
* @since 0.1.0
|
|
18
|
+
*/
|
|
19
|
+
declare const searchWeb: PassiveContextSource<SearchWebParams>;
|
|
20
|
+
//#endregion
|
|
21
|
+
export { searchWeb };
|
package/dist/search.js
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
import { getLogger } from "@logtape/logtape";
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
import { parseDocument } from "htmlparser2";
|
|
4
|
+
|
|
5
|
+
//#region src/search.ts
|
|
6
|
+
const logger = getLogger([
|
|
7
|
+
"vertana",
|
|
8
|
+
"context-web",
|
|
9
|
+
"search"
|
|
10
|
+
]);
|
|
11
|
+
function unwrapDuckDuckGoRedirectUrl(href) {
|
|
12
|
+
const trimmed = href.trim();
|
|
13
|
+
const normalized = trimmed.startsWith("//") ? `https:${trimmed}` : trimmed;
|
|
14
|
+
let parsed;
|
|
15
|
+
try {
|
|
16
|
+
parsed = new URL(normalized);
|
|
17
|
+
} catch {
|
|
18
|
+
return null;
|
|
19
|
+
}
|
|
20
|
+
if (!/(^|\.)duckduckgo\.com$/i.test(parsed.hostname)) return trimmed;
|
|
21
|
+
if (parsed.pathname !== "/l/") return trimmed;
|
|
22
|
+
const raw = parsed.searchParams.get("uddg");
|
|
23
|
+
if (raw == null || raw.length === 0) return null;
|
|
24
|
+
try {
|
|
25
|
+
const decoded = decodeURIComponent(raw);
|
|
26
|
+
new URL(decoded);
|
|
27
|
+
return decoded;
|
|
28
|
+
} catch {
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
function isElement(node) {
|
|
33
|
+
return node.type === "tag";
|
|
34
|
+
}
|
|
35
|
+
function getTextContent(node) {
|
|
36
|
+
if (node.type === "text") return node.data;
|
|
37
|
+
if (isElement(node)) return node.children.map(getTextContent).join("");
|
|
38
|
+
return "";
|
|
39
|
+
}
|
|
40
|
+
function hasClass(element, className) {
|
|
41
|
+
const classes = element.attribs.class;
|
|
42
|
+
if (classes == null) return false;
|
|
43
|
+
return classes.split(/\s+/).includes(className);
|
|
44
|
+
}
|
|
45
|
+
function collectElementsByTagName(doc, tagName) {
|
|
46
|
+
const results = [];
|
|
47
|
+
function visit(node) {
|
|
48
|
+
if (isElement(node)) {
|
|
49
|
+
if (node.name === tagName) results.push(node);
|
|
50
|
+
for (const child of node.children) visit(child);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
for (const child of doc.children) visit(child);
|
|
54
|
+
return results;
|
|
55
|
+
}
|
|
56
|
+
function findFirstAnchorWithClass(node, className) {
|
|
57
|
+
if (isElement(node) && node.name === "a" && hasClass(node, className)) return node;
|
|
58
|
+
if (isElement(node)) for (const child of node.children) {
|
|
59
|
+
const found = findFirstAnchorWithClass(child, className);
|
|
60
|
+
if (found != null) return found;
|
|
61
|
+
}
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
function findFirstTextByClass(node, className) {
|
|
65
|
+
if (isElement(node) && hasClass(node, className)) {
|
|
66
|
+
const text = getTextContent(node).trim();
|
|
67
|
+
return text.length > 0 ? text : null;
|
|
68
|
+
}
|
|
69
|
+
if (isElement(node)) for (const child of node.children) {
|
|
70
|
+
const found = findFirstTextByClass(child, className);
|
|
71
|
+
if (found != null) return found;
|
|
72
|
+
}
|
|
73
|
+
return null;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Parses DuckDuckGo Lite search result HTML.
|
|
77
|
+
*
|
|
78
|
+
* This parser intentionally relies on minimal semantics:
|
|
79
|
+
* - Each result starts at a `<tr>` that contains an `a.result-link`.
|
|
80
|
+
* - Additional data (snippet, display URL) is searched within subsequent `<tr>`
|
|
81
|
+
* siblings until the next result starts.
|
|
82
|
+
*
|
|
83
|
+
* This keeps the parser resistant to minor structure changes while avoiding
|
|
84
|
+
* accidentally attaching a snippet from the next result.
|
|
85
|
+
*
|
|
86
|
+
* @param html DuckDuckGo Lite HTML.
|
|
87
|
+
* @param options Parsing options.
|
|
88
|
+
* @returns Parsed search results.
|
|
89
|
+
* @since 0.1.0
|
|
90
|
+
*/
|
|
91
|
+
function parseDuckDuckGoLiteResults(html, options = {}) {
|
|
92
|
+
const maxResults = options.maxResults ?? 10;
|
|
93
|
+
if (maxResults <= 0) return [];
|
|
94
|
+
const rows = collectElementsByTagName(parseDocument(html, {
|
|
95
|
+
lowerCaseTags: true,
|
|
96
|
+
lowerCaseAttributeNames: true
|
|
97
|
+
}), "tr");
|
|
98
|
+
const results = [];
|
|
99
|
+
for (let rowIndex = 0; rowIndex < rows.length; rowIndex++) {
|
|
100
|
+
const row = rows[rowIndex];
|
|
101
|
+
const anchor = findFirstAnchorWithClass(row, "result-link");
|
|
102
|
+
if (anchor == null) continue;
|
|
103
|
+
const title = getTextContent(anchor).trim();
|
|
104
|
+
const href = anchor.attribs.href?.trim();
|
|
105
|
+
if (title.length === 0 || href == null || href.length === 0) continue;
|
|
106
|
+
const url = unwrapDuckDuckGoRedirectUrl(href) ?? href;
|
|
107
|
+
let snippet = null;
|
|
108
|
+
let displayUrl = null;
|
|
109
|
+
for (let j = rowIndex; j < rows.length; j++) {
|
|
110
|
+
if (j !== rowIndex && findFirstAnchorWithClass(rows[j], "result-link") != null) break;
|
|
111
|
+
snippet ??= findFirstTextByClass(rows[j], "result-snippet");
|
|
112
|
+
displayUrl ??= findFirstTextByClass(rows[j], "link-text");
|
|
113
|
+
if (snippet != null && displayUrl != null) break;
|
|
114
|
+
}
|
|
115
|
+
results.push({
|
|
116
|
+
title,
|
|
117
|
+
url,
|
|
118
|
+
snippet: snippet ?? void 0,
|
|
119
|
+
displayUrl: displayUrl ?? void 0
|
|
120
|
+
});
|
|
121
|
+
if (results.length >= maxResults) break;
|
|
122
|
+
}
|
|
123
|
+
return results;
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* A passive context source that performs a web search using DuckDuckGo Lite.
|
|
127
|
+
*
|
|
128
|
+
* This source returns a list of search results (title, URL, snippet) and does
|
|
129
|
+
* not fetch the target pages themselves. Combine with {@link fetchWebPage} if
|
|
130
|
+
* you want to retrieve a specific result in detail.
|
|
131
|
+
*
|
|
132
|
+
* @since 0.1.0
|
|
133
|
+
*/
|
|
134
|
+
const searchWeb = {
|
|
135
|
+
name: "search-web",
|
|
136
|
+
description: "Searches the web (DuckDuckGo Lite) and returns a list of results with titles, URLs, and snippets. Use this to quickly find relevant pages, then fetch a specific page separately if needed.",
|
|
137
|
+
mode: "passive",
|
|
138
|
+
parameters: z.object({
|
|
139
|
+
query: z.string().min(1).describe("The search query keyword(s)"),
|
|
140
|
+
maxResults: z.number().int().positive().max(50).optional().describe("Maximum number of results to return (default: 10)"),
|
|
141
|
+
region: z.string().optional().describe("DuckDuckGo region (kl) parameter, e.g. 'kr-kr' or 'us-en'"),
|
|
142
|
+
timeRange: z.enum([
|
|
143
|
+
"d",
|
|
144
|
+
"w",
|
|
145
|
+
"m",
|
|
146
|
+
"y"
|
|
147
|
+
]).optional().describe("Time range filter (df): d=day, w=week, m=month, y=year")
|
|
148
|
+
}),
|
|
149
|
+
async gather(params, options) {
|
|
150
|
+
const maxResults = params.maxResults ?? 10;
|
|
151
|
+
const url = new URL("https://lite.duckduckgo.com/lite/");
|
|
152
|
+
url.searchParams.set("q", params.query);
|
|
153
|
+
if (params.region != null && params.region.trim().length > 0) url.searchParams.set("kl", params.region.trim());
|
|
154
|
+
if (params.timeRange != null) url.searchParams.set("df", params.timeRange);
|
|
155
|
+
logger.debug("Searching DuckDuckGo Lite: {url}", { url: url.toString() });
|
|
156
|
+
try {
|
|
157
|
+
const response = await fetch(url, {
|
|
158
|
+
signal: options?.signal,
|
|
159
|
+
headers: {
|
|
160
|
+
"User-Agent": "Mozilla/5.0 (compatible; Vertana/0.1; +https://vertana.org)",
|
|
161
|
+
Accept: "text/html,application/xhtml+xml"
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
if (!response.ok) return {
|
|
165
|
+
content: `Failed to search the web. Status: ${response.status}`,
|
|
166
|
+
metadata: {
|
|
167
|
+
query: params.query,
|
|
168
|
+
success: false,
|
|
169
|
+
status: response.status
|
|
170
|
+
}
|
|
171
|
+
};
|
|
172
|
+
const results = parseDuckDuckGoLiteResults(await response.text(), { maxResults });
|
|
173
|
+
return {
|
|
174
|
+
content: formatSearchResults(params.query, results),
|
|
175
|
+
metadata: {
|
|
176
|
+
query: params.query,
|
|
177
|
+
resultCount: results.length,
|
|
178
|
+
urls: results.map((r) => r.url),
|
|
179
|
+
success: true
|
|
180
|
+
}
|
|
181
|
+
};
|
|
182
|
+
} catch (error) {
|
|
183
|
+
if (error instanceof Error && error.name === "AbortError") return {
|
|
184
|
+
content: "Search aborted.",
|
|
185
|
+
metadata: {
|
|
186
|
+
query: params.query,
|
|
187
|
+
success: false,
|
|
188
|
+
aborted: true
|
|
189
|
+
}
|
|
190
|
+
};
|
|
191
|
+
return {
|
|
192
|
+
content: `Failed to search the web. Error: ${String(error)}`,
|
|
193
|
+
metadata: {
|
|
194
|
+
query: params.query,
|
|
195
|
+
success: false
|
|
196
|
+
}
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
};
|
|
201
|
+
function formatSearchResults(query, results) {
|
|
202
|
+
if (results.length === 0) return `No web search results found for: ${query}`;
|
|
203
|
+
const lines = [];
|
|
204
|
+
lines.push(`# Web search results: ${query}`);
|
|
205
|
+
lines.push("");
|
|
206
|
+
for (let i = 0; i < results.length; i++) {
|
|
207
|
+
const result = results[i];
|
|
208
|
+
lines.push(`## ${i + 1}. ${result.title}`);
|
|
209
|
+
lines.push(`URL: ${result.url}`);
|
|
210
|
+
if (result.displayUrl != null) lines.push(`Display: ${result.displayUrl}`);
|
|
211
|
+
if (result.snippet != null) {
|
|
212
|
+
lines.push("");
|
|
213
|
+
lines.push(result.snippet);
|
|
214
|
+
}
|
|
215
|
+
if (i !== results.length - 1) lines.push("");
|
|
216
|
+
}
|
|
217
|
+
return lines.join("\n");
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
//#endregion
|
|
221
|
+
export { searchWeb };
|