@vertana/context-web 0.1.0-dev.11 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -16,11 +16,13 @@ linked pages to provide additional context for translation.
16
16
  Features
17
17
  --------
18
18
 
19
- - **fetchWebPage**: A passive context source that fetches a single URL
19
+ - `fetchWebPage`: A passive context source that fetches a single URL
20
20
  and extracts the main content using Mozilla's Readability algorithm.
21
- - **fetchLinkedPages**: A required context source factory that extracts
21
+ - `fetchLinkedPages`: A required context source factory that extracts
22
22
  all links from the source text and fetches their content.
23
- - **extractLinks**: A utility function to extract URLs from text
23
+ - `searchWeb`: A passive context source that performs a web search
24
+ (DuckDuckGo Lite) and returns a list of results (title, URL, snippet).
25
+ - `extractLinks`: A utility function to extract URLs from text
24
26
  in various formats (plain text, Markdown, HTML).
25
27
 
26
28
 
@@ -51,7 +53,7 @@ Usage
51
53
 
52
54
  ~~~~ typescript
53
55
  import { translate } from "@vertana/facade";
54
- import { fetchLinkedPages, fetchWebPage } from "@vertana/context-web";
56
+ import { fetchLinkedPages, fetchWebPage, searchWeb } from "@vertana/context-web";
55
57
  import { openai } from "@ai-sdk/openai";
56
58
 
57
59
  const text = `
@@ -63,7 +65,8 @@ const result = await translate(openai("gpt-4o"), "ko", text, {
63
65
  contextSources: [
64
66
  // Automatically fetch all links in the text
65
67
  fetchLinkedPages({ text, mediaType: "text/plain" }),
66
- // Allow LLM to fetch additional URLs on demand
68
+ // Allow LLM to search the web and fetch URLs on demand
69
+ searchWeb,
67
70
  fetchWebPage,
68
71
  ],
69
72
  });
package/dist/index.cjs CHANGED
@@ -1,7 +1,9 @@
1
1
  const require_extract_links = require('./extract-links.cjs');
2
2
  const require_fetch = require('./fetch.cjs');
3
+ const require_search = require('./search.cjs');
3
4
 
4
5
  exports.extractContent = require_fetch.extractContent;
5
6
  exports.extractLinks = require_extract_links.extractLinks;
6
7
  exports.fetchLinkedPages = require_fetch.fetchLinkedPages;
7
- exports.fetchWebPage = require_fetch.fetchWebPage;
8
+ exports.fetchWebPage = require_fetch.fetchWebPage;
9
+ exports.searchWeb = require_search.searchWeb;
package/dist/index.d.cts CHANGED
@@ -1,3 +1,4 @@
1
1
  import { MediaType, extractLinks } from "./extract-links.cjs";
2
2
  import { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.cjs";
3
- export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage };
3
+ import { searchWeb } from "./search.cjs";
4
+ export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage, searchWeb };
package/dist/index.d.ts CHANGED
@@ -1,3 +1,4 @@
1
1
  import { MediaType, extractLinks } from "./extract-links.js";
2
2
  import { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.js";
3
- export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage };
3
+ import { searchWeb } from "./search.js";
4
+ export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage, searchWeb };
package/dist/index.js CHANGED
@@ -1,4 +1,5 @@
1
1
  import { extractLinks } from "./extract-links.js";
2
2
  import { extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.js";
3
+ import { searchWeb } from "./search.js";
3
4
 
4
- export { extractContent, extractLinks, fetchLinkedPages, fetchWebPage };
5
+ export { extractContent, extractLinks, fetchLinkedPages, fetchWebPage, searchWeb };
@@ -0,0 +1,221 @@
1
+ let _logtape_logtape = require("@logtape/logtape");
2
+ let zod = require("zod");
3
+ let htmlparser2 = require("htmlparser2");
4
+
5
+ //#region src/search.ts
6
+ const logger = (0, _logtape_logtape.getLogger)([
7
+ "vertana",
8
+ "context-web",
9
+ "search"
10
+ ]);
11
+ function unwrapDuckDuckGoRedirectUrl(href) {
12
+ const trimmed = href.trim();
13
+ const normalized = trimmed.startsWith("//") ? `https:${trimmed}` : trimmed;
14
+ let parsed;
15
+ try {
16
+ parsed = new URL(normalized);
17
+ } catch {
18
+ return null;
19
+ }
20
+ if (!/(^|\.)duckduckgo\.com$/i.test(parsed.hostname)) return trimmed;
21
+ if (parsed.pathname !== "/l/") return trimmed;
22
+ const raw = parsed.searchParams.get("uddg");
23
+ if (raw == null || raw.length === 0) return null;
24
+ try {
25
+ const decoded = decodeURIComponent(raw);
26
+ new URL(decoded);
27
+ return decoded;
28
+ } catch {
29
+ return null;
30
+ }
31
+ }
32
+ function isElement(node) {
33
+ return node.type === "tag";
34
+ }
35
+ function getTextContent(node) {
36
+ if (node.type === "text") return node.data;
37
+ if (isElement(node)) return node.children.map(getTextContent).join("");
38
+ return "";
39
+ }
40
+ function hasClass(element, className) {
41
+ const classes = element.attribs.class;
42
+ if (classes == null) return false;
43
+ return classes.split(/\s+/).includes(className);
44
+ }
45
+ function collectElementsByTagName(doc, tagName) {
46
+ const results = [];
47
+ function visit(node) {
48
+ if (isElement(node)) {
49
+ if (node.name === tagName) results.push(node);
50
+ for (const child of node.children) visit(child);
51
+ }
52
+ }
53
+ for (const child of doc.children) visit(child);
54
+ return results;
55
+ }
56
+ function findFirstAnchorWithClass(node, className) {
57
+ if (isElement(node) && node.name === "a" && hasClass(node, className)) return node;
58
+ if (isElement(node)) for (const child of node.children) {
59
+ const found = findFirstAnchorWithClass(child, className);
60
+ if (found != null) return found;
61
+ }
62
+ return null;
63
+ }
64
+ function findFirstTextByClass(node, className) {
65
+ if (isElement(node) && hasClass(node, className)) {
66
+ const text = getTextContent(node).trim();
67
+ return text.length > 0 ? text : null;
68
+ }
69
+ if (isElement(node)) for (const child of node.children) {
70
+ const found = findFirstTextByClass(child, className);
71
+ if (found != null) return found;
72
+ }
73
+ return null;
74
+ }
75
+ /**
76
+ * Parses DuckDuckGo Lite search result HTML.
77
+ *
78
+ * This parser intentionally relies on minimal semantics:
79
+ * - Each result starts at a `<tr>` that contains an `a.result-link`.
80
+ * - Additional data (snippet, display URL) is searched within subsequent `<tr>`
81
+ * siblings until the next result starts.
82
+ *
83
+ * This keeps the parser resistant to minor structure changes while avoiding
84
+ * accidentally attaching a snippet from the next result.
85
+ *
86
+ * @param html DuckDuckGo Lite HTML.
87
+ * @param options Parsing options.
88
+ * @returns Parsed search results.
89
+ * @since 0.1.0
90
+ */
91
+ function parseDuckDuckGoLiteResults(html, options = {}) {
92
+ const maxResults = options.maxResults ?? 10;
93
+ if (maxResults <= 0) return [];
94
+ const rows = collectElementsByTagName((0, htmlparser2.parseDocument)(html, {
95
+ lowerCaseTags: true,
96
+ lowerCaseAttributeNames: true
97
+ }), "tr");
98
+ const results = [];
99
+ for (let rowIndex = 0; rowIndex < rows.length; rowIndex++) {
100
+ const row = rows[rowIndex];
101
+ const anchor = findFirstAnchorWithClass(row, "result-link");
102
+ if (anchor == null) continue;
103
+ const title = getTextContent(anchor).trim();
104
+ const href = anchor.attribs.href?.trim();
105
+ if (title.length === 0 || href == null || href.length === 0) continue;
106
+ const url = unwrapDuckDuckGoRedirectUrl(href) ?? href;
107
+ let snippet = null;
108
+ let displayUrl = null;
109
+ for (let j = rowIndex; j < rows.length; j++) {
110
+ if (j !== rowIndex && findFirstAnchorWithClass(rows[j], "result-link") != null) break;
111
+ snippet ??= findFirstTextByClass(rows[j], "result-snippet");
112
+ displayUrl ??= findFirstTextByClass(rows[j], "link-text");
113
+ if (snippet != null && displayUrl != null) break;
114
+ }
115
+ results.push({
116
+ title,
117
+ url,
118
+ snippet: snippet ?? void 0,
119
+ displayUrl: displayUrl ?? void 0
120
+ });
121
+ if (results.length >= maxResults) break;
122
+ }
123
+ return results;
124
+ }
125
+ /**
126
+ * A passive context source that performs a web search using DuckDuckGo Lite.
127
+ *
128
+ * This source returns a list of search results (title, URL, snippet) and does
129
+ * not fetch the target pages themselves. Combine with {@link fetchWebPage} if
130
+ * you want to retrieve a specific result in detail.
131
+ *
132
+ * @since 0.1.0
133
+ */
134
+ const searchWeb = {
135
+ name: "search-web",
136
+ description: "Searches the web (DuckDuckGo Lite) and returns a list of results with titles, URLs, and snippets. Use this to quickly find relevant pages, then fetch a specific page separately if needed.",
137
+ mode: "passive",
138
+ parameters: zod.z.object({
139
+ query: zod.z.string().min(1).describe("The search query keyword(s)"),
140
+ maxResults: zod.z.number().int().positive().max(50).optional().describe("Maximum number of results to return (default: 10)"),
141
+ region: zod.z.string().optional().describe("DuckDuckGo region (kl) parameter, e.g. 'kr-kr' or 'us-en'"),
142
+ timeRange: zod.z.enum([
143
+ "d",
144
+ "w",
145
+ "m",
146
+ "y"
147
+ ]).optional().describe("Time range filter (df): d=day, w=week, m=month, y=year")
148
+ }),
149
+ async gather(params, options) {
150
+ const maxResults = params.maxResults ?? 10;
151
+ const url = new URL("https://lite.duckduckgo.com/lite/");
152
+ url.searchParams.set("q", params.query);
153
+ if (params.region != null && params.region.trim().length > 0) url.searchParams.set("kl", params.region.trim());
154
+ if (params.timeRange != null) url.searchParams.set("df", params.timeRange);
155
+ logger.debug("Searching DuckDuckGo Lite: {url}", { url: url.toString() });
156
+ try {
157
+ const response = await fetch(url, {
158
+ signal: options?.signal,
159
+ headers: {
160
+ "User-Agent": "Mozilla/5.0 (compatible; Vertana/0.1; +https://vertana.org)",
161
+ Accept: "text/html,application/xhtml+xml"
162
+ }
163
+ });
164
+ if (!response.ok) return {
165
+ content: `Failed to search the web. Status: ${response.status}`,
166
+ metadata: {
167
+ query: params.query,
168
+ success: false,
169
+ status: response.status
170
+ }
171
+ };
172
+ const results = parseDuckDuckGoLiteResults(await response.text(), { maxResults });
173
+ return {
174
+ content: formatSearchResults(params.query, results),
175
+ metadata: {
176
+ query: params.query,
177
+ resultCount: results.length,
178
+ urls: results.map((r) => r.url),
179
+ success: true
180
+ }
181
+ };
182
+ } catch (error) {
183
+ if (error instanceof Error && error.name === "AbortError") return {
184
+ content: "Search aborted.",
185
+ metadata: {
186
+ query: params.query,
187
+ success: false,
188
+ aborted: true
189
+ }
190
+ };
191
+ return {
192
+ content: `Failed to search the web. Error: ${String(error)}`,
193
+ metadata: {
194
+ query: params.query,
195
+ success: false
196
+ }
197
+ };
198
+ }
199
+ }
200
+ };
201
+ function formatSearchResults(query, results) {
202
+ if (results.length === 0) return `No web search results found for: ${query}`;
203
+ const lines = [];
204
+ lines.push(`# Web search results: ${query}`);
205
+ lines.push("");
206
+ for (let i = 0; i < results.length; i++) {
207
+ const result = results[i];
208
+ lines.push(`## ${i + 1}. ${result.title}`);
209
+ lines.push(`URL: ${result.url}`);
210
+ if (result.displayUrl != null) lines.push(`Display: ${result.displayUrl}`);
211
+ if (result.snippet != null) {
212
+ lines.push("");
213
+ lines.push(result.snippet);
214
+ }
215
+ if (i !== results.length - 1) lines.push("");
216
+ }
217
+ return lines.join("\n");
218
+ }
219
+
220
+ //#endregion
221
+ exports.searchWeb = searchWeb;
@@ -0,0 +1,21 @@
1
+ import { PassiveContextSource } from "@vertana/core/context";
2
+
3
+ //#region src/search.d.ts
4
+ interface SearchWebParams {
5
+ readonly query: string;
6
+ readonly maxResults?: number;
7
+ readonly region?: string;
8
+ readonly timeRange?: "d" | "w" | "m" | "y";
9
+ }
10
+ /**
11
+ * A passive context source that performs a web search using DuckDuckGo Lite.
12
+ *
13
+ * This source returns a list of search results (title, URL, snippet) and does
14
+ * not fetch the target pages themselves. Combine with {@link fetchWebPage} if
15
+ * you want to retrieve a specific result in detail.
16
+ *
17
+ * @since 0.1.0
18
+ */
19
+ declare const searchWeb: PassiveContextSource<SearchWebParams>;
20
+ //#endregion
21
+ export { searchWeb };
@@ -0,0 +1,21 @@
1
+ import { PassiveContextSource } from "@vertana/core/context";
2
+
3
+ //#region src/search.d.ts
4
+ interface SearchWebParams {
5
+ readonly query: string;
6
+ readonly maxResults?: number;
7
+ readonly region?: string;
8
+ readonly timeRange?: "d" | "w" | "m" | "y";
9
+ }
10
+ /**
11
+ * A passive context source that performs a web search using DuckDuckGo Lite.
12
+ *
13
+ * This source returns a list of search results (title, URL, snippet) and does
14
+ * not fetch the target pages themselves. Combine with {@link fetchWebPage} if
15
+ * you want to retrieve a specific result in detail.
16
+ *
17
+ * @since 0.1.0
18
+ */
19
+ declare const searchWeb: PassiveContextSource<SearchWebParams>;
20
+ //#endregion
21
+ export { searchWeb };
package/dist/search.js ADDED
@@ -0,0 +1,221 @@
1
+ import { getLogger } from "@logtape/logtape";
2
+ import { z } from "zod";
3
+ import { parseDocument } from "htmlparser2";
4
+
5
+ //#region src/search.ts
6
+ const logger = getLogger([
7
+ "vertana",
8
+ "context-web",
9
+ "search"
10
+ ]);
11
+ function unwrapDuckDuckGoRedirectUrl(href) {
12
+ const trimmed = href.trim();
13
+ const normalized = trimmed.startsWith("//") ? `https:${trimmed}` : trimmed;
14
+ let parsed;
15
+ try {
16
+ parsed = new URL(normalized);
17
+ } catch {
18
+ return null;
19
+ }
20
+ if (!/(^|\.)duckduckgo\.com$/i.test(parsed.hostname)) return trimmed;
21
+ if (parsed.pathname !== "/l/") return trimmed;
22
+ const raw = parsed.searchParams.get("uddg");
23
+ if (raw == null || raw.length === 0) return null;
24
+ try {
25
+ const decoded = decodeURIComponent(raw);
26
+ new URL(decoded);
27
+ return decoded;
28
+ } catch {
29
+ return null;
30
+ }
31
+ }
32
+ function isElement(node) {
33
+ return node.type === "tag";
34
+ }
35
+ function getTextContent(node) {
36
+ if (node.type === "text") return node.data;
37
+ if (isElement(node)) return node.children.map(getTextContent).join("");
38
+ return "";
39
+ }
40
+ function hasClass(element, className) {
41
+ const classes = element.attribs.class;
42
+ if (classes == null) return false;
43
+ return classes.split(/\s+/).includes(className);
44
+ }
45
+ function collectElementsByTagName(doc, tagName) {
46
+ const results = [];
47
+ function visit(node) {
48
+ if (isElement(node)) {
49
+ if (node.name === tagName) results.push(node);
50
+ for (const child of node.children) visit(child);
51
+ }
52
+ }
53
+ for (const child of doc.children) visit(child);
54
+ return results;
55
+ }
56
+ function findFirstAnchorWithClass(node, className) {
57
+ if (isElement(node) && node.name === "a" && hasClass(node, className)) return node;
58
+ if (isElement(node)) for (const child of node.children) {
59
+ const found = findFirstAnchorWithClass(child, className);
60
+ if (found != null) return found;
61
+ }
62
+ return null;
63
+ }
64
+ function findFirstTextByClass(node, className) {
65
+ if (isElement(node) && hasClass(node, className)) {
66
+ const text = getTextContent(node).trim();
67
+ return text.length > 0 ? text : null;
68
+ }
69
+ if (isElement(node)) for (const child of node.children) {
70
+ const found = findFirstTextByClass(child, className);
71
+ if (found != null) return found;
72
+ }
73
+ return null;
74
+ }
75
+ /**
76
+ * Parses DuckDuckGo Lite search result HTML.
77
+ *
78
+ * This parser intentionally relies on minimal semantics:
79
+ * - Each result starts at a `<tr>` that contains an `a.result-link`.
80
+ * - Additional data (snippet, display URL) is searched within subsequent `<tr>`
81
+ * siblings until the next result starts.
82
+ *
83
+ * This keeps the parser resistant to minor structure changes while avoiding
84
+ * accidentally attaching a snippet from the next result.
85
+ *
86
+ * @param html DuckDuckGo Lite HTML.
87
+ * @param options Parsing options.
88
+ * @returns Parsed search results.
89
+ * @since 0.1.0
90
+ */
91
+ function parseDuckDuckGoLiteResults(html, options = {}) {
92
+ const maxResults = options.maxResults ?? 10;
93
+ if (maxResults <= 0) return [];
94
+ const rows = collectElementsByTagName(parseDocument(html, {
95
+ lowerCaseTags: true,
96
+ lowerCaseAttributeNames: true
97
+ }), "tr");
98
+ const results = [];
99
+ for (let rowIndex = 0; rowIndex < rows.length; rowIndex++) {
100
+ const row = rows[rowIndex];
101
+ const anchor = findFirstAnchorWithClass(row, "result-link");
102
+ if (anchor == null) continue;
103
+ const title = getTextContent(anchor).trim();
104
+ const href = anchor.attribs.href?.trim();
105
+ if (title.length === 0 || href == null || href.length === 0) continue;
106
+ const url = unwrapDuckDuckGoRedirectUrl(href) ?? href;
107
+ let snippet = null;
108
+ let displayUrl = null;
109
+ for (let j = rowIndex; j < rows.length; j++) {
110
+ if (j !== rowIndex && findFirstAnchorWithClass(rows[j], "result-link") != null) break;
111
+ snippet ??= findFirstTextByClass(rows[j], "result-snippet");
112
+ displayUrl ??= findFirstTextByClass(rows[j], "link-text");
113
+ if (snippet != null && displayUrl != null) break;
114
+ }
115
+ results.push({
116
+ title,
117
+ url,
118
+ snippet: snippet ?? void 0,
119
+ displayUrl: displayUrl ?? void 0
120
+ });
121
+ if (results.length >= maxResults) break;
122
+ }
123
+ return results;
124
+ }
125
+ /**
126
+ * A passive context source that performs a web search using DuckDuckGo Lite.
127
+ *
128
+ * This source returns a list of search results (title, URL, snippet) and does
129
+ * not fetch the target pages themselves. Combine with {@link fetchWebPage} if
130
+ * you want to retrieve a specific result in detail.
131
+ *
132
+ * @since 0.1.0
133
+ */
134
+ const searchWeb = {
135
+ name: "search-web",
136
+ description: "Searches the web (DuckDuckGo Lite) and returns a list of results with titles, URLs, and snippets. Use this to quickly find relevant pages, then fetch a specific page separately if needed.",
137
+ mode: "passive",
138
+ parameters: z.object({
139
+ query: z.string().min(1).describe("The search query keyword(s)"),
140
+ maxResults: z.number().int().positive().max(50).optional().describe("Maximum number of results to return (default: 10)"),
141
+ region: z.string().optional().describe("DuckDuckGo region (kl) parameter, e.g. 'kr-kr' or 'us-en'"),
142
+ timeRange: z.enum([
143
+ "d",
144
+ "w",
145
+ "m",
146
+ "y"
147
+ ]).optional().describe("Time range filter (df): d=day, w=week, m=month, y=year")
148
+ }),
149
+ async gather(params, options) {
150
+ const maxResults = params.maxResults ?? 10;
151
+ const url = new URL("https://lite.duckduckgo.com/lite/");
152
+ url.searchParams.set("q", params.query);
153
+ if (params.region != null && params.region.trim().length > 0) url.searchParams.set("kl", params.region.trim());
154
+ if (params.timeRange != null) url.searchParams.set("df", params.timeRange);
155
+ logger.debug("Searching DuckDuckGo Lite: {url}", { url: url.toString() });
156
+ try {
157
+ const response = await fetch(url, {
158
+ signal: options?.signal,
159
+ headers: {
160
+ "User-Agent": "Mozilla/5.0 (compatible; Vertana/0.1; +https://vertana.org)",
161
+ Accept: "text/html,application/xhtml+xml"
162
+ }
163
+ });
164
+ if (!response.ok) return {
165
+ content: `Failed to search the web. Status: ${response.status}`,
166
+ metadata: {
167
+ query: params.query,
168
+ success: false,
169
+ status: response.status
170
+ }
171
+ };
172
+ const results = parseDuckDuckGoLiteResults(await response.text(), { maxResults });
173
+ return {
174
+ content: formatSearchResults(params.query, results),
175
+ metadata: {
176
+ query: params.query,
177
+ resultCount: results.length,
178
+ urls: results.map((r) => r.url),
179
+ success: true
180
+ }
181
+ };
182
+ } catch (error) {
183
+ if (error instanceof Error && error.name === "AbortError") return {
184
+ content: "Search aborted.",
185
+ metadata: {
186
+ query: params.query,
187
+ success: false,
188
+ aborted: true
189
+ }
190
+ };
191
+ return {
192
+ content: `Failed to search the web. Error: ${String(error)}`,
193
+ metadata: {
194
+ query: params.query,
195
+ success: false
196
+ }
197
+ };
198
+ }
199
+ }
200
+ };
201
+ function formatSearchResults(query, results) {
202
+ if (results.length === 0) return `No web search results found for: ${query}`;
203
+ const lines = [];
204
+ lines.push(`# Web search results: ${query}`);
205
+ lines.push("");
206
+ for (let i = 0; i < results.length; i++) {
207
+ const result = results[i];
208
+ lines.push(`## ${i + 1}. ${result.title}`);
209
+ lines.push(`URL: ${result.url}`);
210
+ if (result.displayUrl != null) lines.push(`Display: ${result.displayUrl}`);
211
+ if (result.snippet != null) {
212
+ lines.push("");
213
+ lines.push(result.snippet);
214
+ }
215
+ if (i !== results.length - 1) lines.push("");
216
+ }
217
+ return lines.join("\n");
218
+ }
219
+
220
+ //#endregion
221
+ export { searchWeb };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vertana/context-web",
3
- "version": "0.1.0-dev.11+812bc132",
3
+ "version": "0.1.0",
4
4
  "description": "Web context gathering for Vertana - fetch and extract content from linked pages",
5
5
  "keywords": [
6
6
  "LLM",