npm - @vertana/context-web - Versions diffs - 0.1.0-dev.11 → 0.1.0 - Mend

@vertana/context-web 0.1.0-dev.11 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md CHANGED Viewed

@@ -16,11 +16,13 @@ linked pages to provide additional context for translation.
 Features
 --------
- -  **fetchWebPage**: A passive context source that fetches a single URL
+ -  `fetchWebPage`: A passive context source that fetches a single URL
     and extracts the main content using Mozilla's Readability algorithm.
- -  **fetchLinkedPages**: A required context source factory that extracts
+ -  `fetchLinkedPages`: A required context source factory that extracts
     all links from the source text and fetches their content.
- -  **extractLinks**: A utility function to extract URLs from text
+ -  `searchWeb`: A passive context source that performs a web search
+    (DuckDuckGo Lite) and returns a list of results (title, URL, snippet).
+ -  `extractLinks`: A utility function to extract URLs from text
     in various formats (plain text, Markdown, HTML).
@@ -51,7 +53,7 @@ Usage
 ~~~~ typescript
 import { translate } from "@vertana/facade";
-import { fetchLinkedPages, fetchWebPage } from "@vertana/context-web";
+import { fetchLinkedPages, fetchWebPage, searchWeb } from "@vertana/context-web";
 import { openai } from "@ai-sdk/openai";
 const text = `
@@ -63,7 +65,8 @@ const result = await translate(openai("gpt-4o"), "ko", text, {
   contextSources: [
     // Automatically fetch all links in the text
     fetchLinkedPages({ text, mediaType: "text/plain" }),
-    // Allow LLM to fetch additional URLs on demand
+    // Allow LLM to search the web and fetch URLs on demand
+    searchWeb,
     fetchWebPage,
   ],
 });

package/dist/index.cjs CHANGED Viewed

@@ -1,7 +1,9 @@
 const require_extract_links = require('./extract-links.cjs');
 const require_fetch = require('./fetch.cjs');
+const require_search = require('./search.cjs');
 exports.extractContent = require_fetch.extractContent;
 exports.extractLinks = require_extract_links.extractLinks;
 exports.fetchLinkedPages = require_fetch.fetchLinkedPages;
-exports.fetchWebPage = require_fetch.fetchWebPage;
+exports.fetchWebPage = require_fetch.fetchWebPage;
+exports.searchWeb = require_search.searchWeb;

package/dist/index.d.cts CHANGED Viewed

@@ -1,3 +1,4 @@
 import { MediaType, extractLinks } from "./extract-links.cjs";
 import { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.cjs";
-export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage };
+import { searchWeb } from "./search.cjs";
+export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage, searchWeb };

package/dist/index.d.ts CHANGED Viewed

@@ -1,3 +1,4 @@
 import { MediaType, extractLinks } from "./extract-links.js";
 import { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.js";
-export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage };
+import { searchWeb } from "./search.js";
+export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage, searchWeb };

package/dist/index.js CHANGED Viewed

@@ -1,4 +1,5 @@
 import { extractLinks } from "./extract-links.js";
 import { extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.js";
+import { searchWeb } from "./search.js";
-export { extractContent, extractLinks, fetchLinkedPages, fetchWebPage };
+export { extractContent, extractLinks, fetchLinkedPages, fetchWebPage, searchWeb };

package/dist/search.cjs ADDED Viewed

@@ -0,0 +1,221 @@
+let _logtape_logtape = require("@logtape/logtape");
+let zod = require("zod");
+let htmlparser2 = require("htmlparser2");
+//#region src/search.ts
+const logger = (0, _logtape_logtape.getLogger)([
+	"vertana",
+	"context-web",
+	"search"
+]);
+function unwrapDuckDuckGoRedirectUrl(href) {
+	const trimmed = href.trim();
+	const normalized = trimmed.startsWith("//") ? `https:${trimmed}` : trimmed;
+	let parsed;
+	try {
+		parsed = new URL(normalized);
+	} catch {
+		return null;
+	}
+	if (!/(^|\.)duckduckgo\.com$/i.test(parsed.hostname)) return trimmed;
+	if (parsed.pathname !== "/l/") return trimmed;
+	const raw = parsed.searchParams.get("uddg");
+	if (raw == null || raw.length === 0) return null;
+	try {
+		const decoded = decodeURIComponent(raw);
+		new URL(decoded);
+		return decoded;
+	} catch {
+		return null;
+	}
+}
+function isElement(node) {
+	return node.type === "tag";
+}
+function getTextContent(node) {
+	if (node.type === "text") return node.data;
+	if (isElement(node)) return node.children.map(getTextContent).join("");
+	return "";
+}
+function hasClass(element, className) {
+	const classes = element.attribs.class;
+	if (classes == null) return false;
+	return classes.split(/\s+/).includes(className);
+}
+function collectElementsByTagName(doc, tagName) {
+	const results = [];
+	function visit(node) {
+		if (isElement(node)) {
+			if (node.name === tagName) results.push(node);
+			for (const child of node.children) visit(child);
+		}
+	}
+	for (const child of doc.children) visit(child);
+	return results;
+}
+function findFirstAnchorWithClass(node, className) {
+	if (isElement(node) && node.name === "a" && hasClass(node, className)) return node;
+	if (isElement(node)) for (const child of node.children) {
+		const found = findFirstAnchorWithClass(child, className);
+		if (found != null) return found;
+	}
+	return null;
+}
+function findFirstTextByClass(node, className) {
+	if (isElement(node) && hasClass(node, className)) {
+		const text = getTextContent(node).trim();
+		return text.length > 0 ? text : null;
+	}
+	if (isElement(node)) for (const child of node.children) {
+		const found = findFirstTextByClass(child, className);
+		if (found != null) return found;
+	}
+	return null;
+}
+/**
+* Parses DuckDuckGo Lite search result HTML.
+*
+* This parser intentionally relies on minimal semantics:
+* - Each result starts at a `<tr>` that contains an `a.result-link`.
+* - Additional data (snippet, display URL) is searched within subsequent `<tr>`
+*   siblings until the next result starts.
+*
+* This keeps the parser resistant to minor structure changes while avoiding
+* accidentally attaching a snippet from the next result.
+*
+* @param html DuckDuckGo Lite HTML.
+* @param options Parsing options.
+* @returns Parsed search results.
+* @since 0.1.0
+*/
+function parseDuckDuckGoLiteResults(html, options = {}) {
+	const maxResults = options.maxResults ?? 10;
+	if (maxResults <= 0) return [];
+	const rows = collectElementsByTagName((0, htmlparser2.parseDocument)(html, {
+		lowerCaseTags: true,
+		lowerCaseAttributeNames: true
+	}), "tr");
+	const results = [];
+	for (let rowIndex = 0; rowIndex < rows.length; rowIndex++) {
+		const row = rows[rowIndex];
+		const anchor = findFirstAnchorWithClass(row, "result-link");
+		if (anchor == null) continue;
+		const title = getTextContent(anchor).trim();
+		const href = anchor.attribs.href?.trim();
+		if (title.length === 0 || href == null || href.length === 0) continue;
+		const url = unwrapDuckDuckGoRedirectUrl(href) ?? href;
+		let snippet = null;
+		let displayUrl = null;
+		for (let j = rowIndex; j < rows.length; j++) {
+			if (j !== rowIndex && findFirstAnchorWithClass(rows[j], "result-link") != null) break;
+			snippet ??= findFirstTextByClass(rows[j], "result-snippet");
+			displayUrl ??= findFirstTextByClass(rows[j], "link-text");
+			if (snippet != null && displayUrl != null) break;
+		}
+		results.push({
+			title,
+			url,
+			snippet: snippet ?? void 0,
+			displayUrl: displayUrl ?? void 0
+		});
+		if (results.length >= maxResults) break;
+	}
+	return results;
+}
+/**
+* A passive context source that performs a web search using DuckDuckGo Lite.
+*
+* This source returns a list of search results (title, URL, snippet) and does
+* not fetch the target pages themselves. Combine with {@link fetchWebPage} if
+* you want to retrieve a specific result in detail.
+*
+* @since 0.1.0
+*/
+const searchWeb = {
+	name: "search-web",
+	description: "Searches the web (DuckDuckGo Lite) and returns a list of results with titles, URLs, and snippets. Use this to quickly find relevant pages, then fetch a specific page separately if needed.",
+	mode: "passive",
+	parameters: zod.z.object({
+		query: zod.z.string().min(1).describe("The search query keyword(s)"),
+		maxResults: zod.z.number().int().positive().max(50).optional().describe("Maximum number of results to return (default: 10)"),
+		region: zod.z.string().optional().describe("DuckDuckGo region (kl) parameter, e.g. 'kr-kr' or 'us-en'"),
+		timeRange: zod.z.enum([
+			"d",
+			"w",
+			"m",
+			"y"
+		]).optional().describe("Time range filter (df): d=day, w=week, m=month, y=year")
+	}),
+	async gather(params, options) {
+		const maxResults = params.maxResults ?? 10;
+		const url = new URL("https://lite.duckduckgo.com/lite/");
+		url.searchParams.set("q", params.query);
+		if (params.region != null && params.region.trim().length > 0) url.searchParams.set("kl", params.region.trim());
+		if (params.timeRange != null) url.searchParams.set("df", params.timeRange);
+		logger.debug("Searching DuckDuckGo Lite: {url}", { url: url.toString() });
+		try {
+			const response = await fetch(url, {
+				signal: options?.signal,
+				headers: {
+					"User-Agent": "Mozilla/5.0 (compatible; Vertana/0.1; +https://vertana.org)",
+					Accept: "text/html,application/xhtml+xml"
+				}
+			});
+			if (!response.ok) return {
+				content: `Failed to search the web. Status: ${response.status}`,
+				metadata: {
+					query: params.query,
+					success: false,
+					status: response.status
+				}
+			};
+			const results = parseDuckDuckGoLiteResults(await response.text(), { maxResults });
+			return {
+				content: formatSearchResults(params.query, results),
+				metadata: {
+					query: params.query,
+					resultCount: results.length,
+					urls: results.map((r) => r.url),
+					success: true
+				}
+			};
+		} catch (error) {
+			if (error instanceof Error && error.name === "AbortError") return {
+				content: "Search aborted.",
+				metadata: {
+					query: params.query,
+					success: false,
+					aborted: true
+				}
+			};
+			return {
+				content: `Failed to search the web. Error: ${String(error)}`,
+				metadata: {
+					query: params.query,
+					success: false
+				}
+			};
+		}
+	}
+};
+function formatSearchResults(query, results) {
+	if (results.length === 0) return `No web search results found for: ${query}`;
+	const lines = [];
+	lines.push(`# Web search results: ${query}`);
+	lines.push("");
+	for (let i = 0; i < results.length; i++) {
+		const result = results[i];
+		lines.push(`## ${i + 1}. ${result.title}`);
+		lines.push(`URL: ${result.url}`);
+		if (result.displayUrl != null) lines.push(`Display: ${result.displayUrl}`);
+		if (result.snippet != null) {
+			lines.push("");
+			lines.push(result.snippet);
+		}
+		if (i !== results.length - 1) lines.push("");
+	}
+	return lines.join("\n");
+}
+//#endregion
+exports.searchWeb = searchWeb;

package/dist/search.d.cts ADDED Viewed

@@ -0,0 +1,21 @@
+import { PassiveContextSource } from "@vertana/core/context";
+//#region src/search.d.ts
+interface SearchWebParams {
+  readonly query: string;
+  readonly maxResults?: number;
+  readonly region?: string;
+  readonly timeRange?: "d" | "w" | "m" | "y";
+}
+/**
+ * A passive context source that performs a web search using DuckDuckGo Lite.
+ *
+ * This source returns a list of search results (title, URL, snippet) and does
+ * not fetch the target pages themselves. Combine with {@link fetchWebPage} if
+ * you want to retrieve a specific result in detail.
+ *
+ * @since 0.1.0
+ */
+declare const searchWeb: PassiveContextSource<SearchWebParams>;
+//#endregion
+export { searchWeb };

package/dist/search.d.ts ADDED Viewed

@@ -0,0 +1,21 @@
+import { PassiveContextSource } from "@vertana/core/context";
+//#region src/search.d.ts
+interface SearchWebParams {
+  readonly query: string;
+  readonly maxResults?: number;
+  readonly region?: string;
+  readonly timeRange?: "d" | "w" | "m" | "y";
+}
+/**
+ * A passive context source that performs a web search using DuckDuckGo Lite.
+ *
+ * This source returns a list of search results (title, URL, snippet) and does
+ * not fetch the target pages themselves. Combine with {@link fetchWebPage} if
+ * you want to retrieve a specific result in detail.
+ *
+ * @since 0.1.0
+ */
+declare const searchWeb: PassiveContextSource<SearchWebParams>;
+//#endregion
+export { searchWeb };

package/dist/search.js ADDED Viewed

@@ -0,0 +1,221 @@
+import { getLogger } from "@logtape/logtape";
+import { z } from "zod";
+import { parseDocument } from "htmlparser2";
+//#region src/search.ts
+const logger = getLogger([
+	"vertana",
+	"context-web",
+	"search"
+]);
+function unwrapDuckDuckGoRedirectUrl(href) {
+	const trimmed = href.trim();
+	const normalized = trimmed.startsWith("//") ? `https:${trimmed}` : trimmed;
+	let parsed;
+	try {
+		parsed = new URL(normalized);
+	} catch {
+		return null;
+	}
+	if (!/(^|\.)duckduckgo\.com$/i.test(parsed.hostname)) return trimmed;
+	if (parsed.pathname !== "/l/") return trimmed;
+	const raw = parsed.searchParams.get("uddg");
+	if (raw == null || raw.length === 0) return null;
+	try {
+		const decoded = decodeURIComponent(raw);
+		new URL(decoded);
+		return decoded;
+	} catch {
+		return null;
+	}
+}
+function isElement(node) {
+	return node.type === "tag";
+}
+function getTextContent(node) {
+	if (node.type === "text") return node.data;
+	if (isElement(node)) return node.children.map(getTextContent).join("");
+	return "";
+}
+function hasClass(element, className) {
+	const classes = element.attribs.class;
+	if (classes == null) return false;
+	return classes.split(/\s+/).includes(className);
+}
+function collectElementsByTagName(doc, tagName) {
+	const results = [];
+	function visit(node) {
+		if (isElement(node)) {
+			if (node.name === tagName) results.push(node);
+			for (const child of node.children) visit(child);
+		}
+	}
+	for (const child of doc.children) visit(child);
+	return results;
+}
+function findFirstAnchorWithClass(node, className) {
+	if (isElement(node) && node.name === "a" && hasClass(node, className)) return node;
+	if (isElement(node)) for (const child of node.children) {
+		const found = findFirstAnchorWithClass(child, className);
+		if (found != null) return found;
+	}
+	return null;
+}
+function findFirstTextByClass(node, className) {
+	if (isElement(node) && hasClass(node, className)) {
+		const text = getTextContent(node).trim();
+		return text.length > 0 ? text : null;
+	}
+	if (isElement(node)) for (const child of node.children) {
+		const found = findFirstTextByClass(child, className);
+		if (found != null) return found;
+	}
+	return null;
+}
+/**
+* Parses DuckDuckGo Lite search result HTML.
+*
+* This parser intentionally relies on minimal semantics:
+* - Each result starts at a `<tr>` that contains an `a.result-link`.
+* - Additional data (snippet, display URL) is searched within subsequent `<tr>`
+*   siblings until the next result starts.
+*
+* This keeps the parser resistant to minor structure changes while avoiding
+* accidentally attaching a snippet from the next result.
+*
+* @param html DuckDuckGo Lite HTML.
+* @param options Parsing options.
+* @returns Parsed search results.
+* @since 0.1.0
+*/
+function parseDuckDuckGoLiteResults(html, options = {}) {
+	const maxResults = options.maxResults ?? 10;
+	if (maxResults <= 0) return [];
+	const rows = collectElementsByTagName(parseDocument(html, {
+		lowerCaseTags: true,
+		lowerCaseAttributeNames: true
+	}), "tr");
+	const results = [];
+	for (let rowIndex = 0; rowIndex < rows.length; rowIndex++) {
+		const row = rows[rowIndex];
+		const anchor = findFirstAnchorWithClass(row, "result-link");
+		if (anchor == null) continue;
+		const title = getTextContent(anchor).trim();
+		const href = anchor.attribs.href?.trim();
+		if (title.length === 0 || href == null || href.length === 0) continue;
+		const url = unwrapDuckDuckGoRedirectUrl(href) ?? href;
+		let snippet = null;
+		let displayUrl = null;
+		for (let j = rowIndex; j < rows.length; j++) {
+			if (j !== rowIndex && findFirstAnchorWithClass(rows[j], "result-link") != null) break;
+			snippet ??= findFirstTextByClass(rows[j], "result-snippet");
+			displayUrl ??= findFirstTextByClass(rows[j], "link-text");
+			if (snippet != null && displayUrl != null) break;
+		}
+		results.push({
+			title,
+			url,
+			snippet: snippet ?? void 0,
+			displayUrl: displayUrl ?? void 0
+		});
+		if (results.length >= maxResults) break;
+	}
+	return results;
+}
+/**
+* A passive context source that performs a web search using DuckDuckGo Lite.
+*
+* This source returns a list of search results (title, URL, snippet) and does
+* not fetch the target pages themselves. Combine with {@link fetchWebPage} if
+* you want to retrieve a specific result in detail.
+*
+* @since 0.1.0
+*/
+const searchWeb = {
+	name: "search-web",
+	description: "Searches the web (DuckDuckGo Lite) and returns a list of results with titles, URLs, and snippets. Use this to quickly find relevant pages, then fetch a specific page separately if needed.",
+	mode: "passive",
+	parameters: z.object({
+		query: z.string().min(1).describe("The search query keyword(s)"),
+		maxResults: z.number().int().positive().max(50).optional().describe("Maximum number of results to return (default: 10)"),
+		region: z.string().optional().describe("DuckDuckGo region (kl) parameter, e.g. 'kr-kr' or 'us-en'"),
+		timeRange: z.enum([
+			"d",
+			"w",
+			"m",
+			"y"
+		]).optional().describe("Time range filter (df): d=day, w=week, m=month, y=year")
+	}),
+	async gather(params, options) {
+		const maxResults = params.maxResults ?? 10;
+		const url = new URL("https://lite.duckduckgo.com/lite/");
+		url.searchParams.set("q", params.query);
+		if (params.region != null && params.region.trim().length > 0) url.searchParams.set("kl", params.region.trim());
+		if (params.timeRange != null) url.searchParams.set("df", params.timeRange);
+		logger.debug("Searching DuckDuckGo Lite: {url}", { url: url.toString() });
+		try {
+			const response = await fetch(url, {
+				signal: options?.signal,
+				headers: {
+					"User-Agent": "Mozilla/5.0 (compatible; Vertana/0.1; +https://vertana.org)",
+					Accept: "text/html,application/xhtml+xml"
+				}
+			});
+			if (!response.ok) return {
+				content: `Failed to search the web. Status: ${response.status}`,
+				metadata: {
+					query: params.query,
+					success: false,
+					status: response.status
+				}
+			};
+			const results = parseDuckDuckGoLiteResults(await response.text(), { maxResults });
+			return {
+				content: formatSearchResults(params.query, results),
+				metadata: {
+					query: params.query,
+					resultCount: results.length,
+					urls: results.map((r) => r.url),
+					success: true
+				}
+			};
+		} catch (error) {
+			if (error instanceof Error && error.name === "AbortError") return {
+				content: "Search aborted.",
+				metadata: {
+					query: params.query,
+					success: false,
+					aborted: true
+				}
+			};
+			return {
+				content: `Failed to search the web. Error: ${String(error)}`,
+				metadata: {
+					query: params.query,
+					success: false
+				}
+			};
+		}
+	}
+};
+function formatSearchResults(query, results) {
+	if (results.length === 0) return `No web search results found for: ${query}`;
+	const lines = [];
+	lines.push(`# Web search results: ${query}`);
+	lines.push("");
+	for (let i = 0; i < results.length; i++) {
+		const result = results[i];
+		lines.push(`## ${i + 1}. ${result.title}`);
+		lines.push(`URL: ${result.url}`);
+		if (result.displayUrl != null) lines.push(`Display: ${result.displayUrl}`);
+		if (result.snippet != null) {
+			lines.push("");
+			lines.push(result.snippet);
+		}
+		if (i !== results.length - 1) lines.push("");
+	}
+	return lines.join("\n");
+}
+//#endregion
+export { searchWeb };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vertana/context-web",
-  "version": "0.1.0-dev.11+812bc132",
+  "version": "0.1.0",
   "description": "Web context gathering for Vertana - fetch and extract content from linked pages",
   "keywords": [
     "LLM",