@vertana/context-web 0.1.0-dev.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ MIT License
2
+
3
+ Copyright 2025 Hong Minhee
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
+ this software and associated documentation files (the "Software"), to deal in
7
+ the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software is furnished to do so,
10
+ subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,76 @@
1
+ # @vertana/context-web
2
+
3
+ [![JSR][JSR badge]][JSR]
4
+ [![npm][npm badge]][npm]
5
+
6
+ Web context gathering for [Vertana] — fetch and extract content from
7
+ linked pages to provide additional context for translation.
8
+
9
+ [JSR]: https://jsr.io/@vertana/context-web
10
+ [JSR badge]: https://jsr.io/badges/@vertana/context-web
11
+ [npm]: https://www.npmjs.com/package/@vertana/context-web
12
+ [npm badge]: https://img.shields.io/npm/v/@vertana/context-web
13
+ [Vertana]: https://vertana.org/
14
+
15
+
16
+ Features
17
+ --------
18
+
19
+ - **fetchWebPage**: A passive context source that fetches a single URL
20
+ and extracts the main content using Mozilla's Readability algorithm.
21
+ - **fetchLinkedPages**: A required context source factory that extracts
22
+ all links from the source text and fetches their content.
23
+ - **extractLinks**: A utility function to extract URLs from text
24
+ in various formats (plain text, Markdown, HTML).
25
+
26
+
27
+ Installation
28
+ ------------
29
+
30
+ ### Deno
31
+
32
+ ~~~~ bash
33
+ deno add jsr:@vertana/context-web
34
+ ~~~~
35
+
36
+ ### npm
37
+
38
+ ~~~~ bash
39
+ npm add @vertana/context-web
40
+ ~~~~
41
+
42
+ ### pnpm
43
+
44
+ ~~~~ bash
45
+ pnpm add @vertana/context-web
46
+ ~~~~
47
+
48
+
49
+ Usage
50
+ -----
51
+
52
+ ~~~~ typescript
53
+ import { translate } from "@vertana/facade";
54
+ import { fetchLinkedPages, fetchWebPage } from "@vertana/context-web";
55
+ import { openai } from "@ai-sdk/openai";
56
+
57
+ const text = `
58
+ Check out this article: https://example.com/article
59
+ It explains the concept in detail.
60
+ `;
61
+
62
+ const result = await translate(openai("gpt-4o"), "ko", text, {
63
+ contextSources: [
64
+ // Automatically fetch all links in the text
65
+ fetchLinkedPages({ text, mediaType: "text/plain" }),
66
+ // Allow LLM to fetch additional URLs on demand
67
+ fetchWebPage,
68
+ ],
69
+ });
70
+ ~~~~
71
+
72
+
73
+ License
74
+ -------
75
+
76
+ [MIT License](../../LICENSE)
@@ -0,0 +1,117 @@
1
+ let htmlparser2 = require("htmlparser2");
2
+
3
+ //#region src/extract-links.ts
4
+ /**
5
+ * Extracts URLs from text based on the media type.
6
+ *
7
+ * @param text The text to extract URLs from.
8
+ * @param mediaType The media type of the text.
9
+ * @returns An array of unique URLs found in the text.
10
+ * @since 0.1.0
11
+ */
12
+ function extractLinks(text, mediaType) {
13
+ switch (mediaType) {
14
+ case "text/plain": return extractFromPlainText(text);
15
+ case "text/markdown": return extractFromMarkdown(text);
16
+ case "text/html": return extractFromHtml(text);
17
+ }
18
+ }
19
+ /**
20
+ * URL pattern for plain text extraction.
21
+ * Matches http:// and https:// URLs.
22
+ */
23
+ const URL_PATTERN = /https?:\/\/[^\s<>"')\]]+/g;
24
+ /**
25
+ * Characters that should be trimmed from the end of URLs.
26
+ */
27
+ const TRAILING_PUNCTUATION = /[.,;:!?)]+$/;
28
+ /**
29
+ * Extracts URLs from plain text.
30
+ */
31
+ function extractFromPlainText(text) {
32
+ const matches = text.match(URL_PATTERN);
33
+ if (matches == null) return [];
34
+ const urls = /* @__PURE__ */ new Set();
35
+ for (const match of matches) {
36
+ const cleanUrl = match.replace(TRAILING_PUNCTUATION, "");
37
+ if (isValidUrl(cleanUrl)) urls.add(cleanUrl);
38
+ }
39
+ return [...urls];
40
+ }
41
+ /**
42
+ * Markdown link patterns.
43
+ */
44
+ const MARKDOWN_INLINE_LINK = /\[([^\]]*)\]\(([^)]+)\)/g;
45
+ const MARKDOWN_REFERENCE_LINK = /^\[([^\]]+)\]:\s*(\S+)/gm;
46
+ const MARKDOWN_AUTOLINK = /<(https?:\/\/[^>]+)>/g;
47
+ const MARKDOWN_CODE_BLOCK = /```[\s\S]*?```|`[^`]+`/g;
48
+ /**
49
+ * Extracts URLs from Markdown text.
50
+ */
51
+ function extractFromMarkdown(text) {
52
+ const textWithoutCode = text.replace(MARKDOWN_CODE_BLOCK, "");
53
+ const urls = /* @__PURE__ */ new Set();
54
+ let match;
55
+ MARKDOWN_INLINE_LINK.lastIndex = 0;
56
+ while ((match = MARKDOWN_INLINE_LINK.exec(textWithoutCode)) != null) {
57
+ const url = match[2];
58
+ if (isValidUrl(url)) urls.add(url);
59
+ }
60
+ MARKDOWN_REFERENCE_LINK.lastIndex = 0;
61
+ while ((match = MARKDOWN_REFERENCE_LINK.exec(textWithoutCode)) != null) {
62
+ const url = match[2];
63
+ if (isValidUrl(url)) urls.add(url);
64
+ }
65
+ MARKDOWN_AUTOLINK.lastIndex = 0;
66
+ while ((match = MARKDOWN_AUTOLINK.exec(textWithoutCode)) != null) {
67
+ const url = match[1];
68
+ if (isValidUrl(url)) urls.add(url);
69
+ }
70
+ const bareUrls = extractFromPlainText(textWithoutCode);
71
+ for (const url of bareUrls) urls.add(url);
72
+ return [...urls];
73
+ }
74
+ /**
75
+ * Determines if a node is an element.
76
+ */
77
+ function isElement(node) {
78
+ return node.type === "tag";
79
+ }
80
+ /**
81
+ * Extracts URLs from HTML.
82
+ */
83
+ function extractFromHtml(html) {
84
+ const doc = (0, htmlparser2.parseDocument)(html, {
85
+ lowerCaseTags: true,
86
+ lowerCaseAttributeNames: true
87
+ });
88
+ const urls = /* @__PURE__ */ new Set();
89
+ function traverse(node) {
90
+ if (isElement(node)) {
91
+ if (node.name === "a") {
92
+ const href = node.attribs.href;
93
+ if (href != null && isValidUrl(href)) urls.add(href);
94
+ }
95
+ for (const child of node.children) traverse(child);
96
+ }
97
+ }
98
+ for (const child of doc.children) traverse(child);
99
+ return [...urls];
100
+ }
101
+ /**
102
+ * Checks if a URL is valid for extraction.
103
+ * Only allows http:// and https:// URLs.
104
+ */
105
+ function isValidUrl(url) {
106
+ if (url.length === 0 || url === "#") return false;
107
+ if (!/^https?:\/\//i.test(url)) return false;
108
+ try {
109
+ new URL(url);
110
+ return true;
111
+ } catch {
112
+ return false;
113
+ }
114
+ }
115
+
116
+ //#endregion
117
+ exports.extractLinks = extractLinks;
@@ -0,0 +1,18 @@
1
+ //#region src/extract-links.d.ts
2
+ /**
3
+ * Supported media types for link extraction.
4
+ *
5
+ * @since 0.1.0
6
+ */
7
+ type MediaType = "text/plain" | "text/markdown" | "text/html";
8
+ /**
9
+ * Extracts URLs from text based on the media type.
10
+ *
11
+ * @param text The text to extract URLs from.
12
+ * @param mediaType The media type of the text.
13
+ * @returns An array of unique URLs found in the text.
14
+ * @since 0.1.0
15
+ */
16
+ declare function extractLinks(text: string, mediaType: MediaType): readonly string[];
17
+ //#endregion
18
+ export { MediaType, extractLinks };
@@ -0,0 +1,18 @@
1
+ //#region src/extract-links.d.ts
2
+ /**
3
+ * Supported media types for link extraction.
4
+ *
5
+ * @since 0.1.0
6
+ */
7
+ type MediaType = "text/plain" | "text/markdown" | "text/html";
8
+ /**
9
+ * Extracts URLs from text based on the media type.
10
+ *
11
+ * @param text The text to extract URLs from.
12
+ * @param mediaType The media type of the text.
13
+ * @returns An array of unique URLs found in the text.
14
+ * @since 0.1.0
15
+ */
16
+ declare function extractLinks(text: string, mediaType: MediaType): readonly string[];
17
+ //#endregion
18
+ export { MediaType, extractLinks };
@@ -0,0 +1,117 @@
1
+ import { parseDocument } from "htmlparser2";
2
+
3
+ //#region src/extract-links.ts
4
+ /**
5
+ * Extracts URLs from text based on the media type.
6
+ *
7
+ * @param text The text to extract URLs from.
8
+ * @param mediaType The media type of the text.
9
+ * @returns An array of unique URLs found in the text.
10
+ * @since 0.1.0
11
+ */
12
+ function extractLinks(text, mediaType) {
13
+ switch (mediaType) {
14
+ case "text/plain": return extractFromPlainText(text);
15
+ case "text/markdown": return extractFromMarkdown(text);
16
+ case "text/html": return extractFromHtml(text);
17
+ }
18
+ }
19
+ /**
20
+ * URL pattern for plain text extraction.
21
+ * Matches http:// and https:// URLs.
22
+ */
23
+ const URL_PATTERN = /https?:\/\/[^\s<>"')\]]+/g;
24
+ /**
25
+ * Characters that should be trimmed from the end of URLs.
26
+ */
27
+ const TRAILING_PUNCTUATION = /[.,;:!?)]+$/;
28
+ /**
29
+ * Extracts URLs from plain text.
30
+ */
31
+ function extractFromPlainText(text) {
32
+ const matches = text.match(URL_PATTERN);
33
+ if (matches == null) return [];
34
+ const urls = /* @__PURE__ */ new Set();
35
+ for (const match of matches) {
36
+ const cleanUrl = match.replace(TRAILING_PUNCTUATION, "");
37
+ if (isValidUrl(cleanUrl)) urls.add(cleanUrl);
38
+ }
39
+ return [...urls];
40
+ }
41
+ /**
42
+ * Markdown link patterns.
43
+ */
44
+ const MARKDOWN_INLINE_LINK = /\[([^\]]*)\]\(([^)]+)\)/g;
45
+ const MARKDOWN_REFERENCE_LINK = /^\[([^\]]+)\]:\s*(\S+)/gm;
46
+ const MARKDOWN_AUTOLINK = /<(https?:\/\/[^>]+)>/g;
47
+ const MARKDOWN_CODE_BLOCK = /```[\s\S]*?```|`[^`]+`/g;
48
+ /**
49
+ * Extracts URLs from Markdown text.
50
+ */
51
+ function extractFromMarkdown(text) {
52
+ const textWithoutCode = text.replace(MARKDOWN_CODE_BLOCK, "");
53
+ const urls = /* @__PURE__ */ new Set();
54
+ let match;
55
+ MARKDOWN_INLINE_LINK.lastIndex = 0;
56
+ while ((match = MARKDOWN_INLINE_LINK.exec(textWithoutCode)) != null) {
57
+ const url = match[2];
58
+ if (isValidUrl(url)) urls.add(url);
59
+ }
60
+ MARKDOWN_REFERENCE_LINK.lastIndex = 0;
61
+ while ((match = MARKDOWN_REFERENCE_LINK.exec(textWithoutCode)) != null) {
62
+ const url = match[2];
63
+ if (isValidUrl(url)) urls.add(url);
64
+ }
65
+ MARKDOWN_AUTOLINK.lastIndex = 0;
66
+ while ((match = MARKDOWN_AUTOLINK.exec(textWithoutCode)) != null) {
67
+ const url = match[1];
68
+ if (isValidUrl(url)) urls.add(url);
69
+ }
70
+ const bareUrls = extractFromPlainText(textWithoutCode);
71
+ for (const url of bareUrls) urls.add(url);
72
+ return [...urls];
73
+ }
74
+ /**
75
+ * Determines if a node is an element.
76
+ */
77
+ function isElement(node) {
78
+ return node.type === "tag";
79
+ }
80
+ /**
81
+ * Extracts URLs from HTML.
82
+ */
83
+ function extractFromHtml(html) {
84
+ const doc = parseDocument(html, {
85
+ lowerCaseTags: true,
86
+ lowerCaseAttributeNames: true
87
+ });
88
+ const urls = /* @__PURE__ */ new Set();
89
+ function traverse(node) {
90
+ if (isElement(node)) {
91
+ if (node.name === "a") {
92
+ const href = node.attribs.href;
93
+ if (href != null && isValidUrl(href)) urls.add(href);
94
+ }
95
+ for (const child of node.children) traverse(child);
96
+ }
97
+ }
98
+ for (const child of doc.children) traverse(child);
99
+ return [...urls];
100
+ }
101
+ /**
102
+ * Checks if a URL is valid for extraction.
103
+ * Only allows http:// and https:// URLs.
104
+ */
105
+ function isValidUrl(url) {
106
+ if (url.length === 0 || url === "#") return false;
107
+ if (!/^https?:\/\//i.test(url)) return false;
108
+ try {
109
+ new URL(url);
110
+ return true;
111
+ } catch {
112
+ return false;
113
+ }
114
+ }
115
+
116
+ //#endregion
117
+ export { extractLinks };
package/dist/fetch.cjs ADDED
@@ -0,0 +1,235 @@
1
+ const require_extract_links = require('./extract-links.cjs');
2
+ let _logtape_logtape = require("@logtape/logtape");
3
+ let _mozilla_readability = require("@mozilla/readability");
4
+ let linkedom = require("linkedom");
5
+ let zod = require("zod");
6
+
7
+ //#region src/fetch.ts
8
+ const logger = (0, _logtape_logtape.getLogger)([
9
+ "vertana",
10
+ "context-web",
11
+ "fetch"
12
+ ]);
13
+ /**
14
+ * Extracts the main content from an HTML page using Mozilla's Readability.
15
+ *
16
+ * @param html The HTML content to extract from.
17
+ * @param url The URL of the page (used for resolving relative links).
18
+ * @returns The extracted content, or null if extraction failed.
19
+ * @since 0.1.0
20
+ */
21
+ function extractContent(html, url) {
22
+ const document = (0, linkedom.parseHTML)(html, "text/html").document;
23
+ const baseElement = document.createElement("base");
24
+ baseElement.href = url;
25
+ document.head.appendChild(baseElement);
26
+ const article = new _mozilla_readability.Readability(document).parse();
27
+ if (article == null) return null;
28
+ const title = article.title ?? "";
29
+ const content = article.textContent ?? "";
30
+ if (title.length === 0 && content.length === 0) return null;
31
+ return {
32
+ title,
33
+ content,
34
+ byline: article.byline ?? void 0,
35
+ excerpt: article.excerpt ?? void 0
36
+ };
37
+ }
38
+ /**
39
+ * Fetches a URL and extracts its main content.
40
+ *
41
+ * @param url The URL to fetch.
42
+ * @param options Fetch options.
43
+ * @returns The extracted content, or null if fetch or extraction failed.
44
+ */
45
+ async function fetchAndExtract(url, options) {
46
+ const timeout = options?.timeout ?? 1e4;
47
+ logger.debug("Fetching URL: {url}...", { url });
48
+ try {
49
+ const controller = new AbortController();
50
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
51
+ if (options?.signal != null) options.signal.addEventListener("abort", () => controller.abort());
52
+ const response = await fetch(url, {
53
+ signal: controller.signal,
54
+ headers: {
55
+ "User-Agent": "Mozilla/5.0 (compatible; Vertana/0.1; +https://vertana.org)",
56
+ Accept: "text/html,application/xhtml+xml"
57
+ }
58
+ });
59
+ clearTimeout(timeoutId);
60
+ if (!response.ok) {
61
+ logger.warn("Failed to fetch URL: {url}, status: {status}", {
62
+ url,
63
+ status: response.status
64
+ });
65
+ return null;
66
+ }
67
+ const contentType = response.headers.get("content-type");
68
+ if (contentType != null && !contentType.includes("text/html")) {
69
+ logger.debug("Skipping non-HTML content: {url}, type: {contentType}", {
70
+ url,
71
+ contentType
72
+ });
73
+ return null;
74
+ }
75
+ const content = extractContent(await response.text(), url);
76
+ if (content == null) {
77
+ logger.debug("Failed to extract content from: {url}", { url });
78
+ return null;
79
+ }
80
+ logger.debug("Extracted content from: {url}, title: {title}", {
81
+ url,
82
+ title: content.title
83
+ });
84
+ return content;
85
+ } catch (error) {
86
+ if (error instanceof Error && error.name === "AbortError") logger.debug("Fetch aborted for: {url}", { url });
87
+ else logger.warn("Error fetching URL: {url}, error: {error}", {
88
+ url,
89
+ error: String(error)
90
+ });
91
+ return null;
92
+ }
93
+ }
94
+ /**
95
+ * A passive context source that fetches a single web page and extracts
96
+ * its main content.
97
+ *
98
+ * This source is exposed as a tool that the LLM can call when it needs
99
+ * to fetch additional context from a specific URL.
100
+ *
101
+ * @example
102
+ * ```typescript
103
+ * import { translate } from "@vertana/facade";
104
+ * import { fetchWebPage } from "@vertana/context-web";
105
+ *
106
+ * const result = await translate(model, "ko", text, {
107
+ * contextSources: [fetchWebPage],
108
+ * });
109
+ * ```
110
+ *
111
+ * @since 0.1.0
112
+ */
113
+ const fetchWebPage = {
114
+ name: "fetch-web-page",
115
+ description: "Fetches a web page and extracts its main content. Use this when you need additional context from a linked article or page.",
116
+ mode: "passive",
117
+ parameters: zod.z.object({ url: zod.z.string().url().describe("The URL of the web page to fetch") }),
118
+ async gather(params, options) {
119
+ const content = await fetchAndExtract(params.url, { signal: options?.signal });
120
+ if (content == null) return {
121
+ content: `Failed to fetch or extract content from: ${params.url}`,
122
+ metadata: {
123
+ url: params.url,
124
+ success: false
125
+ }
126
+ };
127
+ return {
128
+ content: formatContent(content, params.url),
129
+ metadata: {
130
+ url: params.url,
131
+ title: content.title,
132
+ success: true
133
+ }
134
+ };
135
+ }
136
+ };
137
+ /**
138
+ * Creates a required context source that extracts all links from the given
139
+ * text and fetches their content.
140
+ *
141
+ * This source is invoked automatically before translation begins, providing
142
+ * context from all linked pages.
143
+ *
144
+ * @param options Options for the context source.
145
+ * @returns A required context source.
146
+ *
147
+ * @example
148
+ * ```typescript
149
+ * import { translate } from "@vertana/facade";
150
+ * import { fetchLinkedPages } from "@vertana/context-web";
151
+ *
152
+ * const text = "Check out https://example.com for details.";
153
+ * const result = await translate(model, "ko", text, {
154
+ * contextSources: [
155
+ * fetchLinkedPages({ text, mediaType: "text/plain" }),
156
+ * ],
157
+ * });
158
+ * ```
159
+ *
160
+ * @since 0.1.0
161
+ */
162
+ function fetchLinkedPages(options) {
163
+ const maxLinks = options.maxLinks ?? 10;
164
+ const timeout = options.timeout ?? 1e4;
165
+ const linksToFetch = require_extract_links.extractLinks(options.text, options.mediaType).slice(0, maxLinks);
166
+ return {
167
+ name: "fetch-linked-pages",
168
+ description: `Fetches content from ${linksToFetch.length} linked page(s) to provide additional context for translation.`,
169
+ mode: "required",
170
+ async gather(gatherOptions) {
171
+ if (linksToFetch.length === 0) {
172
+ logger.debug("No links to fetch.");
173
+ return {
174
+ content: "",
175
+ metadata: {
176
+ linkCount: 0,
177
+ fetchedCount: 0
178
+ }
179
+ };
180
+ }
181
+ logger.info("Fetching {count} linked page(s)...", { count: linksToFetch.length });
182
+ const results = [];
183
+ for (const url of linksToFetch) {
184
+ gatherOptions?.signal?.throwIfAborted();
185
+ const content = await fetchAndExtract(url, {
186
+ signal: gatherOptions?.signal,
187
+ timeout
188
+ });
189
+ if (content != null) results.push({
190
+ url,
191
+ content
192
+ });
193
+ }
194
+ if (results.length === 0) {
195
+ logger.debug("No content could be extracted from any linked pages.");
196
+ return {
197
+ content: "",
198
+ metadata: {
199
+ linkCount: linksToFetch.length,
200
+ fetchedCount: 0
201
+ }
202
+ };
203
+ }
204
+ logger.info("Successfully extracted content from {count} of {total} page(s).", {
205
+ count: results.length,
206
+ total: linksToFetch.length
207
+ });
208
+ return {
209
+ content: results.map(({ url, content }) => formatContent(content, url)).join("\n\n---\n\n"),
210
+ metadata: {
211
+ linkCount: linksToFetch.length,
212
+ fetchedCount: results.length,
213
+ urls: results.map((r) => r.url)
214
+ }
215
+ };
216
+ }
217
+ };
218
+ }
219
+ /**
220
+ * Formats extracted content for inclusion in the translation context.
221
+ */
222
+ function formatContent(content, url) {
223
+ const parts = [];
224
+ parts.push(`# ${content.title}`);
225
+ parts.push(`Source: ${url}`);
226
+ if (content.byline != null) parts.push(`Author: ${content.byline}`);
227
+ parts.push("");
228
+ parts.push(content.content);
229
+ return parts.join("\n");
230
+ }
231
+
232
+ //#endregion
233
+ exports.extractContent = extractContent;
234
+ exports.fetchLinkedPages = fetchLinkedPages;
235
+ exports.fetchWebPage = fetchWebPage;
@@ -0,0 +1,121 @@
1
+ import { MediaType } from "./extract-links.cjs";
2
+ import { PassiveContextSource, RequiredContextSource } from "@vertana/core/context";
3
+
4
+ //#region src/fetch.d.ts
5
+
6
+ /**
7
+ * Result of extracting content from a web page.
8
+ *
9
+ * @since 0.1.0
10
+ */
11
+ interface ExtractedContent {
12
+ /**
13
+ * The title of the article.
14
+ */
15
+ readonly title: string;
16
+ /**
17
+ * The extracted main content as plain text.
18
+ */
19
+ readonly content: string;
20
+ /**
21
+ * The byline (author) if available.
22
+ */
23
+ readonly byline?: string;
24
+ /**
25
+ * The excerpt if available.
26
+ */
27
+ readonly excerpt?: string;
28
+ }
29
+ /**
30
+ * Extracts the main content from an HTML page using Mozilla's Readability.
31
+ *
32
+ * @param html The HTML content to extract from.
33
+ * @param url The URL of the page (used for resolving relative links).
34
+ * @returns The extracted content, or null if extraction failed.
35
+ * @since 0.1.0
36
+ */
37
+ declare function extractContent(html: string, url: string): ExtractedContent | null;
38
+ /**
39
+ * Parameters for the fetchWebPage context source.
40
+ */
41
+ interface FetchWebPageParams {
42
+ /**
43
+ * The URL to fetch.
44
+ */
45
+ readonly url: string;
46
+ }
47
+ /**
48
+ * A passive context source that fetches a single web page and extracts
49
+ * its main content.
50
+ *
51
+ * This source is exposed as a tool that the LLM can call when it needs
52
+ * to fetch additional context from a specific URL.
53
+ *
54
+ * @example
55
+ * ```typescript
56
+ * import { translate } from "@vertana/facade";
57
+ * import { fetchWebPage } from "@vertana/context-web";
58
+ *
59
+ * const result = await translate(model, "ko", text, {
60
+ * contextSources: [fetchWebPage],
61
+ * });
62
+ * ```
63
+ *
64
+ * @since 0.1.0
65
+ */
66
+ declare const fetchWebPage: PassiveContextSource<FetchWebPageParams>;
67
+ /**
68
+ * Options for creating a fetchLinkedPages context source.
69
+ *
70
+ * @since 0.1.0
71
+ */
72
+ interface FetchLinkedPagesOptions {
73
+ /**
74
+ * The text to extract links from.
75
+ */
76
+ readonly text: string;
77
+ /**
78
+ * The media type of the text.
79
+ */
80
+ readonly mediaType: MediaType;
81
+ /**
82
+ * Maximum number of links to fetch.
83
+ *
84
+ * @default 10
85
+ */
86
+ readonly maxLinks?: number;
87
+ /**
88
+ * Timeout for each fetch request in milliseconds.
89
+ *
90
+ * @default 10000
91
+ */
92
+ readonly timeout?: number;
93
+ }
94
+ /**
95
+ * Creates a required context source that extracts all links from the given
96
+ * text and fetches their content.
97
+ *
98
+ * This source is invoked automatically before translation begins, providing
99
+ * context from all linked pages.
100
+ *
101
+ * @param options Options for the context source.
102
+ * @returns A required context source.
103
+ *
104
+ * @example
105
+ * ```typescript
106
+ * import { translate } from "@vertana/facade";
107
+ * import { fetchLinkedPages } from "@vertana/context-web";
108
+ *
109
+ * const text = "Check out https://example.com for details.";
110
+ * const result = await translate(model, "ko", text, {
111
+ * contextSources: [
112
+ * fetchLinkedPages({ text, mediaType: "text/plain" }),
113
+ * ],
114
+ * });
115
+ * ```
116
+ *
117
+ * @since 0.1.0
118
+ */
119
+ declare function fetchLinkedPages(options: FetchLinkedPagesOptions): RequiredContextSource;
120
+ //#endregion
121
+ export { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage };
@@ -0,0 +1,121 @@
1
+ import { MediaType } from "./extract-links.js";
2
+ import { PassiveContextSource, RequiredContextSource } from "@vertana/core/context";
3
+
4
+ //#region src/fetch.d.ts
5
+
6
+ /**
7
+ * Result of extracting content from a web page.
8
+ *
9
+ * @since 0.1.0
10
+ */
11
+ interface ExtractedContent {
12
+ /**
13
+ * The title of the article.
14
+ */
15
+ readonly title: string;
16
+ /**
17
+ * The extracted main content as plain text.
18
+ */
19
+ readonly content: string;
20
+ /**
21
+ * The byline (author) if available.
22
+ */
23
+ readonly byline?: string;
24
+ /**
25
+ * The excerpt if available.
26
+ */
27
+ readonly excerpt?: string;
28
+ }
29
+ /**
30
+ * Extracts the main content from an HTML page using Mozilla's Readability.
31
+ *
32
+ * @param html The HTML content to extract from.
33
+ * @param url The URL of the page (used for resolving relative links).
34
+ * @returns The extracted content, or null if extraction failed.
35
+ * @since 0.1.0
36
+ */
37
+ declare function extractContent(html: string, url: string): ExtractedContent | null;
38
+ /**
39
+ * Parameters for the fetchWebPage context source.
40
+ */
41
+ interface FetchWebPageParams {
42
+ /**
43
+ * The URL to fetch.
44
+ */
45
+ readonly url: string;
46
+ }
47
+ /**
48
+ * A passive context source that fetches a single web page and extracts
49
+ * its main content.
50
+ *
51
+ * This source is exposed as a tool that the LLM can call when it needs
52
+ * to fetch additional context from a specific URL.
53
+ *
54
+ * @example
55
+ * ```typescript
56
+ * import { translate } from "@vertana/facade";
57
+ * import { fetchWebPage } from "@vertana/context-web";
58
+ *
59
+ * const result = await translate(model, "ko", text, {
60
+ * contextSources: [fetchWebPage],
61
+ * });
62
+ * ```
63
+ *
64
+ * @since 0.1.0
65
+ */
66
+ declare const fetchWebPage: PassiveContextSource<FetchWebPageParams>;
67
+ /**
68
+ * Options for creating a fetchLinkedPages context source.
69
+ *
70
+ * @since 0.1.0
71
+ */
72
+ interface FetchLinkedPagesOptions {
73
+ /**
74
+ * The text to extract links from.
75
+ */
76
+ readonly text: string;
77
+ /**
78
+ * The media type of the text.
79
+ */
80
+ readonly mediaType: MediaType;
81
+ /**
82
+ * Maximum number of links to fetch.
83
+ *
84
+ * @default 10
85
+ */
86
+ readonly maxLinks?: number;
87
+ /**
88
+ * Timeout for each fetch request in milliseconds.
89
+ *
90
+ * @default 10000
91
+ */
92
+ readonly timeout?: number;
93
+ }
94
+ /**
95
+ * Creates a required context source that extracts all links from the given
96
+ * text and fetches their content.
97
+ *
98
+ * This source is invoked automatically before translation begins, providing
99
+ * context from all linked pages.
100
+ *
101
+ * @param options Options for the context source.
102
+ * @returns A required context source.
103
+ *
104
+ * @example
105
+ * ```typescript
106
+ * import { translate } from "@vertana/facade";
107
+ * import { fetchLinkedPages } from "@vertana/context-web";
108
+ *
109
+ * const text = "Check out https://example.com for details.";
110
+ * const result = await translate(model, "ko", text, {
111
+ * contextSources: [
112
+ * fetchLinkedPages({ text, mediaType: "text/plain" }),
113
+ * ],
114
+ * });
115
+ * ```
116
+ *
117
+ * @since 0.1.0
118
+ */
119
+ declare function fetchLinkedPages(options: FetchLinkedPagesOptions): RequiredContextSource;
120
+ //#endregion
121
+ export { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage };
package/dist/fetch.js ADDED
@@ -0,0 +1,233 @@
1
+ import { extractLinks } from "./extract-links.js";
2
+ import { getLogger } from "@logtape/logtape";
3
+ import { Readability } from "@mozilla/readability";
4
+ import { parseHTML } from "linkedom";
5
+ import { z } from "zod";
6
+
7
+ //#region src/fetch.ts
8
+ const logger = getLogger([
9
+ "vertana",
10
+ "context-web",
11
+ "fetch"
12
+ ]);
13
+ /**
14
+ * Extracts the main content from an HTML page using Mozilla's Readability.
15
+ *
16
+ * @param html The HTML content to extract from.
17
+ * @param url The URL of the page (used for resolving relative links).
18
+ * @returns The extracted content, or null if extraction failed.
19
+ * @since 0.1.0
20
+ */
21
+ function extractContent(html, url) {
22
+ const document = parseHTML(html, "text/html").document;
23
+ const baseElement = document.createElement("base");
24
+ baseElement.href = url;
25
+ document.head.appendChild(baseElement);
26
+ const article = new Readability(document).parse();
27
+ if (article == null) return null;
28
+ const title = article.title ?? "";
29
+ const content = article.textContent ?? "";
30
+ if (title.length === 0 && content.length === 0) return null;
31
+ return {
32
+ title,
33
+ content,
34
+ byline: article.byline ?? void 0,
35
+ excerpt: article.excerpt ?? void 0
36
+ };
37
+ }
38
+ /**
39
+ * Fetches a URL and extracts its main content.
40
+ *
41
+ * @param url The URL to fetch.
42
+ * @param options Fetch options.
43
+ * @returns The extracted content, or null if fetch or extraction failed.
44
+ */
45
+ async function fetchAndExtract(url, options) {
46
+ const timeout = options?.timeout ?? 1e4;
47
+ logger.debug("Fetching URL: {url}...", { url });
48
+ try {
49
+ const controller = new AbortController();
50
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
51
+ if (options?.signal != null) options.signal.addEventListener("abort", () => controller.abort());
52
+ const response = await fetch(url, {
53
+ signal: controller.signal,
54
+ headers: {
55
+ "User-Agent": "Mozilla/5.0 (compatible; Vertana/0.1; +https://vertana.org)",
56
+ Accept: "text/html,application/xhtml+xml"
57
+ }
58
+ });
59
+ clearTimeout(timeoutId);
60
+ if (!response.ok) {
61
+ logger.warn("Failed to fetch URL: {url}, status: {status}", {
62
+ url,
63
+ status: response.status
64
+ });
65
+ return null;
66
+ }
67
+ const contentType = response.headers.get("content-type");
68
+ if (contentType != null && !contentType.includes("text/html")) {
69
+ logger.debug("Skipping non-HTML content: {url}, type: {contentType}", {
70
+ url,
71
+ contentType
72
+ });
73
+ return null;
74
+ }
75
+ const content = extractContent(await response.text(), url);
76
+ if (content == null) {
77
+ logger.debug("Failed to extract content from: {url}", { url });
78
+ return null;
79
+ }
80
+ logger.debug("Extracted content from: {url}, title: {title}", {
81
+ url,
82
+ title: content.title
83
+ });
84
+ return content;
85
+ } catch (error) {
86
+ if (error instanceof Error && error.name === "AbortError") logger.debug("Fetch aborted for: {url}", { url });
87
+ else logger.warn("Error fetching URL: {url}, error: {error}", {
88
+ url,
89
+ error: String(error)
90
+ });
91
+ return null;
92
+ }
93
+ }
94
+ /**
95
+ * A passive context source that fetches a single web page and extracts
96
+ * its main content.
97
+ *
98
+ * This source is exposed as a tool that the LLM can call when it needs
99
+ * to fetch additional context from a specific URL.
100
+ *
101
+ * @example
102
+ * ```typescript
103
+ * import { translate } from "@vertana/facade";
104
+ * import { fetchWebPage } from "@vertana/context-web";
105
+ *
106
+ * const result = await translate(model, "ko", text, {
107
+ * contextSources: [fetchWebPage],
108
+ * });
109
+ * ```
110
+ *
111
+ * @since 0.1.0
112
+ */
113
+ const fetchWebPage = {
114
+ name: "fetch-web-page",
115
+ description: "Fetches a web page and extracts its main content. Use this when you need additional context from a linked article or page.",
116
+ mode: "passive",
117
+ parameters: z.object({ url: z.string().url().describe("The URL of the web page to fetch") }),
118
+ async gather(params, options) {
119
+ const content = await fetchAndExtract(params.url, { signal: options?.signal });
120
+ if (content == null) return {
121
+ content: `Failed to fetch or extract content from: ${params.url}`,
122
+ metadata: {
123
+ url: params.url,
124
+ success: false
125
+ }
126
+ };
127
+ return {
128
+ content: formatContent(content, params.url),
129
+ metadata: {
130
+ url: params.url,
131
+ title: content.title,
132
+ success: true
133
+ }
134
+ };
135
+ }
136
+ };
137
+ /**
138
+ * Creates a required context source that extracts all links from the given
139
+ * text and fetches their content.
140
+ *
141
+ * This source is invoked automatically before translation begins, providing
142
+ * context from all linked pages.
143
+ *
144
+ * @param options Options for the context source.
145
+ * @returns A required context source.
146
+ *
147
+ * @example
148
+ * ```typescript
149
+ * import { translate } from "@vertana/facade";
150
+ * import { fetchLinkedPages } from "@vertana/context-web";
151
+ *
152
+ * const text = "Check out https://example.com for details.";
153
+ * const result = await translate(model, "ko", text, {
154
+ * contextSources: [
155
+ * fetchLinkedPages({ text, mediaType: "text/plain" }),
156
+ * ],
157
+ * });
158
+ * ```
159
+ *
160
+ * @since 0.1.0
161
+ */
162
+ function fetchLinkedPages(options) {
163
+ const maxLinks = options.maxLinks ?? 10;
164
+ const timeout = options.timeout ?? 1e4;
165
+ const linksToFetch = extractLinks(options.text, options.mediaType).slice(0, maxLinks);
166
+ return {
167
+ name: "fetch-linked-pages",
168
+ description: `Fetches content from ${linksToFetch.length} linked page(s) to provide additional context for translation.`,
169
+ mode: "required",
170
+ async gather(gatherOptions) {
171
+ if (linksToFetch.length === 0) {
172
+ logger.debug("No links to fetch.");
173
+ return {
174
+ content: "",
175
+ metadata: {
176
+ linkCount: 0,
177
+ fetchedCount: 0
178
+ }
179
+ };
180
+ }
181
+ logger.info("Fetching {count} linked page(s)...", { count: linksToFetch.length });
182
+ const results = [];
183
+ for (const url of linksToFetch) {
184
+ gatherOptions?.signal?.throwIfAborted();
185
+ const content = await fetchAndExtract(url, {
186
+ signal: gatherOptions?.signal,
187
+ timeout
188
+ });
189
+ if (content != null) results.push({
190
+ url,
191
+ content
192
+ });
193
+ }
194
+ if (results.length === 0) {
195
+ logger.debug("No content could be extracted from any linked pages.");
196
+ return {
197
+ content: "",
198
+ metadata: {
199
+ linkCount: linksToFetch.length,
200
+ fetchedCount: 0
201
+ }
202
+ };
203
+ }
204
+ logger.info("Successfully extracted content from {count} of {total} page(s).", {
205
+ count: results.length,
206
+ total: linksToFetch.length
207
+ });
208
+ return {
209
+ content: results.map(({ url, content }) => formatContent(content, url)).join("\n\n---\n\n"),
210
+ metadata: {
211
+ linkCount: linksToFetch.length,
212
+ fetchedCount: results.length,
213
+ urls: results.map((r) => r.url)
214
+ }
215
+ };
216
+ }
217
+ };
218
+ }
219
+ /**
220
+ * Formats extracted content for inclusion in the translation context.
221
+ */
222
+ function formatContent(content, url) {
223
+ const parts = [];
224
+ parts.push(`# ${content.title}`);
225
+ parts.push(`Source: ${url}`);
226
+ if (content.byline != null) parts.push(`Author: ${content.byline}`);
227
+ parts.push("");
228
+ parts.push(content.content);
229
+ return parts.join("\n");
230
+ }
231
+
232
+ //#endregion
233
+ export { extractContent, fetchLinkedPages, fetchWebPage };
package/dist/index.cjs ADDED
@@ -0,0 +1,7 @@
1
+ const require_extract_links = require('./extract-links.cjs');
2
+ const require_fetch = require('./fetch.cjs');
3
+
4
+ exports.extractContent = require_fetch.extractContent;
5
+ exports.extractLinks = require_extract_links.extractLinks;
6
+ exports.fetchLinkedPages = require_fetch.fetchLinkedPages;
7
+ exports.fetchWebPage = require_fetch.fetchWebPage;
@@ -0,0 +1,3 @@
1
+ import { MediaType, extractLinks } from "./extract-links.cjs";
2
+ import { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.cjs";
3
+ export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage };
@@ -0,0 +1,3 @@
1
+ import { MediaType, extractLinks } from "./extract-links.js";
2
+ import { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.js";
3
+ export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage };
package/dist/index.js ADDED
@@ -0,0 +1,4 @@
1
+ import { extractLinks } from "./extract-links.js";
2
+ import { extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.js";
3
+
4
+ export { extractContent, extractLinks, fetchLinkedPages, fetchWebPage };
package/package.json ADDED
@@ -0,0 +1,92 @@
1
+ {
2
+ "name": "@vertana/context-web",
3
+ "version": "0.1.0-dev.1",
4
+ "description": "Web context gathering for Vertana - fetch and extract content from linked pages",
5
+ "keywords": [
6
+ "LLM",
7
+ "translation",
8
+ "context",
9
+ "web",
10
+ "readability"
11
+ ],
12
+ "license": "MIT",
13
+ "author": {
14
+ "name": "Hong Minhee",
15
+ "email": "hong@minhee.org",
16
+ "url": "https://hongminhee.org/"
17
+ },
18
+ "homepage": "https://vertana.org/",
19
+ "repository": {
20
+ "type": "git",
21
+ "url": "git+https://github.com/dahlia/vertana.git",
22
+ "directory": "packages/context-web"
23
+ },
24
+ "bugs": {
25
+ "url": "https://github.com/dahlia/vertana/issues"
26
+ },
27
+ "funding": [
28
+ "https://github.com/sponsors/dahlia"
29
+ ],
30
+ "engines": {
31
+ "node": ">=20.0.0",
32
+ "bun": ">=1.2.0",
33
+ "deno": ">=2.3.0"
34
+ },
35
+ "files": [
36
+ "dist/",
37
+ "package.json",
38
+ "README.md"
39
+ ],
40
+ "type": "module",
41
+ "module": "./dist/index.js",
42
+ "main": "./dist/index.cjs",
43
+ "types": "./dist/index.d.ts",
44
+ "exports": {
45
+ ".": {
46
+ "types": {
47
+ "require": "./dist/index.d.cts",
48
+ "import": "./dist/index.d.ts"
49
+ },
50
+ "require": "./dist/index.cjs",
51
+ "import": "./dist/index.js"
52
+ },
53
+ "./fetch": {
54
+ "types": {
55
+ "require": "./dist/fetch.d.cts",
56
+ "import": "./dist/fetch.d.ts"
57
+ },
58
+ "require": "./dist/fetch.cjs",
59
+ "import": "./dist/fetch.js"
60
+ },
61
+ "./extract-links": {
62
+ "types": {
63
+ "require": "./dist/extract-links.d.cts",
64
+ "import": "./dist/extract-links.d.ts"
65
+ },
66
+ "require": "./dist/extract-links.cjs",
67
+ "import": "./dist/extract-links.js"
68
+ }
69
+ },
70
+ "sideEffects": false,
71
+ "dependencies": {
72
+ "@logtape/logtape": "^1.3.5",
73
+ "@mozilla/readability": "^0.6.0",
74
+ "@vertana/core": "",
75
+ "htmlparser2": "^10.0.0",
76
+ "linkedom": "^0.18.12",
77
+ "zod": "4.2.1"
78
+ },
79
+ "devDependencies": {
80
+ "@types/node": "^20.19.9",
81
+ "tsdown": "^0.18.3",
82
+ "typescript": "^5.9.3"
83
+ },
84
+ "scripts": {
85
+ "build": "tsdown",
86
+ "prepublish": "tsdown",
87
+ "test": "tsdown && node --experimental-transform-types --test --test-concurrency=4",
88
+ "test:bun": "tsdown && bun test",
89
+ "test:deno": "deno test --allow-env --allow-net",
90
+ "test-all": "tsdown && node --experimental-transform-types --test && bun test && deno test"
91
+ }
92
+ }