@vertana/context-web 0.1.0-dev.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +20 -0
- package/README.md +76 -0
- package/dist/extract-links.cjs +117 -0
- package/dist/extract-links.d.cts +18 -0
- package/dist/extract-links.d.ts +18 -0
- package/dist/extract-links.js +117 -0
- package/dist/fetch.cjs +235 -0
- package/dist/fetch.d.cts +121 -0
- package/dist/fetch.d.ts +121 -0
- package/dist/fetch.js +233 -0
- package/dist/index.cjs +7 -0
- package/dist/index.d.cts +3 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +4 -0
- package/package.json +92 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright 2025 Hong Minhee
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
|
7
|
+
the Software without restriction, including without limitation the rights to
|
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
|
10
|
+
subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# @vertana/context-web
|
|
2
|
+
|
|
3
|
+
[![JSR][JSR badge]][JSR]
|
|
4
|
+
[![npm][npm badge]][npm]
|
|
5
|
+
|
|
6
|
+
Web context gathering for [Vertana] — fetch and extract content from
|
|
7
|
+
linked pages to provide additional context for translation.
|
|
8
|
+
|
|
9
|
+
[JSR]: https://jsr.io/@vertana/context-web
|
|
10
|
+
[JSR badge]: https://jsr.io/badges/@vertana/context-web
|
|
11
|
+
[npm]: https://www.npmjs.com/package/@vertana/context-web
|
|
12
|
+
[npm badge]: https://img.shields.io/npm/v/@vertana/context-web
|
|
13
|
+
[Vertana]: https://vertana.org/
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
Features
|
|
17
|
+
--------
|
|
18
|
+
|
|
19
|
+
- **fetchWebPage**: A passive context source that fetches a single URL
|
|
20
|
+
and extracts the main content using Mozilla's Readability algorithm.
|
|
21
|
+
- **fetchLinkedPages**: A required context source factory that extracts
|
|
22
|
+
all links from the source text and fetches their content.
|
|
23
|
+
- **extractLinks**: A utility function to extract URLs from text
|
|
24
|
+
in various formats (plain text, Markdown, HTML).
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
Installation
|
|
28
|
+
------------
|
|
29
|
+
|
|
30
|
+
### Deno
|
|
31
|
+
|
|
32
|
+
~~~~ bash
|
|
33
|
+
deno add jsr:@vertana/context-web
|
|
34
|
+
~~~~
|
|
35
|
+
|
|
36
|
+
### npm
|
|
37
|
+
|
|
38
|
+
~~~~ bash
|
|
39
|
+
npm add @vertana/context-web
|
|
40
|
+
~~~~
|
|
41
|
+
|
|
42
|
+
### pnpm
|
|
43
|
+
|
|
44
|
+
~~~~ bash
|
|
45
|
+
pnpm add @vertana/context-web
|
|
46
|
+
~~~~
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
Usage
|
|
50
|
+
-----
|
|
51
|
+
|
|
52
|
+
~~~~ typescript
|
|
53
|
+
import { translate } from "@vertana/facade";
|
|
54
|
+
import { fetchLinkedPages, fetchWebPage } from "@vertana/context-web";
|
|
55
|
+
import { openai } from "@ai-sdk/openai";
|
|
56
|
+
|
|
57
|
+
const text = `
|
|
58
|
+
Check out this article: https://example.com/article
|
|
59
|
+
It explains the concept in detail.
|
|
60
|
+
`;
|
|
61
|
+
|
|
62
|
+
const result = await translate(openai("gpt-4o"), "ko", text, {
|
|
63
|
+
contextSources: [
|
|
64
|
+
// Automatically fetch all links in the text
|
|
65
|
+
fetchLinkedPages({ text, mediaType: "text/plain" }),
|
|
66
|
+
// Allow LLM to fetch additional URLs on demand
|
|
67
|
+
fetchWebPage,
|
|
68
|
+
],
|
|
69
|
+
});
|
|
70
|
+
~~~~
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
License
|
|
74
|
+
-------
|
|
75
|
+
|
|
76
|
+
[MIT License](../../LICENSE)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
let htmlparser2 = require("htmlparser2");
|
|
2
|
+
|
|
3
|
+
//#region src/extract-links.ts
|
|
4
|
+
/**
|
|
5
|
+
* Extracts URLs from text based on the media type.
|
|
6
|
+
*
|
|
7
|
+
* @param text The text to extract URLs from.
|
|
8
|
+
* @param mediaType The media type of the text.
|
|
9
|
+
* @returns An array of unique URLs found in the text.
|
|
10
|
+
* @since 0.1.0
|
|
11
|
+
*/
|
|
12
|
+
function extractLinks(text, mediaType) {
|
|
13
|
+
switch (mediaType) {
|
|
14
|
+
case "text/plain": return extractFromPlainText(text);
|
|
15
|
+
case "text/markdown": return extractFromMarkdown(text);
|
|
16
|
+
case "text/html": return extractFromHtml(text);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* URL pattern for plain text extraction.
|
|
21
|
+
* Matches http:// and https:// URLs.
|
|
22
|
+
*/
|
|
23
|
+
const URL_PATTERN = /https?:\/\/[^\s<>"')\]]+/g;
|
|
24
|
+
/**
|
|
25
|
+
* Characters that should be trimmed from the end of URLs.
|
|
26
|
+
*/
|
|
27
|
+
const TRAILING_PUNCTUATION = /[.,;:!?)]+$/;
|
|
28
|
+
/**
|
|
29
|
+
* Extracts URLs from plain text.
|
|
30
|
+
*/
|
|
31
|
+
function extractFromPlainText(text) {
|
|
32
|
+
const matches = text.match(URL_PATTERN);
|
|
33
|
+
if (matches == null) return [];
|
|
34
|
+
const urls = /* @__PURE__ */ new Set();
|
|
35
|
+
for (const match of matches) {
|
|
36
|
+
const cleanUrl = match.replace(TRAILING_PUNCTUATION, "");
|
|
37
|
+
if (isValidUrl(cleanUrl)) urls.add(cleanUrl);
|
|
38
|
+
}
|
|
39
|
+
return [...urls];
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Markdown link patterns.
|
|
43
|
+
*/
|
|
44
|
+
const MARKDOWN_INLINE_LINK = /\[([^\]]*)\]\(([^)]+)\)/g;
|
|
45
|
+
const MARKDOWN_REFERENCE_LINK = /^\[([^\]]+)\]:\s*(\S+)/gm;
|
|
46
|
+
const MARKDOWN_AUTOLINK = /<(https?:\/\/[^>]+)>/g;
|
|
47
|
+
const MARKDOWN_CODE_BLOCK = /```[\s\S]*?```|`[^`]+`/g;
|
|
48
|
+
/**
|
|
49
|
+
* Extracts URLs from Markdown text.
|
|
50
|
+
*/
|
|
51
|
+
function extractFromMarkdown(text) {
|
|
52
|
+
const textWithoutCode = text.replace(MARKDOWN_CODE_BLOCK, "");
|
|
53
|
+
const urls = /* @__PURE__ */ new Set();
|
|
54
|
+
let match;
|
|
55
|
+
MARKDOWN_INLINE_LINK.lastIndex = 0;
|
|
56
|
+
while ((match = MARKDOWN_INLINE_LINK.exec(textWithoutCode)) != null) {
|
|
57
|
+
const url = match[2];
|
|
58
|
+
if (isValidUrl(url)) urls.add(url);
|
|
59
|
+
}
|
|
60
|
+
MARKDOWN_REFERENCE_LINK.lastIndex = 0;
|
|
61
|
+
while ((match = MARKDOWN_REFERENCE_LINK.exec(textWithoutCode)) != null) {
|
|
62
|
+
const url = match[2];
|
|
63
|
+
if (isValidUrl(url)) urls.add(url);
|
|
64
|
+
}
|
|
65
|
+
MARKDOWN_AUTOLINK.lastIndex = 0;
|
|
66
|
+
while ((match = MARKDOWN_AUTOLINK.exec(textWithoutCode)) != null) {
|
|
67
|
+
const url = match[1];
|
|
68
|
+
if (isValidUrl(url)) urls.add(url);
|
|
69
|
+
}
|
|
70
|
+
const bareUrls = extractFromPlainText(textWithoutCode);
|
|
71
|
+
for (const url of bareUrls) urls.add(url);
|
|
72
|
+
return [...urls];
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Determines if a node is an element.
|
|
76
|
+
*/
|
|
77
|
+
function isElement(node) {
|
|
78
|
+
return node.type === "tag";
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Extracts URLs from HTML.
|
|
82
|
+
*/
|
|
83
|
+
function extractFromHtml(html) {
|
|
84
|
+
const doc = (0, htmlparser2.parseDocument)(html, {
|
|
85
|
+
lowerCaseTags: true,
|
|
86
|
+
lowerCaseAttributeNames: true
|
|
87
|
+
});
|
|
88
|
+
const urls = /* @__PURE__ */ new Set();
|
|
89
|
+
function traverse(node) {
|
|
90
|
+
if (isElement(node)) {
|
|
91
|
+
if (node.name === "a") {
|
|
92
|
+
const href = node.attribs.href;
|
|
93
|
+
if (href != null && isValidUrl(href)) urls.add(href);
|
|
94
|
+
}
|
|
95
|
+
for (const child of node.children) traverse(child);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
for (const child of doc.children) traverse(child);
|
|
99
|
+
return [...urls];
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Checks if a URL is valid for extraction.
|
|
103
|
+
* Only allows http:// and https:// URLs.
|
|
104
|
+
*/
|
|
105
|
+
function isValidUrl(url) {
|
|
106
|
+
if (url.length === 0 || url === "#") return false;
|
|
107
|
+
if (!/^https?:\/\//i.test(url)) return false;
|
|
108
|
+
try {
|
|
109
|
+
new URL(url);
|
|
110
|
+
return true;
|
|
111
|
+
} catch {
|
|
112
|
+
return false;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
//#endregion
|
|
117
|
+
exports.extractLinks = extractLinks;
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
//#region src/extract-links.d.ts
|
|
2
|
+
/**
|
|
3
|
+
* Supported media types for link extraction.
|
|
4
|
+
*
|
|
5
|
+
* @since 0.1.0
|
|
6
|
+
*/
|
|
7
|
+
type MediaType = "text/plain" | "text/markdown" | "text/html";
|
|
8
|
+
/**
|
|
9
|
+
* Extracts URLs from text based on the media type.
|
|
10
|
+
*
|
|
11
|
+
* @param text The text to extract URLs from.
|
|
12
|
+
* @param mediaType The media type of the text.
|
|
13
|
+
* @returns An array of unique URLs found in the text.
|
|
14
|
+
* @since 0.1.0
|
|
15
|
+
*/
|
|
16
|
+
declare function extractLinks(text: string, mediaType: MediaType): readonly string[];
|
|
17
|
+
//#endregion
|
|
18
|
+
export { MediaType, extractLinks };
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
//#region src/extract-links.d.ts
|
|
2
|
+
/**
|
|
3
|
+
* Supported media types for link extraction.
|
|
4
|
+
*
|
|
5
|
+
* @since 0.1.0
|
|
6
|
+
*/
|
|
7
|
+
type MediaType = "text/plain" | "text/markdown" | "text/html";
|
|
8
|
+
/**
|
|
9
|
+
* Extracts URLs from text based on the media type.
|
|
10
|
+
*
|
|
11
|
+
* @param text The text to extract URLs from.
|
|
12
|
+
* @param mediaType The media type of the text.
|
|
13
|
+
* @returns An array of unique URLs found in the text.
|
|
14
|
+
* @since 0.1.0
|
|
15
|
+
*/
|
|
16
|
+
declare function extractLinks(text: string, mediaType: MediaType): readonly string[];
|
|
17
|
+
//#endregion
|
|
18
|
+
export { MediaType, extractLinks };
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import { parseDocument } from "htmlparser2";
|
|
2
|
+
|
|
3
|
+
//#region src/extract-links.ts
|
|
4
|
+
/**
|
|
5
|
+
* Extracts URLs from text based on the media type.
|
|
6
|
+
*
|
|
7
|
+
* @param text The text to extract URLs from.
|
|
8
|
+
* @param mediaType The media type of the text.
|
|
9
|
+
* @returns An array of unique URLs found in the text.
|
|
10
|
+
* @since 0.1.0
|
|
11
|
+
*/
|
|
12
|
+
function extractLinks(text, mediaType) {
|
|
13
|
+
switch (mediaType) {
|
|
14
|
+
case "text/plain": return extractFromPlainText(text);
|
|
15
|
+
case "text/markdown": return extractFromMarkdown(text);
|
|
16
|
+
case "text/html": return extractFromHtml(text);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* URL pattern for plain text extraction.
|
|
21
|
+
* Matches http:// and https:// URLs.
|
|
22
|
+
*/
|
|
23
|
+
const URL_PATTERN = /https?:\/\/[^\s<>"')\]]+/g;
|
|
24
|
+
/**
|
|
25
|
+
* Characters that should be trimmed from the end of URLs.
|
|
26
|
+
*/
|
|
27
|
+
const TRAILING_PUNCTUATION = /[.,;:!?)]+$/;
|
|
28
|
+
/**
|
|
29
|
+
* Extracts URLs from plain text.
|
|
30
|
+
*/
|
|
31
|
+
function extractFromPlainText(text) {
|
|
32
|
+
const matches = text.match(URL_PATTERN);
|
|
33
|
+
if (matches == null) return [];
|
|
34
|
+
const urls = /* @__PURE__ */ new Set();
|
|
35
|
+
for (const match of matches) {
|
|
36
|
+
const cleanUrl = match.replace(TRAILING_PUNCTUATION, "");
|
|
37
|
+
if (isValidUrl(cleanUrl)) urls.add(cleanUrl);
|
|
38
|
+
}
|
|
39
|
+
return [...urls];
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Markdown link patterns.
|
|
43
|
+
*/
|
|
44
|
+
const MARKDOWN_INLINE_LINK = /\[([^\]]*)\]\(([^)]+)\)/g;
|
|
45
|
+
const MARKDOWN_REFERENCE_LINK = /^\[([^\]]+)\]:\s*(\S+)/gm;
|
|
46
|
+
const MARKDOWN_AUTOLINK = /<(https?:\/\/[^>]+)>/g;
|
|
47
|
+
const MARKDOWN_CODE_BLOCK = /```[\s\S]*?```|`[^`]+`/g;
|
|
48
|
+
/**
|
|
49
|
+
* Extracts URLs from Markdown text.
|
|
50
|
+
*/
|
|
51
|
+
function extractFromMarkdown(text) {
|
|
52
|
+
const textWithoutCode = text.replace(MARKDOWN_CODE_BLOCK, "");
|
|
53
|
+
const urls = /* @__PURE__ */ new Set();
|
|
54
|
+
let match;
|
|
55
|
+
MARKDOWN_INLINE_LINK.lastIndex = 0;
|
|
56
|
+
while ((match = MARKDOWN_INLINE_LINK.exec(textWithoutCode)) != null) {
|
|
57
|
+
const url = match[2];
|
|
58
|
+
if (isValidUrl(url)) urls.add(url);
|
|
59
|
+
}
|
|
60
|
+
MARKDOWN_REFERENCE_LINK.lastIndex = 0;
|
|
61
|
+
while ((match = MARKDOWN_REFERENCE_LINK.exec(textWithoutCode)) != null) {
|
|
62
|
+
const url = match[2];
|
|
63
|
+
if (isValidUrl(url)) urls.add(url);
|
|
64
|
+
}
|
|
65
|
+
MARKDOWN_AUTOLINK.lastIndex = 0;
|
|
66
|
+
while ((match = MARKDOWN_AUTOLINK.exec(textWithoutCode)) != null) {
|
|
67
|
+
const url = match[1];
|
|
68
|
+
if (isValidUrl(url)) urls.add(url);
|
|
69
|
+
}
|
|
70
|
+
const bareUrls = extractFromPlainText(textWithoutCode);
|
|
71
|
+
for (const url of bareUrls) urls.add(url);
|
|
72
|
+
return [...urls];
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Determines if a node is an element.
|
|
76
|
+
*/
|
|
77
|
+
function isElement(node) {
|
|
78
|
+
return node.type === "tag";
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Extracts URLs from HTML.
|
|
82
|
+
*/
|
|
83
|
+
function extractFromHtml(html) {
|
|
84
|
+
const doc = parseDocument(html, {
|
|
85
|
+
lowerCaseTags: true,
|
|
86
|
+
lowerCaseAttributeNames: true
|
|
87
|
+
});
|
|
88
|
+
const urls = /* @__PURE__ */ new Set();
|
|
89
|
+
function traverse(node) {
|
|
90
|
+
if (isElement(node)) {
|
|
91
|
+
if (node.name === "a") {
|
|
92
|
+
const href = node.attribs.href;
|
|
93
|
+
if (href != null && isValidUrl(href)) urls.add(href);
|
|
94
|
+
}
|
|
95
|
+
for (const child of node.children) traverse(child);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
for (const child of doc.children) traverse(child);
|
|
99
|
+
return [...urls];
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Checks if a URL is valid for extraction.
|
|
103
|
+
* Only allows http:// and https:// URLs.
|
|
104
|
+
*/
|
|
105
|
+
function isValidUrl(url) {
|
|
106
|
+
if (url.length === 0 || url === "#") return false;
|
|
107
|
+
if (!/^https?:\/\//i.test(url)) return false;
|
|
108
|
+
try {
|
|
109
|
+
new URL(url);
|
|
110
|
+
return true;
|
|
111
|
+
} catch {
|
|
112
|
+
return false;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
//#endregion
|
|
117
|
+
export { extractLinks };
|
package/dist/fetch.cjs
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
const require_extract_links = require('./extract-links.cjs');
|
|
2
|
+
let _logtape_logtape = require("@logtape/logtape");
|
|
3
|
+
let _mozilla_readability = require("@mozilla/readability");
|
|
4
|
+
let linkedom = require("linkedom");
|
|
5
|
+
let zod = require("zod");
|
|
6
|
+
|
|
7
|
+
//#region src/fetch.ts
|
|
8
|
+
const logger = (0, _logtape_logtape.getLogger)([
|
|
9
|
+
"vertana",
|
|
10
|
+
"context-web",
|
|
11
|
+
"fetch"
|
|
12
|
+
]);
|
|
13
|
+
/**
|
|
14
|
+
* Extracts the main content from an HTML page using Mozilla's Readability.
|
|
15
|
+
*
|
|
16
|
+
* @param html The HTML content to extract from.
|
|
17
|
+
* @param url The URL of the page (used for resolving relative links).
|
|
18
|
+
* @returns The extracted content, or null if extraction failed.
|
|
19
|
+
* @since 0.1.0
|
|
20
|
+
*/
|
|
21
|
+
function extractContent(html, url) {
|
|
22
|
+
const document = (0, linkedom.parseHTML)(html, "text/html").document;
|
|
23
|
+
const baseElement = document.createElement("base");
|
|
24
|
+
baseElement.href = url;
|
|
25
|
+
document.head.appendChild(baseElement);
|
|
26
|
+
const article = new _mozilla_readability.Readability(document).parse();
|
|
27
|
+
if (article == null) return null;
|
|
28
|
+
const title = article.title ?? "";
|
|
29
|
+
const content = article.textContent ?? "";
|
|
30
|
+
if (title.length === 0 && content.length === 0) return null;
|
|
31
|
+
return {
|
|
32
|
+
title,
|
|
33
|
+
content,
|
|
34
|
+
byline: article.byline ?? void 0,
|
|
35
|
+
excerpt: article.excerpt ?? void 0
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Fetches a URL and extracts its main content.
|
|
40
|
+
*
|
|
41
|
+
* @param url The URL to fetch.
|
|
42
|
+
* @param options Fetch options.
|
|
43
|
+
* @returns The extracted content, or null if fetch or extraction failed.
|
|
44
|
+
*/
|
|
45
|
+
async function fetchAndExtract(url, options) {
|
|
46
|
+
const timeout = options?.timeout ?? 1e4;
|
|
47
|
+
logger.debug("Fetching URL: {url}...", { url });
|
|
48
|
+
try {
|
|
49
|
+
const controller = new AbortController();
|
|
50
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
51
|
+
if (options?.signal != null) options.signal.addEventListener("abort", () => controller.abort());
|
|
52
|
+
const response = await fetch(url, {
|
|
53
|
+
signal: controller.signal,
|
|
54
|
+
headers: {
|
|
55
|
+
"User-Agent": "Mozilla/5.0 (compatible; Vertana/0.1; +https://vertana.org)",
|
|
56
|
+
Accept: "text/html,application/xhtml+xml"
|
|
57
|
+
}
|
|
58
|
+
});
|
|
59
|
+
clearTimeout(timeoutId);
|
|
60
|
+
if (!response.ok) {
|
|
61
|
+
logger.warn("Failed to fetch URL: {url}, status: {status}", {
|
|
62
|
+
url,
|
|
63
|
+
status: response.status
|
|
64
|
+
});
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
const contentType = response.headers.get("content-type");
|
|
68
|
+
if (contentType != null && !contentType.includes("text/html")) {
|
|
69
|
+
logger.debug("Skipping non-HTML content: {url}, type: {contentType}", {
|
|
70
|
+
url,
|
|
71
|
+
contentType
|
|
72
|
+
});
|
|
73
|
+
return null;
|
|
74
|
+
}
|
|
75
|
+
const content = extractContent(await response.text(), url);
|
|
76
|
+
if (content == null) {
|
|
77
|
+
logger.debug("Failed to extract content from: {url}", { url });
|
|
78
|
+
return null;
|
|
79
|
+
}
|
|
80
|
+
logger.debug("Extracted content from: {url}, title: {title}", {
|
|
81
|
+
url,
|
|
82
|
+
title: content.title
|
|
83
|
+
});
|
|
84
|
+
return content;
|
|
85
|
+
} catch (error) {
|
|
86
|
+
if (error instanceof Error && error.name === "AbortError") logger.debug("Fetch aborted for: {url}", { url });
|
|
87
|
+
else logger.warn("Error fetching URL: {url}, error: {error}", {
|
|
88
|
+
url,
|
|
89
|
+
error: String(error)
|
|
90
|
+
});
|
|
91
|
+
return null;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* A passive context source that fetches a single web page and extracts
|
|
96
|
+
* its main content.
|
|
97
|
+
*
|
|
98
|
+
* This source is exposed as a tool that the LLM can call when it needs
|
|
99
|
+
* to fetch additional context from a specific URL.
|
|
100
|
+
*
|
|
101
|
+
* @example
|
|
102
|
+
* ```typescript
|
|
103
|
+
* import { translate } from "@vertana/facade";
|
|
104
|
+
* import { fetchWebPage } from "@vertana/context-web";
|
|
105
|
+
*
|
|
106
|
+
* const result = await translate(model, "ko", text, {
|
|
107
|
+
* contextSources: [fetchWebPage],
|
|
108
|
+
* });
|
|
109
|
+
* ```
|
|
110
|
+
*
|
|
111
|
+
* @since 0.1.0
|
|
112
|
+
*/
|
|
113
|
+
const fetchWebPage = {
|
|
114
|
+
name: "fetch-web-page",
|
|
115
|
+
description: "Fetches a web page and extracts its main content. Use this when you need additional context from a linked article or page.",
|
|
116
|
+
mode: "passive",
|
|
117
|
+
parameters: zod.z.object({ url: zod.z.string().url().describe("The URL of the web page to fetch") }),
|
|
118
|
+
async gather(params, options) {
|
|
119
|
+
const content = await fetchAndExtract(params.url, { signal: options?.signal });
|
|
120
|
+
if (content == null) return {
|
|
121
|
+
content: `Failed to fetch or extract content from: ${params.url}`,
|
|
122
|
+
metadata: {
|
|
123
|
+
url: params.url,
|
|
124
|
+
success: false
|
|
125
|
+
}
|
|
126
|
+
};
|
|
127
|
+
return {
|
|
128
|
+
content: formatContent(content, params.url),
|
|
129
|
+
metadata: {
|
|
130
|
+
url: params.url,
|
|
131
|
+
title: content.title,
|
|
132
|
+
success: true
|
|
133
|
+
}
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
/**
|
|
138
|
+
* Creates a required context source that extracts all links from the given
|
|
139
|
+
* text and fetches their content.
|
|
140
|
+
*
|
|
141
|
+
* This source is invoked automatically before translation begins, providing
|
|
142
|
+
* context from all linked pages.
|
|
143
|
+
*
|
|
144
|
+
* @param options Options for the context source.
|
|
145
|
+
* @returns A required context source.
|
|
146
|
+
*
|
|
147
|
+
* @example
|
|
148
|
+
* ```typescript
|
|
149
|
+
* import { translate } from "@vertana/facade";
|
|
150
|
+
* import { fetchLinkedPages } from "@vertana/context-web";
|
|
151
|
+
*
|
|
152
|
+
* const text = "Check out https://example.com for details.";
|
|
153
|
+
* const result = await translate(model, "ko", text, {
|
|
154
|
+
* contextSources: [
|
|
155
|
+
* fetchLinkedPages({ text, mediaType: "text/plain" }),
|
|
156
|
+
* ],
|
|
157
|
+
* });
|
|
158
|
+
* ```
|
|
159
|
+
*
|
|
160
|
+
* @since 0.1.0
|
|
161
|
+
*/
|
|
162
|
+
function fetchLinkedPages(options) {
|
|
163
|
+
const maxLinks = options.maxLinks ?? 10;
|
|
164
|
+
const timeout = options.timeout ?? 1e4;
|
|
165
|
+
const linksToFetch = require_extract_links.extractLinks(options.text, options.mediaType).slice(0, maxLinks);
|
|
166
|
+
return {
|
|
167
|
+
name: "fetch-linked-pages",
|
|
168
|
+
description: `Fetches content from ${linksToFetch.length} linked page(s) to provide additional context for translation.`,
|
|
169
|
+
mode: "required",
|
|
170
|
+
async gather(gatherOptions) {
|
|
171
|
+
if (linksToFetch.length === 0) {
|
|
172
|
+
logger.debug("No links to fetch.");
|
|
173
|
+
return {
|
|
174
|
+
content: "",
|
|
175
|
+
metadata: {
|
|
176
|
+
linkCount: 0,
|
|
177
|
+
fetchedCount: 0
|
|
178
|
+
}
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
logger.info("Fetching {count} linked page(s)...", { count: linksToFetch.length });
|
|
182
|
+
const results = [];
|
|
183
|
+
for (const url of linksToFetch) {
|
|
184
|
+
gatherOptions?.signal?.throwIfAborted();
|
|
185
|
+
const content = await fetchAndExtract(url, {
|
|
186
|
+
signal: gatherOptions?.signal,
|
|
187
|
+
timeout
|
|
188
|
+
});
|
|
189
|
+
if (content != null) results.push({
|
|
190
|
+
url,
|
|
191
|
+
content
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
if (results.length === 0) {
|
|
195
|
+
logger.debug("No content could be extracted from any linked pages.");
|
|
196
|
+
return {
|
|
197
|
+
content: "",
|
|
198
|
+
metadata: {
|
|
199
|
+
linkCount: linksToFetch.length,
|
|
200
|
+
fetchedCount: 0
|
|
201
|
+
}
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
logger.info("Successfully extracted content from {count} of {total} page(s).", {
|
|
205
|
+
count: results.length,
|
|
206
|
+
total: linksToFetch.length
|
|
207
|
+
});
|
|
208
|
+
return {
|
|
209
|
+
content: results.map(({ url, content }) => formatContent(content, url)).join("\n\n---\n\n"),
|
|
210
|
+
metadata: {
|
|
211
|
+
linkCount: linksToFetch.length,
|
|
212
|
+
fetchedCount: results.length,
|
|
213
|
+
urls: results.map((r) => r.url)
|
|
214
|
+
}
|
|
215
|
+
};
|
|
216
|
+
}
|
|
217
|
+
};
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Formats extracted content for inclusion in the translation context.
|
|
221
|
+
*/
|
|
222
|
+
function formatContent(content, url) {
|
|
223
|
+
const parts = [];
|
|
224
|
+
parts.push(`# ${content.title}`);
|
|
225
|
+
parts.push(`Source: ${url}`);
|
|
226
|
+
if (content.byline != null) parts.push(`Author: ${content.byline}`);
|
|
227
|
+
parts.push("");
|
|
228
|
+
parts.push(content.content);
|
|
229
|
+
return parts.join("\n");
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
//#endregion
|
|
233
|
+
exports.extractContent = extractContent;
|
|
234
|
+
exports.fetchLinkedPages = fetchLinkedPages;
|
|
235
|
+
exports.fetchWebPage = fetchWebPage;
|
package/dist/fetch.d.cts
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import { MediaType } from "./extract-links.cjs";
|
|
2
|
+
import { PassiveContextSource, RequiredContextSource } from "@vertana/core/context";
|
|
3
|
+
|
|
4
|
+
//#region src/fetch.d.ts
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Result of extracting content from a web page.
|
|
8
|
+
*
|
|
9
|
+
* @since 0.1.0
|
|
10
|
+
*/
|
|
11
|
+
interface ExtractedContent {
|
|
12
|
+
/**
|
|
13
|
+
* The title of the article.
|
|
14
|
+
*/
|
|
15
|
+
readonly title: string;
|
|
16
|
+
/**
|
|
17
|
+
* The extracted main content as plain text.
|
|
18
|
+
*/
|
|
19
|
+
readonly content: string;
|
|
20
|
+
/**
|
|
21
|
+
* The byline (author) if available.
|
|
22
|
+
*/
|
|
23
|
+
readonly byline?: string;
|
|
24
|
+
/**
|
|
25
|
+
* The excerpt if available.
|
|
26
|
+
*/
|
|
27
|
+
readonly excerpt?: string;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Extracts the main content from an HTML page using Mozilla's Readability.
|
|
31
|
+
*
|
|
32
|
+
* @param html The HTML content to extract from.
|
|
33
|
+
* @param url The URL of the page (used for resolving relative links).
|
|
34
|
+
* @returns The extracted content, or null if extraction failed.
|
|
35
|
+
* @since 0.1.0
|
|
36
|
+
*/
|
|
37
|
+
declare function extractContent(html: string, url: string): ExtractedContent | null;
|
|
38
|
+
/**
|
|
39
|
+
* Parameters for the fetchWebPage context source.
|
|
40
|
+
*/
|
|
41
|
+
interface FetchWebPageParams {
|
|
42
|
+
/**
|
|
43
|
+
* The URL to fetch.
|
|
44
|
+
*/
|
|
45
|
+
readonly url: string;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* A passive context source that fetches a single web page and extracts
|
|
49
|
+
* its main content.
|
|
50
|
+
*
|
|
51
|
+
* This source is exposed as a tool that the LLM can call when it needs
|
|
52
|
+
* to fetch additional context from a specific URL.
|
|
53
|
+
*
|
|
54
|
+
* @example
|
|
55
|
+
* ```typescript
|
|
56
|
+
* import { translate } from "@vertana/facade";
|
|
57
|
+
* import { fetchWebPage } from "@vertana/context-web";
|
|
58
|
+
*
|
|
59
|
+
* const result = await translate(model, "ko", text, {
|
|
60
|
+
* contextSources: [fetchWebPage],
|
|
61
|
+
* });
|
|
62
|
+
* ```
|
|
63
|
+
*
|
|
64
|
+
* @since 0.1.0
|
|
65
|
+
*/
|
|
66
|
+
declare const fetchWebPage: PassiveContextSource<FetchWebPageParams>;
|
|
67
|
+
/**
|
|
68
|
+
* Options for creating a fetchLinkedPages context source.
|
|
69
|
+
*
|
|
70
|
+
* @since 0.1.0
|
|
71
|
+
*/
|
|
72
|
+
interface FetchLinkedPagesOptions {
|
|
73
|
+
/**
|
|
74
|
+
* The text to extract links from.
|
|
75
|
+
*/
|
|
76
|
+
readonly text: string;
|
|
77
|
+
/**
|
|
78
|
+
* The media type of the text.
|
|
79
|
+
*/
|
|
80
|
+
readonly mediaType: MediaType;
|
|
81
|
+
/**
|
|
82
|
+
* Maximum number of links to fetch.
|
|
83
|
+
*
|
|
84
|
+
* @default 10
|
|
85
|
+
*/
|
|
86
|
+
readonly maxLinks?: number;
|
|
87
|
+
/**
|
|
88
|
+
* Timeout for each fetch request in milliseconds.
|
|
89
|
+
*
|
|
90
|
+
* @default 10000
|
|
91
|
+
*/
|
|
92
|
+
readonly timeout?: number;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Creates a required context source that extracts all links from the given
|
|
96
|
+
* text and fetches their content.
|
|
97
|
+
*
|
|
98
|
+
* This source is invoked automatically before translation begins, providing
|
|
99
|
+
* context from all linked pages.
|
|
100
|
+
*
|
|
101
|
+
* @param options Options for the context source.
|
|
102
|
+
* @returns A required context source.
|
|
103
|
+
*
|
|
104
|
+
* @example
|
|
105
|
+
* ```typescript
|
|
106
|
+
* import { translate } from "@vertana/facade";
|
|
107
|
+
* import { fetchLinkedPages } from "@vertana/context-web";
|
|
108
|
+
*
|
|
109
|
+
* const text = "Check out https://example.com for details.";
|
|
110
|
+
* const result = await translate(model, "ko", text, {
|
|
111
|
+
* contextSources: [
|
|
112
|
+
* fetchLinkedPages({ text, mediaType: "text/plain" }),
|
|
113
|
+
* ],
|
|
114
|
+
* });
|
|
115
|
+
* ```
|
|
116
|
+
*
|
|
117
|
+
* @since 0.1.0
|
|
118
|
+
*/
|
|
119
|
+
declare function fetchLinkedPages(options: FetchLinkedPagesOptions): RequiredContextSource;
|
|
120
|
+
//#endregion
|
|
121
|
+
export { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage };
|
package/dist/fetch.d.ts
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import { MediaType } from "./extract-links.js";
|
|
2
|
+
import { PassiveContextSource, RequiredContextSource } from "@vertana/core/context";
|
|
3
|
+
|
|
4
|
+
//#region src/fetch.d.ts
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Result of extracting content from a web page.
|
|
8
|
+
*
|
|
9
|
+
* @since 0.1.0
|
|
10
|
+
*/
|
|
11
|
+
interface ExtractedContent {
|
|
12
|
+
/**
|
|
13
|
+
* The title of the article.
|
|
14
|
+
*/
|
|
15
|
+
readonly title: string;
|
|
16
|
+
/**
|
|
17
|
+
* The extracted main content as plain text.
|
|
18
|
+
*/
|
|
19
|
+
readonly content: string;
|
|
20
|
+
/**
|
|
21
|
+
* The byline (author) if available.
|
|
22
|
+
*/
|
|
23
|
+
readonly byline?: string;
|
|
24
|
+
/**
|
|
25
|
+
* The excerpt if available.
|
|
26
|
+
*/
|
|
27
|
+
readonly excerpt?: string;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Extracts the main content from an HTML page using Mozilla's Readability.
|
|
31
|
+
*
|
|
32
|
+
* @param html The HTML content to extract from.
|
|
33
|
+
* @param url The URL of the page (used for resolving relative links).
|
|
34
|
+
* @returns The extracted content, or null if extraction failed.
|
|
35
|
+
* @since 0.1.0
|
|
36
|
+
*/
|
|
37
|
+
declare function extractContent(html: string, url: string): ExtractedContent | null;
|
|
38
|
+
/**
|
|
39
|
+
* Parameters for the fetchWebPage context source.
|
|
40
|
+
*/
|
|
41
|
+
interface FetchWebPageParams {
|
|
42
|
+
/**
|
|
43
|
+
* The URL to fetch.
|
|
44
|
+
*/
|
|
45
|
+
readonly url: string;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* A passive context source that fetches a single web page and extracts
|
|
49
|
+
* its main content.
|
|
50
|
+
*
|
|
51
|
+
* This source is exposed as a tool that the LLM can call when it needs
|
|
52
|
+
* to fetch additional context from a specific URL.
|
|
53
|
+
*
|
|
54
|
+
* @example
|
|
55
|
+
* ```typescript
|
|
56
|
+
* import { translate } from "@vertana/facade";
|
|
57
|
+
* import { fetchWebPage } from "@vertana/context-web";
|
|
58
|
+
*
|
|
59
|
+
* const result = await translate(model, "ko", text, {
|
|
60
|
+
* contextSources: [fetchWebPage],
|
|
61
|
+
* });
|
|
62
|
+
* ```
|
|
63
|
+
*
|
|
64
|
+
* @since 0.1.0
|
|
65
|
+
*/
|
|
66
|
+
declare const fetchWebPage: PassiveContextSource<FetchWebPageParams>;
|
|
67
|
+
/**
|
|
68
|
+
* Options for creating a fetchLinkedPages context source.
|
|
69
|
+
*
|
|
70
|
+
* @since 0.1.0
|
|
71
|
+
*/
|
|
72
|
+
interface FetchLinkedPagesOptions {
|
|
73
|
+
/**
|
|
74
|
+
* The text to extract links from.
|
|
75
|
+
*/
|
|
76
|
+
readonly text: string;
|
|
77
|
+
/**
|
|
78
|
+
* The media type of the text.
|
|
79
|
+
*/
|
|
80
|
+
readonly mediaType: MediaType;
|
|
81
|
+
/**
|
|
82
|
+
* Maximum number of links to fetch.
|
|
83
|
+
*
|
|
84
|
+
* @default 10
|
|
85
|
+
*/
|
|
86
|
+
readonly maxLinks?: number;
|
|
87
|
+
/**
|
|
88
|
+
* Timeout for each fetch request in milliseconds.
|
|
89
|
+
*
|
|
90
|
+
* @default 10000
|
|
91
|
+
*/
|
|
92
|
+
readonly timeout?: number;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Creates a required context source that extracts all links from the given
|
|
96
|
+
* text and fetches their content.
|
|
97
|
+
*
|
|
98
|
+
* This source is invoked automatically before translation begins, providing
|
|
99
|
+
* context from all linked pages.
|
|
100
|
+
*
|
|
101
|
+
* @param options Options for the context source.
|
|
102
|
+
* @returns A required context source.
|
|
103
|
+
*
|
|
104
|
+
* @example
|
|
105
|
+
* ```typescript
|
|
106
|
+
* import { translate } from "@vertana/facade";
|
|
107
|
+
* import { fetchLinkedPages } from "@vertana/context-web";
|
|
108
|
+
*
|
|
109
|
+
* const text = "Check out https://example.com for details.";
|
|
110
|
+
* const result = await translate(model, "ko", text, {
|
|
111
|
+
* contextSources: [
|
|
112
|
+
* fetchLinkedPages({ text, mediaType: "text/plain" }),
|
|
113
|
+
* ],
|
|
114
|
+
* });
|
|
115
|
+
* ```
|
|
116
|
+
*
|
|
117
|
+
* @since 0.1.0
|
|
118
|
+
*/
|
|
119
|
+
declare function fetchLinkedPages(options: FetchLinkedPagesOptions): RequiredContextSource;
|
|
120
|
+
//#endregion
|
|
121
|
+
export { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage };
|
package/dist/fetch.js
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import { extractLinks } from "./extract-links.js";
|
|
2
|
+
import { getLogger } from "@logtape/logtape";
|
|
3
|
+
import { Readability } from "@mozilla/readability";
|
|
4
|
+
import { parseHTML } from "linkedom";
|
|
5
|
+
import { z } from "zod";
|
|
6
|
+
|
|
7
|
+
//#region src/fetch.ts
|
|
8
|
+
const logger = getLogger([
|
|
9
|
+
"vertana",
|
|
10
|
+
"context-web",
|
|
11
|
+
"fetch"
|
|
12
|
+
]);
|
|
13
|
+
/**
|
|
14
|
+
* Extracts the main content from an HTML page using Mozilla's Readability.
|
|
15
|
+
*
|
|
16
|
+
* @param html The HTML content to extract from.
|
|
17
|
+
* @param url The URL of the page (used for resolving relative links).
|
|
18
|
+
* @returns The extracted content, or null if extraction failed.
|
|
19
|
+
* @since 0.1.0
|
|
20
|
+
*/
|
|
21
|
+
function extractContent(html, url) {
|
|
22
|
+
const document = parseHTML(html, "text/html").document;
|
|
23
|
+
const baseElement = document.createElement("base");
|
|
24
|
+
baseElement.href = url;
|
|
25
|
+
document.head.appendChild(baseElement);
|
|
26
|
+
const article = new Readability(document).parse();
|
|
27
|
+
if (article == null) return null;
|
|
28
|
+
const title = article.title ?? "";
|
|
29
|
+
const content = article.textContent ?? "";
|
|
30
|
+
if (title.length === 0 && content.length === 0) return null;
|
|
31
|
+
return {
|
|
32
|
+
title,
|
|
33
|
+
content,
|
|
34
|
+
byline: article.byline ?? void 0,
|
|
35
|
+
excerpt: article.excerpt ?? void 0
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Fetches a URL and extracts its main content.
|
|
40
|
+
*
|
|
41
|
+
* @param url The URL to fetch.
|
|
42
|
+
* @param options Fetch options.
|
|
43
|
+
* @returns The extracted content, or null if fetch or extraction failed.
|
|
44
|
+
*/
|
|
45
|
+
async function fetchAndExtract(url, options) {
|
|
46
|
+
const timeout = options?.timeout ?? 1e4;
|
|
47
|
+
logger.debug("Fetching URL: {url}...", { url });
|
|
48
|
+
try {
|
|
49
|
+
const controller = new AbortController();
|
|
50
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
51
|
+
if (options?.signal != null) options.signal.addEventListener("abort", () => controller.abort());
|
|
52
|
+
const response = await fetch(url, {
|
|
53
|
+
signal: controller.signal,
|
|
54
|
+
headers: {
|
|
55
|
+
"User-Agent": "Mozilla/5.0 (compatible; Vertana/0.1; +https://vertana.org)",
|
|
56
|
+
Accept: "text/html,application/xhtml+xml"
|
|
57
|
+
}
|
|
58
|
+
});
|
|
59
|
+
clearTimeout(timeoutId);
|
|
60
|
+
if (!response.ok) {
|
|
61
|
+
logger.warn("Failed to fetch URL: {url}, status: {status}", {
|
|
62
|
+
url,
|
|
63
|
+
status: response.status
|
|
64
|
+
});
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
const contentType = response.headers.get("content-type");
|
|
68
|
+
if (contentType != null && !contentType.includes("text/html")) {
|
|
69
|
+
logger.debug("Skipping non-HTML content: {url}, type: {contentType}", {
|
|
70
|
+
url,
|
|
71
|
+
contentType
|
|
72
|
+
});
|
|
73
|
+
return null;
|
|
74
|
+
}
|
|
75
|
+
const content = extractContent(await response.text(), url);
|
|
76
|
+
if (content == null) {
|
|
77
|
+
logger.debug("Failed to extract content from: {url}", { url });
|
|
78
|
+
return null;
|
|
79
|
+
}
|
|
80
|
+
logger.debug("Extracted content from: {url}, title: {title}", {
|
|
81
|
+
url,
|
|
82
|
+
title: content.title
|
|
83
|
+
});
|
|
84
|
+
return content;
|
|
85
|
+
} catch (error) {
|
|
86
|
+
if (error instanceof Error && error.name === "AbortError") logger.debug("Fetch aborted for: {url}", { url });
|
|
87
|
+
else logger.warn("Error fetching URL: {url}, error: {error}", {
|
|
88
|
+
url,
|
|
89
|
+
error: String(error)
|
|
90
|
+
});
|
|
91
|
+
return null;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* A passive context source that fetches a single web page and extracts
|
|
96
|
+
* its main content.
|
|
97
|
+
*
|
|
98
|
+
* This source is exposed as a tool that the LLM can call when it needs
|
|
99
|
+
* to fetch additional context from a specific URL.
|
|
100
|
+
*
|
|
101
|
+
* @example
|
|
102
|
+
* ```typescript
|
|
103
|
+
* import { translate } from "@vertana/facade";
|
|
104
|
+
* import { fetchWebPage } from "@vertana/context-web";
|
|
105
|
+
*
|
|
106
|
+
* const result = await translate(model, "ko", text, {
|
|
107
|
+
* contextSources: [fetchWebPage],
|
|
108
|
+
* });
|
|
109
|
+
* ```
|
|
110
|
+
*
|
|
111
|
+
* @since 0.1.0
|
|
112
|
+
*/
|
|
113
|
+
const fetchWebPage = {
|
|
114
|
+
name: "fetch-web-page",
|
|
115
|
+
description: "Fetches a web page and extracts its main content. Use this when you need additional context from a linked article or page.",
|
|
116
|
+
mode: "passive",
|
|
117
|
+
parameters: z.object({ url: z.string().url().describe("The URL of the web page to fetch") }),
|
|
118
|
+
async gather(params, options) {
|
|
119
|
+
const content = await fetchAndExtract(params.url, { signal: options?.signal });
|
|
120
|
+
if (content == null) return {
|
|
121
|
+
content: `Failed to fetch or extract content from: ${params.url}`,
|
|
122
|
+
metadata: {
|
|
123
|
+
url: params.url,
|
|
124
|
+
success: false
|
|
125
|
+
}
|
|
126
|
+
};
|
|
127
|
+
return {
|
|
128
|
+
content: formatContent(content, params.url),
|
|
129
|
+
metadata: {
|
|
130
|
+
url: params.url,
|
|
131
|
+
title: content.title,
|
|
132
|
+
success: true
|
|
133
|
+
}
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
/**
|
|
138
|
+
* Creates a required context source that extracts all links from the given
|
|
139
|
+
* text and fetches their content.
|
|
140
|
+
*
|
|
141
|
+
* This source is invoked automatically before translation begins, providing
|
|
142
|
+
* context from all linked pages.
|
|
143
|
+
*
|
|
144
|
+
* @param options Options for the context source.
|
|
145
|
+
* @returns A required context source.
|
|
146
|
+
*
|
|
147
|
+
* @example
|
|
148
|
+
* ```typescript
|
|
149
|
+
* import { translate } from "@vertana/facade";
|
|
150
|
+
* import { fetchLinkedPages } from "@vertana/context-web";
|
|
151
|
+
*
|
|
152
|
+
* const text = "Check out https://example.com for details.";
|
|
153
|
+
* const result = await translate(model, "ko", text, {
|
|
154
|
+
* contextSources: [
|
|
155
|
+
* fetchLinkedPages({ text, mediaType: "text/plain" }),
|
|
156
|
+
* ],
|
|
157
|
+
* });
|
|
158
|
+
* ```
|
|
159
|
+
*
|
|
160
|
+
* @since 0.1.0
|
|
161
|
+
*/
|
|
162
|
+
function fetchLinkedPages(options) {
|
|
163
|
+
const maxLinks = options.maxLinks ?? 10;
|
|
164
|
+
const timeout = options.timeout ?? 1e4;
|
|
165
|
+
const linksToFetch = extractLinks(options.text, options.mediaType).slice(0, maxLinks);
|
|
166
|
+
return {
|
|
167
|
+
name: "fetch-linked-pages",
|
|
168
|
+
description: `Fetches content from ${linksToFetch.length} linked page(s) to provide additional context for translation.`,
|
|
169
|
+
mode: "required",
|
|
170
|
+
async gather(gatherOptions) {
|
|
171
|
+
if (linksToFetch.length === 0) {
|
|
172
|
+
logger.debug("No links to fetch.");
|
|
173
|
+
return {
|
|
174
|
+
content: "",
|
|
175
|
+
metadata: {
|
|
176
|
+
linkCount: 0,
|
|
177
|
+
fetchedCount: 0
|
|
178
|
+
}
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
logger.info("Fetching {count} linked page(s)...", { count: linksToFetch.length });
|
|
182
|
+
const results = [];
|
|
183
|
+
for (const url of linksToFetch) {
|
|
184
|
+
gatherOptions?.signal?.throwIfAborted();
|
|
185
|
+
const content = await fetchAndExtract(url, {
|
|
186
|
+
signal: gatherOptions?.signal,
|
|
187
|
+
timeout
|
|
188
|
+
});
|
|
189
|
+
if (content != null) results.push({
|
|
190
|
+
url,
|
|
191
|
+
content
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
if (results.length === 0) {
|
|
195
|
+
logger.debug("No content could be extracted from any linked pages.");
|
|
196
|
+
return {
|
|
197
|
+
content: "",
|
|
198
|
+
metadata: {
|
|
199
|
+
linkCount: linksToFetch.length,
|
|
200
|
+
fetchedCount: 0
|
|
201
|
+
}
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
logger.info("Successfully extracted content from {count} of {total} page(s).", {
|
|
205
|
+
count: results.length,
|
|
206
|
+
total: linksToFetch.length
|
|
207
|
+
});
|
|
208
|
+
return {
|
|
209
|
+
content: results.map(({ url, content }) => formatContent(content, url)).join("\n\n---\n\n"),
|
|
210
|
+
metadata: {
|
|
211
|
+
linkCount: linksToFetch.length,
|
|
212
|
+
fetchedCount: results.length,
|
|
213
|
+
urls: results.map((r) => r.url)
|
|
214
|
+
}
|
|
215
|
+
};
|
|
216
|
+
}
|
|
217
|
+
};
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Formats extracted content for inclusion in the translation context.
|
|
221
|
+
*/
|
|
222
|
+
function formatContent(content, url) {
|
|
223
|
+
const parts = [];
|
|
224
|
+
parts.push(`# ${content.title}`);
|
|
225
|
+
parts.push(`Source: ${url}`);
|
|
226
|
+
if (content.byline != null) parts.push(`Author: ${content.byline}`);
|
|
227
|
+
parts.push("");
|
|
228
|
+
parts.push(content.content);
|
|
229
|
+
return parts.join("\n");
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
//#endregion
|
|
233
|
+
export { extractContent, fetchLinkedPages, fetchWebPage };
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
const require_extract_links = require('./extract-links.cjs');
|
|
2
|
+
const require_fetch = require('./fetch.cjs');
|
|
3
|
+
|
|
4
|
+
exports.extractContent = require_fetch.extractContent;
|
|
5
|
+
exports.extractLinks = require_extract_links.extractLinks;
|
|
6
|
+
exports.fetchLinkedPages = require_fetch.fetchLinkedPages;
|
|
7
|
+
exports.fetchWebPage = require_fetch.fetchWebPage;
|
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import { MediaType, extractLinks } from "./extract-links.cjs";
|
|
2
|
+
import { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.cjs";
|
|
3
|
+
export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import { MediaType, extractLinks } from "./extract-links.js";
|
|
2
|
+
import { ExtractedContent, FetchLinkedPagesOptions, extractContent, fetchLinkedPages, fetchWebPage } from "./fetch.js";
|
|
3
|
+
export { type ExtractedContent, type FetchLinkedPagesOptions, type MediaType, extractContent, extractLinks, fetchLinkedPages, fetchWebPage };
|
package/dist/index.js
ADDED
package/package.json
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@vertana/context-web",
|
|
3
|
+
"version": "0.1.0-dev.1",
|
|
4
|
+
"description": "Web context gathering for Vertana - fetch and extract content from linked pages",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"LLM",
|
|
7
|
+
"translation",
|
|
8
|
+
"context",
|
|
9
|
+
"web",
|
|
10
|
+
"readability"
|
|
11
|
+
],
|
|
12
|
+
"license": "MIT",
|
|
13
|
+
"author": {
|
|
14
|
+
"name": "Hong Minhee",
|
|
15
|
+
"email": "hong@minhee.org",
|
|
16
|
+
"url": "https://hongminhee.org/"
|
|
17
|
+
},
|
|
18
|
+
"homepage": "https://vertana.org/",
|
|
19
|
+
"repository": {
|
|
20
|
+
"type": "git",
|
|
21
|
+
"url": "git+https://github.com/dahlia/vertana.git",
|
|
22
|
+
"directory": "packages/context-web"
|
|
23
|
+
},
|
|
24
|
+
"bugs": {
|
|
25
|
+
"url": "https://github.com/dahlia/vertana/issues"
|
|
26
|
+
},
|
|
27
|
+
"funding": [
|
|
28
|
+
"https://github.com/sponsors/dahlia"
|
|
29
|
+
],
|
|
30
|
+
"engines": {
|
|
31
|
+
"node": ">=20.0.0",
|
|
32
|
+
"bun": ">=1.2.0",
|
|
33
|
+
"deno": ">=2.3.0"
|
|
34
|
+
},
|
|
35
|
+
"files": [
|
|
36
|
+
"dist/",
|
|
37
|
+
"package.json",
|
|
38
|
+
"README.md"
|
|
39
|
+
],
|
|
40
|
+
"type": "module",
|
|
41
|
+
"module": "./dist/index.js",
|
|
42
|
+
"main": "./dist/index.cjs",
|
|
43
|
+
"types": "./dist/index.d.ts",
|
|
44
|
+
"exports": {
|
|
45
|
+
".": {
|
|
46
|
+
"types": {
|
|
47
|
+
"require": "./dist/index.d.cts",
|
|
48
|
+
"import": "./dist/index.d.ts"
|
|
49
|
+
},
|
|
50
|
+
"require": "./dist/index.cjs",
|
|
51
|
+
"import": "./dist/index.js"
|
|
52
|
+
},
|
|
53
|
+
"./fetch": {
|
|
54
|
+
"types": {
|
|
55
|
+
"require": "./dist/fetch.d.cts",
|
|
56
|
+
"import": "./dist/fetch.d.ts"
|
|
57
|
+
},
|
|
58
|
+
"require": "./dist/fetch.cjs",
|
|
59
|
+
"import": "./dist/fetch.js"
|
|
60
|
+
},
|
|
61
|
+
"./extract-links": {
|
|
62
|
+
"types": {
|
|
63
|
+
"require": "./dist/extract-links.d.cts",
|
|
64
|
+
"import": "./dist/extract-links.d.ts"
|
|
65
|
+
},
|
|
66
|
+
"require": "./dist/extract-links.cjs",
|
|
67
|
+
"import": "./dist/extract-links.js"
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
"sideEffects": false,
|
|
71
|
+
"dependencies": {
|
|
72
|
+
"@logtape/logtape": "^1.3.5",
|
|
73
|
+
"@mozilla/readability": "^0.6.0",
|
|
74
|
+
"@vertana/core": "",
|
|
75
|
+
"htmlparser2": "^10.0.0",
|
|
76
|
+
"linkedom": "^0.18.12",
|
|
77
|
+
"zod": "4.2.1"
|
|
78
|
+
},
|
|
79
|
+
"devDependencies": {
|
|
80
|
+
"@types/node": "^20.19.9",
|
|
81
|
+
"tsdown": "^0.18.3",
|
|
82
|
+
"typescript": "^5.9.3"
|
|
83
|
+
},
|
|
84
|
+
"scripts": {
|
|
85
|
+
"build": "tsdown",
|
|
86
|
+
"prepublish": "tsdown",
|
|
87
|
+
"test": "tsdown && node --experimental-transform-types --test --test-concurrency=4",
|
|
88
|
+
"test:bun": "tsdown && bun test",
|
|
89
|
+
"test:deno": "deno test --allow-env --allow-net",
|
|
90
|
+
"test-all": "tsdown && node --experimental-transform-types --test && bun test && deno test"
|
|
91
|
+
}
|
|
92
|
+
}
|