@memoryblock/plugin-fetch-webpage 0.1.0-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +135 -0
- package/dist/index.js.map +1 -0
- package/package.json +17 -0
- package/src/index.ts +161 -0
- package/tsconfig.json +10 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 memoryblock
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { ToolExecutionResult, ToolContext, ToolDefinition } from 'memoryblock';
|
|
2
|
+
export declare const fetchWebpageTool: {
|
|
3
|
+
definition: ToolDefinition;
|
|
4
|
+
execute(params: Record<string, unknown>, _context: ToolContext): Promise<ToolExecutionResult>;
|
|
5
|
+
};
|
|
6
|
+
/** Export as array for registry plugin loading. */
|
|
7
|
+
export declare const tools: {
|
|
8
|
+
definition: ToolDefinition;
|
|
9
|
+
execute(params: Record<string, unknown>, _context: ToolContext): Promise<ToolExecutionResult>;
|
|
10
|
+
}[];
|
|
11
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,mBAAmB,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AA4FpF,eAAO,MAAM,gBAAgB;;oBAEH,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,YAAY,WAAW,GAAG,OAAO,CAAC,mBAAmB,CAAC;CA+DtG,CAAC;AAEF,mDAAmD;AACnD,eAAO,MAAM,KAAK;;oBAlEQ,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,YAAY,WAAW,GAAG,OAAO,CAAC,mBAAmB,CAAC;GAkEhE,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* fetch_webpage — extract readable content from a URL.
|
|
3
|
+
*
|
|
4
|
+
* Zero external dependencies:
|
|
5
|
+
* - Uses Node.js built-in fetch() (available since Node 18)
|
|
6
|
+
* - HTML→text via regex (no cheerio, no jsdom)
|
|
7
|
+
*
|
|
8
|
+
* Cost-efficient: truncates output at 8000 chars for token control.
|
|
9
|
+
*/
|
|
10
|
+
const MAX_CONTENT_LENGTH = 8000;
|
|
11
|
+
const FETCH_TIMEOUT = 15000;
|
|
12
|
+
/**
|
|
13
|
+
* Strip HTML to readable text content.
|
|
14
|
+
* Removes scripts, styles, nav, header, footer, ads, and extracts
|
|
15
|
+
* meaningful text from the page body.
|
|
16
|
+
*/
|
|
17
|
+
function htmlToText(html) {
|
|
18
|
+
let text = html;
|
|
19
|
+
// Remove script, style, nav, header, footer, aside elements
|
|
20
|
+
text = text.replace(/<(script|style|nav|header|footer|aside|noscript)[^>]*>[\s\S]*?<\/\1>/gi, '');
|
|
21
|
+
// Remove HTML comments
|
|
22
|
+
text = text.replace(/<!--[\s\S]*?-->/g, '');
|
|
23
|
+
// Convert common HTML elements to readable text
|
|
24
|
+
text = text.replace(/<br\s*\/?>/gi, '\n');
|
|
25
|
+
text = text.replace(/<\/p>/gi, '\n\n');
|
|
26
|
+
text = text.replace(/<\/div>/gi, '\n');
|
|
27
|
+
text = text.replace(/<\/h[1-6]>/gi, '\n\n');
|
|
28
|
+
text = text.replace(/<\/li>/gi, '\n');
|
|
29
|
+
// Add formatting for headers and links
|
|
30
|
+
text = text.replace(/<h[1-6][^>]*>(.*?)<\/h[1-6]>/gi, '\n## $1\n');
|
|
31
|
+
text = text.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '$2 ($1)');
|
|
32
|
+
text = text.replace(/<li[^>]*>/gi, '• ');
|
|
33
|
+
// Strip remaining HTML tags
|
|
34
|
+
text = text.replace(/<[^>]+>/g, '');
|
|
35
|
+
// Decode common HTML entities
|
|
36
|
+
text = text
|
|
37
|
+
.replace(/&/g, '&')
|
|
38
|
+
.replace(/</g, '<')
|
|
39
|
+
.replace(/>/g, '>')
|
|
40
|
+
.replace(/"/g, '"')
|
|
41
|
+
.replace(/'/g, "'")
|
|
42
|
+
.replace(/ /g, ' ')
|
|
43
|
+
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code)));
|
|
44
|
+
// Clean up whitespace
|
|
45
|
+
text = text.replace(/[ \t]+/g, ' '); // collapse horizontal whitespace
|
|
46
|
+
text = text.replace(/\n{3,}/g, '\n\n'); // max 2 consecutive newlines
|
|
47
|
+
text = text.replace(/^\s+|\s+$/gm, ''); // trim each line
|
|
48
|
+
return text.trim();
|
|
49
|
+
}
|
|
50
|
+
/** Extract page title from HTML. */
|
|
51
|
+
function extractTitle(html) {
|
|
52
|
+
const match = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
53
|
+
return match ? match[1].trim().replace(/\s+/g, ' ') : '(no title)';
|
|
54
|
+
}
|
|
55
|
+
/** Extract meta description. */
|
|
56
|
+
function extractDescription(html) {
|
|
57
|
+
const match = html.match(/<meta[^>]*name=["']description["'][^>]*content=["']([\s\S]*?)["'][^>]*>/i);
|
|
58
|
+
return match ? match[1].trim() : '';
|
|
59
|
+
}
|
|
60
|
+
const fetchWebpageDefinition = {
|
|
61
|
+
name: 'fetch_webpage',
|
|
62
|
+
description: 'Fetch a webpage and extract its text content. ' +
|
|
63
|
+
'Returns the page title, description, and cleaned text. ' +
|
|
64
|
+
'Useful for reading articles, documentation, and reference pages. ' +
|
|
65
|
+
'Output is capped at 8000 characters for token efficiency.',
|
|
66
|
+
parameters: {
|
|
67
|
+
type: 'object',
|
|
68
|
+
properties: {
|
|
69
|
+
url: { type: 'string', description: 'The URL to fetch.' },
|
|
70
|
+
},
|
|
71
|
+
required: ['url'],
|
|
72
|
+
additionalProperties: false,
|
|
73
|
+
},
|
|
74
|
+
requiresApproval: false,
|
|
75
|
+
};
|
|
76
|
+
export const fetchWebpageTool = {
|
|
77
|
+
definition: fetchWebpageDefinition,
|
|
78
|
+
async execute(params, _context) {
|
|
79
|
+
const url = params.url;
|
|
80
|
+
if (!url || !url.startsWith('http')) {
|
|
81
|
+
return { content: 'Invalid URL. Must start with http:// or https://.', isError: true };
|
|
82
|
+
}
|
|
83
|
+
try {
|
|
84
|
+
const controller = new AbortController();
|
|
85
|
+
const timeout = setTimeout(() => controller.abort(), FETCH_TIMEOUT);
|
|
86
|
+
const response = await fetch(url, {
|
|
87
|
+
signal: controller.signal,
|
|
88
|
+
headers: {
|
|
89
|
+
'User-Agent': 'memoryblock/0.1.0 (AI assistant web reader)',
|
|
90
|
+
'Accept': 'text/html,application/xhtml+xml,text/plain',
|
|
91
|
+
},
|
|
92
|
+
});
|
|
93
|
+
clearTimeout(timeout);
|
|
94
|
+
if (!response.ok) {
|
|
95
|
+
return {
|
|
96
|
+
content: `Fetch failed: HTTP ${response.status} ${response.statusText}`,
|
|
97
|
+
isError: true,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
const contentType = response.headers.get('content-type') || '';
|
|
101
|
+
const html = await response.text();
|
|
102
|
+
// If it's plain text or JSON, return as-is (truncated)
|
|
103
|
+
if (!contentType.includes('html')) {
|
|
104
|
+
const truncated = html.slice(0, MAX_CONTENT_LENGTH);
|
|
105
|
+
return {
|
|
106
|
+
content: truncated + (html.length > MAX_CONTENT_LENGTH ? '\n...(truncated)' : ''),
|
|
107
|
+
isError: false,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
const title = extractTitle(html);
|
|
111
|
+
const desc = extractDescription(html);
|
|
112
|
+
const text = htmlToText(html);
|
|
113
|
+
const truncated = text.slice(0, MAX_CONTENT_LENGTH);
|
|
114
|
+
const parts = [
|
|
115
|
+
`# ${title}`,
|
|
116
|
+
desc ? `> ${desc}` : '',
|
|
117
|
+
`Source: ${url}`,
|
|
118
|
+
'',
|
|
119
|
+
truncated,
|
|
120
|
+
text.length > MAX_CONTENT_LENGTH ? `\n...(truncated — ${text.length} chars total)` : '',
|
|
121
|
+
];
|
|
122
|
+
return { content: parts.filter(Boolean).join('\n'), isError: false };
|
|
123
|
+
}
|
|
124
|
+
catch (err) {
|
|
125
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
126
|
+
if (message.includes('abort')) {
|
|
127
|
+
return { content: `Fetch timed out after ${FETCH_TIMEOUT / 1000}s: ${url}`, isError: true };
|
|
128
|
+
}
|
|
129
|
+
return { content: `Fetch failed: ${message}`, isError: true };
|
|
130
|
+
}
|
|
131
|
+
},
|
|
132
|
+
};
|
|
133
|
+
/** Export as array for registry plugin loading. */
|
|
134
|
+
export const tools = [fetchWebpageTool];
|
|
135
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAEA;;;;;;;;GAQG;AAEH,MAAM,kBAAkB,GAAG,IAAI,CAAC;AAChC,MAAM,aAAa,GAAG,KAAK,CAAC;AAE5B;;;;GAIG;AACH,SAAS,UAAU,CAAC,IAAY;IAC5B,IAAI,IAAI,GAAG,IAAI,CAAC;IAEhB,4DAA4D;IAC5D,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,wEAAwE,EAAE,EAAE,CAAC,CAAC;IAElG,uBAAuB;IACvB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;IAE5C,gDAAgD;IAChD,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC,CAAC;IAC1C,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IACvC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC;IACvC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;IAC5C,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;IAEtC,uCAAuC;IACvC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,gCAAgC,EAAE,WAAW,CAAC,CAAC;IACnE,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,yCAAyC,EAAE,SAAS,CAAC,CAAC;IAC1E,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,IAAI,CAAC,CAAC;IAEzC,4BAA4B;IAC5B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;IAEpC,8BAA8B;IAC9B,IAAI,GAAG,IAAI;SACN,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,CAAC,MAAM,CAAC,YAAY,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAE5E,sBAAsB;IACtB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC,CAAS,iCAAiC;IAC9E,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,CAAO,6BAA6B;IAC3E,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC,CAAO,iBAAiB;IAE/D,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC;AACvB,CAAC;AAED,oCAAoC;AACpC,SAAS,YAAY,CAAC,IAAY;IAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;IAC7D,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC;AACvE,CAAC;AAED,gCAAgC;AAChC,SAAS,kBAAkB,CAAC,IAAY;IACpC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,0EAA0E,CAAC,CAAC;IACrG,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;AACxC,CAAC;AAED,MAAM,sBAAsB,GAAmB;IAC3C,IAAI,EAAE,eAAe;IACrB,WAAW,EACP,gDAAgD;QAChD,yDAAyD;QACzD,mEAAmE;QACnE,2DAA2D;IAC/D,UAAU,EAAE;QACR,IAAI,EAAE,QAAQ;QACd,UAAU,EAAE;YACR,GAAG,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,mBAAmB,EAAE;SAC5D;QACD,QAAQ,EAAE,CAAC,KAAK,CAAC;QACjB,oBAAoB,EAAE,KAAK;KAC9B;IACD,gBAAgB,EAAE,KAAK;CAC1B,CAAC;AAEF,MAAM,CAAC,MAAM,gBAAgB,GAAG;IAC5B,UAAU,EAAE,sBAAsB;IAClC,KAAK,CAAC,OAAO,CAAC,MAA+B,EAAE,QAAqB;QAChE,MAAM,GAAG,GAAG,MAAM,CAAC,GAAa,CAAC;QAEjC,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;YAClC,OAAO,EAAE,OAAO,EAAE,mDAAmD,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;QAC3F,CAAC;QAED,IAAI,CAAC;YACD,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;YACzC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,aAAa,CAAC,CAAC;YAEpE,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAC9B,MAAM,EAAE,UAAU,CAAC,MAAM;gBACzB,OAAO,EAAE;oBACL,YAAY,EAAE,6CAA6C;oBAC3D,QAAQ,EAAE,4CAA4C;iBACzD;aACJ,CAAC,CAAC;YAEH,YAAY,CAAC,OAAO,CAAC,CAAC;YAEtB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACf,OAAO;oBACH,OAAO,EAAE,sBAAsB,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE;oBACvE,OAAO,EAAE,IAAI;iBAChB,CAAC;YACN,CAAC;YAED,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;YAC/D,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YAEnC,uDAAuD;YACvD,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;gBAChC,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,kBAAkB,CAAC,CAAC;gBACpD,OAAO;oBACH,OAAO,EAAE,SAAS,GAAG,CAAC,IAAI,CAAC,MAAM,GAAG,kBAAkB,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC,CAAC,EAAE,CAAC;oBACjF,OAAO,EAAE,KAAK;iBACjB,CAAC;YACN,CAAC;YAED,MAAM,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,IAAI,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAC;YACtC,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;YAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,kBAAkB,CAAC,CAAC;YAEpD,MAAM,KAAK,GAAG;gBACV,KAAK,KAAK,EAAE;gBACZ,IAAI,CAAC,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE;gBACvB,WAAW,GAAG,EAAE;gBAChB,EAAE;gBACF,SAAS;gBACT,IAAI,CAAC,MAAM,GAAG,kBAAkB,CAAC,CAAC,CAAC,qBAAqB,IAAI,CAAC,MAAM,eAAe,CAAC,CAAC,CAAC,EAAE;aAC1F,CAAC;YAEF,OAAO,EAAE,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;QACzE,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,MAAM,OAAO,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YACjE,IAAI,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;gBAC5B,OAAO,EAAE,OAAO,EAAE,yBAAyB,aAAa,GAAG,IAAI,MAAM,GAAG,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;YAChG,CAAC;YACD,OAAO,EAAE,OAAO,EAAE,iBAAiB,OAAO,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;QAClE,CAAC;IACL,CAAC;CACJ,CAAC;AAEF,mDAAmD;AACnD,MAAM,CAAC,MAAM,KAAK,GAAG,CAAC,gBAAgB,CAAC,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@memoryblock/plugin-fetch-webpage",
|
|
3
|
+
"version": "0.1.0-beta",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"exports": {
|
|
6
|
+
".": {
|
|
7
|
+
"types": "./dist/index.d.ts",
|
|
8
|
+
"import": "./dist/index.js"
|
|
9
|
+
}
|
|
10
|
+
},
|
|
11
|
+
"dependencies": {
|
|
12
|
+
"memoryblock": "0.1.0-beta"
|
|
13
|
+
},
|
|
14
|
+
"scripts": {
|
|
15
|
+
"build": "tsc -p tsconfig.json"
|
|
16
|
+
}
|
|
17
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import type { ToolExecutionResult, ToolContext, ToolDefinition } from 'memoryblock';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* fetch_webpage — extract readable content from a URL.
|
|
5
|
+
*
|
|
6
|
+
* Zero external dependencies:
|
|
7
|
+
* - Uses Node.js built-in fetch() (available since Node 18)
|
|
8
|
+
* - HTML→text via regex (no cheerio, no jsdom)
|
|
9
|
+
*
|
|
10
|
+
* Cost-efficient: truncates output at 8000 chars for token control.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const MAX_CONTENT_LENGTH = 8000;
|
|
14
|
+
const FETCH_TIMEOUT = 15000;
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Strip HTML to readable text content.
|
|
18
|
+
* Removes scripts, styles, nav, header, footer, ads, and extracts
|
|
19
|
+
* meaningful text from the page body.
|
|
20
|
+
*/
|
|
21
|
+
function htmlToText(html: string): string {
|
|
22
|
+
let text = html;
|
|
23
|
+
|
|
24
|
+
// Remove script, style, nav, header, footer, aside elements
|
|
25
|
+
text = text.replace(/<(script|style|nav|header|footer|aside|noscript)[^>]*>[\s\S]*?<\/\1>/gi, '');
|
|
26
|
+
|
|
27
|
+
// Remove HTML comments
|
|
28
|
+
text = text.replace(/<!--[\s\S]*?-->/g, '');
|
|
29
|
+
|
|
30
|
+
// Convert common HTML elements to readable text
|
|
31
|
+
text = text.replace(/<br\s*\/?>/gi, '\n');
|
|
32
|
+
text = text.replace(/<\/p>/gi, '\n\n');
|
|
33
|
+
text = text.replace(/<\/div>/gi, '\n');
|
|
34
|
+
text = text.replace(/<\/h[1-6]>/gi, '\n\n');
|
|
35
|
+
text = text.replace(/<\/li>/gi, '\n');
|
|
36
|
+
|
|
37
|
+
// Add formatting for headers and links
|
|
38
|
+
text = text.replace(/<h[1-6][^>]*>(.*?)<\/h[1-6]>/gi, '\n## $1\n');
|
|
39
|
+
text = text.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '$2 ($1)');
|
|
40
|
+
text = text.replace(/<li[^>]*>/gi, '• ');
|
|
41
|
+
|
|
42
|
+
// Strip remaining HTML tags
|
|
43
|
+
text = text.replace(/<[^>]+>/g, '');
|
|
44
|
+
|
|
45
|
+
// Decode common HTML entities
|
|
46
|
+
text = text
|
|
47
|
+
.replace(/&/g, '&')
|
|
48
|
+
.replace(/</g, '<')
|
|
49
|
+
.replace(/>/g, '>')
|
|
50
|
+
.replace(/"/g, '"')
|
|
51
|
+
.replace(/'/g, "'")
|
|
52
|
+
.replace(/ /g, ' ')
|
|
53
|
+
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code)));
|
|
54
|
+
|
|
55
|
+
// Clean up whitespace
|
|
56
|
+
text = text.replace(/[ \t]+/g, ' '); // collapse horizontal whitespace
|
|
57
|
+
text = text.replace(/\n{3,}/g, '\n\n'); // max 2 consecutive newlines
|
|
58
|
+
text = text.replace(/^\s+|\s+$/gm, ''); // trim each line
|
|
59
|
+
|
|
60
|
+
return text.trim();
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/** Extract page title from HTML. */
|
|
64
|
+
function extractTitle(html: string): string {
|
|
65
|
+
const match = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
66
|
+
return match ? match[1].trim().replace(/\s+/g, ' ') : '(no title)';
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/** Extract meta description. */
|
|
70
|
+
function extractDescription(html: string): string {
|
|
71
|
+
const match = html.match(/<meta[^>]*name=["']description["'][^>]*content=["']([\s\S]*?)["'][^>]*>/i);
|
|
72
|
+
return match ? match[1].trim() : '';
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const fetchWebpageDefinition: ToolDefinition = {
|
|
76
|
+
name: 'fetch_webpage',
|
|
77
|
+
description:
|
|
78
|
+
'Fetch a webpage and extract its text content. ' +
|
|
79
|
+
'Returns the page title, description, and cleaned text. ' +
|
|
80
|
+
'Useful for reading articles, documentation, and reference pages. ' +
|
|
81
|
+
'Output is capped at 8000 characters for token efficiency.',
|
|
82
|
+
parameters: {
|
|
83
|
+
type: 'object',
|
|
84
|
+
properties: {
|
|
85
|
+
url: { type: 'string', description: 'The URL to fetch.' },
|
|
86
|
+
},
|
|
87
|
+
required: ['url'],
|
|
88
|
+
additionalProperties: false,
|
|
89
|
+
},
|
|
90
|
+
requiresApproval: false,
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
export const fetchWebpageTool = {
|
|
94
|
+
definition: fetchWebpageDefinition,
|
|
95
|
+
async execute(params: Record<string, unknown>, _context: ToolContext): Promise<ToolExecutionResult> {
|
|
96
|
+
const url = params.url as string;
|
|
97
|
+
|
|
98
|
+
if (!url || !url.startsWith('http')) {
|
|
99
|
+
return { content: 'Invalid URL. Must start with http:// or https://.', isError: true };
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
try {
|
|
103
|
+
const controller = new AbortController();
|
|
104
|
+
const timeout = setTimeout(() => controller.abort(), FETCH_TIMEOUT);
|
|
105
|
+
|
|
106
|
+
const response = await fetch(url, {
|
|
107
|
+
signal: controller.signal,
|
|
108
|
+
headers: {
|
|
109
|
+
'User-Agent': 'memoryblock/0.1.0 (AI assistant web reader)',
|
|
110
|
+
'Accept': 'text/html,application/xhtml+xml,text/plain',
|
|
111
|
+
},
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
clearTimeout(timeout);
|
|
115
|
+
|
|
116
|
+
if (!response.ok) {
|
|
117
|
+
return {
|
|
118
|
+
content: `Fetch failed: HTTP ${response.status} ${response.statusText}`,
|
|
119
|
+
isError: true,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const contentType = response.headers.get('content-type') || '';
|
|
124
|
+
const html = await response.text();
|
|
125
|
+
|
|
126
|
+
// If it's plain text or JSON, return as-is (truncated)
|
|
127
|
+
if (!contentType.includes('html')) {
|
|
128
|
+
const truncated = html.slice(0, MAX_CONTENT_LENGTH);
|
|
129
|
+
return {
|
|
130
|
+
content: truncated + (html.length > MAX_CONTENT_LENGTH ? '\n...(truncated)' : ''),
|
|
131
|
+
isError: false,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const title = extractTitle(html);
|
|
136
|
+
const desc = extractDescription(html);
|
|
137
|
+
const text = htmlToText(html);
|
|
138
|
+
const truncated = text.slice(0, MAX_CONTENT_LENGTH);
|
|
139
|
+
|
|
140
|
+
const parts = [
|
|
141
|
+
`# ${title}`,
|
|
142
|
+
desc ? `> ${desc}` : '',
|
|
143
|
+
`Source: ${url}`,
|
|
144
|
+
'',
|
|
145
|
+
truncated,
|
|
146
|
+
text.length > MAX_CONTENT_LENGTH ? `\n...(truncated — ${text.length} chars total)` : '',
|
|
147
|
+
];
|
|
148
|
+
|
|
149
|
+
return { content: parts.filter(Boolean).join('\n'), isError: false };
|
|
150
|
+
} catch (err) {
|
|
151
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
152
|
+
if (message.includes('abort')) {
|
|
153
|
+
return { content: `Fetch timed out after ${FETCH_TIMEOUT / 1000}s: ${url}`, isError: true };
|
|
154
|
+
}
|
|
155
|
+
return { content: `Fetch failed: ${message}`, isError: true };
|
|
156
|
+
}
|
|
157
|
+
},
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
/** Export as array for registry plugin loading. */
|
|
161
|
+
export const tools = [fetchWebpageTool];
|