@j0hanz/superfetch 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +327 -0
- package/dist/config/index.d.ts +30 -0
- package/dist/config/index.d.ts.map +1 -0
- package/dist/config/index.js +42 -0
- package/dist/config/index.js.map +1 -0
- package/dist/errors/app-error.d.ts +71 -0
- package/dist/errors/app-error.d.ts.map +1 -0
- package/dist/errors/app-error.js +103 -0
- package/dist/errors/app-error.js.map +1 -0
- package/dist/errors/index.d.ts +2 -0
- package/dist/errors/index.d.ts.map +1 -0
- package/dist/errors/index.js +2 -0
- package/dist/errors/index.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +179 -0
- package/dist/index.js.map +1 -0
- package/dist/middleware/error-handler.d.ts +7 -0
- package/dist/middleware/error-handler.d.ts.map +1 -0
- package/dist/middleware/error-handler.js +37 -0
- package/dist/middleware/error-handler.js.map +1 -0
- package/dist/middleware/rate-limiter.d.ts +33 -0
- package/dist/middleware/rate-limiter.d.ts.map +1 -0
- package/dist/middleware/rate-limiter.js +100 -0
- package/dist/middleware/rate-limiter.js.map +1 -0
- package/dist/prompts/index.d.ts +6 -0
- package/dist/prompts/index.d.ts.map +1 -0
- package/dist/prompts/index.js +81 -0
- package/dist/prompts/index.js.map +1 -0
- package/dist/resources/index.d.ts +6 -0
- package/dist/resources/index.d.ts.map +1 -0
- package/dist/resources/index.js +44 -0
- package/dist/resources/index.js.map +1 -0
- package/dist/server.d.ts +8 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +39 -0
- package/dist/server.js.map +1 -0
- package/dist/services/cache.d.ts +16 -0
- package/dist/services/cache.d.ts.map +1 -0
- package/dist/services/cache.js +63 -0
- package/dist/services/cache.js.map +1 -0
- package/dist/services/cache.service.d.ts +52 -0
- package/dist/services/cache.service.d.ts.map +1 -0
- package/dist/services/cache.service.js +113 -0
- package/dist/services/cache.service.js.map +1 -0
- package/dist/services/extractor.d.ts +32 -0
- package/dist/services/extractor.d.ts.map +1 -0
- package/dist/services/extractor.js +97 -0
- package/dist/services/extractor.js.map +1 -0
- package/dist/services/extractor.service.d.ts +18 -0
- package/dist/services/extractor.service.d.ts.map +1 -0
- package/dist/services/extractor.service.js +75 -0
- package/dist/services/extractor.service.js.map +1 -0
- package/dist/services/fetcher.d.ts +9 -0
- package/dist/services/fetcher.d.ts.map +1 -0
- package/dist/services/fetcher.js +100 -0
- package/dist/services/fetcher.js.map +1 -0
- package/dist/services/fetcher.service.d.ts +18 -0
- package/dist/services/fetcher.service.d.ts.map +1 -0
- package/dist/services/fetcher.service.js +122 -0
- package/dist/services/fetcher.service.js.map +1 -0
- package/dist/services/logger.d.ts +5 -0
- package/dist/services/logger.d.ts.map +1 -0
- package/dist/services/logger.js +48 -0
- package/dist/services/logger.js.map +1 -0
- package/dist/services/logger.service.d.ts +5 -0
- package/dist/services/logger.service.d.ts.map +1 -0
- package/dist/services/logger.service.js +57 -0
- package/dist/services/logger.service.js.map +1 -0
- package/dist/services/parser.d.ts +6 -0
- package/dist/services/parser.d.ts.map +1 -0
- package/dist/services/parser.js +152 -0
- package/dist/services/parser.js.map +1 -0
- package/dist/services/parser.service.d.ts +42 -0
- package/dist/services/parser.service.d.ts.map +1 -0
- package/dist/services/parser.service.js +209 -0
- package/dist/services/parser.service.js.map +1 -0
- package/dist/tools/handlers/fetch-links.tool.d.ts +20 -0
- package/dist/tools/handlers/fetch-links.tool.d.ts.map +1 -0
- package/dist/tools/handlers/fetch-links.tool.js +91 -0
- package/dist/tools/handlers/fetch-links.tool.js.map +1 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +17 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts.map +1 -0
- package/dist/tools/handlers/fetch-markdown.tool.js +99 -0
- package/dist/tools/handlers/fetch-markdown.tool.js.map +1 -0
- package/dist/tools/handlers/fetch-url.tool.d.ts +17 -0
- package/dist/tools/handlers/fetch-url.tool.d.ts.map +1 -0
- package/dist/tools/handlers/fetch-url.tool.js +103 -0
- package/dist/tools/handlers/fetch-url.tool.js.map +1 -0
- package/dist/tools/index.d.ts +7 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +83 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/transformers/jsonl.transformer.d.ts +4 -0
- package/dist/transformers/jsonl.transformer.d.ts.map +1 -0
- package/dist/transformers/jsonl.transformer.js +42 -0
- package/dist/transformers/jsonl.transformer.js.map +1 -0
- package/dist/transformers/markdown.transformer.d.ts +4 -0
- package/dist/transformers/markdown.transformer.d.ts.map +1 -0
- package/dist/transformers/markdown.transformer.js +104 -0
- package/dist/transformers/markdown.transformer.js.map +1 -0
- package/dist/types/content.types.d.ts +63 -0
- package/dist/types/content.types.d.ts.map +1 -0
- package/dist/types/content.types.js +2 -0
- package/dist/types/content.types.js.map +1 -0
- package/dist/types/index.d.ts +3 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +3 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/schemas.d.ts +22 -0
- package/dist/types/schemas.d.ts.map +1 -0
- package/dist/types/schemas.js +5 -0
- package/dist/types/schemas.js.map +1 -0
- package/dist/utils/sanitizer.d.ts +9 -0
- package/dist/utils/sanitizer.d.ts.map +1 -0
- package/dist/utils/sanitizer.js +19 -0
- package/dist/utils/sanitizer.js.map +1 -0
- package/dist/utils/url-validator.d.ts +10 -0
- package/dist/utils/url-validator.d.ts.map +1 -0
- package/dist/utils/url-validator.js +69 -0
- package/dist/utils/url-validator.js.map +1 -0
- package/package.json +80 -0
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import * as cheerio from 'cheerio';
|
|
2
|
+
import { sanitizeText } from '../utils/sanitizer.js';
|
|
3
|
+
import { config } from '../config/index.js';
|
|
4
|
+
class ParserService {
|
|
5
|
+
/**
|
|
6
|
+
* Parses HTML content and extracts semantic blocks
|
|
7
|
+
*/
|
|
8
|
+
parseHtml(html) {
|
|
9
|
+
const $ = cheerio.load(html);
|
|
10
|
+
const blocks = [];
|
|
11
|
+
// Remove script, style, and other non-content elements
|
|
12
|
+
$('script, style, noscript, iframe, svg').remove();
|
|
13
|
+
// Parse the body content
|
|
14
|
+
$('body')
|
|
15
|
+
.find('h1, h2, h3, h4, h5, h6, p, ul, ol, pre, code, table, img')
|
|
16
|
+
.each((_, element) => {
|
|
17
|
+
const block = this.parseElement($, element);
|
|
18
|
+
if (block) {
|
|
19
|
+
blocks.push(block);
|
|
20
|
+
}
|
|
21
|
+
});
|
|
22
|
+
return this.filterBlocks(blocks);
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Parses a single element into a content block
|
|
26
|
+
*/
|
|
27
|
+
parseElement($, node) {
|
|
28
|
+
// Check if node is an Element with tagName property
|
|
29
|
+
if (!('tagName' in node) || typeof node.tagName !== 'string') {
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
const element = node;
|
|
33
|
+
const tagName = element.tagName.toLowerCase();
|
|
34
|
+
switch (tagName) {
|
|
35
|
+
case 'h1':
|
|
36
|
+
case 'h2':
|
|
37
|
+
case 'h3':
|
|
38
|
+
case 'h4':
|
|
39
|
+
case 'h5':
|
|
40
|
+
case 'h6':
|
|
41
|
+
return this.parseHeading($, element);
|
|
42
|
+
case 'p':
|
|
43
|
+
return this.parseParagraph($, element);
|
|
44
|
+
case 'ul':
|
|
45
|
+
case 'ol':
|
|
46
|
+
return this.parseList($, element);
|
|
47
|
+
case 'pre':
|
|
48
|
+
case 'code':
|
|
49
|
+
return this.parseCode($, element);
|
|
50
|
+
case 'table':
|
|
51
|
+
return this.parseTable($, element);
|
|
52
|
+
case 'img':
|
|
53
|
+
return this.parseImage($, element);
|
|
54
|
+
default:
|
|
55
|
+
return null;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Parses a heading element
|
|
60
|
+
*/
|
|
61
|
+
parseHeading($, element) {
|
|
62
|
+
const text = sanitizeText($(element).text());
|
|
63
|
+
if (!text) {
|
|
64
|
+
return null;
|
|
65
|
+
}
|
|
66
|
+
const level = parseInt(element.tagName.substring(1), 10);
|
|
67
|
+
return {
|
|
68
|
+
type: 'heading',
|
|
69
|
+
level,
|
|
70
|
+
text,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Parses a paragraph element
|
|
75
|
+
*/
|
|
76
|
+
parseParagraph($, element) {
|
|
77
|
+
const text = sanitizeText($(element).text());
|
|
78
|
+
if (!text || text.length < config.extraction.minParagraphLength) {
|
|
79
|
+
// Skip very short paragraphs
|
|
80
|
+
return null;
|
|
81
|
+
}
|
|
82
|
+
return {
|
|
83
|
+
type: 'paragraph',
|
|
84
|
+
text,
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Parses a list element
|
|
89
|
+
*/
|
|
90
|
+
parseList($, element) {
|
|
91
|
+
const items = [];
|
|
92
|
+
$(element)
|
|
93
|
+
.find('li')
|
|
94
|
+
.each((_, li) => {
|
|
95
|
+
const text = sanitizeText($(li).text());
|
|
96
|
+
if (text) {
|
|
97
|
+
items.push(text);
|
|
98
|
+
}
|
|
99
|
+
});
|
|
100
|
+
if (items.length === 0) {
|
|
101
|
+
return null;
|
|
102
|
+
}
|
|
103
|
+
const ordered = element.tagName.toLowerCase() === 'ol';
|
|
104
|
+
return {
|
|
105
|
+
type: 'list',
|
|
106
|
+
ordered,
|
|
107
|
+
items,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Parses a code element
|
|
112
|
+
*/
|
|
113
|
+
parseCode($, element) {
|
|
114
|
+
const text = $(element).text().trim();
|
|
115
|
+
if (!text) {
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
118
|
+
// Try to detect language from class name
|
|
119
|
+
const className = $(element).attr('class') || '';
|
|
120
|
+
const languageMatch = className.match(/language-(\w+)/);
|
|
121
|
+
const language = languageMatch ? languageMatch[1] : undefined;
|
|
122
|
+
return {
|
|
123
|
+
type: 'code',
|
|
124
|
+
language,
|
|
125
|
+
text,
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Parses a table element
|
|
130
|
+
*/
|
|
131
|
+
parseTable($, element) {
|
|
132
|
+
const headers = [];
|
|
133
|
+
const rows = [];
|
|
134
|
+
// Extract headers from thead or first row
|
|
135
|
+
const $table = $(element);
|
|
136
|
+
$table.find('thead th, thead td').each((_, cell) => {
|
|
137
|
+
headers.push(sanitizeText($(cell).text()));
|
|
138
|
+
});
|
|
139
|
+
// If no thead, try first tr
|
|
140
|
+
if (headers.length === 0) {
|
|
141
|
+
$table
|
|
142
|
+
.find('tr')
|
|
143
|
+
.first()
|
|
144
|
+
.find('th, td')
|
|
145
|
+
.each((_, cell) => {
|
|
146
|
+
headers.push(sanitizeText($(cell).text()));
|
|
147
|
+
});
|
|
148
|
+
}
|
|
149
|
+
// Extract body rows
|
|
150
|
+
const rowsSelector = headers.length > 0 ? 'tbody tr, tr:not(:first)' : 'tbody tr, tr';
|
|
151
|
+
$table.find(rowsSelector).each((_, row) => {
|
|
152
|
+
const cells = [];
|
|
153
|
+
$(row)
|
|
154
|
+
.find('td, th')
|
|
155
|
+
.each((_, cell) => {
|
|
156
|
+
cells.push(sanitizeText($(cell).text()));
|
|
157
|
+
});
|
|
158
|
+
if (cells.length > 0) {
|
|
159
|
+
rows.push(cells);
|
|
160
|
+
}
|
|
161
|
+
});
|
|
162
|
+
if (rows.length === 0) {
|
|
163
|
+
return null;
|
|
164
|
+
}
|
|
165
|
+
return {
|
|
166
|
+
type: 'table',
|
|
167
|
+
headers: headers.length > 0 ? headers : undefined,
|
|
168
|
+
rows,
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Parses an image element
|
|
173
|
+
*/
|
|
174
|
+
parseImage($, element) {
|
|
175
|
+
const src = $(element).attr('src');
|
|
176
|
+
if (!src) {
|
|
177
|
+
return null;
|
|
178
|
+
}
|
|
179
|
+
const alt = $(element).attr('alt') || undefined;
|
|
180
|
+
return {
|
|
181
|
+
type: 'image',
|
|
182
|
+
src,
|
|
183
|
+
alt,
|
|
184
|
+
};
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Filters out empty or invalid blocks
|
|
188
|
+
*/
|
|
189
|
+
filterBlocks(blocks) {
|
|
190
|
+
return blocks.filter((block) => {
|
|
191
|
+
if (block.type === 'paragraph') {
|
|
192
|
+
return block.text.length > 0;
|
|
193
|
+
}
|
|
194
|
+
if (block.type === 'heading') {
|
|
195
|
+
return block.text.length > 0;
|
|
196
|
+
}
|
|
197
|
+
if (block.type === 'list') {
|
|
198
|
+
return block.items.length > 0;
|
|
199
|
+
}
|
|
200
|
+
if (block.type === 'code') {
|
|
201
|
+
return block.text.length > 0;
|
|
202
|
+
}
|
|
203
|
+
return true;
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
// Singleton instance
|
|
208
|
+
export const parserService = new ParserService();
|
|
209
|
+
//# sourceMappingURL=parser.service.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"parser.service.js","sourceRoot":"","sources":["../../src/services/parser.service.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAGnC,OAAO,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAC;AACrD,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAW5C,MAAM,aAAa;IACjB;;OAEG;IACH,SAAS,CAAC,IAAY;QACpB,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,MAAM,MAAM,GAAwB,EAAE,CAAC;QAEvC,uDAAuD;QACvD,CAAC,CAAC,sCAAsC,CAAC,CAAC,MAAM,EAAE,CAAC;QAEnD,yBAAyB;QACzB,CAAC,CAAC,MAAM,CAAC;aACN,IAAI,CAAC,0DAA0D,CAAC;aAChE,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;YACnB,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAC5C,IAAI,KAAK,EAAE,CAAC;gBACV,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACrB,CAAC;QACH,CAAC,CAAC,CAAC;QAEL,OAAO,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;IACnC,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,CAAa,EAAE,IAAa;QAC/C,oDAAoD;QACpD,IAAI,CAAC,CAAC,SAAS,IAAI,IAAI,CAAC,IAAI,OAAO,IAAI,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;YAC7D,OAAO,IAAI,CAAC;QACd,CAAC;QACD,MAAM,OAAO,GAAG,IAAI,CAAC;QACrB,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;QAE9C,QAAQ,OAAO,EAAE,CAAC;YAChB,KAAK,IAAI,CAAC;YACV,KAAK,IAAI,CAAC;YACV,KAAK,IAAI,CAAC;YACV,KAAK,IAAI,CAAC;YACV,KAAK,IAAI,CAAC;YACV,KAAK,IAAI;gBACP,OAAO,IAAI,CAAC,YAAY,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAEvC,KAAK,GAAG;gBACN,OAAO,IAAI,CAAC,cAAc,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAEzC,KAAK,IAAI,CAAC;YACV,KAAK,IAAI;gBACP,OAAO,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAEpC,KAAK,KAAK,CAAC;YACX,KAAK,MAAM;gBACT,OAAO,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAEpC,KAAK,OAAO;gBACV,OAAO,IAAI,CAAC,UAAU,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAErC,KAAK,KAAK;gBACR,OAAO,IAAI,CAAC,UAAU,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAErC;gBACE,OAAO,IAAI,CAAC;QAChB,CAAC;IACH,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,CAAa,EAAE,OAAgB;QAClD,MAAM,IAAI,GAAG,YAAY,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QAC7C,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,OAAO,IAAI,CAAC;QACd,CAAC;QAED,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAEzD,OAAO;YACL,IAAI,EAAE,SAAS;YACf,KAAK;YACL,IAAI;SACL,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,cAAc,CACpB,CAAa,EACb,OAAgB;QAEhB,MAAM,IAAI,GAAG,YAAY,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QAC7C,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,UAAU,CAAC,kBAAkB,EAAE,CAAC;YAChE,6BAA6B;YAC7B,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO;YACL,IAAI,EAAE,WAAW;YACjB,IAAI;SACL,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,SAAS,CAAC,CAAa,EAAE,OAAgB;QAC/C,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,CAAC,CAAC,OAAO,CAAC;aACP,IAAI,CAAC,IAAI,CAAC;aACV,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YACd,MAAM,IAAI,GAAG,YAAY,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YACxC,IAAI,IAAI,EAAE,CAAC;gBACT,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACnB,CAAC;QACH,CAAC,CAAC,CAAC;QAEL,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,OAAO,IAAI,CAAC;QACd,CAAC;QAED,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,WAAW,EAAE,KAAK,IAAI,CAAC;QAEvD,OAAO;YACL,IAAI,EAAE,MAAM;YACZ,OAAO;YACP,KAAK;SACN,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,SAAS,CAAC,CAAa,EAAE,OAAgB;QAC/C,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,OAAO,IAAI,CAAC;QACd,CAAC;QAED,yCAAyC;QACzC,MAAM,SAAS,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;QACjD,MAAM,aAAa,GAAG,SAAS,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;QACxD,MAAM,QAAQ,GAAG,aAAa,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QAE9D,OAAO;YACL,IAAI,EAAE,MAAM;YACZ,QAAQ;YACR,IAAI;SACL,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,CAAa,EAAE,OAAgB;QAChD,MAAM,OAAO,GAAa,EAAE,CAAC;QAC7B,MAAM,IAAI,GAAe,EAAE,CAAC;QAE5B,0CAA0C;QAC1C,MAAM,MAAM,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;QAC1B,MAAM,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;YACjD,OAAO,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,4BAA4B;QAC5B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACzB,MAAM;iBACH,IAAI,CAAC,IAAI,CAAC;iBACV,KAAK,EAAE;iBACP,IAAI,CAAC,QAAQ,CAAC;iBACd,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;gBAChB,OAAO,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;YAC7C,CAAC,CAAC,CAAC;QACP,CAAC;QAED,oBAAoB;QACpB,MAAM,YAAY,GAChB,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,0BAA0B,CAAC,CAAC,CAAC,cAAc,CAAC;QACnE,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE;YACxC,MAAM,KAAK,GAAa,EAAE,CAAC;YAC3B,CAAC,CAAC,GAAG,CAAC;iBACH,IAAI,CAAC,QAAQ,CAAC;iBACd,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;gBAChB,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;YAC3C,CAAC,CAAC,CAAC;YACL,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACrB,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnB,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACtB,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO;YACL,IAAI,EAAE,OAAO;YACb,OAAO,EAAE,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS;YACjD,IAAI;SACL,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,CAAa,EAAE,OAAgB;QAChD,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACnC,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,OAAO,IAAI,CAAC;QACd,CAAC;QAED,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,SAAS,CAAC;QAEhD,OAAO;YACL,IAAI,EAAE,OAAO;YACb,GAAG;YACH,GAAG;SACJ,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,MAA2B;QAC9C,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE;YAC7B,IAAI,KAAK,CAAC,IAAI,KAAK,WAAW,EAAE,CAAC;gBAC/B,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;YAC/B,CAAC;YACD,IAAI,KAAK,CAAC,IAAI,KAAK,SAAS,EAAE,CAAC;gBAC7B,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;YAC/B,CAAC;YACD,IAAI,KAAK,CAAC,IAAI,KAAK,MAAM,EAAE,CAAC;gBAC1B,OAAO,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;YAChC,CAAC;YACD,IAAI,KAAK,CAAC,IAAI,KAAK,MAAM,EAAE,CAAC;gBAC1B,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;YAC/B,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AAED,qBAAqB;AACrB,MAAM,CAAC,MAAM,aAAa,GAAG,IAAI,aAAa,EAAE,CAAC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { FetchLinksInput } from '../../types/index.js';
|
|
2
|
+
export declare const FETCH_LINKS_TOOL_NAME = "fetch-links";
|
|
3
|
+
export declare const FETCH_LINKS_TOOL_DESCRIPTION = "Extracts all hyperlinks from a webpage with anchor text and type classification";
|
|
4
|
+
/**
|
|
5
|
+
* Tool handler for extracting links from a URL
|
|
6
|
+
*/
|
|
7
|
+
export declare function fetchLinksToolHandler(input: FetchLinksInput): Promise<{
|
|
8
|
+
content: {
|
|
9
|
+
type: "text";
|
|
10
|
+
text: string;
|
|
11
|
+
}[];
|
|
12
|
+
isError?: undefined;
|
|
13
|
+
} | {
|
|
14
|
+
content: {
|
|
15
|
+
type: "text";
|
|
16
|
+
text: string;
|
|
17
|
+
}[];
|
|
18
|
+
isError: boolean;
|
|
19
|
+
}>;
|
|
20
|
+
//# sourceMappingURL=fetch-links.tool.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetch-links.tool.d.ts","sourceRoot":"","sources":["../../../src/tools/handlers/fetch-links.tool.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,eAAe,EAAiB,MAAM,sBAAsB,CAAC;AAE3E,eAAO,MAAM,qBAAqB,gBAAgB,CAAC;AACnD,eAAO,MAAM,4BAA4B,oFAC0C,CAAC;AAmDpF;;GAEG;AACH,wBAAsB,qBAAqB,CAAC,KAAK,EAAE,eAAe;;;;;;;;;;;;GA+CjE"}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import { validateAndNormalizeUrl, isInternalUrl, } from '../../utils/url-validator.js';
|
|
2
|
+
import { fetchUrlWithRetry } from '../../services/fetcher.js';
|
|
3
|
+
import * as cache from '../../services/cache.js';
|
|
4
|
+
import * as cheerio from 'cheerio';
|
|
5
|
+
export const FETCH_LINKS_TOOL_NAME = 'fetch-links';
|
|
6
|
+
export const FETCH_LINKS_TOOL_DESCRIPTION = 'Extracts all hyperlinks from a webpage with anchor text and type classification';
|
|
7
|
+
/**
|
|
8
|
+
* Extracts links from HTML, filtering by type and deduplicating
|
|
9
|
+
*/
|
|
10
|
+
function extractLinksFromHtml(html, baseUrl, options) {
|
|
11
|
+
const $ = cheerio.load(html);
|
|
12
|
+
const links = [];
|
|
13
|
+
const seenUrls = new Set();
|
|
14
|
+
$('a[href]').each((_, element) => {
|
|
15
|
+
const href = $(element).attr('href');
|
|
16
|
+
const text = $(element).text().trim();
|
|
17
|
+
// Skip invalid hrefs
|
|
18
|
+
if (!href || href.startsWith('#') || href.startsWith('javascript:')) {
|
|
19
|
+
return;
|
|
20
|
+
}
|
|
21
|
+
try {
|
|
22
|
+
const absoluteUrl = new URL(href, baseUrl).href;
|
|
23
|
+
// Skip duplicates
|
|
24
|
+
if (seenUrls.has(absoluteUrl)) {
|
|
25
|
+
return;
|
|
26
|
+
}
|
|
27
|
+
seenUrls.add(absoluteUrl);
|
|
28
|
+
const type = isInternalUrl(absoluteUrl, baseUrl) ? 'internal' : 'external';
|
|
29
|
+
// Filter based on options
|
|
30
|
+
if (type === 'internal' && !options.includeInternal)
|
|
31
|
+
return;
|
|
32
|
+
if (type === 'external' && !options.includeExternal)
|
|
33
|
+
return;
|
|
34
|
+
links.push({
|
|
35
|
+
href: absoluteUrl,
|
|
36
|
+
text: text || absoluteUrl,
|
|
37
|
+
type,
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
catch {
|
|
41
|
+
// Skip invalid URLs silently
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
return links;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Tool handler for extracting links from a URL
|
|
48
|
+
*/
|
|
49
|
+
export async function fetchLinksToolHandler(input) {
|
|
50
|
+
try {
|
|
51
|
+
const url = validateAndNormalizeUrl(input.url);
|
|
52
|
+
const cacheKey = cache.createCacheKey('links', url);
|
|
53
|
+
const cached = cache.get(cacheKey);
|
|
54
|
+
if (cached) {
|
|
55
|
+
return {
|
|
56
|
+
content: [{ type: 'text', text: cached.content }],
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
const html = await fetchUrlWithRetry(url);
|
|
60
|
+
// Extract links
|
|
61
|
+
const links = extractLinksFromHtml(html, url, {
|
|
62
|
+
includeInternal: input.includeInternal ?? true,
|
|
63
|
+
includeExternal: input.includeExternal ?? true,
|
|
64
|
+
});
|
|
65
|
+
const output = {
|
|
66
|
+
url,
|
|
67
|
+
linkCount: links.length,
|
|
68
|
+
links,
|
|
69
|
+
};
|
|
70
|
+
const outputText = JSON.stringify(output, null, 2);
|
|
71
|
+
cache.set(cacheKey, outputText);
|
|
72
|
+
return {
|
|
73
|
+
content: [{ type: 'text', text: outputText }],
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
catch (error) {
|
|
77
|
+
return {
|
|
78
|
+
content: [
|
|
79
|
+
{
|
|
80
|
+
type: 'text',
|
|
81
|
+
text: JSON.stringify({
|
|
82
|
+
error: `Failed to extract links: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
|
83
|
+
url: input.url,
|
|
84
|
+
}),
|
|
85
|
+
},
|
|
86
|
+
],
|
|
87
|
+
isError: true,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
//# sourceMappingURL=fetch-links.tool.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetch-links.tool.js","sourceRoot":"","sources":["../../../src/tools/handlers/fetch-links.tool.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,uBAAuB,EACvB,aAAa,GACd,MAAM,8BAA8B,CAAC;AACtC,OAAO,EAAE,iBAAiB,EAAE,MAAM,2BAA2B,CAAC;AAC9D,OAAO,KAAK,KAAK,MAAM,yBAAyB,CAAC;AACjD,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAGnC,MAAM,CAAC,MAAM,qBAAqB,GAAG,aAAa,CAAC;AACnD,MAAM,CAAC,MAAM,4BAA4B,GACvC,iFAAiF,CAAC;AAEpF;;GAEG;AACH,SAAS,oBAAoB,CAC3B,IAAY,EACZ,OAAe,EACf,OAA+D;IAE/D,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,KAAK,GAAoB,EAAE,CAAC;IAClC,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;IAEnC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;QAC/B,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACrC,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;QAEtC,qBAAqB;QACrB,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC,EAAE,CAAC;YACpE,OAAO;QACT,CAAC;QAED,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;YAEhD,kBAAkB;YAClB,IAAI,QAAQ,CAAC,GAAG,CAAC,WAAW,CAAC,EAAE,CAAC;gBAC9B,OAAO;YACT,CAAC;YACD,QAAQ,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;YAE1B,MAAM,IAAI,GAAG,aAAa,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC;YAE3E,0BAA0B;YAC1B,IAAI,IAAI,KAAK,UAAU,IAAI,CAAC,OAAO,CAAC,eAAe;gBAAE,OAAO;YAC5D,IAAI,IAAI,KAAK,UAAU,IAAI,CAAC,OAAO,CAAC,eAAe;gBAAE,OAAO;YAE5D,KAAK,CAAC,IAAI,CAAC;gBACT,IAAI,EAAE,WAAW;gBACjB,IAAI,EAAE,IAAI,IAAI,WAAW;gBACzB,IAAI;aACL,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,6BAA6B;QAC/B,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,qBAAqB,CAAC,KAAsB;IAChE,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,uBAAuB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC/C,MAAM,QAAQ,GAAG,KAAK,CAAC,cAAc,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;QAEpD,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACnC,IAAI,MAAM,EAAE,CAAC;YACX,OAAO;gBACL,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,MAAM,CAAC,OAAO,EAAE,CAAC;aAC3D,CAAC;QACJ,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,iBAAiB,CAAC,GAAG,CAAC,CAAC;QAE1C,gBAAgB;QAChB,MAAM,KAAK,GAAG,oBAAoB,CAAC,IAAI,EAAE,GAAG,EAAE;YAC5C,eAAe,EAAE,KAAK,CAAC,eAAe,IAAI,IAAI;YAC9C,eAAe,EAAE,KAAK,CAAC,eAAe,IAAI,IAAI;SAC/C,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG;YACb,GAAG;YACH,SAAS,EAAE,KAAK,CAAC,MAAM;YACvB,KAAK;SACN,CAAC;QAEF,MAAM,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;QAEnD,KAAK,CAAC,GAAG,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;QAEhC,OAAO;YACL,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;SACvD,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO;YACL,OAAO,EAAE;gBACP;oBACE,IAAI,EAAE,MAAe;oBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;wBACnB,KAAK,EAAE,4BAA4B,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE;wBAC7F,GAAG,EAAE,KAAK,CAAC,GAAG;qBACf,CAAC;iBACH;aACF;YACD,OAAO,EAAE,IAAI;SACd,CAAC;IACJ,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { FetchMarkdownInput } from '../../types/index.js';
|
|
2
|
+
export declare const FETCH_MARKDOWN_TOOL_NAME = "fetch-markdown";
|
|
3
|
+
export declare const FETCH_MARKDOWN_TOOL_DESCRIPTION = "Fetches a webpage and converts it to clean Markdown format with optional frontmatter";
|
|
4
|
+
export declare function fetchMarkdownToolHandler(input: FetchMarkdownInput): Promise<{
|
|
5
|
+
content: {
|
|
6
|
+
type: "text";
|
|
7
|
+
text: string;
|
|
8
|
+
}[];
|
|
9
|
+
isError?: undefined;
|
|
10
|
+
} | {
|
|
11
|
+
content: {
|
|
12
|
+
type: "text";
|
|
13
|
+
text: string;
|
|
14
|
+
}[];
|
|
15
|
+
isError: boolean;
|
|
16
|
+
}>;
|
|
17
|
+
//# sourceMappingURL=fetch-markdown.tool.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetch-markdown.tool.d.ts","sourceRoot":"","sources":["../../../src/tools/handlers/fetch-markdown.tool.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAC;AAE/D,eAAO,MAAM,wBAAwB,mBAAmB,CAAC;AACzD,eAAO,MAAM,+BAA+B,yFAC4C,CAAC;AA+CzF,wBAAsB,wBAAwB,CAAC,KAAK,EAAE,kBAAkB;;;;;;;;;;;;GA+DvE"}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import { validateAndNormalizeUrl } from '../../utils/url-validator.js';
|
|
2
|
+
import { fetchUrlWithRetry } from '../../services/fetcher.js';
|
|
3
|
+
import { extractContent } from '../../services/extractor.js';
|
|
4
|
+
import { htmlToMarkdown } from '../../transformers/markdown.transformer.js';
|
|
5
|
+
import * as cache from '../../services/cache.js';
|
|
6
|
+
import { config } from '../../config/index.js';
|
|
7
|
+
export const FETCH_MARKDOWN_TOOL_NAME = 'fetch-markdown';
|
|
8
|
+
export const FETCH_MARKDOWN_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to clean Markdown format with optional frontmatter';
|
|
9
|
+
function extractAndConvertToMarkdown(html, url, options) {
|
|
10
|
+
// Use the optimized extractContent that parses JSDOM only once
|
|
11
|
+
const { article, metadata: extractedMeta } = extractContent(html, url);
|
|
12
|
+
if (options.extractMainContent && config.extraction.extractMainContent && article) {
|
|
13
|
+
const metadata = options.includeMetadata && config.extraction.includeMetadata
|
|
14
|
+
? {
|
|
15
|
+
type: 'metadata',
|
|
16
|
+
title: article.title,
|
|
17
|
+
author: article.byline,
|
|
18
|
+
url,
|
|
19
|
+
fetchedAt: new Date().toISOString(),
|
|
20
|
+
}
|
|
21
|
+
: undefined;
|
|
22
|
+
return {
|
|
23
|
+
markdown: htmlToMarkdown(article.content, metadata),
|
|
24
|
+
title: article.title,
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
// Fallback: convert full HTML
|
|
28
|
+
const metadata = options.includeMetadata && config.extraction.includeMetadata
|
|
29
|
+
? {
|
|
30
|
+
type: 'metadata',
|
|
31
|
+
title: extractedMeta.title,
|
|
32
|
+
description: extractedMeta.description,
|
|
33
|
+
author: extractedMeta.author,
|
|
34
|
+
url,
|
|
35
|
+
fetchedAt: new Date().toISOString(),
|
|
36
|
+
}
|
|
37
|
+
: undefined;
|
|
38
|
+
return {
|
|
39
|
+
markdown: htmlToMarkdown(html, metadata),
|
|
40
|
+
title: extractedMeta.title,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
export async function fetchMarkdownToolHandler(input) {
|
|
44
|
+
try {
|
|
45
|
+
const url = validateAndNormalizeUrl(input.url);
|
|
46
|
+
const cacheKey = cache.createCacheKey('markdown', url);
|
|
47
|
+
const cached = cache.get(cacheKey);
|
|
48
|
+
if (cached) {
|
|
49
|
+
return {
|
|
50
|
+
content: [
|
|
51
|
+
{
|
|
52
|
+
type: 'text',
|
|
53
|
+
text: JSON.stringify({
|
|
54
|
+
url,
|
|
55
|
+
cached: true,
|
|
56
|
+
fetchedAt: cached.fetchedAt,
|
|
57
|
+
markdown: cached.content,
|
|
58
|
+
}),
|
|
59
|
+
},
|
|
60
|
+
],
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
const html = await fetchUrlWithRetry(url);
|
|
64
|
+
const { markdown, title } = extractAndConvertToMarkdown(html, url, {
|
|
65
|
+
extractMainContent: input.extractMainContent ?? true,
|
|
66
|
+
includeMetadata: input.includeMetadata ?? true,
|
|
67
|
+
});
|
|
68
|
+
cache.set(cacheKey, markdown);
|
|
69
|
+
return {
|
|
70
|
+
content: [
|
|
71
|
+
{
|
|
72
|
+
type: 'text',
|
|
73
|
+
text: JSON.stringify({
|
|
74
|
+
url,
|
|
75
|
+
title,
|
|
76
|
+
fetchedAt: new Date().toISOString(),
|
|
77
|
+
markdown,
|
|
78
|
+
cached: false,
|
|
79
|
+
}, null, 2),
|
|
80
|
+
},
|
|
81
|
+
],
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
catch (error) {
|
|
85
|
+
return {
|
|
86
|
+
content: [
|
|
87
|
+
{
|
|
88
|
+
type: 'text',
|
|
89
|
+
text: JSON.stringify({
|
|
90
|
+
error: `Failed to fetch markdown: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
|
91
|
+
url: input.url,
|
|
92
|
+
}),
|
|
93
|
+
},
|
|
94
|
+
],
|
|
95
|
+
isError: true,
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
//# sourceMappingURL=fetch-markdown.tool.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetch-markdown.tool.js","sourceRoot":"","sources":["../../../src/tools/handlers/fetch-markdown.tool.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAC;AACvE,OAAO,EAAE,iBAAiB,EAAE,MAAM,2BAA2B,CAAC;AAC9D,OAAO,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAC7D,OAAO,EAAE,cAAc,EAAE,MAAM,4CAA4C,CAAC;AAC5E,OAAO,KAAK,KAAK,MAAM,yBAAyB,CAAC;AACjD,OAAO,EAAE,MAAM,EAAE,MAAM,uBAAuB,CAAC;AAG/C,MAAM,CAAC,MAAM,wBAAwB,GAAG,gBAAgB,CAAC;AACzD,MAAM,CAAC,MAAM,+BAA+B,GAC1C,sFAAsF,CAAC;AAEzF,SAAS,2BAA2B,CAClC,IAAY,EACZ,GAAW,EACX,OAAkE;IAElE,+DAA+D;IAC/D,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,GAAG,cAAc,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IAEvE,IAAI,OAAO,CAAC,kBAAkB,IAAI,MAAM,CAAC,UAAU,CAAC,kBAAkB,IAAI,OAAO,EAAE,CAAC;QAClF,MAAM,QAAQ,GACZ,OAAO,CAAC,eAAe,IAAI,MAAM,CAAC,UAAU,CAAC,eAAe;YAC1D,CAAC,CAAC;gBACE,IAAI,EAAE,UAAmB;gBACzB,KAAK,EAAE,OAAO,CAAC,KAAK;gBACpB,MAAM,EAAE,OAAO,CAAC,MAAM;gBACtB,GAAG;gBACH,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC;YACH,CAAC,CAAC,SAAS,CAAC;QAEhB,OAAO;YACL,QAAQ,EAAE,cAAc,CAAC,OAAO,CAAC,OAAO,EAAE,QAAQ,CAAC;YACnD,KAAK,EAAE,OAAO,CAAC,KAAK;SACrB,CAAC;IACJ,CAAC;IAED,8BAA8B;IAC9B,MAAM,QAAQ,GACZ,OAAO,CAAC,eAAe,IAAI,MAAM,CAAC,UAAU,CAAC,eAAe;QAC1D,CAAC,CAAC;YACE,IAAI,EAAE,UAAmB;YACzB,KAAK,EAAE,aAAa,CAAC,KAAK;YAC1B,WAAW,EAAE,aAAa,CAAC,WAAW;YACtC,MAAM,EAAE,aAAa,CAAC,MAAM;YAC5B,GAAG;YACH,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC;QACH,CAAC,CAAC,SAAS,CAAC;IAEhB,OAAO;QACL,QAAQ,EAAE,cAAc,CAAC,IAAI,EAAE,QAAQ,CAAC;QACxC,KAAK,EAAE,aAAa,CAAC,KAAK;KAC3B,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,wBAAwB,CAAC,KAAyB;IACtE,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,uBAAuB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC/C,MAAM,QAAQ,GAAG,KAAK,CAAC,cAAc,CAAC,UAAU,EAAE,GAAG,CAAC,CAAC;QAEvD,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACnC,IAAI,MAAM,EAAE,CAAC;YACX,OAAO;gBACL,OAAO,EAAE;oBACP;wBACE,IAAI,EAAE,MAAe;wBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;4BACnB,GAAG;4BACH,MAAM,EAAE,IAAI;4BACZ,SAAS,EAAE,MAAM,CAAC,SAAS;4BAC3B,QAAQ,EAAE,MAAM,CAAC,OAAO;yBACzB,CAAC;qBACH;iBACF;aACF,CAAC;QACJ,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,iBAAiB,CAAC,GAAG,CAAC,CAAC;QAE1C,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,GAAG,2BAA2B,CAAC,IAAI,EAAE,GAAG,EAAE;YACjE,kBAAkB,EAAE,KAAK,CAAC,kBAAkB,IAAI,IAAI;YACpD,eAAe,EAAE,KAAK,CAAC,eAAe,IAAI,IAAI;SAC/C,CAAC,CAAC;QAEH,KAAK,CAAC,GAAG,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;QAE9B,OAAO;YACL,OAAO,EAAE;gBACP;oBACE,IAAI,EAAE,MAAe;oBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAClB;wBACE,GAAG;wBACH,KAAK;wBACL,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;wBACnC,QAAQ;wBACR,MAAM,EAAE,KAAK;qBACd,EACD,IAAI,EACJ,CAAC,CACF;iBACF;aACF;SACF,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO;YACL,OAAO,EAAE;gBACP;oBACE,IAAI,EAAE,MAAe;oBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;wBACnB,KAAK,EAAE,6BAA6B,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE;wBAC9F,GAAG,EAAE,KAAK,CAAC,GAAG;qBACf,CAAC;iBACH;aACF;YACD,OAAO,EAAE,IAAI;SACd,CAAC;IACJ,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { FetchUrlInput } from '../../types/index.js';
|
|
2
|
+
export declare const FETCH_URL_TOOL_NAME = "fetch-url";
|
|
3
|
+
export declare const FETCH_URL_TOOL_DESCRIPTION = "Fetches a webpage and converts it to AI-readable JSONL format with semantic content blocks";
|
|
4
|
+
export declare function fetchUrlToolHandler(input: FetchUrlInput): Promise<{
|
|
5
|
+
content: {
|
|
6
|
+
type: "text";
|
|
7
|
+
text: string;
|
|
8
|
+
}[];
|
|
9
|
+
isError?: undefined;
|
|
10
|
+
} | {
|
|
11
|
+
content: {
|
|
12
|
+
type: "text";
|
|
13
|
+
text: string;
|
|
14
|
+
}[];
|
|
15
|
+
isError: boolean;
|
|
16
|
+
}>;
|
|
17
|
+
//# sourceMappingURL=fetch-url.tool.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetch-url.tool.d.ts","sourceRoot":"","sources":["../../../src/tools/handlers/fetch-url.tool.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EACV,aAAa,EAGd,MAAM,sBAAsB,CAAC;AAE9B,eAAO,MAAM,mBAAmB,cAAc,CAAC;AAC/C,eAAO,MAAM,0BAA0B,+FACuD,CAAC;AAkD/F,wBAAsB,mBAAmB,CAAC,KAAK,EAAE,aAAa;;;;;;;;;;;;GAwE7D"}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import { validateAndNormalizeUrl } from '../../utils/url-validator.js';
|
|
2
|
+
import { fetchUrlWithRetry } from '../../services/fetcher.js';
|
|
3
|
+
import { extractContent } from '../../services/extractor.js';
|
|
4
|
+
import { parseHtml } from '../../services/parser.js';
|
|
5
|
+
import { toJsonl } from '../../transformers/jsonl.transformer.js';
|
|
6
|
+
import * as cache from '../../services/cache.js';
|
|
7
|
+
import { config } from '../../config/index.js';
|
|
8
|
+
export const FETCH_URL_TOOL_NAME = 'fetch-url';
|
|
9
|
+
export const FETCH_URL_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to AI-readable JSONL format with semantic content blocks';
|
|
10
|
+
function extractContentFromHtml(html, url, options) {
|
|
11
|
+
// Use the optimized extractContent that parses JSDOM only once
|
|
12
|
+
const { article, metadata: extractedMeta } = extractContent(html, url);
|
|
13
|
+
if (options.extractMainContent && config.extraction.extractMainContent && article) {
|
|
14
|
+
const contentBlocks = parseHtml(article.content);
|
|
15
|
+
const metadata = options.includeMetadata && config.extraction.includeMetadata
|
|
16
|
+
? {
|
|
17
|
+
type: 'metadata',
|
|
18
|
+
title: article.title,
|
|
19
|
+
author: article.byline,
|
|
20
|
+
url,
|
|
21
|
+
fetchedAt: new Date().toISOString(),
|
|
22
|
+
}
|
|
23
|
+
: undefined;
|
|
24
|
+
return { contentBlocks, metadata, title: article.title };
|
|
25
|
+
}
|
|
26
|
+
// Fallback: use parsed HTML directly
|
|
27
|
+
const contentBlocks = parseHtml(html);
|
|
28
|
+
const metadata = options.includeMetadata && config.extraction.includeMetadata
|
|
29
|
+
? {
|
|
30
|
+
type: 'metadata',
|
|
31
|
+
title: extractedMeta.title,
|
|
32
|
+
description: extractedMeta.description,
|
|
33
|
+
author: extractedMeta.author,
|
|
34
|
+
url,
|
|
35
|
+
fetchedAt: new Date().toISOString(),
|
|
36
|
+
}
|
|
37
|
+
: undefined;
|
|
38
|
+
return { contentBlocks, metadata, title: extractedMeta.title };
|
|
39
|
+
}
|
|
40
|
+
export async function fetchUrlToolHandler(input) {
|
|
41
|
+
try {
|
|
42
|
+
const url = validateAndNormalizeUrl(input.url);
|
|
43
|
+
const cacheKey = cache.createCacheKey('url', url);
|
|
44
|
+
const cached = cache.get(cacheKey);
|
|
45
|
+
if (cached) {
|
|
46
|
+
return {
|
|
47
|
+
content: [
|
|
48
|
+
{
|
|
49
|
+
type: 'text',
|
|
50
|
+
text: JSON.stringify({
|
|
51
|
+
url,
|
|
52
|
+
cached: true,
|
|
53
|
+
fetchedAt: cached.fetchedAt,
|
|
54
|
+
content: cached.content,
|
|
55
|
+
}),
|
|
56
|
+
},
|
|
57
|
+
],
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
const html = await fetchUrlWithRetry(url, input.customHeaders);
|
|
61
|
+
const { contentBlocks, metadata, title } = extractContentFromHtml(html, url, {
|
|
62
|
+
extractMainContent: input.extractMainContent ?? true,
|
|
63
|
+
includeMetadata: input.includeMetadata ?? true,
|
|
64
|
+
});
|
|
65
|
+
let jsonlContent = toJsonl(contentBlocks, metadata);
|
|
66
|
+
if (input.maxContentLength && jsonlContent.length > input.maxContentLength) {
|
|
67
|
+
jsonlContent =
|
|
68
|
+
jsonlContent.substring(0, input.maxContentLength) + '\n...[truncated]';
|
|
69
|
+
}
|
|
70
|
+
cache.set(cacheKey, jsonlContent);
|
|
71
|
+
return {
|
|
72
|
+
content: [
|
|
73
|
+
{
|
|
74
|
+
type: 'text',
|
|
75
|
+
text: JSON.stringify({
|
|
76
|
+
url,
|
|
77
|
+
title,
|
|
78
|
+
contentBlocks: contentBlocks.length,
|
|
79
|
+
fetchedAt: new Date().toISOString(),
|
|
80
|
+
format: 'jsonl',
|
|
81
|
+
content: jsonlContent,
|
|
82
|
+
cached: false,
|
|
83
|
+
}, null, 2),
|
|
84
|
+
},
|
|
85
|
+
],
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
catch (error) {
|
|
89
|
+
return {
|
|
90
|
+
content: [
|
|
91
|
+
{
|
|
92
|
+
type: 'text',
|
|
93
|
+
text: JSON.stringify({
|
|
94
|
+
error: `Failed to fetch URL: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
|
95
|
+
url: input.url,
|
|
96
|
+
}),
|
|
97
|
+
},
|
|
98
|
+
],
|
|
99
|
+
isError: true,
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
//# sourceMappingURL=fetch-url.tool.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetch-url.tool.js","sourceRoot":"","sources":["../../../src/tools/handlers/fetch-url.tool.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAC;AACvE,OAAO,EAAE,iBAAiB,EAAE,MAAM,2BAA2B,CAAC;AAC9D,OAAO,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAC7D,OAAO,EAAE,SAAS,EAAE,MAAM,0BAA0B,CAAC;AACrD,OAAO,EAAE,OAAO,EAAE,MAAM,yCAAyC,CAAC;AAClE,OAAO,KAAK,KAAK,MAAM,yBAAyB,CAAC;AACjD,OAAO,EAAE,MAAM,EAAE,MAAM,uBAAuB,CAAC;AAO/C,MAAM,CAAC,MAAM,mBAAmB,GAAG,WAAW,CAAC;AAC/C,MAAM,CAAC,MAAM,0BAA0B,GACrC,4FAA4F,CAAC;AAQ/F,SAAS,sBAAsB,CAC7B,IAAY,EACZ,GAAW,EACX,OAAkE;IAElE,+DAA+D;IAC/D,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,GAAG,cAAc,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IAEvE,IAAI,OAAO,CAAC,kBAAkB,IAAI,MAAM,CAAC,UAAU,CAAC,kBAAkB,IAAI,OAAO,EAAE,CAAC;QAClF,MAAM,aAAa,GAAG,SAAS,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QACjD,MAAM,QAAQ,GACZ,OAAO,CAAC,eAAe,IAAI,MAAM,CAAC,UAAU,CAAC,eAAe;YAC1D,CAAC,CAAC;gBACE,IAAI,EAAE,UAAmB;gBACzB,KAAK,EAAE,OAAO,CAAC,KAAK;gBACpB,MAAM,EAAE,OAAO,CAAC,MAAM;gBACtB,GAAG;gBACH,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC;YACH,CAAC,CAAC,SAAS,CAAC;QAEhB,OAAO,EAAE,aAAa,EAAE,QAAQ,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,CAAC;IAC3D,CAAC;IAED,qCAAqC;IACrC,MAAM,aAAa,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAEtC,MAAM,QAAQ,GACZ,OAAO,CAAC,eAAe,IAAI,MAAM,CAAC,UAAU,CAAC,eAAe;QAC1D,CAAC,CAAC;YACE,IAAI,EAAE,UAAmB;YACzB,KAAK,EAAE,aAAa,CAAC,KAAK;YAC1B,WAAW,EAAE,aAAa,CAAC,WAAW;YACtC,MAAM,EAAE,aAAa,CAAC,MAAM;YAC5B,GAAG;YACH,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC;QACH,CAAC,CAAC,SAAS,CAAC;IAEhB,OAAO,EAAE,aAAa,EAAE,QAAQ,EAAE,KAAK,EAAE,aAAa,CAAC,KAAK,EAAE,CAAC;AACjE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,KAAoB;IAC5D,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,uBAAuB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC/C,MAAM,QAAQ,GAAG,KAAK,CAAC,cAAc,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;QAElD,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACnC,IAAI,MAAM,EAAE,CAAC;YACX,OAAO;gBACL,OAAO,EAAE;oBACP;wBACE,IAAI,EAAE,MAAe;wBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;4BACnB,GAAG;4BACH,MAAM,EAAE,IAAI;4BACZ,SAAS,EAAE,MAAM,CAAC,SAAS;4BAC3B,OAAO,EAAE,MAAM,CAAC,OAAO;yBACxB,CAAC;qBACH;iBACF;aACF,CAAC;QACJ,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,iBAAiB,CAAC,GAAG,EAAE,KAAK,CAAC,aAAa,CAAC,CAAC;QAE/D,MAAM,EAAE,aAAa,EAAE,QAAQ,EAAE,KAAK,EAAE,GAAG,sBAAsB,CAAC,IAAI,EAAE,GAAG,EAAE;YAC3E,kBAAkB,EAAE,KAAK,CAAC,kBAAkB,IAAI,IAAI;YACpD,eAAe,EAAE,KAAK,CAAC,eAAe,IAAI,IAAI;SAC/C,CAAC,CAAC;QAEH,IAAI,YAAY,GAAG,OAAO,CAAC,aAAa,EAAE,QAAQ,CAAC,CAAC;QAEpD,IAAI,KAAK,CAAC,gBAAgB,IAAI,YAAY,CAAC,MAAM,GAAG,KAAK,CAAC,gBAAgB,EAAE,CAAC;YAC3E,YAAY;gBACV,YAAY,CAAC,SAAS,CAAC,CAAC,EAAE,KAAK,CAAC,gBAAgB,CAAC,GAAG,kBAAkB,CAAC;QAC3E,CAAC;QAED,KAAK,CAAC,GAAG,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;QAElC,OAAO;YACL,OAAO,EAAE;gBACP;oBACE,IAAI,EAAE,MAAe;oBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAClB;wBACE,GAAG;wBACH,KAAK;wBACL,aAAa,EAAE,aAAa,CAAC,MAAM;wBACnC,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;wBACnC,MAAM,EAAE,OAAO;wBACf,OAAO,EAAE,YAAY;wBACrB,MAAM,EAAE,KAAK;qBACd,EACD,IAAI,EACJ,CAAC,CACF;iBACF;aACF;SACF,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO;YACL,OAAO,EAAE;gBACP;oBACE,IAAI,EAAE,MAAe;oBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;wBACnB,KAAK,EAAE,wBAAwB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE;wBACzF,GAAG,EAAE,KAAK,CAAC,GAAG;qBACf,CAAC;iBACH;aACF;YACD,OAAO,EAAE,IAAI;SACd,CAAC;IACJ,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
2
|
+
/**
|
|
3
|
+
* Registers all tools with the MCP server using the modern McpServer API
|
|
4
|
+
* Tools are registered with Zod schemas for automatic validation
|
|
5
|
+
*/
|
|
6
|
+
export declare function registerTools(server: McpServer): void;
|
|
7
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/tools/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,yCAAyC,CAAC;AA2EzE;;;GAGG;AACH,wBAAgB,aAAa,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI,CAiCrD"}
|