searchfetch 3.0.1 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +117 -2
- package/package.json +1 -1
- package/templates/docker-hub.json +41 -0
- package/templates/docs-page.json +17 -6
- package/templates/docs-rs.json +47 -0
- package/templates/duckduckgo-search.json +1 -1
- package/templates/google-search.json +1 -1
package/index.js
CHANGED
|
@@ -408,6 +408,100 @@ async function fetchHtmlWithRetry(url, template, blockMedia) {
|
|
|
408
408
|
throw lastError;
|
|
409
409
|
}
|
|
410
410
|
|
|
411
|
+
// === SOURCE MARKDOWN DETECTION ===========================================
|
|
412
|
+
|
|
413
|
+
function isMarkdownContent(text) {
|
|
414
|
+
if (!text) return false;
|
|
415
|
+
const htmlTagCount = (text.match(/<\w+[^>]*>/g) || []).length;
|
|
416
|
+
if (htmlTagCount > 3) return false;
|
|
417
|
+
const patterns = [
|
|
418
|
+
/^#{1,6}\s+\S/m,
|
|
419
|
+
/\[.+?\]\(.+?\)/,
|
|
420
|
+
/```\w*\n/,
|
|
421
|
+
/^\s*[-*+]\s+\S/m,
|
|
422
|
+
/\*\*[^*]+\*\*/,
|
|
423
|
+
/^>\s+\S/m,
|
|
424
|
+
];
|
|
425
|
+
for (const pat of patterns) {
|
|
426
|
+
if (pat.test(text)) return true;
|
|
427
|
+
}
|
|
428
|
+
return false;
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
function stripSourceMarkdown(content) {
|
|
432
|
+
return content
|
|
433
|
+
.replace(/^@twoslash-cache:.*$/gm, "")
|
|
434
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
435
|
+
.trim();
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
function resolveSourceUrl(sourceTemplate, url) {
|
|
439
|
+
if (sourceTemplate === "{url}.md") {
|
|
440
|
+
return `${url.replace(/\/+$/, "")}.md`;
|
|
441
|
+
}
|
|
442
|
+
return sourceTemplate.replace("{url}", url);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
async function fetchSourceMarkdown(sourceUrl, template, blockMedia) {
|
|
446
|
+
const browser = await browserManager.getBrowser();
|
|
447
|
+
const context = await browser.newContext();
|
|
448
|
+
|
|
449
|
+
try {
|
|
450
|
+
if (template && template.cookies && template.cookies.length > 0) {
|
|
451
|
+
await context.addCookies(template.cookies);
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
const page = await context.newPage();
|
|
455
|
+
try {
|
|
456
|
+
if (blockMedia) {
|
|
457
|
+
const blockedTypes =
|
|
458
|
+
template && template.block_resources
|
|
459
|
+
? template.block_resources
|
|
460
|
+
: ["image", "media", "font"];
|
|
461
|
+
if (blockedTypes.length > 0) {
|
|
462
|
+
await page.route("**/*", (route) => {
|
|
463
|
+
const type = route.request().resourceType();
|
|
464
|
+
if (blockedTypes.includes(type)) route.abort();
|
|
465
|
+
else route.continue();
|
|
466
|
+
});
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
let response;
|
|
471
|
+
try {
|
|
472
|
+
response = await page.goto(sourceUrl, {
|
|
473
|
+
waitUntil: "domcontentloaded",
|
|
474
|
+
timeout: 10000,
|
|
475
|
+
});
|
|
476
|
+
} catch (_) {
|
|
477
|
+
return null;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
if (response && response.status() >= 400) return null;
|
|
481
|
+
|
|
482
|
+
let text;
|
|
483
|
+
try {
|
|
484
|
+
text = await page.evaluate(
|
|
485
|
+
"() => document.body?.innerText || document.body?.textContent || ''",
|
|
486
|
+
);
|
|
487
|
+
} catch (_) {
|
|
488
|
+
text = await page.content();
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
if (!text || typeof text !== "string") return null;
|
|
492
|
+
|
|
493
|
+
text = stripSourceMarkdown(text.trim());
|
|
494
|
+
return isMarkdownContent(text) ? text : null;
|
|
495
|
+
} finally {
|
|
496
|
+
await page.close();
|
|
497
|
+
}
|
|
498
|
+
} catch (_) {
|
|
499
|
+
return null;
|
|
500
|
+
} finally {
|
|
501
|
+
await context.close();
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
|
|
411
505
|
// === HTML CLEANUP ========================================================
|
|
412
506
|
|
|
413
507
|
const DEFAULT_REMOVE_SELECTORS = [
|
|
@@ -1095,10 +1189,31 @@ server.registerTool(
|
|
|
1095
1189
|
template = getTemplateByName(templateParam);
|
|
1096
1190
|
}
|
|
1097
1191
|
|
|
1098
|
-
// 2.
|
|
1192
|
+
// 2. Try source markdown if template specifies source_url
|
|
1193
|
+
let sourceMd = null;
|
|
1194
|
+
if (template && template.source_url) {
|
|
1195
|
+
const sourceUrl = resolveSourceUrl(template.source_url, url);
|
|
1196
|
+
sourceMd = await fetchSourceMarkdown(sourceUrl, template, block_media);
|
|
1197
|
+
}
|
|
1198
|
+
|
|
1199
|
+
if (sourceMd !== null) {
|
|
1200
|
+
const totalLength = sourceMd.length;
|
|
1201
|
+
const paginated = sourceMd.substring(start_index, start_index + max_length);
|
|
1202
|
+
let metadata =
|
|
1203
|
+
`\n\n---\n[webfetch: template="${template ? template.name : "auto"}" (source markdown), ` +
|
|
1204
|
+
`showing characters ${start_index} to ${start_index + paginated.length} of ${totalLength} total.`;
|
|
1205
|
+
if (start_index + max_length < totalLength) {
|
|
1206
|
+
metadata +=
|
|
1207
|
+
` Use start_index=${start_index + max_length} to read more.`;
|
|
1208
|
+
}
|
|
1209
|
+
metadata += "]";
|
|
1210
|
+
return { content: [{ type: "text", text: paginated + metadata }] };
|
|
1211
|
+
}
|
|
1212
|
+
|
|
1213
|
+
// 3. Fetch
|
|
1099
1214
|
const html = await fetchHtmlWithRetry(url, template, block_media);
|
|
1100
1215
|
|
|
1101
|
-
//
|
|
1216
|
+
// 4. Extract and compose
|
|
1102
1217
|
const $ = cheerio.load(html);
|
|
1103
1218
|
|
|
1104
1219
|
if (template) {
|
package/package.json
CHANGED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "docker-hub",
|
|
3
|
+
"description": "Docker Hub image page — official and community images with description and README",
|
|
4
|
+
"order": 10,
|
|
5
|
+
"url_patterns": [
|
|
6
|
+
"^https?://hub\\.docker\\.com/_/[^/]+/?$",
|
|
7
|
+
"^https?://hub\\.docker\\.com/r/[^/]+/[^/]+/?$"
|
|
8
|
+
],
|
|
9
|
+
"remove": [
|
|
10
|
+
"script",
|
|
11
|
+
"style",
|
|
12
|
+
"svg",
|
|
13
|
+
"nav",
|
|
14
|
+
"footer",
|
|
15
|
+
"header",
|
|
16
|
+
".styles_sidebar__",
|
|
17
|
+
".styles_chatbot__",
|
|
18
|
+
"[data-testid='chatbot']"
|
|
19
|
+
],
|
|
20
|
+
"sections": [
|
|
21
|
+
{
|
|
22
|
+
"name": "Image",
|
|
23
|
+
"selector": "h1",
|
|
24
|
+
"format": "text",
|
|
25
|
+
"required": false
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"name": "Description",
|
|
29
|
+
"selector": "meta[name='description']",
|
|
30
|
+
"format": "attribute",
|
|
31
|
+
"attribute": "content",
|
|
32
|
+
"required": false
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"name": "Content",
|
|
36
|
+
"selector": "#readme-more-content-wrapper, article, [data-testid='description'], main, .MuiContainer-root",
|
|
37
|
+
"format": "markdown",
|
|
38
|
+
"required": false
|
|
39
|
+
}
|
|
40
|
+
]
|
|
41
|
+
}
|
package/templates/docs-page.json
CHANGED
|
@@ -1,29 +1,40 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "docs-page",
|
|
3
|
-
"description": "Documentation page (ReadTheDocs, Sphinx, etc.)",
|
|
4
|
-
"order":
|
|
3
|
+
"description": "Documentation page (ReadTheDocs, Sphinx, Mintlify, VitePress, Docusaurus, Nextra, etc.)",
|
|
4
|
+
"order": 9,
|
|
5
5
|
"url_patterns": [
|
|
6
6
|
"^https?://[^/]+\\.readthedocs\\.io/.*",
|
|
7
|
-
"^https?://[^/]+\\.rtfd\\.io/.*"
|
|
7
|
+
"^https?://[^/]+\\.rtfd\\.io/.*",
|
|
8
|
+
"^https?://docs\\.mintlify\\.com/.*",
|
|
9
|
+
"^https?://[^/]+\\.mintlify\\.(?:com|app|dev)/.*",
|
|
10
|
+
"^https?://[^/]+/docs(?:/.*)?$",
|
|
11
|
+
"^https?://[^/]+/doc(?:/.*)?$",
|
|
12
|
+
"^https?://docs\\.[^/]+\\.[^/]+/.*"
|
|
8
13
|
],
|
|
14
|
+
"source_url": "{url}.md",
|
|
9
15
|
"remove": [
|
|
10
16
|
"script",
|
|
11
17
|
"style",
|
|
12
18
|
"svg",
|
|
13
19
|
"nav",
|
|
14
20
|
"footer",
|
|
15
|
-
".sphinxsidebar"
|
|
21
|
+
".sphinxsidebar",
|
|
22
|
+
".sidebar",
|
|
23
|
+
"aside",
|
|
24
|
+
".nextra-toc",
|
|
25
|
+
".table-of-contents",
|
|
26
|
+
".pagination-nav"
|
|
16
27
|
],
|
|
17
28
|
"sections": [
|
|
18
29
|
{
|
|
19
30
|
"name": "Title",
|
|
20
|
-
"selector": "h1",
|
|
31
|
+
"selector": "h1, h2",
|
|
21
32
|
"format": "text",
|
|
22
33
|
"required": false
|
|
23
34
|
},
|
|
24
35
|
{
|
|
25
36
|
"name": "Content",
|
|
26
|
-
"selector": "[role='main'], .document, .rst-content, article",
|
|
37
|
+
"selector": "[role='main'], .document, .rst-content, article, .content, .markdown-body, .prose, main, .nextra-content, .doc-content",
|
|
27
38
|
"format": "markdown",
|
|
28
39
|
"required": false
|
|
29
40
|
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "docs-rs",
|
|
3
|
+
"description": "docs.rs rustdoc documentation page — crate docs, struct/enum/fn items",
|
|
4
|
+
"order": 6,
|
|
5
|
+
"url_patterns": [
|
|
6
|
+
"^https?://docs\\.rs/[^/]+/[^/]+/[^/]+/?$",
|
|
7
|
+
"^https?://docs\\.rs/[^/]+/[^/]+/[^/]+/index\\.html$",
|
|
8
|
+
"^https?://docs\\.rs/[^/]+/[^/]+/[^/]+/.+\\.html$"
|
|
9
|
+
],
|
|
10
|
+
"remove": [
|
|
11
|
+
"script",
|
|
12
|
+
"style",
|
|
13
|
+
"svg",
|
|
14
|
+
"nav.sidebar",
|
|
15
|
+
"nav.sub",
|
|
16
|
+
"footer",
|
|
17
|
+
".search-form",
|
|
18
|
+
".rustdoc-version-dialog",
|
|
19
|
+
".mobile-topbar",
|
|
20
|
+
".out-of-band",
|
|
21
|
+
"#source-sidebar",
|
|
22
|
+
"#theme-picker",
|
|
23
|
+
"#settings-menu",
|
|
24
|
+
".since",
|
|
25
|
+
".srclink"
|
|
26
|
+
],
|
|
27
|
+
"sections": [
|
|
28
|
+
{
|
|
29
|
+
"name": "Crate",
|
|
30
|
+
"selector": "h1.crate-title, .crate-title h1, .in-band .crate-name",
|
|
31
|
+
"format": "text",
|
|
32
|
+
"required": false
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"name": "Title",
|
|
36
|
+
"selector": "h1.fqn, .in-band h1, h1",
|
|
37
|
+
"format": "text",
|
|
38
|
+
"required": false
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"name": "Content",
|
|
42
|
+
"selector": "#main-content, main, .main, .docblock",
|
|
43
|
+
"format": "markdown",
|
|
44
|
+
"required": false
|
|
45
|
+
}
|
|
46
|
+
]
|
|
47
|
+
}
|