searchfetch 3.0.0 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +159 -8
- package/package.json +1 -1
- package/templates/docker-hub.json +41 -0
- package/templates/docs-page.json +17 -6
- package/templates/docs-rs.json +47 -0
- package/templates/duckduckgo-search.json +1 -1
- package/templates/google-search.json +1 -1
package/index.js
CHANGED
|
@@ -244,6 +244,33 @@ function mapSearchParams(engine, query, region, safeSearch) {
|
|
|
244
244
|
|
|
245
245
|
// === FETCH ===============================================================
|
|
246
246
|
|
|
247
|
+
const FETCH_MAX_ATTEMPTS = 2;
|
|
248
|
+
const HTTP_429_RETRY_DELAY_MS = 2000;
|
|
249
|
+
|
|
250
|
+
function sleep(ms) {
|
|
251
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
function parseRetryAfterMs(value) {
|
|
255
|
+
if (!value) return HTTP_429_RETRY_DELAY_MS;
|
|
256
|
+
const seconds = Number(value);
|
|
257
|
+
if (Number.isFinite(seconds) && seconds >= 0) {
|
|
258
|
+
return Math.min(seconds * 1000, 30000);
|
|
259
|
+
}
|
|
260
|
+
const dateMs = Date.parse(value);
|
|
261
|
+
if (Number.isFinite(dateMs)) {
|
|
262
|
+
return Math.min(Math.max(dateMs - Date.now(), 0), 30000);
|
|
263
|
+
}
|
|
264
|
+
return HTTP_429_RETRY_DELAY_MS;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
function makeHttpStatusError(status, url, retryAfterMs = null) {
|
|
268
|
+
const err = new Error(`Access denied: HTTP ${status} when fetching ${url}`);
|
|
269
|
+
err.httpStatus = status;
|
|
270
|
+
err.retryAfterMs = retryAfterMs;
|
|
271
|
+
return err;
|
|
272
|
+
}
|
|
273
|
+
|
|
247
274
|
function isAccessDenied($) {
|
|
248
275
|
const title = ($("title").text() || "").toLowerCase();
|
|
249
276
|
const bodyText = ($("body").text() || "").replace(/\s+/g, " ").trim().toLowerCase();
|
|
@@ -325,8 +352,10 @@ async function fetchHtml(url, template, blockMedia) {
|
|
|
325
352
|
if (response) {
|
|
326
353
|
const status = response.status();
|
|
327
354
|
if ([401, 403, 429].includes(status)) {
|
|
328
|
-
throw
|
|
329
|
-
|
|
355
|
+
throw makeHttpStatusError(
|
|
356
|
+
status,
|
|
357
|
+
url,
|
|
358
|
+
status === 429 ? parseRetryAfterMs(response.headers()["retry-after"]) : null,
|
|
330
359
|
);
|
|
331
360
|
}
|
|
332
361
|
}
|
|
@@ -352,18 +381,25 @@ async function fetchHtml(url, template, blockMedia) {
|
|
|
352
381
|
|
|
353
382
|
async function fetchHtmlWithRetry(url, template, blockMedia) {
|
|
354
383
|
let lastError;
|
|
355
|
-
for (let attempt = 0; attempt <
|
|
384
|
+
for (let attempt = 0; attempt < FETCH_MAX_ATTEMPTS; attempt++) {
|
|
356
385
|
try {
|
|
357
386
|
return await fetchHtml(url, template, blockMedia);
|
|
358
387
|
} catch (err) {
|
|
359
388
|
lastError = err;
|
|
360
389
|
if (
|
|
361
|
-
attempt
|
|
390
|
+
attempt < FETCH_MAX_ATTEMPTS - 1 &&
|
|
391
|
+
err.httpStatus === 429
|
|
392
|
+
) {
|
|
393
|
+
await sleep(err.retryAfterMs ?? HTTP_429_RETRY_DELAY_MS);
|
|
394
|
+
continue;
|
|
395
|
+
}
|
|
396
|
+
if (
|
|
397
|
+
attempt < FETCH_MAX_ATTEMPTS - 1 &&
|
|
362
398
|
(err.message.includes("net::") ||
|
|
363
399
|
err.message.includes("ERR_") ||
|
|
364
400
|
err.message.includes("Navigation failed"))
|
|
365
401
|
) {
|
|
366
|
-
|
|
402
|
+
await sleep(500);
|
|
367
403
|
continue;
|
|
368
404
|
}
|
|
369
405
|
throw err;
|
|
@@ -372,6 +408,100 @@ async function fetchHtmlWithRetry(url, template, blockMedia) {
|
|
|
372
408
|
throw lastError;
|
|
373
409
|
}
|
|
374
410
|
|
|
411
|
+
// === SOURCE MARKDOWN DETECTION ===========================================
|
|
412
|
+
|
|
413
|
+
function isMarkdownContent(text) {
|
|
414
|
+
if (!text) return false;
|
|
415
|
+
const htmlTagCount = (text.match(/<\w+[^>]*>/g) || []).length;
|
|
416
|
+
if (htmlTagCount > 3) return false;
|
|
417
|
+
const patterns = [
|
|
418
|
+
/^#{1,6}\s+\S/m,
|
|
419
|
+
/\[.+?\]\(.+?\)/,
|
|
420
|
+
/```\w*\n/,
|
|
421
|
+
/^\s*[-*+]\s+\S/m,
|
|
422
|
+
/\*\*[^*]+\*\*/,
|
|
423
|
+
/^>\s+\S/m,
|
|
424
|
+
];
|
|
425
|
+
for (const pat of patterns) {
|
|
426
|
+
if (pat.test(text)) return true;
|
|
427
|
+
}
|
|
428
|
+
return false;
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
function stripSourceMarkdown(content) {
|
|
432
|
+
return content
|
|
433
|
+
.replace(/^@twoslash-cache:.*$/gm, "")
|
|
434
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
435
|
+
.trim();
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
function resolveSourceUrl(sourceTemplate, url) {
|
|
439
|
+
if (sourceTemplate === "{url}.md") {
|
|
440
|
+
return `${url.replace(/\/+$/, "")}.md`;
|
|
441
|
+
}
|
|
442
|
+
return sourceTemplate.replace("{url}", url);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
async function fetchSourceMarkdown(sourceUrl, template, blockMedia) {
|
|
446
|
+
const browser = await browserManager.getBrowser();
|
|
447
|
+
const context = await browser.newContext();
|
|
448
|
+
|
|
449
|
+
try {
|
|
450
|
+
if (template && template.cookies && template.cookies.length > 0) {
|
|
451
|
+
await context.addCookies(template.cookies);
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
const page = await context.newPage();
|
|
455
|
+
try {
|
|
456
|
+
if (blockMedia) {
|
|
457
|
+
const blockedTypes =
|
|
458
|
+
template && template.block_resources
|
|
459
|
+
? template.block_resources
|
|
460
|
+
: ["image", "media", "font"];
|
|
461
|
+
if (blockedTypes.length > 0) {
|
|
462
|
+
await page.route("**/*", (route) => {
|
|
463
|
+
const type = route.request().resourceType();
|
|
464
|
+
if (blockedTypes.includes(type)) route.abort();
|
|
465
|
+
else route.continue();
|
|
466
|
+
});
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
let response;
|
|
471
|
+
try {
|
|
472
|
+
response = await page.goto(sourceUrl, {
|
|
473
|
+
waitUntil: "domcontentloaded",
|
|
474
|
+
timeout: 10000,
|
|
475
|
+
});
|
|
476
|
+
} catch (_) {
|
|
477
|
+
return null;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
if (response && response.status() >= 400) return null;
|
|
481
|
+
|
|
482
|
+
let text;
|
|
483
|
+
try {
|
|
484
|
+
text = await page.evaluate(
|
|
485
|
+
"() => document.body?.innerText || document.body?.textContent || ''",
|
|
486
|
+
);
|
|
487
|
+
} catch (_) {
|
|
488
|
+
text = await page.content();
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
if (!text || typeof text !== "string") return null;
|
|
492
|
+
|
|
493
|
+
text = stripSourceMarkdown(text.trim());
|
|
494
|
+
return isMarkdownContent(text) ? text : null;
|
|
495
|
+
} finally {
|
|
496
|
+
await page.close();
|
|
497
|
+
}
|
|
498
|
+
} catch (_) {
|
|
499
|
+
return null;
|
|
500
|
+
} finally {
|
|
501
|
+
await context.close();
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
|
|
375
505
|
// === HTML CLEANUP ========================================================
|
|
376
506
|
|
|
377
507
|
const DEFAULT_REMOVE_SELECTORS = [
|
|
@@ -920,7 +1050,7 @@ function resolveSearchTemplate(engine, query, region, safeSearch) {
|
|
|
920
1050
|
|
|
921
1051
|
// === MCP SERVER & TOOLS ==================================================
|
|
922
1052
|
|
|
923
|
-
const server = new McpServer({ name: "searchfetch", version: "3.0.
|
|
1053
|
+
const server = new McpServer({ name: "searchfetch", version: "3.0.1" });
|
|
924
1054
|
|
|
925
1055
|
// --- websearch tool ---
|
|
926
1056
|
|
|
@@ -1059,10 +1189,31 @@ server.registerTool(
|
|
|
1059
1189
|
template = getTemplateByName(templateParam);
|
|
1060
1190
|
}
|
|
1061
1191
|
|
|
1062
|
-
// 2.
|
|
1192
|
+
// 2. Try source markdown if template specifies source_url
|
|
1193
|
+
let sourceMd = null;
|
|
1194
|
+
if (template && template.source_url) {
|
|
1195
|
+
const sourceUrl = resolveSourceUrl(template.source_url, url);
|
|
1196
|
+
sourceMd = await fetchSourceMarkdown(sourceUrl, template, block_media);
|
|
1197
|
+
}
|
|
1198
|
+
|
|
1199
|
+
if (sourceMd !== null) {
|
|
1200
|
+
const totalLength = sourceMd.length;
|
|
1201
|
+
const paginated = sourceMd.substring(start_index, start_index + max_length);
|
|
1202
|
+
let metadata =
|
|
1203
|
+
`\n\n---\n[webfetch: template="${template ? template.name : "auto"}" (source markdown), ` +
|
|
1204
|
+
`showing characters ${start_index} to ${start_index + paginated.length} of ${totalLength} total.`;
|
|
1205
|
+
if (start_index + max_length < totalLength) {
|
|
1206
|
+
metadata +=
|
|
1207
|
+
` Use start_index=${start_index + max_length} to read more.`;
|
|
1208
|
+
}
|
|
1209
|
+
metadata += "]";
|
|
1210
|
+
return { content: [{ type: "text", text: paginated + metadata }] };
|
|
1211
|
+
}
|
|
1212
|
+
|
|
1213
|
+
// 3. Fetch
|
|
1063
1214
|
const html = await fetchHtmlWithRetry(url, template, block_media);
|
|
1064
1215
|
|
|
1065
|
-
//
|
|
1216
|
+
// 4. Extract and compose
|
|
1066
1217
|
const $ = cheerio.load(html);
|
|
1067
1218
|
|
|
1068
1219
|
if (template) {
|
package/package.json
CHANGED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "docker-hub",
|
|
3
|
+
"description": "Docker Hub image page — official and community images with description and README",
|
|
4
|
+
"order": 10,
|
|
5
|
+
"url_patterns": [
|
|
6
|
+
"^https?://hub\\.docker\\.com/_/[^/]+/?$",
|
|
7
|
+
"^https?://hub\\.docker\\.com/r/[^/]+/[^/]+/?$"
|
|
8
|
+
],
|
|
9
|
+
"remove": [
|
|
10
|
+
"script",
|
|
11
|
+
"style",
|
|
12
|
+
"svg",
|
|
13
|
+
"nav",
|
|
14
|
+
"footer",
|
|
15
|
+
"header",
|
|
16
|
+
".styles_sidebar__",
|
|
17
|
+
".styles_chatbot__",
|
|
18
|
+
"[data-testid='chatbot']"
|
|
19
|
+
],
|
|
20
|
+
"sections": [
|
|
21
|
+
{
|
|
22
|
+
"name": "Image",
|
|
23
|
+
"selector": "h1",
|
|
24
|
+
"format": "text",
|
|
25
|
+
"required": false
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"name": "Description",
|
|
29
|
+
"selector": "meta[name='description']",
|
|
30
|
+
"format": "attribute",
|
|
31
|
+
"attribute": "content",
|
|
32
|
+
"required": false
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"name": "Content",
|
|
36
|
+
"selector": "#readme-more-content-wrapper, article, [data-testid='description'], main, .MuiContainer-root",
|
|
37
|
+
"format": "markdown",
|
|
38
|
+
"required": false
|
|
39
|
+
}
|
|
40
|
+
]
|
|
41
|
+
}
|
package/templates/docs-page.json
CHANGED
|
@@ -1,29 +1,40 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "docs-page",
|
|
3
|
-
"description": "Documentation page (ReadTheDocs, Sphinx, etc.)",
|
|
4
|
-
"order":
|
|
3
|
+
"description": "Documentation page (ReadTheDocs, Sphinx, Mintlify, VitePress, Docusaurus, Nextra, etc.)",
|
|
4
|
+
"order": 9,
|
|
5
5
|
"url_patterns": [
|
|
6
6
|
"^https?://[^/]+\\.readthedocs\\.io/.*",
|
|
7
|
-
"^https?://[^/]+\\.rtfd\\.io/.*"
|
|
7
|
+
"^https?://[^/]+\\.rtfd\\.io/.*",
|
|
8
|
+
"^https?://docs\\.mintlify\\.com/.*",
|
|
9
|
+
"^https?://[^/]+\\.mintlify\\.(?:com|app|dev)/.*",
|
|
10
|
+
"^https?://[^/]+/docs(?:/.*)?$",
|
|
11
|
+
"^https?://[^/]+/doc(?:/.*)?$",
|
|
12
|
+
"^https?://docs\\.[^/]+\\.[^/]+/.*"
|
|
8
13
|
],
|
|
14
|
+
"source_url": "{url}.md",
|
|
9
15
|
"remove": [
|
|
10
16
|
"script",
|
|
11
17
|
"style",
|
|
12
18
|
"svg",
|
|
13
19
|
"nav",
|
|
14
20
|
"footer",
|
|
15
|
-
".sphinxsidebar"
|
|
21
|
+
".sphinxsidebar",
|
|
22
|
+
".sidebar",
|
|
23
|
+
"aside",
|
|
24
|
+
".nextra-toc",
|
|
25
|
+
".table-of-contents",
|
|
26
|
+
".pagination-nav"
|
|
16
27
|
],
|
|
17
28
|
"sections": [
|
|
18
29
|
{
|
|
19
30
|
"name": "Title",
|
|
20
|
-
"selector": "h1",
|
|
31
|
+
"selector": "h1, h2",
|
|
21
32
|
"format": "text",
|
|
22
33
|
"required": false
|
|
23
34
|
},
|
|
24
35
|
{
|
|
25
36
|
"name": "Content",
|
|
26
|
-
"selector": "[role='main'], .document, .rst-content, article",
|
|
37
|
+
"selector": "[role='main'], .document, .rst-content, article, .content, .markdown-body, .prose, main, .nextra-content, .doc-content",
|
|
27
38
|
"format": "markdown",
|
|
28
39
|
"required": false
|
|
29
40
|
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "docs-rs",
|
|
3
|
+
"description": "docs.rs rustdoc documentation page — crate docs, struct/enum/fn items",
|
|
4
|
+
"order": 6,
|
|
5
|
+
"url_patterns": [
|
|
6
|
+
"^https?://docs\\.rs/[^/]+/[^/]+/[^/]+/?$",
|
|
7
|
+
"^https?://docs\\.rs/[^/]+/[^/]+/[^/]+/index\\.html$",
|
|
8
|
+
"^https?://docs\\.rs/[^/]+/[^/]+/[^/]+/.+\\.html$"
|
|
9
|
+
],
|
|
10
|
+
"remove": [
|
|
11
|
+
"script",
|
|
12
|
+
"style",
|
|
13
|
+
"svg",
|
|
14
|
+
"nav.sidebar",
|
|
15
|
+
"nav.sub",
|
|
16
|
+
"footer",
|
|
17
|
+
".search-form",
|
|
18
|
+
".rustdoc-version-dialog",
|
|
19
|
+
".mobile-topbar",
|
|
20
|
+
".out-of-band",
|
|
21
|
+
"#source-sidebar",
|
|
22
|
+
"#theme-picker",
|
|
23
|
+
"#settings-menu",
|
|
24
|
+
".since",
|
|
25
|
+
".srclink"
|
|
26
|
+
],
|
|
27
|
+
"sections": [
|
|
28
|
+
{
|
|
29
|
+
"name": "Crate",
|
|
30
|
+
"selector": "h1.crate-title, .crate-title h1, .in-band .crate-name",
|
|
31
|
+
"format": "text",
|
|
32
|
+
"required": false
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"name": "Title",
|
|
36
|
+
"selector": "h1.fqn, .in-band h1, h1",
|
|
37
|
+
"format": "text",
|
|
38
|
+
"required": false
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"name": "Content",
|
|
42
|
+
"selector": "#main-content, main, .main, .docblock",
|
|
43
|
+
"format": "markdown",
|
|
44
|
+
"required": false
|
|
45
|
+
}
|
|
46
|
+
]
|
|
47
|
+
}
|