searchfetch 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -408,6 +408,100 @@ async function fetchHtmlWithRetry(url, template, blockMedia) {
408
408
  throw lastError;
409
409
  }
410
410
 
411
+ // === SOURCE MARKDOWN DETECTION ===========================================
412
+
413
+ function isMarkdownContent(text) {
414
+ if (!text) return false;
415
+ const htmlTagCount = (text.match(/<\w+[^>]*>/g) || []).length;
416
+ if (htmlTagCount > 3) return false;
417
+ const patterns = [
418
+ /^#{1,6}\s+\S/m,
419
+ /\[.+?\]\(.+?\)/,
420
+ /```\w*\n/,
421
+ /^\s*[-*+]\s+\S/m,
422
+ /\*\*[^*]+\*\*/,
423
+ /^>\s+\S/m,
424
+ ];
425
+ for (const pat of patterns) {
426
+ if (pat.test(text)) return true;
427
+ }
428
+ return false;
429
+ }
430
+
431
+ function stripSourceMarkdown(content) {
432
+ return content
433
+ .replace(/^@twoslash-cache:.*$/gm, "")
434
+ .replace(/\n{3,}/g, "\n\n")
435
+ .trim();
436
+ }
437
+
438
+ function resolveSourceUrl(sourceTemplate, url) {
439
+ if (sourceTemplate === "{url}.md") {
440
+ return `${url.replace(/\/+$/, "")}.md`;
441
+ }
442
+ return sourceTemplate.replace("{url}", url);
443
+ }
444
+
445
+ async function fetchSourceMarkdown(sourceUrl, template, blockMedia) {
446
+ const browser = await browserManager.getBrowser();
447
+ const context = await browser.newContext();
448
+
449
+ try {
450
+ if (template && template.cookies && template.cookies.length > 0) {
451
+ await context.addCookies(template.cookies);
452
+ }
453
+
454
+ const page = await context.newPage();
455
+ try {
456
+ if (blockMedia) {
457
+ const blockedTypes =
458
+ template && template.block_resources
459
+ ? template.block_resources
460
+ : ["image", "media", "font"];
461
+ if (blockedTypes.length > 0) {
462
+ await page.route("**/*", (route) => {
463
+ const type = route.request().resourceType();
464
+ if (blockedTypes.includes(type)) route.abort();
465
+ else route.continue();
466
+ });
467
+ }
468
+ }
469
+
470
+ let response;
471
+ try {
472
+ response = await page.goto(sourceUrl, {
473
+ waitUntil: "domcontentloaded",
474
+ timeout: 10000,
475
+ });
476
+ } catch (_) {
477
+ return null;
478
+ }
479
+
480
+ if (response && response.status() >= 400) return null;
481
+
482
+ let text;
483
+ try {
484
+ text = await page.evaluate(
485
+ "() => document.body?.innerText || document.body?.textContent || ''",
486
+ );
487
+ } catch (_) {
488
+ text = await page.content();
489
+ }
490
+
491
+ if (!text || typeof text !== "string") return null;
492
+
493
+ text = stripSourceMarkdown(text.trim());
494
+ return isMarkdownContent(text) ? text : null;
495
+ } finally {
496
+ await page.close();
497
+ }
498
+ } catch (_) {
499
+ return null;
500
+ } finally {
501
+ await context.close();
502
+ }
503
+ }
504
+
411
505
  // === HTML CLEANUP ========================================================
412
506
 
413
507
  const DEFAULT_REMOVE_SELECTORS = [
@@ -1095,10 +1189,31 @@ server.registerTool(
1095
1189
  template = getTemplateByName(templateParam);
1096
1190
  }
1097
1191
 
1098
- // 2. Fetch
1192
+ // 2. Try source markdown if template specifies source_url
1193
+ let sourceMd = null;
1194
+ if (template && template.source_url) {
1195
+ const sourceUrl = resolveSourceUrl(template.source_url, url);
1196
+ sourceMd = await fetchSourceMarkdown(sourceUrl, template, block_media);
1197
+ }
1198
+
1199
+ if (sourceMd !== null) {
1200
+ const totalLength = sourceMd.length;
1201
+ const paginated = sourceMd.substring(start_index, start_index + max_length);
1202
+ let metadata =
1203
+ `\n\n---\n[webfetch: template="${template ? template.name : "auto"}" (source markdown), ` +
1204
+ `showing characters ${start_index} to ${start_index + paginated.length} of ${totalLength} total.`;
1205
+ if (start_index + max_length < totalLength) {
1206
+ metadata +=
1207
+ ` Use start_index=${start_index + max_length} to read more.`;
1208
+ }
1209
+ metadata += "]";
1210
+ return { content: [{ type: "text", text: paginated + metadata }] };
1211
+ }
1212
+
1213
+ // 3. Fetch
1099
1214
  const html = await fetchHtmlWithRetry(url, template, block_media);
1100
1215
 
1101
- // 3. Extract and compose
1216
+ // 4. Extract and compose
1102
1217
  const $ = cheerio.load(html);
1103
1218
 
1104
1219
  if (template) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "searchfetch",
3
- "version": "3.0.1",
3
+ "version": "3.1.0",
4
4
  "description": "Fault-tolerant MCP Server for Stealth Web Search and Fetching",
5
5
  "type": "module",
6
6
  "bin": {
@@ -0,0 +1,41 @@
1
+ {
2
+ "name": "docker-hub",
3
+ "description": "Docker Hub image page — official and community images with description and README",
4
+ "order": 10,
5
+ "url_patterns": [
6
+ "^https?://hub\\.docker\\.com/_/[^/]+/?$",
7
+ "^https?://hub\\.docker\\.com/r/[^/]+/[^/]+/?$"
8
+ ],
9
+ "remove": [
10
+ "script",
11
+ "style",
12
+ "svg",
13
+ "nav",
14
+ "footer",
15
+ "header",
16
+ ".styles_sidebar__",
17
+ ".styles_chatbot__",
18
+ "[data-testid='chatbot']"
19
+ ],
20
+ "sections": [
21
+ {
22
+ "name": "Image",
23
+ "selector": "h1",
24
+ "format": "text",
25
+ "required": false
26
+ },
27
+ {
28
+ "name": "Description",
29
+ "selector": "meta[name='description']",
30
+ "format": "attribute",
31
+ "attribute": "content",
32
+ "required": false
33
+ },
34
+ {
35
+ "name": "Content",
36
+ "selector": "#readme-more-content-wrapper, article, [data-testid='description'], main, .MuiContainer-root",
37
+ "format": "markdown",
38
+ "required": false
39
+ }
40
+ ]
41
+ }
@@ -1,29 +1,40 @@
1
1
  {
2
2
  "name": "docs-page",
3
- "description": "Documentation page (ReadTheDocs, Sphinx, etc.)",
4
- "order": 8,
3
+ "description": "Documentation page (ReadTheDocs, Sphinx, Mintlify, VitePress, Docusaurus, Nextra, etc.)",
4
+ "order": 9,
5
5
  "url_patterns": [
6
6
  "^https?://[^/]+\\.readthedocs\\.io/.*",
7
- "^https?://[^/]+\\.rtfd\\.io/.*"
7
+ "^https?://[^/]+\\.rtfd\\.io/.*",
8
+ "^https?://docs\\.mintlify\\.com/.*",
9
+ "^https?://[^/]+\\.mintlify\\.(?:com|app|dev)/.*",
10
+ "^https?://[^/]+/docs(?:/.*)?$",
11
+ "^https?://[^/]+/doc(?:/.*)?$",
12
+ "^https?://docs\\.[^/]+\\.[^/]+/.*"
8
13
  ],
14
+ "source_url": "{url}.md",
9
15
  "remove": [
10
16
  "script",
11
17
  "style",
12
18
  "svg",
13
19
  "nav",
14
20
  "footer",
15
- ".sphinxsidebar"
21
+ ".sphinxsidebar",
22
+ ".sidebar",
23
+ "aside",
24
+ ".nextra-toc",
25
+ ".table-of-contents",
26
+ ".pagination-nav"
16
27
  ],
17
28
  "sections": [
18
29
  {
19
30
  "name": "Title",
20
- "selector": "h1",
31
+ "selector": "h1, h2",
21
32
  "format": "text",
22
33
  "required": false
23
34
  },
24
35
  {
25
36
  "name": "Content",
26
- "selector": "[role='main'], .document, .rst-content, article",
37
+ "selector": "[role='main'], .document, .rst-content, article, .content, .markdown-body, .prose, main, .nextra-content, .doc-content",
27
38
  "format": "markdown",
28
39
  "required": false
29
40
  }
@@ -0,0 +1,47 @@
1
+ {
2
+ "name": "docs-rs",
3
+ "description": "docs.rs rustdoc documentation page — crate docs, struct/enum/fn items",
4
+ "order": 6,
5
+ "url_patterns": [
6
+ "^https?://docs\\.rs/[^/]+/[^/]+/[^/]+/?$",
7
+ "^https?://docs\\.rs/[^/]+/[^/]+/[^/]+/index\\.html$",
8
+ "^https?://docs\\.rs/[^/]+/[^/]+/[^/]+/.+\\.html$"
9
+ ],
10
+ "remove": [
11
+ "script",
12
+ "style",
13
+ "svg",
14
+ "nav.sidebar",
15
+ "nav.sub",
16
+ "footer",
17
+ ".search-form",
18
+ ".rustdoc-version-dialog",
19
+ ".mobile-topbar",
20
+ ".out-of-band",
21
+ "#source-sidebar",
22
+ "#theme-picker",
23
+ "#settings-menu",
24
+ ".since",
25
+ ".srclink"
26
+ ],
27
+ "sections": [
28
+ {
29
+ "name": "Crate",
30
+ "selector": "h1.crate-title, .crate-title h1, .in-band .crate-name",
31
+ "format": "text",
32
+ "required": false
33
+ },
34
+ {
35
+ "name": "Title",
36
+ "selector": "h1.fqn, .in-band h1, h1",
37
+ "format": "text",
38
+ "required": false
39
+ },
40
+ {
41
+ "name": "Content",
42
+ "selector": "#main-content, main, .main, .docblock",
43
+ "format": "markdown",
44
+ "required": false
45
+ }
46
+ ]
47
+ }
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "duckduckgo-search",
3
3
  "description": "DuckDuckGo HTML search results",
4
- "order": 7,
4
+ "order": 8,
5
5
  "url_patterns": [
6
6
  "^https?://html\\.duckduckgo\\.com/html/\\?"
7
7
  ],
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "google-search",
3
3
  "description": "Google web search results",
4
- "order": 6,
4
+ "order": 7,
5
5
  "url_patterns": [
6
6
  "^https?://(www\\.)?google\\.[^/]+/search\\?"
7
7
  ],