searchfetch 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -244,6 +244,33 @@ function mapSearchParams(engine, query, region, safeSearch) {
244
244
 
245
245
  // === FETCH ===============================================================
246
246
 
247
+ const FETCH_MAX_ATTEMPTS = 2;
248
+ const HTTP_429_RETRY_DELAY_MS = 2000;
249
+
250
+ function sleep(ms) {
251
+ return new Promise((resolve) => setTimeout(resolve, ms));
252
+ }
253
+
254
+ function parseRetryAfterMs(value) {
255
+ if (!value) return HTTP_429_RETRY_DELAY_MS;
256
+ const seconds = Number(value);
257
+ if (Number.isFinite(seconds) && seconds >= 0) {
258
+ return Math.min(seconds * 1000, 30000);
259
+ }
260
+ const dateMs = Date.parse(value);
261
+ if (Number.isFinite(dateMs)) {
262
+ return Math.min(Math.max(dateMs - Date.now(), 0), 30000);
263
+ }
264
+ return HTTP_429_RETRY_DELAY_MS;
265
+ }
266
+
267
+ function makeHttpStatusError(status, url, retryAfterMs = null) {
268
+ const err = new Error(`Access denied: HTTP ${status} when fetching ${url}`);
269
+ err.httpStatus = status;
270
+ err.retryAfterMs = retryAfterMs;
271
+ return err;
272
+ }
273
+
247
274
  function isAccessDenied($) {
248
275
  const title = ($("title").text() || "").toLowerCase();
249
276
  const bodyText = ($("body").text() || "").replace(/\s+/g, " ").trim().toLowerCase();
@@ -325,8 +352,10 @@ async function fetchHtml(url, template, blockMedia) {
325
352
  if (response) {
326
353
  const status = response.status();
327
354
  if ([401, 403, 429].includes(status)) {
328
- throw new Error(
329
- `Access denied: HTTP ${status} when fetching ${url}`,
355
+ throw makeHttpStatusError(
356
+ status,
357
+ url,
358
+ status === 429 ? parseRetryAfterMs(response.headers()["retry-after"]) : null,
330
359
  );
331
360
  }
332
361
  }
@@ -352,18 +381,25 @@ async function fetchHtml(url, template, blockMedia) {
352
381
 
353
382
  async function fetchHtmlWithRetry(url, template, blockMedia) {
354
383
  let lastError;
355
- for (let attempt = 0; attempt < 2; attempt++) {
384
+ for (let attempt = 0; attempt < FETCH_MAX_ATTEMPTS; attempt++) {
356
385
  try {
357
386
  return await fetchHtml(url, template, blockMedia);
358
387
  } catch (err) {
359
388
  lastError = err;
360
389
  if (
361
- attempt === 0 &&
390
+ attempt < FETCH_MAX_ATTEMPTS - 1 &&
391
+ err.httpStatus === 429
392
+ ) {
393
+ await sleep(err.retryAfterMs ?? HTTP_429_RETRY_DELAY_MS);
394
+ continue;
395
+ }
396
+ if (
397
+ attempt < FETCH_MAX_ATTEMPTS - 1 &&
362
398
  (err.message.includes("net::") ||
363
399
  err.message.includes("ERR_") ||
364
400
  err.message.includes("Navigation failed"))
365
401
  ) {
366
- // Network error — retry once
402
+ await sleep(500);
367
403
  continue;
368
404
  }
369
405
  throw err;
@@ -372,6 +408,100 @@ async function fetchHtmlWithRetry(url, template, blockMedia) {
372
408
  throw lastError;
373
409
  }
374
410
 
411
+ // === SOURCE MARKDOWN DETECTION ===========================================
412
+
413
+ function isMarkdownContent(text) {
414
+ if (!text) return false;
415
+ const htmlTagCount = (text.match(/<\w+[^>]*>/g) || []).length;
416
+ if (htmlTagCount > 3) return false;
417
+ const patterns = [
418
+ /^#{1,6}\s+\S/m,
419
+ /\[.+?\]\(.+?\)/,
420
+ /```\w*\n/,
421
+ /^\s*[-*+]\s+\S/m,
422
+ /\*\*[^*]+\*\*/,
423
+ /^>\s+\S/m,
424
+ ];
425
+ for (const pat of patterns) {
426
+ if (pat.test(text)) return true;
427
+ }
428
+ return false;
429
+ }
430
+
431
+ function stripSourceMarkdown(content) {
432
+ return content
433
+ .replace(/^@twoslash-cache:.*$/gm, "")
434
+ .replace(/\n{3,}/g, "\n\n")
435
+ .trim();
436
+ }
437
+
438
+ function resolveSourceUrl(sourceTemplate, url) {
439
+ if (sourceTemplate === "{url}.md") {
440
+ return `${url.replace(/\/+$/, "")}.md`;
441
+ }
442
+ return sourceTemplate.replace("{url}", url);
443
+ }
444
+
445
+ async function fetchSourceMarkdown(sourceUrl, template, blockMedia) {
446
+ const browser = await browserManager.getBrowser();
447
+ const context = await browser.newContext();
448
+
449
+ try {
450
+ if (template && template.cookies && template.cookies.length > 0) {
451
+ await context.addCookies(template.cookies);
452
+ }
453
+
454
+ const page = await context.newPage();
455
+ try {
456
+ if (blockMedia) {
457
+ const blockedTypes =
458
+ template && template.block_resources
459
+ ? template.block_resources
460
+ : ["image", "media", "font"];
461
+ if (blockedTypes.length > 0) {
462
+ await page.route("**/*", (route) => {
463
+ const type = route.request().resourceType();
464
+ if (blockedTypes.includes(type)) route.abort();
465
+ else route.continue();
466
+ });
467
+ }
468
+ }
469
+
470
+ let response;
471
+ try {
472
+ response = await page.goto(sourceUrl, {
473
+ waitUntil: "domcontentloaded",
474
+ timeout: 10000,
475
+ });
476
+ } catch (_) {
477
+ return null;
478
+ }
479
+
480
+ if (response && response.status() >= 400) return null;
481
+
482
+ let text;
483
+ try {
484
+ text = await page.evaluate(
485
+ "() => document.body?.innerText || document.body?.textContent || ''",
486
+ );
487
+ } catch (_) {
488
+ text = await page.content();
489
+ }
490
+
491
+ if (!text || typeof text !== "string") return null;
492
+
493
+ text = stripSourceMarkdown(text.trim());
494
+ return isMarkdownContent(text) ? text : null;
495
+ } finally {
496
+ await page.close();
497
+ }
498
+ } catch (_) {
499
+ return null;
500
+ } finally {
501
+ await context.close();
502
+ }
503
+ }
504
+
375
505
  // === HTML CLEANUP ========================================================
376
506
 
377
507
  const DEFAULT_REMOVE_SELECTORS = [
@@ -920,7 +1050,7 @@ function resolveSearchTemplate(engine, query, region, safeSearch) {
920
1050
 
921
1051
  // === MCP SERVER & TOOLS ==================================================
922
1052
 
923
- const server = new McpServer({ name: "searchfetch", version: "3.0.0" });
1053
+ const server = new McpServer({ name: "searchfetch", version: "3.0.1" });
924
1054
 
925
1055
  // --- websearch tool ---
926
1056
 
@@ -1059,10 +1189,31 @@ server.registerTool(
1059
1189
  template = getTemplateByName(templateParam);
1060
1190
  }
1061
1191
 
1062
- // 2. Fetch
1192
+ // 2. Try source markdown if template specifies source_url
1193
+ let sourceMd = null;
1194
+ if (template && template.source_url) {
1195
+ const sourceUrl = resolveSourceUrl(template.source_url, url);
1196
+ sourceMd = await fetchSourceMarkdown(sourceUrl, template, block_media);
1197
+ }
1198
+
1199
+ if (sourceMd !== null) {
1200
+ const totalLength = sourceMd.length;
1201
+ const paginated = sourceMd.substring(start_index, start_index + max_length);
1202
+ let metadata =
1203
+ `\n\n---\n[webfetch: template="${template ? template.name : "auto"}" (source markdown), ` +
1204
+ `showing characters ${start_index} to ${start_index + paginated.length} of ${totalLength} total.`;
1205
+ if (start_index + max_length < totalLength) {
1206
+ metadata +=
1207
+ ` Use start_index=${start_index + max_length} to read more.`;
1208
+ }
1209
+ metadata += "]";
1210
+ return { content: [{ type: "text", text: paginated + metadata }] };
1211
+ }
1212
+
1213
+ // 3. Fetch
1063
1214
  const html = await fetchHtmlWithRetry(url, template, block_media);
1064
1215
 
1065
- // 3. Extract and compose
1216
+ // 4. Extract and compose
1066
1217
  const $ = cheerio.load(html);
1067
1218
 
1068
1219
  if (template) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "searchfetch",
3
- "version": "3.0.0",
3
+ "version": "3.1.0",
4
4
  "description": "Fault-tolerant MCP Server for Stealth Web Search and Fetching",
5
5
  "type": "module",
6
6
  "bin": {
@@ -0,0 +1,41 @@
1
+ {
2
+ "name": "docker-hub",
3
+ "description": "Docker Hub image page — official and community images with description and README",
4
+ "order": 10,
5
+ "url_patterns": [
6
+ "^https?://hub\\.docker\\.com/_/[^/]+/?$",
7
+ "^https?://hub\\.docker\\.com/r/[^/]+/[^/]+/?$"
8
+ ],
9
+ "remove": [
10
+ "script",
11
+ "style",
12
+ "svg",
13
+ "nav",
14
+ "footer",
15
+ "header",
16
+ ".styles_sidebar__",
17
+ ".styles_chatbot__",
18
+ "[data-testid='chatbot']"
19
+ ],
20
+ "sections": [
21
+ {
22
+ "name": "Image",
23
+ "selector": "h1",
24
+ "format": "text",
25
+ "required": false
26
+ },
27
+ {
28
+ "name": "Description",
29
+ "selector": "meta[name='description']",
30
+ "format": "attribute",
31
+ "attribute": "content",
32
+ "required": false
33
+ },
34
+ {
35
+ "name": "Content",
36
+ "selector": "#readme-more-content-wrapper, article, [data-testid='description'], main, .MuiContainer-root",
37
+ "format": "markdown",
38
+ "required": false
39
+ }
40
+ ]
41
+ }
@@ -1,29 +1,40 @@
1
1
  {
2
2
  "name": "docs-page",
3
- "description": "Documentation page (ReadTheDocs, Sphinx, etc.)",
4
- "order": 8,
3
+ "description": "Documentation page (ReadTheDocs, Sphinx, Mintlify, VitePress, Docusaurus, Nextra, etc.)",
4
+ "order": 9,
5
5
  "url_patterns": [
6
6
  "^https?://[^/]+\\.readthedocs\\.io/.*",
7
- "^https?://[^/]+\\.rtfd\\.io/.*"
7
+ "^https?://[^/]+\\.rtfd\\.io/.*",
8
+ "^https?://docs\\.mintlify\\.com/.*",
9
+ "^https?://[^/]+\\.mintlify\\.(?:com|app|dev)/.*",
10
+ "^https?://[^/]+/docs(?:/.*)?$",
11
+ "^https?://[^/]+/doc(?:/.*)?$",
12
+ "^https?://docs\\.[^/]+\\.[^/]+/.*"
8
13
  ],
14
+ "source_url": "{url}.md",
9
15
  "remove": [
10
16
  "script",
11
17
  "style",
12
18
  "svg",
13
19
  "nav",
14
20
  "footer",
15
- ".sphinxsidebar"
21
+ ".sphinxsidebar",
22
+ ".sidebar",
23
+ "aside",
24
+ ".nextra-toc",
25
+ ".table-of-contents",
26
+ ".pagination-nav"
16
27
  ],
17
28
  "sections": [
18
29
  {
19
30
  "name": "Title",
20
- "selector": "h1",
31
+ "selector": "h1, h2",
21
32
  "format": "text",
22
33
  "required": false
23
34
  },
24
35
  {
25
36
  "name": "Content",
26
- "selector": "[role='main'], .document, .rst-content, article",
37
+ "selector": "[role='main'], .document, .rst-content, article, .content, .markdown-body, .prose, main, .nextra-content, .doc-content",
27
38
  "format": "markdown",
28
39
  "required": false
29
40
  }
@@ -0,0 +1,47 @@
1
+ {
2
+ "name": "docs-rs",
3
+ "description": "docs.rs rustdoc documentation page — crate docs, struct/enum/fn items",
4
+ "order": 6,
5
+ "url_patterns": [
6
+ "^https?://docs\\.rs/[^/]+/[^/]+/[^/]+/?$",
7
+ "^https?://docs\\.rs/[^/]+/[^/]+/[^/]+/index\\.html$",
8
+ "^https?://docs\\.rs/[^/]+/[^/]+/[^/]+/.+\\.html$"
9
+ ],
10
+ "remove": [
11
+ "script",
12
+ "style",
13
+ "svg",
14
+ "nav.sidebar",
15
+ "nav.sub",
16
+ "footer",
17
+ ".search-form",
18
+ ".rustdoc-version-dialog",
19
+ ".mobile-topbar",
20
+ ".out-of-band",
21
+ "#source-sidebar",
22
+ "#theme-picker",
23
+ "#settings-menu",
24
+ ".since",
25
+ ".srclink"
26
+ ],
27
+ "sections": [
28
+ {
29
+ "name": "Crate",
30
+ "selector": "h1.crate-title, .crate-title h1, .in-band .crate-name",
31
+ "format": "text",
32
+ "required": false
33
+ },
34
+ {
35
+ "name": "Title",
36
+ "selector": "h1.fqn, .in-band h1, h1",
37
+ "format": "text",
38
+ "required": false
39
+ },
40
+ {
41
+ "name": "Content",
42
+ "selector": "#main-content, main, .main, .docblock",
43
+ "format": "markdown",
44
+ "required": false
45
+ }
46
+ ]
47
+ }
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "duckduckgo-search",
3
3
  "description": "DuckDuckGo HTML search results",
4
- "order": 7,
4
+ "order": 8,
5
5
  "url_patterns": [
6
6
  "^https?://html\\.duckduckgo\\.com/html/\\?"
7
7
  ],
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "google-search",
3
3
  "description": "Google web search results",
4
- "order": 6,
4
+ "order": 7,
5
5
  "url_patterns": [
6
6
  "^https?://(www\\.)?google\\.[^/]+/search\\?"
7
7
  ],