crawlforge-mcp-server 4.2.12 → 4.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CLAUDE.md +19 -7
  2. package/README.md +11 -3
  3. package/package.json +3 -2
  4. package/server.js +195 -22
  5. package/src/cli/commands/init.js +107 -0
  6. package/src/cli/index.js +2 -0
  7. package/src/constants/config.js +5 -0
  8. package/src/core/ActionExecutor.js +13 -1
  9. package/src/core/AgentOrchestrator.js +300 -0
  10. package/src/core/AuthManager.js +21 -1
  11. package/src/core/ChangeTracker.js +8 -5
  12. package/src/core/LLMsTxtAnalyzer.js +71 -47
  13. package/src/core/LocalizationManager.js +7 -4
  14. package/src/core/ResearchOrchestrator.js +10 -6
  15. package/src/core/StealthBrowserManager.js +52 -13
  16. package/src/core/analysis/ContentAnalyzer.js +2 -2
  17. package/src/core/crawlers/BFSCrawler.js +23 -12
  18. package/src/core/processing/ContentProcessor.js +19 -3
  19. package/src/core/processing/PDFProcessor.js +72 -23
  20. package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
  21. package/src/tools/advanced/batchScrape/index.js +3 -1
  22. package/src/tools/advanced/batchScrape/reporter.js +5 -1
  23. package/src/tools/advanced/batchScrape/worker.js +6 -1
  24. package/src/tools/agent/agent.js +71 -0
  25. package/src/tools/basic/_fetch.js +78 -5
  26. package/src/tools/basic/extractLinks.js +1 -1
  27. package/src/tools/basic/extractMetadata.js +65 -1
  28. package/src/tools/basic/extractText.js +73 -5
  29. package/src/tools/basic/scrapeStructured.js +48 -10
  30. package/src/tools/crawl/crawlDeep.js +13 -5
  31. package/src/tools/crawl/mapSite.js +53 -52
  32. package/src/tools/extract/analyzeContent.js +11 -6
  33. package/src/tools/extract/extractContent.js +23 -5
  34. package/src/tools/extract/extractStructured.js +65 -16
  35. package/src/tools/extract/extractWithLlm.js +192 -11
  36. package/src/tools/extract/listOllamaModels.js +19 -8
  37. package/src/tools/extract/processDocument.js +10 -4
  38. package/src/tools/extract/summarizeContent.js +58 -1
  39. package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
  40. package/src/tools/research/deepResearch.js +43 -4
  41. package/src/tools/scrape/unifiedScrape.js +314 -0
  42. package/src/tools/search/providers/searxng.js +2 -2
  43. package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
  44. package/src/tools/search/ranking/ResultRanker.js +13 -4
  45. package/src/tools/search/searchWeb.js +5 -5
  46. package/src/tools/templates/TemplateRegistry.js +3 -2
  47. package/src/tools/tracking/trackChanges/differ.js +33 -1
  48. package/src/utils/htmlToMarkdown.js +5 -1
@@ -3,28 +3,43 @@
3
3
  * Applies an AbortController timeout and a default User-Agent.
4
4
  */
5
5
 
6
+ import { config } from '../../constants/config.js';
7
+ import { createRequire } from 'module';
8
+
9
+ // Derive User-Agent from package version so it reflects the actual release.
10
+ const _require = createRequire(import.meta.url);
11
+ const _pkg = _require('../../../package.json');
12
+ const CRAWLFORGE_UA = `CrawlForge/${_pkg.version} (+https://crawlforge.dev)`;
13
+
6
14
  /**
7
- * Fetch a URL with a configurable timeout.
15
+ * Fetch a URL with a configurable timeout and body-size cap.
16
+ *
17
+ * Content-Length is checked before the body is read; if absent or lying, the
18
+ * accumulated byte count is checked during streaming. Both checks use the
19
+ * configurable cap from config.fetch.maxBodySize (env MAX_FETCH_BODY_SIZE,
20
+ * default 25 MB).
21
+ *
8
22
  * @param {string} url
9
23
  * @param {{ timeout?: number, headers?: Record<string,string> }} [options]
10
- * @returns {Promise<Response>}
24
+ * @returns {Promise<Response & { _body: string }>}
11
25
  */
12
26
  export async function fetchWithTimeout(url, options = {}) {
13
27
  const { timeout = 10000, headers = {} } = options;
28
+ const maxBodySize = config.fetch.maxBodySize;
14
29
 
15
30
  const controller = new AbortController();
16
31
  const timeoutId = setTimeout(() => controller.abort(), timeout);
17
32
 
33
+ let response;
18
34
  try {
19
- const response = await fetch(url, {
35
+ response = await fetch(url, {
20
36
  signal: controller.signal,
21
37
  headers: {
22
- 'User-Agent': 'CrawlForge/1.0.0',
38
+ 'User-Agent': CRAWLFORGE_UA,
23
39
  ...headers
24
40
  }
25
41
  });
26
42
  clearTimeout(timeoutId);
27
- return response;
28
43
  } catch (error) {
29
44
  clearTimeout(timeoutId);
30
45
  if (error.name === 'AbortError') {
@@ -32,4 +47,62 @@ export async function fetchWithTimeout(url, options = {}) {
32
47
  }
33
48
  throw error;
34
49
  }
50
+
51
+ // --- Body-size cap ---
52
+
53
+ // Early rejection via Content-Length (servers may omit or lie — guard below
54
+ // handles that case). Optional-chained so non-standard responses (e.g. test
55
+ // mocks) without a Headers object don't throw.
56
+ const contentLengthHeader = response.headers?.get?.('content-length') ?? null;
57
+ if (contentLengthHeader !== null) {
58
+ const declared = parseInt(contentLengthHeader, 10);
59
+ if (!isNaN(declared) && declared > maxBodySize) {
60
+ throw new Error(
61
+ `Response body too large: Content-Length ${declared} exceeds limit of ${maxBodySize} bytes`
62
+ );
63
+ }
64
+ }
65
+
66
+ // Only the streaming byte-count guard requires a readable body. Responses
67
+ // without a ReadableStream body (already-buffered responses, test mocks)
68
+ // are returned unchanged so callers' native .text()/.json() still work.
69
+ if (!response.body || typeof response.body.getReader !== 'function') {
70
+ return response;
71
+ }
72
+
73
+ // Stream the body and abort if accumulated bytes exceed the cap.
74
+ const reader = response.body.getReader();
75
+ const chunks = [];
76
+ let totalBytes = 0;
77
+
78
+ while (true) {
79
+ const { done, value } = await reader.read();
80
+ if (done) break;
81
+ totalBytes += value.byteLength;
82
+ if (totalBytes > maxBodySize) {
83
+ reader.cancel();
84
+ throw new Error(
85
+ `Response body too large: exceeded limit of ${maxBodySize} bytes`
86
+ );
87
+ }
88
+ chunks.push(value);
89
+ }
90
+
91
+ // Reassemble and expose as a response-like object that callers can use.
92
+ const bodyText = new TextDecoder().decode(
93
+ chunks.reduce((acc, chunk) => {
94
+ const merged = new Uint8Array(acc.byteLength + chunk.byteLength);
95
+ merged.set(acc, 0);
96
+ merged.set(chunk, acc.byteLength);
97
+ return merged;
98
+ }, new Uint8Array(0))
99
+ );
100
+
101
+ // Attach the pre-read text so callers can call .text() on the result.
102
+ // We wrap it in a minimal compatible object.
103
+ return Object.assign(response, {
104
+ text: () => Promise.resolve(bodyText),
105
+ json: () => Promise.resolve(JSON.parse(bodyText)),
106
+ _body: bodyText
107
+ });
35
108
  }
@@ -41,7 +41,7 @@ export async function extractLinksHandler({ url, filter_external, base_url }) {
41
41
  isExternal = false;
42
42
  }
43
43
 
44
- if (filter_external && isExternal) return;
44
+ if (filter_external && !isExternal) return;
45
45
 
46
46
  links.push({ href: absoluteUrl, text, is_external: isExternal, original_href: href });
47
47
  } catch {
@@ -1,11 +1,64 @@
1
1
  /**
2
2
  * extract_metadata — Extract page metadata (title, description, OG tags, etc.).
3
3
  * Extracted from server.js inline handler.
4
+ * B1: Parse JSON-LD and microdata; stronger title fallback chain (og:title → <title> → h1).
4
5
  */
5
6
 
6
7
  import { load } from 'cheerio';
7
8
  import { fetchWithTimeout } from './_fetch.js';
8
9
 
10
+ /**
11
+ * Parse all JSON-LD blocks from the document.
12
+ * @param {import('cheerio').CheerioAPI} $
13
+ * @returns {Array}
14
+ */
15
+ function parseJsonLd($) {
16
+ const results = [];
17
+ $('script[type="application/ld+json"]').each((_, el) => {
18
+ try {
19
+ const raw = $(el).html();
20
+ if (raw) results.push(JSON.parse(raw));
21
+ } catch {
22
+ // Skip invalid blocks
23
+ }
24
+ });
25
+ return results;
26
+ }
27
+
28
+ /**
29
+ * Parse microdata items (elements with itemscope).
30
+ * @param {import('cheerio').CheerioAPI} $
31
+ * @returns {Array}
32
+ */
33
+ function parseMicrodata($) {
34
+ const results = [];
35
+ $('[itemscope]').each((_, el) => {
36
+ const $el = $(el);
37
+ const item = {
38
+ type: $el.attr('itemtype') || null,
39
+ properties: {}
40
+ };
41
+ $el.find('[itemprop]').each((_, prop) => {
42
+ const $prop = $(prop);
43
+ const name = $prop.attr('itemprop');
44
+ if (!name) return;
45
+ const tag = ($prop.get(0).tagName || '').toLowerCase();
46
+ let value;
47
+ if (tag === 'meta') value = $prop.attr('content');
48
+ else if (tag === 'a' || tag === 'link') value = $prop.attr('href');
49
+ else if (tag === 'img') value = $prop.attr('src');
50
+ else if (tag === 'time') value = $prop.attr('datetime') || $prop.text().trim();
51
+ else value = $prop.text().trim();
52
+ if (value) {
53
+ if (!item.properties[name]) item.properties[name] = [];
54
+ item.properties[name].push(value);
55
+ }
56
+ });
57
+ results.push(item);
58
+ });
59
+ return results;
60
+ }
61
+
9
62
  /**
10
63
  * @param {{ url: string }} params
11
64
  */
@@ -19,7 +72,13 @@ export async function extractMetadataHandler({ url }) {
19
72
  const html = await response.text();
20
73
  const $ = load(html);
21
74
 
22
- const title = $('title').text().trim() || $('h1').first().text().trim();
75
+ // Stronger title fallback: og:title <title> → h1
76
+ const title =
77
+ $('meta[property="og:title"]').attr('content') ||
78
+ $('title').text().trim() ||
79
+ $('h1').first().text().trim() ||
80
+ '';
81
+
23
82
  const description =
24
83
  $('meta[name="description"]').attr('content') ||
25
84
  $('meta[property="og:description"]').attr('content') || '';
@@ -47,6 +106,9 @@ export async function extractMetadataHandler({ url }) {
47
106
  $('meta[charset]').attr('charset') ||
48
107
  $('meta[http-equiv="Content-Type"]').attr('content') || '';
49
108
 
109
+ const jsonLd = parseJsonLd($);
110
+ const microdata = parseMicrodata($);
111
+
50
112
  return {
51
113
  content: [{
52
114
  type: 'text',
@@ -61,6 +123,8 @@ export async function extractMetadataHandler({ url }) {
61
123
  charset,
62
124
  og_tags: ogTags,
63
125
  twitter_tags: twitterTags,
126
+ json_ld: jsonLd,
127
+ microdata,
64
128
  url: response.url
65
129
  }, null, 2)
66
130
  }]
@@ -2,12 +2,77 @@
2
2
  * extract_text — Extract clean text content from HTML.
3
3
  * Extracted from server.js inline handler.
4
4
  * D3.1: Added output_format:"markdown" option backed by Turndown.
5
+ * B1: Preserve block structure for text mode; use Readability + GFM for markdown mode.
5
6
  */
6
7
 
7
8
  import { load } from 'cheerio';
9
+ import { JSDOM } from 'jsdom';
10
+ import { Readability } from '@mozilla/readability';
8
11
  import { fetchWithTimeout } from './_fetch.js';
9
12
  import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js';
10
13
 
14
+ // Block-level elements whose boundaries should become paragraph breaks
15
+ const BLOCK_ELEMENTS = new Set([
16
+ 'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
17
+ 'li', 'blockquote', 'pre', 'td', 'th', 'dt', 'dd',
18
+ 'article', 'section', 'figure', 'figcaption', 'aside',
19
+ 'header', 'footer', 'main', 'nav', 'form', 'fieldset',
20
+ 'table', 'tr', 'caption'
21
+ ]);
22
+
23
+ /**
24
+ * Extract plain text from a cheerio root preserving block-element paragraph breaks.
25
+ * @param {import('cheerio').CheerioAPI} $ - loaded cheerio instance
26
+ * @returns {string}
27
+ */
28
+ export function extractBlockText($) {
29
+ const parts = [];
30
+
31
+ function walk(node) {
32
+ if (node.type === 'text') {
33
+ const t = node.data.replace(/[ \t\r\n]+/g, ' ');
34
+ if (t.trim()) parts.push(t);
35
+ return;
36
+ }
37
+ if (node.type !== 'tag') return;
38
+ const tag = node.tagName ? node.tagName.toLowerCase() : '';
39
+ const isBlock = BLOCK_ELEMENTS.has(tag);
40
+ if (isBlock) parts.push('\n\n');
41
+ for (const child of (node.children || [])) {
42
+ walk(child);
43
+ }
44
+ if (isBlock) parts.push('\n\n');
45
+ }
46
+
47
+ const body = $('body').get(0);
48
+ if (body) {
49
+ for (const child of (body.children || [])) walk(child);
50
+ }
51
+
52
+ return parts.join('').replace(/\n{3,}/g, '\n\n').trim();
53
+ }
54
+
55
+ /**
56
+ * Convert raw HTML to GFM markdown using Readability + Turndown.
57
+ * Accepts the original HTML string and the final URL (needed for Readability).
58
+ * Returns the markdown string.
59
+ * @param {string} html - raw HTML
60
+ * @param {string} pageUrl - URL of the page (used by Readability)
61
+ * @returns {string}
62
+ */
63
+ export function readabilityToMarkdown(html, pageUrl) {
64
+ let articleHtml;
65
+ try {
66
+ const dom = new JSDOM(html, { url: pageUrl });
67
+ const reader = new Readability(dom.window.document);
68
+ const article = reader.parse();
69
+ articleHtml = article ? article.content : html;
70
+ } catch {
71
+ articleHtml = html;
72
+ }
73
+ return htmlToMarkdown(articleHtml);
74
+ }
75
+
11
76
  /**
12
77
  * @param {{ url: string, remove_scripts?: boolean, remove_styles?: boolean, output_format?: "text"|"markdown" }} params
13
78
  */
@@ -26,20 +91,23 @@ export async function extractTextHandler({ url, remove_scripts, remove_styles, o
26
91
 
27
92
  $('nav, header, footer, aside, .advertisement, .ad, .sidebar').remove();
28
93
 
29
- const text = $('body').text().replace(/\s+/g, ' ').trim();
30
-
31
94
  const result = {
32
- word_count: text.split(/\s+/).filter(w => w.length > 0).length,
33
- char_count: text.length,
34
95
  url: response.url
35
96
  };
36
97
 
37
98
  if (output_format === 'markdown') {
38
- result.markdown = htmlToMarkdown($.html('body'));
99
+ // Run Readability first to get main content, then convert to GFM markdown
100
+ result.markdown = readabilityToMarkdown(html, response.url);
39
101
  result.output_format = 'markdown';
102
+ const plainText = result.markdown.replace(/[#*`_\[\]]/g, '').replace(/\s+/g, ' ').trim();
103
+ result.word_count = plainText.split(/\s+/).filter(w => w.length > 0).length;
104
+ result.char_count = plainText.length;
40
105
  } else {
106
+ const text = extractBlockText($);
41
107
  result.text = text;
42
108
  result.output_format = 'text';
109
+ result.word_count = text.split(/\s+/).filter(w => w.length > 0).length;
110
+ result.char_count = text.length;
43
111
  }
44
112
 
45
113
  return {
@@ -1,15 +1,33 @@
1
1
  /**
2
2
  * scrape_structured — Extract structured data using CSS selectors.
3
3
  * Extracted from server.js inline handler.
4
+ * B1: Support attribute extraction (selector@attr), add max_results,
5
+ * fix elements_found to report real per-field DOM match counts.
4
6
  */
5
7
 
6
8
  import { load } from 'cheerio';
7
9
  import { fetchWithTimeout } from './_fetch.js';
8
10
 
9
11
  /**
10
- * @param {{ url: string, selectors: Record<string, string> }} params
12
+ * Parse a selector string that may include an attribute suffix: "css@attr"
13
+ * e.g. "a.link@href" -> { selector: "a.link", attribute: "href" }
14
+ * "img@src" -> { selector: "img", attribute: "src" }
15
+ * "h1" -> { selector: "h1", attribute: null }
16
+ * @param {string} raw
17
+ * @returns {{ selector: string, attribute: string|null }}
11
18
  */
12
- export async function scrapeStructuredHandler({ url, selectors }) {
19
+ function parseSelectorSpec(raw) {
20
+ const atIdx = raw.lastIndexOf('@');
21
+ if (atIdx > 0) {
22
+ return { selector: raw.slice(0, atIdx), attribute: raw.slice(atIdx + 1) };
23
+ }
24
+ return { selector: raw, attribute: null };
25
+ }
26
+
27
+ /**
28
+ * @param {{ url: string, selectors: Record<string, string>, max_results?: number }} params
29
+ */
30
+ export async function scrapeStructuredHandler({ url, selectors, max_results }) {
13
31
  try {
14
32
  const response = await fetchWithTimeout(url);
15
33
  if (!response.ok) {
@@ -19,22 +37,42 @@ export async function scrapeStructuredHandler({ url, selectors }) {
19
37
  const html = await response.text();
20
38
  const $ = load(html);
21
39
  const results = {};
40
+ const matchCounts = {};
22
41
 
23
- for (const [fieldName, selector] of Object.entries(selectors)) {
42
+ for (const [fieldName, rawSelector] of Object.entries(selectors)) {
24
43
  try {
25
- const elements = $(selector);
26
- if (elements.length === 0) {
44
+ const { selector, attribute } = parseSelectorSpec(rawSelector);
45
+ let elements = $(selector);
46
+ const domCount = elements.length;
47
+ matchCounts[fieldName] = domCount;
48
+
49
+ if (domCount === 0) {
27
50
  results[fieldName] = null;
28
- } else if (elements.length === 1) {
29
- results[fieldName] = elements.text().trim();
30
51
  } else {
31
- results[fieldName] = elements.map((_, el) => $(el).text().trim()).get();
52
+ // Apply max_results cap if specified
53
+ if (max_results != null && max_results > 0 && domCount > max_results) {
54
+ elements = elements.slice(0, max_results);
55
+ }
56
+
57
+ const extract = (el) => {
58
+ if (attribute) {
59
+ return $(el).attr(attribute) ?? null;
60
+ }
61
+ return $(el).text().trim();
62
+ };
63
+
64
+ if (elements.length === 1) {
65
+ results[fieldName] = extract(elements.get(0));
66
+ } else {
67
+ results[fieldName] = elements.map((_, el) => extract(el)).get();
68
+ }
32
69
  }
33
70
  } catch (selectorError) {
34
71
  results[fieldName] = {
35
- error: `Invalid selector: ${selector}`,
72
+ error: `Invalid selector: ${rawSelector}`,
36
73
  message: selectorError.message
37
74
  };
75
+ matchCounts[fieldName] = 0;
38
76
  }
39
77
  }
40
78
 
@@ -44,7 +82,7 @@ export async function scrapeStructuredHandler({ url, selectors }) {
44
82
  text: JSON.stringify({
45
83
  data: results,
46
84
  selectors_used: selectors,
47
- elements_found: Object.keys(results).length,
85
+ elements_found: matchCounts,
48
86
  url: response.url
49
87
  }, null, 2)
50
88
  }]
@@ -14,6 +14,7 @@ const CrawlDeepSchema = z.object({
14
14
  follow_external: z.boolean().optional().default(false),
15
15
  respect_robots: z.boolean().optional().default(true),
16
16
  extract_content: z.boolean().optional().default(true),
17
+ content_max_length: z.number().min(1).max(100000).optional().default(500),
17
18
  concurrency: z.number().min(1).max(20).optional().default(10),
18
19
  enable_link_analysis: z.boolean().optional().default(true),
19
20
  link_analysis_options: z.object({
@@ -217,7 +218,7 @@ export class CrawlDeepTool {
217
218
  errors: results.errors.length,
218
219
  duration_ms: duration,
219
220
  pages_per_second: results.urls.length / (duration / 1000),
220
- results: this.formatResults(results.results, validated.extract_content),
221
+ results: this.formatResults(results.results, validated.extract_content, validated.content_max_length),
221
222
  errors: results.errors,
222
223
  stats: results.stats,
223
224
  site_structure: this.analyzeSiteStructure(results.urls),
@@ -240,7 +241,7 @@ export class CrawlDeepTool {
240
241
  }
241
242
  }
242
243
 
243
- formatResults(results, includeContent) {
244
+ formatResults(results, includeContent, contentMaxLength = 500) {
244
245
  return results.map(result => {
245
246
  const formatted = {
246
247
  url: result.url,
@@ -250,12 +251,19 @@ export class CrawlDeepTool {
250
251
  content_length: result.contentLength,
251
252
  timestamp: result.timestamp
252
253
  };
253
-
254
+
254
255
  if (includeContent) {
255
- formatted.content = result.content ? result.content.substring(0, 500) + '...' : '';
256
+ const raw = result.content || '';
257
+ if (raw.length > contentMaxLength) {
258
+ formatted.content = raw.substring(0, contentMaxLength);
259
+ formatted.truncated = true;
260
+ } else {
261
+ formatted.content = raw;
262
+ formatted.truncated = false;
263
+ }
256
264
  formatted.metadata = result.metadata;
257
265
  }
258
-
266
+
259
267
  return formatted;
260
268
  });
261
269
  }
@@ -3,6 +3,15 @@ import { load } from 'cheerio';
3
3
  import { DomainFilter } from '../../utils/domainFilter.js';
4
4
  import { normalizeUrl, getBaseUrl } from '../../utils/urlNormalizer.js';
5
5
  import { CacheManager } from '../../core/cache/CacheManager.js';
6
+ import { SitemapParser } from '../../utils/sitemapParser.js';
7
+ import { ResultRanker } from '../search/ranking/ResultRanker.js';
8
+
9
+ // Lazy singleton — avoids creating a CacheManager timer per request
10
+ let _ranker = null;
11
+ function getRanker() {
12
+ if (!_ranker) _ranker = new ResultRanker({ cacheEnabled: false });
13
+ return _ranker;
14
+ }
6
15
 
7
16
  const MapSiteSchema = z.object({
8
17
  url: z.string().url(),
@@ -17,7 +26,8 @@ const MapSiteSchema = z.object({
17
26
  include_patterns: z.array(z.string()).optional().default([]),
18
27
  exclude_patterns: z.array(z.string()).optional().default([])
19
28
  }).optional(),
20
- import_filter_config: z.string().optional() // JSON string of exported config
29
+ import_filter_config: z.string().optional(), // JSON string of exported config
30
+ search: z.string().optional() // when set, rank URLs by relevance and emit ranked_urls
21
31
  });
22
32
 
23
33
  export class MapSiteTool {
@@ -33,6 +43,7 @@ export class MapSiteTool {
33
43
  this.timeout = timeout;
34
44
  // Per-session result cache: avoids redundant site maps for the same root URL
35
45
  this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
46
+ this.sitemapParser = new SitemapParser({ userAgent, timeout, enableCaching: cacheEnabled, cacheTTL });
36
47
  }
37
48
 
38
49
  async execute(params) {
@@ -118,6 +129,25 @@ export class MapSiteTool {
118
129
  filter_stats: domainFilter ? domainFilter.getStats() : null
119
130
  };
120
131
 
132
+ // Optional: rank URLs by relevance to a search string
133
+ if (validated.search) {
134
+ try {
135
+ const rankerInput = urlArray.map(url => {
136
+ let title = url;
137
+ try {
138
+ const { pathname } = new URL(url);
139
+ title = decodeURIComponent(pathname).replace(/[-_/]/g, ' ').trim();
140
+ } catch { /* keep raw url */ }
141
+ return { link: url, title, snippet: '' };
142
+ });
143
+ const ranked = await getRanker().rankResults(rankerInput, validated.search);
144
+ result.ranked_urls = ranked.map(r => ({ url: r.link, score: r.finalScore ?? 0 }));
145
+ } catch {
146
+ // ranking is best-effort; don't fail the whole call
147
+ result.ranked_urls = urlArray.map(u => ({ url: u, score: 0 }));
148
+ }
149
+ }
150
+
121
151
  // Store in cache before returning
122
152
  if (this.cache) {
123
153
  const cacheKey = this.cache.generateKey('map_site', { url: validated.url, maxUrls: validated.max_urls });
@@ -131,61 +161,32 @@ export class MapSiteTool {
131
161
  }
132
162
 
133
163
  async fetchSitemapUrls(baseUrl, domainFilter = null) {
164
+ // Discover sitemaps via robots.txt and common paths, then parse with full
165
+ // SitemapParser support (sitemap-index recursion, gzip, CDATA/entities).
166
+ const discovered = await this.sitemapParser.discoverSitemaps(baseUrl, {
167
+ checkRobotsTxt: true,
168
+ checkCommonPaths: true,
169
+ checkSitemapIndex: false
170
+ });
171
+
134
172
  const urls = new Set();
135
- const sitemapUrls = [
136
- `${baseUrl}/sitemap.xml`,
137
- `${baseUrl}/sitemap_index.xml`,
138
- `${baseUrl}/sitemap-index.xml`,
139
- `${baseUrl}/sitemaps.xml`
140
- ];
141
-
142
- for (const sitemapUrl of sitemapUrls) {
173
+ for (const sitemapUrl of discovered) {
143
174
  try {
144
- const response = await this.fetchWithTimeout(sitemapUrl);
145
- if (response.ok) {
146
- const xml = await response.text();
147
- const extractedUrls = this.parseSitemap(xml);
148
-
149
- // Apply domain filter if provided
150
- extractedUrls.forEach(url => {
175
+ const parsed = await this.sitemapParser.parseSitemap(sitemapUrl, {
176
+ includeMetadata: false,
177
+ followIndexes: true
178
+ });
179
+ if (parsed.success) {
180
+ for (const entry of parsed.urls) {
181
+ const url = entry.loc || entry;
151
182
  if (!domainFilter || domainFilter.isAllowed(url).allowed) {
152
183
  urls.add(url);
153
184
  }
154
- });
155
-
156
- // If we found a sitemap, don't try others
157
- if (urls.size > 0) break;
185
+ }
158
186
  }
187
+ if (urls.size > 0) break;
159
188
  } catch {
160
- // Continue to next sitemap URL
161
- }
162
- }
163
-
164
- return Array.from(urls);
165
- }
166
-
167
- parseSitemap(xml) {
168
- const urls = new Set();
169
-
170
- // Extract URLs from sitemap
171
- const urlMatches = xml.match(/<loc>([^<]+)<\/loc>/g);
172
- if (urlMatches) {
173
- urlMatches.forEach(match => {
174
- const url = match.replace(/<\/?loc>/g, '').trim();
175
- if (url) urls.add(url);
176
- });
177
- }
178
-
179
- // Check for nested sitemaps (sitemap index)
180
- const sitemapMatches = xml.match(/<sitemap>[\s\S]*?<\/sitemap>/g);
181
- if (sitemapMatches) {
182
- for (const sitemapMatch of sitemapMatches) {
183
- const locMatch = sitemapMatch.match(/<loc>([^<]+)<\/loc>/);
184
- if (locMatch && locMatch[1]) {
185
- // We could recursively fetch nested sitemaps here
186
- // For now, just add the sitemap URL itself
187
- urls.add(locMatch[1]);
188
- }
189
+ // Continue to next discovered sitemap
189
190
  }
190
191
  }
191
192
 
@@ -362,7 +363,7 @@ export class MapSiteTool {
362
363
  max_depth: 0,
363
364
  average_depth: 0,
364
365
  url_lengths: {
365
- min: Infinity,
366
+ min: null,
366
367
  max: 0,
367
368
  average: 0
368
369
  }
@@ -374,7 +375,7 @@ export class MapSiteTool {
374
375
  for (const url of urls) {
375
376
  try {
376
377
  const urlObj = new URL(url);
377
-
378
+
378
379
  // Count secure URLs
379
380
  if (urlObj.protocol === 'https:') {
380
381
  stats.secure_urls++;
@@ -396,7 +397,7 @@ export class MapSiteTool {
396
397
  // Track URL lengths
397
398
  const length = url.length;
398
399
  totalLength += length;
399
- stats.url_lengths.min = Math.min(stats.url_lengths.min, length);
400
+ stats.url_lengths.min = stats.url_lengths.min === null ? length : Math.min(stats.url_lengths.min, length);
400
401
  stats.url_lengths.max = Math.max(stats.url_lengths.max, length);
401
402
 
402
403
  // Track file extensions
@@ -266,9 +266,9 @@ export class AnalyzeContentTool {
266
266
  };
267
267
 
268
268
  const keywordStr = keywords.join(' ').toLowerCase();
269
-
269
+
270
270
  for (const [category, categoryKeywords] of Object.entries(categories)) {
271
- const matches = categoryKeywords.filter(word => keywordStr.includes(word));
271
+ const matches = categoryKeywords.filter(word => new RegExp(`\\b${word}\\b`).test(keywordStr));
272
272
  if (matches.length > 0) {
273
273
  return category;
274
274
  }
@@ -394,13 +394,18 @@ export class AnalyzeContentTool {
394
394
  anticipation: ['excited', 'eager', 'looking forward', 'anticipating', 'expecting']
395
395
  };
396
396
 
397
- const words = text.toLowerCase().split(/\s+/);
397
+ const lowerText = text.toLowerCase();
398
+ const words = lowerText.split(/\s+/);
398
399
  const emotions = [];
399
400
 
400
401
  for (const [emotion, emotionKeywords] of Object.entries(emotionWords)) {
401
- const matches = words.filter(word => emotionKeywords.some(keyword => word.includes(keyword)));
402
- if (matches.length > 0) {
403
- const intensity = Math.min(1, matches.length / Math.max(words.length / 100, 1));
402
+ const matchCount = emotionKeywords.reduce((count, keyword) => {
403
+ const re = new RegExp(`\\b${keyword}\\b`, 'g');
404
+ const found = lowerText.match(re);
405
+ return count + (found ? found.length : 0);
406
+ }, 0);
407
+ if (matchCount > 0) {
408
+ const intensity = Math.min(1, matchCount / Math.max(words.length / 100, 1));
404
409
  emotions.push({
405
410
  emotion,
406
411
  intensity: Math.round(intensity * 100) / 100