npm - crawlforge-mcp-server - Versions diffs - 4.2.12 → 4.6.0 - Mend

crawlforge-mcp-server 4.2.12 → 4.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/CLAUDE.md +19 -7
package/README.md +11 -3
package/package.json +3 -2
package/server.js +195 -22
package/src/cli/commands/init.js +107 -0
package/src/cli/index.js +2 -0
package/src/constants/config.js +5 -0
package/src/core/ActionExecutor.js +13 -1
package/src/core/AgentOrchestrator.js +300 -0
package/src/core/AuthManager.js +21 -1
package/src/core/ChangeTracker.js +8 -5
package/src/core/LLMsTxtAnalyzer.js +71 -47
package/src/core/LocalizationManager.js +7 -4
package/src/core/ResearchOrchestrator.js +10 -6
package/src/core/StealthBrowserManager.js +52 -13
package/src/core/analysis/ContentAnalyzer.js +2 -2
package/src/core/crawlers/BFSCrawler.js +23 -12
package/src/core/processing/ContentProcessor.js +19 -3
package/src/core/processing/PDFProcessor.js +72 -23
package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
package/src/tools/advanced/batchScrape/index.js +3 -1
package/src/tools/advanced/batchScrape/reporter.js +5 -1
package/src/tools/advanced/batchScrape/worker.js +6 -1
package/src/tools/agent/agent.js +71 -0
package/src/tools/basic/_fetch.js +78 -5
package/src/tools/basic/extractLinks.js +1 -1
package/src/tools/basic/extractMetadata.js +65 -1
package/src/tools/basic/extractText.js +73 -5
package/src/tools/basic/scrapeStructured.js +48 -10
package/src/tools/crawl/crawlDeep.js +13 -5
package/src/tools/crawl/mapSite.js +53 -52
package/src/tools/extract/analyzeContent.js +11 -6
package/src/tools/extract/extractContent.js +23 -5
package/src/tools/extract/extractStructured.js +65 -16
package/src/tools/extract/extractWithLlm.js +192 -11
package/src/tools/extract/listOllamaModels.js +19 -8
package/src/tools/extract/processDocument.js +10 -4
package/src/tools/extract/summarizeContent.js +58 -1
package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
package/src/tools/research/deepResearch.js +43 -4
package/src/tools/scrape/unifiedScrape.js +314 -0
package/src/tools/search/providers/searxng.js +2 -2
package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
package/src/tools/search/ranking/ResultRanker.js +13 -4
package/src/tools/search/searchWeb.js +5 -5
package/src/tools/templates/TemplateRegistry.js +3 -2
package/src/tools/tracking/trackChanges/differ.js +33 -1
package/src/utils/htmlToMarkdown.js +5 -1

package/src/tools/basic/_fetch.js CHANGED Viewed

@@ -3,28 +3,43 @@
  * Applies an AbortController timeout and a default User-Agent.
  */
+import { config } from '../../constants/config.js';
+import { createRequire } from 'module';
+// Derive User-Agent from package version so it reflects the actual release.
+const _require = createRequire(import.meta.url);
+const _pkg = _require('../../../package.json');
+const CRAWLFORGE_UA = `CrawlForge/${_pkg.version} (+https://crawlforge.dev)`;
 /**
- * Fetch a URL with a configurable timeout.
+ * Fetch a URL with a configurable timeout and body-size cap.
+ *
+ * Content-Length is checked before the body is read; if absent or lying, the
+ * accumulated byte count is checked during streaming.  Both checks use the
+ * configurable cap from config.fetch.maxBodySize (env MAX_FETCH_BODY_SIZE,
+ * default 25 MB).
+ *
  * @param {string} url
  * @param {{ timeout?: number, headers?: Record<string,string> }} [options]
- * @returns {Promise<Response>}
+ * @returns {Promise<Response & { _body: string }>}
  */
 export async function fetchWithTimeout(url, options = {}) {
   const { timeout = 10000, headers = {} } = options;
+  const maxBodySize = config.fetch.maxBodySize;
   const controller = new AbortController();
   const timeoutId = setTimeout(() => controller.abort(), timeout);
+  let response;
   try {
-    const response = await fetch(url, {
+    response = await fetch(url, {
       signal: controller.signal,
       headers: {
-        'User-Agent': 'CrawlForge/1.0.0',
+        'User-Agent': CRAWLFORGE_UA,
         ...headers
       }
     });
     clearTimeout(timeoutId);
-    return response;
   } catch (error) {
     clearTimeout(timeoutId);
     if (error.name === 'AbortError') {
@@ -32,4 +47,62 @@ export async function fetchWithTimeout(url, options = {}) {
     }
     throw error;
   }
+  // --- Body-size cap ---
+  // Early rejection via Content-Length (servers may omit or lie — guard below
+  // handles that case). Optional-chained so non-standard responses (e.g. test
+  // mocks) without a Headers object don't throw.
+  const contentLengthHeader = response.headers?.get?.('content-length') ?? null;
+  if (contentLengthHeader !== null) {
+    const declared = parseInt(contentLengthHeader, 10);
+    if (!isNaN(declared) && declared > maxBodySize) {
+      throw new Error(
+        `Response body too large: Content-Length ${declared} exceeds limit of ${maxBodySize} bytes`
+      );
+    }
+  }
+  // Only the streaming byte-count guard requires a readable body. Responses
+  // without a ReadableStream body (already-buffered responses, test mocks)
+  // are returned unchanged so callers' native .text()/.json() still work.
+  if (!response.body || typeof response.body.getReader !== 'function') {
+    return response;
+  }
+  // Stream the body and abort if accumulated bytes exceed the cap.
+  const reader = response.body.getReader();
+  const chunks = [];
+  let totalBytes = 0;
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+    totalBytes += value.byteLength;
+    if (totalBytes > maxBodySize) {
+      reader.cancel();
+      throw new Error(
+        `Response body too large: exceeded limit of ${maxBodySize} bytes`
+      );
+    }
+    chunks.push(value);
+  }
+  // Reassemble and expose as a response-like object that callers can use.
+  const bodyText = new TextDecoder().decode(
+    chunks.reduce((acc, chunk) => {
+      const merged = new Uint8Array(acc.byteLength + chunk.byteLength);
+      merged.set(acc, 0);
+      merged.set(chunk, acc.byteLength);
+      return merged;
+    }, new Uint8Array(0))
+  );
+  // Attach the pre-read text so callers can call .text() on the result.
+  // We wrap it in a minimal compatible object.
+  return Object.assign(response, {
+    text: () => Promise.resolve(bodyText),
+    json: () => Promise.resolve(JSON.parse(bodyText)),
+    _body: bodyText
+  });
 }

package/src/tools/basic/extractLinks.js CHANGED Viewed

@@ -41,7 +41,7 @@ export async function extractLinksHandler({ url, filter_external, base_url }) {
           isExternal = false;
         }
-        if (filter_external && isExternal) return;
+        if (filter_external && !isExternal) return;
         links.push({ href: absoluteUrl, text, is_external: isExternal, original_href: href });
       } catch {

package/src/tools/basic/extractMetadata.js CHANGED Viewed

@@ -1,11 +1,64 @@
 /**
  * extract_metadata — Extract page metadata (title, description, OG tags, etc.).
  * Extracted from server.js inline handler.
+ * B1: Parse JSON-LD and microdata; stronger title fallback chain (og:title → <title> → h1).
  */
 import { load } from 'cheerio';
 import { fetchWithTimeout } from './_fetch.js';
+/**
+ * Parse all JSON-LD blocks from the document.
+ * @param {import('cheerio').CheerioAPI} $
+ * @returns {Array}
+ */
+function parseJsonLd($) {
+  const results = [];
+  $('script[type="application/ld+json"]').each((_, el) => {
+    try {
+      const raw = $(el).html();
+      if (raw) results.push(JSON.parse(raw));
+    } catch {
+      // Skip invalid blocks
+    }
+  });
+  return results;
+}
+/**
+ * Parse microdata items (elements with itemscope).
+ * @param {import('cheerio').CheerioAPI} $
+ * @returns {Array}
+ */
+function parseMicrodata($) {
+  const results = [];
+  $('[itemscope]').each((_, el) => {
+    const $el = $(el);
+    const item = {
+      type: $el.attr('itemtype') || null,
+      properties: {}
+    };
+    $el.find('[itemprop]').each((_, prop) => {
+      const $prop = $(prop);
+      const name = $prop.attr('itemprop');
+      if (!name) return;
+      const tag = ($prop.get(0).tagName || '').toLowerCase();
+      let value;
+      if (tag === 'meta') value = $prop.attr('content');
+      else if (tag === 'a' || tag === 'link') value = $prop.attr('href');
+      else if (tag === 'img') value = $prop.attr('src');
+      else if (tag === 'time') value = $prop.attr('datetime') || $prop.text().trim();
+      else value = $prop.text().trim();
+      if (value) {
+        if (!item.properties[name]) item.properties[name] = [];
+        item.properties[name].push(value);
+      }
+    });
+    results.push(item);
+  });
+  return results;
+}
 /**
  * @param {{ url: string }} params
  */
@@ -19,7 +72,13 @@ export async function extractMetadataHandler({ url }) {
     const html = await response.text();
     const $ = load(html);
-    const title = $('title').text().trim() || $('h1').first().text().trim();
+    // Stronger title fallback: og:title → <title> → h1
+    const title =
+      $('meta[property="og:title"]').attr('content') ||
+      $('title').text().trim() ||
+      $('h1').first().text().trim() ||
+      '';
     const description =
       $('meta[name="description"]').attr('content') ||
       $('meta[property="og:description"]').attr('content') || '';
@@ -47,6 +106,9 @@ export async function extractMetadataHandler({ url }) {
       $('meta[charset]').attr('charset') ||
       $('meta[http-equiv="Content-Type"]').attr('content') || '';
+    const jsonLd = parseJsonLd($);
+    const microdata = parseMicrodata($);
     return {
       content: [{
         type: 'text',
@@ -61,6 +123,8 @@ export async function extractMetadataHandler({ url }) {
           charset,
           og_tags: ogTags,
           twitter_tags: twitterTags,
+          json_ld: jsonLd,
+          microdata,
           url: response.url
         }, null, 2)
       }]

package/src/tools/basic/extractText.js CHANGED Viewed

@@ -2,12 +2,77 @@
  * extract_text — Extract clean text content from HTML.
  * Extracted from server.js inline handler.
  * D3.1: Added output_format:"markdown" option backed by Turndown.
+ * B1: Preserve block structure for text mode; use Readability + GFM for markdown mode.
  */
 import { load } from 'cheerio';
+import { JSDOM } from 'jsdom';
+import { Readability } from '@mozilla/readability';
 import { fetchWithTimeout } from './_fetch.js';
 import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js';
+// Block-level elements whose boundaries should become paragraph breaks
+const BLOCK_ELEMENTS = new Set([
+  'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+  'li', 'blockquote', 'pre', 'td', 'th', 'dt', 'dd',
+  'article', 'section', 'figure', 'figcaption', 'aside',
+  'header', 'footer', 'main', 'nav', 'form', 'fieldset',
+  'table', 'tr', 'caption'
+]);
+/**
+ * Extract plain text from a cheerio root preserving block-element paragraph breaks.
+ * @param {import('cheerio').CheerioAPI} $ - loaded cheerio instance
+ * @returns {string}
+ */
+export function extractBlockText($) {
+  const parts = [];
+  function walk(node) {
+    if (node.type === 'text') {
+      const t = node.data.replace(/[ \t\r\n]+/g, ' ');
+      if (t.trim()) parts.push(t);
+      return;
+    }
+    if (node.type !== 'tag') return;
+    const tag = node.tagName ? node.tagName.toLowerCase() : '';
+    const isBlock = BLOCK_ELEMENTS.has(tag);
+    if (isBlock) parts.push('\n\n');
+    for (const child of (node.children || [])) {
+      walk(child);
+    }
+    if (isBlock) parts.push('\n\n');
+  }
+  const body = $('body').get(0);
+  if (body) {
+    for (const child of (body.children || [])) walk(child);
+  }
+  return parts.join('').replace(/\n{3,}/g, '\n\n').trim();
+}
+/**
+ * Convert raw HTML to GFM markdown using Readability + Turndown.
+ * Accepts the original HTML string and the final URL (needed for Readability).
+ * Returns the markdown string.
+ * @param {string} html - raw HTML
+ * @param {string} pageUrl - URL of the page (used by Readability)
+ * @returns {string}
+ */
+export function readabilityToMarkdown(html, pageUrl) {
+  let articleHtml;
+  try {
+    const dom = new JSDOM(html, { url: pageUrl });
+    const reader = new Readability(dom.window.document);
+    const article = reader.parse();
+    articleHtml = article ? article.content : html;
+  } catch {
+    articleHtml = html;
+  }
+  return htmlToMarkdown(articleHtml);
+}
 /**
  * @param {{ url: string, remove_scripts?: boolean, remove_styles?: boolean, output_format?: "text"|"markdown" }} params
  */
@@ -26,20 +91,23 @@ export async function extractTextHandler({ url, remove_scripts, remove_styles, o
     $('nav, header, footer, aside, .advertisement, .ad, .sidebar').remove();
-    const text = $('body').text().replace(/\s+/g, ' ').trim();
     const result = {
-      word_count: text.split(/\s+/).filter(w => w.length > 0).length,
-      char_count: text.length,
       url: response.url
     };
     if (output_format === 'markdown') {
-      result.markdown = htmlToMarkdown($.html('body'));
+      // Run Readability first to get main content, then convert to GFM markdown
+      result.markdown = readabilityToMarkdown(html, response.url);
       result.output_format = 'markdown';
+      const plainText = result.markdown.replace(/[#*`_\[\]]/g, '').replace(/\s+/g, ' ').trim();
+      result.word_count = plainText.split(/\s+/).filter(w => w.length > 0).length;
+      result.char_count = plainText.length;
     } else {
+      const text = extractBlockText($);
       result.text = text;
       result.output_format = 'text';
+      result.word_count = text.split(/\s+/).filter(w => w.length > 0).length;
+      result.char_count = text.length;
     }
     return {

package/src/tools/basic/scrapeStructured.js CHANGED Viewed

@@ -1,15 +1,33 @@
 /**
  * scrape_structured — Extract structured data using CSS selectors.
  * Extracted from server.js inline handler.
+ * B1: Support attribute extraction (selector@attr), add max_results,
+ *     fix elements_found to report real per-field DOM match counts.
  */
 import { load } from 'cheerio';
 import { fetchWithTimeout } from './_fetch.js';
 /**
- * @param {{ url: string, selectors: Record<string, string> }} params
+ * Parse a selector string that may include an attribute suffix: "css@attr"
+ * e.g. "a.link@href" -> { selector: "a.link", attribute: "href" }
+ *      "img@src"      -> { selector: "img",    attribute: "src" }
+ *      "h1"           -> { selector: "h1",      attribute: null }
+ * @param {string} raw
+ * @returns {{ selector: string, attribute: string|null }}
  */
-export async function scrapeStructuredHandler({ url, selectors }) {
+function parseSelectorSpec(raw) {
+  const atIdx = raw.lastIndexOf('@');
+  if (atIdx > 0) {
+    return { selector: raw.slice(0, atIdx), attribute: raw.slice(atIdx + 1) };
+  }
+  return { selector: raw, attribute: null };
+}
+/**
+ * @param {{ url: string, selectors: Record<string, string>, max_results?: number }} params
+ */
+export async function scrapeStructuredHandler({ url, selectors, max_results }) {
   try {
     const response = await fetchWithTimeout(url);
     if (!response.ok) {
@@ -19,22 +37,42 @@ export async function scrapeStructuredHandler({ url, selectors }) {
     const html = await response.text();
     const $ = load(html);
     const results = {};
+    const matchCounts = {};
-    for (const [fieldName, selector] of Object.entries(selectors)) {
+    for (const [fieldName, rawSelector] of Object.entries(selectors)) {
       try {
-        const elements = $(selector);
-        if (elements.length === 0) {
+        const { selector, attribute } = parseSelectorSpec(rawSelector);
+        let elements = $(selector);
+        const domCount = elements.length;
+        matchCounts[fieldName] = domCount;
+        if (domCount === 0) {
           results[fieldName] = null;
-        } else if (elements.length === 1) {
-          results[fieldName] = elements.text().trim();
         } else {
-          results[fieldName] = elements.map((_, el) => $(el).text().trim()).get();
+          // Apply max_results cap if specified
+          if (max_results != null && max_results > 0 && domCount > max_results) {
+            elements = elements.slice(0, max_results);
+          }
+          const extract = (el) => {
+            if (attribute) {
+              return $(el).attr(attribute) ?? null;
+            }
+            return $(el).text().trim();
+          };
+          if (elements.length === 1) {
+            results[fieldName] = extract(elements.get(0));
+          } else {
+            results[fieldName] = elements.map((_, el) => extract(el)).get();
+          }
         }
       } catch (selectorError) {
         results[fieldName] = {
-          error: `Invalid selector: ${selector}`,
+          error: `Invalid selector: ${rawSelector}`,
           message: selectorError.message
         };
+        matchCounts[fieldName] = 0;
       }
     }
@@ -44,7 +82,7 @@ export async function scrapeStructuredHandler({ url, selectors }) {
         text: JSON.stringify({
           data: results,
           selectors_used: selectors,
-          elements_found: Object.keys(results).length,
+          elements_found: matchCounts,
           url: response.url
         }, null, 2)
       }]

package/src/tools/crawl/crawlDeep.js CHANGED Viewed

@@ -14,6 +14,7 @@ const CrawlDeepSchema = z.object({
   follow_external: z.boolean().optional().default(false),
   respect_robots: z.boolean().optional().default(true),
   extract_content: z.boolean().optional().default(true),
+  content_max_length: z.number().min(1).max(100000).optional().default(500),
   concurrency: z.number().min(1).max(20).optional().default(10),
   enable_link_analysis: z.boolean().optional().default(true),
   link_analysis_options: z.object({
@@ -217,7 +218,7 @@ export class CrawlDeepTool {
         errors: results.errors.length,
         duration_ms: duration,
         pages_per_second: results.urls.length / (duration / 1000),
-        results: this.formatResults(results.results, validated.extract_content),
+        results: this.formatResults(results.results, validated.extract_content, validated.content_max_length),
         errors: results.errors,
         stats: results.stats,
         site_structure: this.analyzeSiteStructure(results.urls),
@@ -240,7 +241,7 @@ export class CrawlDeepTool {
     }
   }
-  formatResults(results, includeContent) {
+  formatResults(results, includeContent, contentMaxLength = 500) {
     return results.map(result => {
       const formatted = {
         url: result.url,
@@ -250,12 +251,19 @@ export class CrawlDeepTool {
         content_length: result.contentLength,
         timestamp: result.timestamp
       };
       if (includeContent) {
-        formatted.content = result.content ? result.content.substring(0, 500) + '...' : '';
+        const raw = result.content || '';
+        if (raw.length > contentMaxLength) {
+          formatted.content = raw.substring(0, contentMaxLength);
+          formatted.truncated = true;
+        } else {
+          formatted.content = raw;
+          formatted.truncated = false;
+        }
         formatted.metadata = result.metadata;
       }
       return formatted;
     });
   }

package/src/tools/crawl/mapSite.js CHANGED Viewed

@@ -3,6 +3,15 @@ import { load } from 'cheerio';
 import { DomainFilter } from '../../utils/domainFilter.js';
 import { normalizeUrl, getBaseUrl } from '../../utils/urlNormalizer.js';
 import { CacheManager } from '../../core/cache/CacheManager.js';
+import { SitemapParser } from '../../utils/sitemapParser.js';
+import { ResultRanker } from '../search/ranking/ResultRanker.js';
+// Lazy singleton — avoids creating a CacheManager timer per request
+let _ranker = null;
+function getRanker() {
+  if (!_ranker) _ranker = new ResultRanker({ cacheEnabled: false });
+  return _ranker;
+}
 const MapSiteSchema = z.object({
   url: z.string().url(),
@@ -17,7 +26,8 @@ const MapSiteSchema = z.object({
     include_patterns: z.array(z.string()).optional().default([]),
     exclude_patterns: z.array(z.string()).optional().default([])
   }).optional(),
-  import_filter_config: z.string().optional() // JSON string of exported config
+  import_filter_config: z.string().optional(), // JSON string of exported config
+  search: z.string().optional() // when set, rank URLs by relevance and emit ranked_urls
 });
 export class MapSiteTool {
@@ -33,6 +43,7 @@ export class MapSiteTool {
     this.timeout = timeout;
     // Per-session result cache: avoids redundant site maps for the same root URL
     this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
+    this.sitemapParser = new SitemapParser({ userAgent, timeout, enableCaching: cacheEnabled, cacheTTL });
   }
   async execute(params) {
@@ -118,6 +129,25 @@ export class MapSiteTool {
         filter_stats: domainFilter ? domainFilter.getStats() : null
       };
+      // Optional: rank URLs by relevance to a search string
+      if (validated.search) {
+        try {
+          const rankerInput = urlArray.map(url => {
+            let title = url;
+            try {
+              const { pathname } = new URL(url);
+              title = decodeURIComponent(pathname).replace(/[-_/]/g, ' ').trim();
+            } catch { /* keep raw url */ }
+            return { link: url, title, snippet: '' };
+          });
+          const ranked = await getRanker().rankResults(rankerInput, validated.search);
+          result.ranked_urls = ranked.map(r => ({ url: r.link, score: r.finalScore ?? 0 }));
+        } catch {
+          // ranking is best-effort; don't fail the whole call
+          result.ranked_urls = urlArray.map(u => ({ url: u, score: 0 }));
+        }
+      }
       // Store in cache before returning
       if (this.cache) {
         const cacheKey = this.cache.generateKey('map_site', { url: validated.url, maxUrls: validated.max_urls });
@@ -131,61 +161,32 @@ export class MapSiteTool {
   }
   async fetchSitemapUrls(baseUrl, domainFilter = null) {
+    // Discover sitemaps via robots.txt and common paths, then parse with full
+    // SitemapParser support (sitemap-index recursion, gzip, CDATA/entities).
+    const discovered = await this.sitemapParser.discoverSitemaps(baseUrl, {
+      checkRobotsTxt: true,
+      checkCommonPaths: true,
+      checkSitemapIndex: false
+    });
     const urls = new Set();
-    const sitemapUrls = [
-      `${baseUrl}/sitemap.xml`,
-      `${baseUrl}/sitemap_index.xml`,
-      `${baseUrl}/sitemap-index.xml`,
-      `${baseUrl}/sitemaps.xml`
-    ];
-    for (const sitemapUrl of sitemapUrls) {
+    for (const sitemapUrl of discovered) {
       try {
-        const response = await this.fetchWithTimeout(sitemapUrl);
-        if (response.ok) {
-          const xml = await response.text();
-          const extractedUrls = this.parseSitemap(xml);
-          // Apply domain filter if provided
-          extractedUrls.forEach(url => {
+        const parsed = await this.sitemapParser.parseSitemap(sitemapUrl, {
+          includeMetadata: false,
+          followIndexes: true
+        });
+        if (parsed.success) {
+          for (const entry of parsed.urls) {
+            const url = entry.loc || entry;
             if (!domainFilter || domainFilter.isAllowed(url).allowed) {
               urls.add(url);
             }
-          });
-          // If we found a sitemap, don't try others
-          if (urls.size > 0) break;
+          }
         }
+        if (urls.size > 0) break;
       } catch {
-        // Continue to next sitemap URL
-      }
-    }
-    return Array.from(urls);
-  }
-  parseSitemap(xml) {
-    const urls = new Set();
-    // Extract URLs from sitemap
-    const urlMatches = xml.match(/<loc>([^<]+)<\/loc>/g);
-    if (urlMatches) {
-      urlMatches.forEach(match => {
-        const url = match.replace(/<\/?loc>/g, '').trim();
-        if (url) urls.add(url);
-      });
-    }
-    // Check for nested sitemaps (sitemap index)
-    const sitemapMatches = xml.match(/<sitemap>[\s\S]*?<\/sitemap>/g);
-    if (sitemapMatches) {
-      for (const sitemapMatch of sitemapMatches) {
-        const locMatch = sitemapMatch.match(/<loc>([^<]+)<\/loc>/);
-        if (locMatch && locMatch[1]) {
-          // We could recursively fetch nested sitemaps here
-          // For now, just add the sitemap URL itself
-          urls.add(locMatch[1]);
-        }
+        // Continue to next discovered sitemap
       }
     }
@@ -362,7 +363,7 @@ export class MapSiteTool {
       max_depth: 0,
       average_depth: 0,
       url_lengths: {
-        min: Infinity,
+        min: null,
         max: 0,
         average: 0
       }
@@ -374,7 +375,7 @@ export class MapSiteTool {
     for (const url of urls) {
       try {
         const urlObj = new URL(url);
         // Count secure URLs
         if (urlObj.protocol === 'https:') {
           stats.secure_urls++;
@@ -396,7 +397,7 @@ export class MapSiteTool {
         // Track URL lengths
         const length = url.length;
         totalLength += length;
-        stats.url_lengths.min = Math.min(stats.url_lengths.min, length);
+        stats.url_lengths.min = stats.url_lengths.min === null ? length : Math.min(stats.url_lengths.min, length);
         stats.url_lengths.max = Math.max(stats.url_lengths.max, length);
         // Track file extensions

package/src/tools/extract/analyzeContent.js CHANGED Viewed

@@ -266,9 +266,9 @@ export class AnalyzeContentTool {
     };
     const keywordStr = keywords.join(' ').toLowerCase();
     for (const [category, categoryKeywords] of Object.entries(categories)) {
-      const matches = categoryKeywords.filter(word => keywordStr.includes(word));
+      const matches = categoryKeywords.filter(word => new RegExp(`\\b${word}\\b`).test(keywordStr));
       if (matches.length > 0) {
         return category;
       }
@@ -394,13 +394,18 @@ export class AnalyzeContentTool {
       anticipation: ['excited', 'eager', 'looking forward', 'anticipating', 'expecting']
     };
-    const words = text.toLowerCase().split(/\s+/);
+    const lowerText = text.toLowerCase();
+    const words = lowerText.split(/\s+/);
     const emotions = [];
     for (const [emotion, emotionKeywords] of Object.entries(emotionWords)) {
-      const matches = words.filter(word => emotionKeywords.some(keyword => word.includes(keyword)));
-      if (matches.length > 0) {
-        const intensity = Math.min(1, matches.length / Math.max(words.length / 100, 1));
+      const matchCount = emotionKeywords.reduce((count, keyword) => {
+        const re = new RegExp(`\\b${keyword}\\b`, 'g');
+        const found = lowerText.match(re);
+        return count + (found ? found.length : 0);
+      }, 0);
+      if (matchCount > 0) {
+        const intensity = Math.min(1, matchCount / Math.max(words.length / 100, 1));
         emotions.push({
           emotion,
           intensity: Math.round(intensity * 100) / 100