npm - crawlforge-mcp-server - Versions diffs - 3.5.1 → 4.2.2 - Mend

crawlforge-mcp-server 3.5.1 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/package.json +6 -4
package/server.js +138 -26
package/src/cli/commands/actions.js +36 -0
package/src/cli/commands/analyze.js +19 -0
package/src/cli/commands/batch.js +45 -0
package/src/cli/commands/crawl.js +30 -0
package/src/cli/commands/extract.js +45 -0
package/src/cli/commands/install-skills.js +46 -0
package/src/cli/commands/llmstxt.js +24 -0
package/src/cli/commands/localize.js +29 -0
package/src/cli/commands/map.js +26 -0
package/src/cli/commands/monitor.js +29 -0
package/src/cli/commands/research.js +26 -0
package/src/cli/commands/scrape.js +37 -0
package/src/cli/commands/search.js +28 -0
package/src/cli/commands/stealth.js +29 -0
package/src/cli/commands/template.js +26 -0
package/src/cli/commands/track.js +24 -0
package/src/cli/commands/uninstall-skills.js +35 -0
package/src/cli/formatter.js +57 -0
package/src/cli/index.js +94 -0
package/src/cli/lib/runTool.js +40 -0
package/src/core/ActionExecutor.js +8 -6
package/src/core/AuthManager.js +103 -3
package/src/core/ChangeTracker.js +34 -0
package/src/core/ElicitationHelper.js +112 -0
package/src/core/JobManager.js +36 -2
package/src/core/LocalizationManager.js +19 -5
package/src/core/PerformanceManager.js +53 -17
package/src/core/ResearchOrchestrator.js +40 -5
package/src/core/SamplingClient.js +191 -0
package/src/core/StealthBrowserManager.js +248 -2
package/src/core/WebhookDispatcher.js +18 -10
package/src/prompts/PromptRegistry.js +199 -0
package/src/resources/ResourceRegistry.js +273 -0
package/src/server/withAuth.js +25 -0
package/src/skills/crawlforge-cli.md +157 -0
package/src/skills/crawlforge-mcp.md +80 -0
package/src/skills/crawlforge-research.md +104 -0
package/src/skills/crawlforge-stealth.md +98 -0
package/src/skills/installer.js +141 -0
package/src/tools/advanced/batchScrape/index.js +30 -0
package/src/tools/advanced/batchScrape/schema.js +1 -1
package/src/tools/basic/extractText.js +19 -8
package/src/tools/crawl/crawlDeep.js +27 -0
package/src/tools/extract/extractContent.js +5 -17
package/src/tools/extract/extractStructured.js +8 -0
package/src/tools/extract/extractWithLlm.js +25 -5
package/src/tools/extract/processDocument.js +7 -1
package/src/tools/extract/summarizeContent.js +17 -0
package/src/tools/research/deepResearch.js +34 -0
package/src/tools/templates/ScrapeTemplateTool.js +68 -0
package/src/tools/templates/TemplateRegistry.js +311 -0
package/src/utils/Logger.js +15 -0
package/src/utils/htmlToMarkdown.js +54 -0
package/src/utils/secretMask.js +86 -0

package/src/tools/extract/extractWithLlm.js CHANGED Viewed

@@ -8,6 +8,15 @@
  */
 import { fetchAndParse } from './_fetchAndParse.js';
+// D1.3: SamplingClient for MCP sampling fallback (lazy — only imported if needed)
+let _SamplingClient = null;
+async function getSamplingClient() {
+  if (!_SamplingClient) {
+    const mod = await import('../../core/SamplingClient.js');
+    _SamplingClient = mod.SamplingClient;
+  }
+  return _SamplingClient;
+}
 // ── Constants ─────────────────────────────────────────────────────────────────
@@ -297,14 +306,25 @@ export class ExtractWithLlm {
     const userMessage = buildUserMessage(prompt, text, schema);
-    // Step 2: First LLM call
-    let rawText, usage;
+    // Step 2: First LLM call — with sampling fallback for 'auto' provider
+    // Fallback chain: Ollama → API key (handled by resolveProvider) → sampling → error
+    let rawText, usage, resolvedModel = model;
     try {
       ({ rawText, usage } = await callLLM({
         provider, apiKey, model, systemMessage, userMessage, maxTokens, schema
       }));
     } catch (llmErr) {
-      return { success: false, error: `LLM call failed: ${llmErr.message}` };
+      // D1.3: If provider is 'auto'/'ollama' and it failed, try sampling as final fallback
+      if (providerParam === 'auto' || providerParam === 'ollama') {
+        try {
+          ({ rawText, usage } = await callViaSampling({ systemMessage, userMessage, maxTokens }));
+          resolvedModel = 'sampling';
+        } catch (samplingErr) {
+          return { success: false, error: `LLM call failed: ${llmErr.message}. Sampling fallback also failed: ${samplingErr.message}` };
+        }
+      } else {
+        return { success: false, error: `LLM call failed: ${llmErr.message}` };
+      }
     }
     // Step 3: Parse JSON; retry once with stricter prompt if it fails
@@ -345,8 +365,8 @@ export class ExtractWithLlm {
     return {
       success: true,
       data: parsed,
-      provider,
-      model,
+      provider: resolvedModel === 'sampling' ? 'sampling' : provider,
+      model: resolvedModel || model,
       usage
     };
   }

package/src/tools/extract/processDocument.js CHANGED Viewed

@@ -8,6 +8,7 @@ import { PDFProcessor } from '../../core/processing/PDFProcessor.js';
 import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
 import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
 import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
+import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
 const ProcessDocumentSchema = z.object({
   source: z.string().min(1),
@@ -28,7 +29,7 @@ const ProcessDocumentSchema = z.object({
     // Processing options
     assessContentQuality: z.boolean().default(true),
     includeStatistics: z.boolean().default(true),
-    outputFormat: z.enum(['text', 'structured', 'full']).default('structured'),
+    outputFormat: z.enum(['text', 'structured', 'full', 'markdown']).default('structured'),
     // Content filtering
     minContentLength: z.number().min(0).default(50),
@@ -328,6 +329,11 @@ export class ProcessDocumentTool {
       result.content.html = html;
     }
+    // D3.1: Markdown output mode — convert extracted HTML to markdown via Turndown
+    if (options.outputFormat === 'markdown') {
+      result.content.markdown = htmlToMarkdown(extractedContent || html);
+    }
     // Step 4: Set metadata
     if (processingResult.metadata) {
       result.metadata = {

package/src/tools/extract/summarizeContent.js CHANGED Viewed

@@ -4,6 +4,15 @@
  */
 import { z } from 'zod';
+// D1.3: lazy SamplingClient for abstractive mode when no LLM keys are set
+let _SamplingClient = null;
+async function getSamplingClient() {
+  if (!_SamplingClient) {
+    const mod = await import('../../core/SamplingClient.js');
+    _SamplingClient = mod.SamplingClient;
+  }
+  return _SamplingClient;
+}
 import { ContentAnalyzer } from '../../core/analysis/ContentAnalyzer.js';
 import { splitSentences } from '../../core/analysis/sentenceUtils.js';
@@ -122,6 +131,14 @@ export class SummarizeContentTool {
       // Step 2: Set summary result
       result.summary = analysisResult.summary;
+      // D1.3: If abstractive mode requested, attempt sampling-based enhancement
+      if (options.summaryType === 'abstractive') {
+        const abstractive = await this._abstractiveSummaryViaSampling(text, analysisResult.summary, options.summaryLength);
+        if (abstractive) {
+          result.summary = abstractive;
+        }
+      }
       // Step 3: Extract key points if requested
       if (options.includeKeypoints) {
         result.keypoints = await this.extractKeyPoints(text, analysisResult.summary);

package/src/tools/research/deepResearch.js CHANGED Viewed

@@ -1,4 +1,6 @@
 import { z } from 'zod';
+// D1.4: Elicitation helper (injected from server.js or can be used standalone)
+import { ElicitationHelper } from '../../core/ElicitationHelper.js';
 import { ResearchOrchestrator } from '../../core/ResearchOrchestrator.js';
 import { Logger } from '../../utils/Logger.js';
@@ -93,6 +95,17 @@ export class DeepResearchTool {
       cacheTTL,
       ...orchestratorOptions
     };
+    // D1.4: Elicitation helper (set mcpServer via setMcpServer() after instantiation)
+    this._elicitation = new ElicitationHelper({});
+  }
+  /**
+   * D1.4: Set the MCP server instance for elicitation support.
+   * Call this from server.js after instantiating DeepResearchTool.
+   * @param {object} mcpServer
+   */
+  setMcpServer(mcpServer) {
+    this._elicitation = new ElicitationHelper({ mcpServer });
   }
   async execute(params) {
@@ -116,6 +129,27 @@ export class DeepResearchTool {
         };
       }
+      // D1.4: Elicitation — warn user if projected cost exceeds 50 credits
+      // deep_research costs approximately 1 credit per URL; maxUrls > 50 → confirm
+      if (validated.maxUrls > 50) {
+        const projectedCredits = validated.maxUrls;
+        const proceed = await this._elicitation.confirm(
+          `deep_research will scan up to ${validated.maxUrls} URLs, projecting ~${projectedCredits} credits.`,
+          {
+            topic: validated.topic,
+            projected_credits: projectedCredits,
+            max_urls: validated.maxUrls,
+          }
+        );
+        if (!proceed) {
+          return {
+            success: false,
+            error: 'Research cancelled by user before starting (elicitation declined).',
+            sessionId,
+          };
+        }
+      }
       // Configure research orchestrator based on research approach
       const orchestratorConfig = this.buildOrchestratorConfig(validated);
       const orchestrator = new ResearchOrchestrator(orchestratorConfig);

package/src/tools/templates/ScrapeTemplateTool.js ADDED Viewed

@@ -0,0 +1,68 @@
+/**
+ * ScrapeTemplateTool — wraps TemplateRegistry to expose the `scrape_template` MCP tool.
+ *
+ * Usage pattern (D3.3):
+ *   const tool = new ScrapeTemplateTool();
+ *   const result = await tool.execute({ template: "github-repo", url: "https://github.com/user/repo" });
+ */
+import { TemplateRegistry } from './TemplateRegistry.js';
+export class ScrapeTemplateTool {
+  constructor() {
+    this.registry = new TemplateRegistry();
+  }
+  /**
+   * Execute the scrape_template tool.
+   * @param {{ template: string, url: string, timeout?: number }} params
+   * @returns {Promise<object>}
+   */
+  async execute({ template, url, timeout = 15000 }) {
+    // list mode — return available templates without scraping
+    if (template === 'list' || !url) {
+      return {
+        templates: this.registry.list(),
+        count: this.registry.list().length
+      };
+    }
+    // Validate template exists before making network call
+    const tpl = this.registry.get(template);
+    if (!tpl) {
+      const available = this.registry.list().map(t => t.id).join(', ');
+      throw new Error(`Unknown template "${template}". Available templates: ${available}`);
+    }
+    // Fetch the page
+    const controller = new AbortController();
+    const timeoutId = setTimeout(() => controller.abort(), timeout);
+    let html;
+    try {
+      const response = await fetch(url, {
+        signal: controller.signal,
+        headers: {
+          'User-Agent': 'Mozilla/5.0 (compatible; CrawlForge-TemplateScraper/4.0)'
+        }
+      });
+      clearTimeout(timeoutId);
+      if (!response.ok) {
+        throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+      }
+      html = await response.text();
+    } catch (error) {
+      clearTimeout(timeoutId);
+      if (error.name === 'AbortError') {
+        throw new Error(`Request timeout after ${timeout}ms`);
+      }
+      throw error;
+    }
+    // Run the template extractor
+    const result = await this.registry.run(template, html, url);
+    return result;
+  }
+}
+export default ScrapeTemplateTool;

package/src/tools/templates/TemplateRegistry.js ADDED Viewed

@@ -0,0 +1,311 @@
+/**
+ * TemplateRegistry — pre-built scraping templates for popular sites (D3.3).
+ *
+ * Each template is a self-contained object with:
+ *   id            — unique slug used as the `template` parameter
+ *   name          — human-readable name
+ *   description   — when to use this template
+ *   targetPattern — regex matching URLs this template handles
+ *   selectors     — CSS selectors mapping field names to DOM locations
+ *   postProcess   — optional function(raw: Object) → Object for cleanup
+ *
+ * Templates do NOT make network calls.  The ScrapeTemplateTool fetches the
+ * page and passes the parsed HTML to the template's extract() method.
+ */
+import { load } from 'cheerio';
+// ── Helpers ──────────────────────────────────────────────────────────────────
+function text($, sel) {
+  return $(sel).first().text().trim() || null;
+}
+function attr($, sel, attribute) {
+  return $(sel).first().attr(attribute) || null;
+}
+function list($, sel) {
+  return $(sel).map((_, el) => $(el).text().trim()).get().filter(Boolean);
+}
+function listAttr($, sel, attribute) {
+  return $(sel).map((_, el) => $(el).attr(attribute)).get().filter(Boolean);
+}
+// ── Template definitions ─────────────────────────────────────────────────────
+const TEMPLATES = [
+  {
+    id: 'amazon-product',
+    name: 'Amazon Product',
+    description: 'Scrape an Amazon product page for title, price, rating, reviews, ASIN, and description.',
+    targetPattern: /amazon\.(com|co\.uk|de|fr|jp|ca|com\.au)/i,
+    extract($) {
+      return {
+        title: text($, '#productTitle'),
+        price: text($, '.a-price .a-offscreen') || text($, '#priceblock_ourprice') || text($, '#priceblock_dealprice'),
+        currency: attr($, 'meta[itemprop="priceCurrency"]', 'content'),
+        rating: text($, '#acrPopover .a-size-base'),
+        review_count: text($, '#acrCustomerReviewText'),
+        asin: text($, 'input#ASIN') || attr($, 'input[name="ASIN"]', 'value'),
+        brand: text($, '#bylineInfo'),
+        description: text($, '#productDescription p') || text($, '#feature-bullets'),
+        images: listAttr($, '#altImages img.a-thumbnail-image', 'src').slice(0, 8),
+        availability: text($, '#availability span'),
+        category_breadcrumb: list($, '#wayfinding-breadcrumbs_feature_div a')
+      };
+    }
+  },
+  {
+    id: 'linkedin-profile',
+    name: 'LinkedIn Profile',
+    description: 'Scrape a LinkedIn public profile for name, headline, location, and about section.',
+    targetPattern: /linkedin\.com\/in\//i,
+    extract($) {
+      return {
+        name: text($, 'h1') || text($, '.top-card-layout__title'),
+        headline: text($, '.top-card-layout__headline') || text($, 'h2'),
+        location: text($, '.top-card-layout__first-subline') || text($, '.profile-info-subheader'),
+        about: text($, '.core-section-container__content p') || text($, '.summary'),
+        connections: text($, '.top-card__connections'),
+        current_company: text($, '.top-card-layout__card-inner-full-width .top-card-link'),
+        note: 'LinkedIn requires authentication for full profiles. This template works on public profile pages only.'
+      };
+    }
+  },
+  {
+    id: 'github-repo',
+    name: 'GitHub Repository',
+    description: 'Scrape a GitHub repository page for stars, forks, description, language, topics, and README summary.',
+    targetPattern: /github\.com\/[^/]+\/[^/]+\/?$/i,
+    extract($) {
+      return {
+        name: text($, 'strong[itemprop="name"] a') || text($, '.repository-content h1'),
+        description: attr($, 'meta[property="og:description"]', 'content') || text($, 'p.f4.my-3'),
+        stars: text($, '#repo-stars-counter-star') || text($, '[aria-label*="stargazers"]'),
+        forks: text($, '#repo-network-counter') || text($, '[aria-label*="forks"]'),
+        watchers: text($, '[aria-label*="watchers"]'),
+        language: text($, 'span[itemprop="programmingLanguage"]') || text($, '.d-inline-flex[class*="language"]'),
+        topics: list($, 'a.topic-tag'),
+        license: text($, 'a[href*="blob/"][href*="LICENSE"]') || text($, '.octicon-law ~ span'),
+        last_updated: attr($, 'relative-time', 'datetime'),
+        homepage: attr($, 'a[href][rel="noopener noreferrer"]', 'href'),
+        open_issues: text($, '.Counter[aria-label*="issue"]')
+      };
+    }
+  },
+  {
+    id: 'youtube-video',
+    name: 'YouTube Video',
+    description: 'Scrape a YouTube video page for title, channel, views, likes, publish date, and description.',
+    targetPattern: /youtube\.com\/watch/i,
+    extract($) {
+      return {
+        title: attr($, 'meta[name="title"]', 'content') || attr($, 'meta[property="og:title"]', 'content'),
+        channel: attr($, 'link[itemprop="name"]', 'content') || text($, '#channel-name'),
+        channel_url: attr($, 'span[itemprop="author"] link[itemprop="url"]', 'href'),
+        views: attr($, 'meta[itemprop="interactionCount"]', 'content'),
+        published: attr($, 'meta[itemprop="uploadDate"]', 'content') || attr($, 'meta[itemprop="datePublished"]', 'content'),
+        description: attr($, 'meta[property="og:description"]', 'content'),
+        thumbnail: attr($, 'meta[property="og:image"]', 'content'),
+        duration: attr($, 'meta[itemprop="duration"]', 'content'),
+        video_id: new URL($('link[rel="canonical"]').attr('href') || 'https://youtube.com').searchParams.get('v')
+      };
+    }
+  },
+  {
+    id: 'tweet',
+    name: 'Tweet / X Post',
+    description: 'Scrape a tweet/X post for text, author, timestamp, likes, and retweets from the Open Graph / structured data.',
+    targetPattern: /(twitter|x)\.com\/[^/]+\/status\//i,
+    extract($) {
+      return {
+        text: attr($, 'meta[property="og:description"]', 'content'),
+        author: attr($, 'meta[property="og:title"]', 'content'),
+        url: attr($, 'meta[property="og:url"]', 'content') || attr($, 'link[rel="canonical"]', 'href'),
+        image: attr($, 'meta[property="og:image"]', 'content'),
+        note: 'X.com requires JavaScript rendering for full tweet data. Structured metadata is returned from static HTML.'
+      };
+    }
+  },
+  {
+    id: 'reddit-thread',
+    name: 'Reddit Thread',
+    description: 'Scrape a Reddit thread for title, subreddit, score, comment count, author, and top-level comments.',
+    targetPattern: /reddit\.com\/r\/[^/]+\/comments\//i,
+    extract($) {
+      return {
+        title: attr($, 'meta[property="og:title"]', 'content') || text($, 'h1'),
+        subreddit: text($, 'a[href*="/r/"][class*="subreddit"]') || (($('title').text().match(/r\/([^•]+)/) || [])[1] || '').trim(),
+        score: text($, '[data-score]') || attr($, '[itemprop="upvoteCount"]', 'content'),
+        author: text($, 'a[href*="/user/"]'),
+        posted: attr($, 'time[datetime]', 'datetime'),
+        body: text($, 'div[data-click-id="text"] p') || attr($, 'meta[property="og:description"]', 'content'),
+        url: attr($, 'meta[property="og:url"]', 'content'),
+        flair: text($, '[class*="flair"]')
+      };
+    }
+  },
+  {
+    id: 'hacker-news-front-page',
+    name: 'Hacker News Front Page',
+    description: 'Scrape the Hacker News front page for a list of stories with title, URL, score, and comment count.',
+    targetPattern: /news\.ycombinator\.com(\/news)?$/i,
+    extract($) {
+      const stories = [];
+      $('tr.athing').each((_, el) => {
+        const $row = $(el);
+        const $score = $row.next('.spacer').find('.score');
+        const $subtext = $row.next('.spacer').find('.subtext');
+        const $titleLink = $row.find('.titleline > a');
+        stories.push({
+          id: $row.attr('id'),
+          title: $titleLink.text().trim(),
+          url: $titleLink.attr('href'),
+          site: $row.find('.sitebit a').text().trim() || null,
+          score: $score.text().replace(' points', '').trim() || null,
+          author: $subtext.find('.hnuser').text().trim() || null,
+          posted: $subtext.find('.age a').attr('href') || null,
+          comments: $subtext.find('a[href*="item"]').last().text().trim() || null
+        });
+      });
+      return { stories: stories.slice(0, 30), scraped_at: new Date().toISOString() };
+    }
+  },
+  {
+    id: 'producthunt-launch',
+    name: 'Product Hunt Launch',
+    description: 'Scrape a Product Hunt product page for name, tagline, vote count, topics, and maker details.',
+    targetPattern: /producthunt\.com\/posts\//i,
+    extract($) {
+      return {
+        name: attr($, 'meta[property="og:title"]', 'content'),
+        tagline: attr($, 'meta[property="og:description"]', 'content'),
+        image: attr($, 'meta[property="og:image"]', 'content'),
+        url: attr($, 'meta[property="og:url"]', 'content'),
+        votes: text($, '[data-test="vote-button"] span') || text($, 'button[data-vote-button]'),
+        topics: list($, 'a[href*="/topics/"]'),
+        website: attr($, 'a[data-test="product-link"]', 'href') || attr($, 'a[href][rel="noopener"][target="_blank"]', 'href')
+      };
+    }
+  },
+  {
+    id: 'stackoverflow-question',
+    name: 'Stack Overflow Question',
+    description: 'Scrape a Stack Overflow question for title, body, votes, tags, answers, and accepted answer.',
+    targetPattern: /stackoverflow\.com\/questions\//i,
+    extract($) {
+      const answers = [];
+      $('.answer').each((_, el) => {
+        const $a = $(el);
+        answers.push({
+          votes: $a.find('[itemprop="upvoteCount"]').attr('content') || $a.find('.js-vote-count').text().trim(),
+          accepted: $a.hasClass('accepted-answer'),
+          body: $a.find('.s-prose').first().text().trim().slice(0, 500)
+        });
+      });
+      return {
+        title: text($, '#question-header h1'),
+        body: text($, '.question .s-prose'),
+        votes: text($, '.question .js-vote-count') || attr($, '.question [itemprop="upvoteCount"]', 'content'),
+        views: text($, '.js-view-count') || attr($, 'meta[name="twitter:data1"]', 'content'),
+        tags: list($, '.post-tag'),
+        author: text($, '.question .user-details a'),
+        asked: attr($, '.question time', 'datetime'),
+        answers: answers.slice(0, 5),
+        answered: $('div.accepted-answer').length > 0
+      };
+    }
+  },
+  {
+    id: 'npm-package',
+    name: 'npm Package',
+    description: 'Scrape an npm package page for name, version, description, weekly downloads, license, and dependencies.',
+    targetPattern: /npmjs\.com\/package\//i,
+    extract($) {
+      const scripts = [];
+      $('script[type="application/ld+json"]').each((_, el) => {
+        try { scripts.push(JSON.parse($(el).html())); } catch {}
+      });
+      const ld = scripts[0] || {};
+      return {
+        name: text($, 'h1') || ld.name,
+        version: text($, 'h3[data-testid="package-version-number"]') || text($, '[class*="version"]'),
+        description: attr($, 'meta[name="description"]', 'content') || text($, 'p[class*="description"]'),
+        license: text($, 'span[class*="license"]') || text($, '[data-cy="license"]') || ld.license,
+        weekly_downloads: text($, 'span[class*="weekly-downloads"]') || text($, '[data-cy="downloads"]'),
+        install_command: `npm install ${ld.name || text($, 'h1') || ''}`.trim(),
+        homepage: attr($, 'a[href][class*="homepage"]', 'href'),
+        repository: attr($, 'a[href*="github.com"]', 'href'),
+        maintainers: list($, 'a[href*="/~"]')
+      };
+    }
+  }
+];
+// ── Registry ─────────────────────────────────────────────────────────────────
+export class TemplateRegistry {
+  constructor() {
+    this._templates = new Map(TEMPLATES.map(t => [t.id, t]));
+  }
+  /**
+   * List all registered template IDs and names.
+   * @returns {{ id: string, name: string, description: string }[]}
+   */
+  list() {
+    return TEMPLATES.map(({ id, name, description, targetPattern }) => ({
+      id, name, description,
+      targetPattern: targetPattern.toString()
+    }));
+  }
+  /**
+   * Look up a template by ID.
+   * @param {string} id
+   * @returns {object|undefined}
+   */
+  get(id) {
+    return this._templates.get(id);
+  }
+  /**
+   * Run a template against raw HTML.
+   * @param {string} id     — template ID
+   * @param {string} html   — raw HTML of the target page
+   * @param {string} url    — original URL (for context)
+   * @returns {{ template: string, url: string, data: object, extractedAt: string }}
+   */
+  async run(id, html, url) {
+    const template = this.get(id);
+    if (!template) {
+      throw new Error(`Unknown template: "${id}". Available: ${TEMPLATES.map(t => t.id).join(', ')}`);
+    }
+    const $ = load(html);
+    const data = template.extract($);
+    return {
+      template: id,
+      template_name: template.name,
+      url,
+      data,
+      extractedAt: new Date().toISOString()
+    };
+  }
+}
+export default TemplateRegistry;

package/src/utils/Logger.js CHANGED Viewed

@@ -4,6 +4,7 @@
  */
 import winston from 'winston';
+import { maskSecrets } from './secretMask.js';
 import { fileURLToPath } from 'url';
 import { dirname, join } from 'path';
 import { existsSync, mkdirSync } from 'fs';
@@ -70,7 +71,21 @@ export class Logger {
    * @returns {winston.Format} Winston format
    */
   createFormat(enableJson) {
+    // D2.9: global secret masking format applied first
+    const secretMaskFormat = winston.format((info) => {
+      if (info.metadata) info.metadata = maskSecrets(info.metadata);
+      if (typeof info.message === 'string') {
+        // lightweight heuristic mask on the message string itself
+        info.message = info.message
+          .replace(/(Bearer\s+)\S+/gi, '$1[REDACTED]')
+          .replace(/(api[_-]?key[:=]\s*)\S+/gi, '$1[REDACTED]')
+          .replace(/(x-api-key[:=]\s*)\S+/gi, '$1[REDACTED]');
+      }
+      return info;
+    })();
     const formats = [
+      secretMaskFormat,
       winston.format.timestamp({ format: 'YYYY-MM-DD HH:mm:ss.SSS' }),
       winston.format.errors({ stack: true }),
       winston.format.metadata({ fillExcept: ['message', 'level', 'timestamp', 'service'] })

package/src/utils/htmlToMarkdown.js ADDED Viewed

@@ -0,0 +1,54 @@
+/**
+ * htmlToMarkdown -- thin wrapper around the Turndown HTML-to-Markdown library.
+ *
+ * Usage:
+ *   import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js';
+ *   const md = htmlToMarkdown(rawHtml);
+ *
+ * Design notes:
+ * - Turndown is the most widely-used, battle-tested HTML->Markdown converter.
+ * - We configure it with sensible defaults for RAG workflows:
+ *     headingStyle: 'atx'       -> # H1 / ## H2 instead of underline style
+ *     codeBlockStyle: 'fenced'  -> triple-backtick fences
+ *     bulletListMarker: '-'
+ * - Tables fall back to prose (no GFM plugin loaded by default).
+ */
+import TurndownService from 'turndown';
+let _td = null;
+function getTurndown() {
+  if (_td === null) {
+    _td = new TurndownService({
+      headingStyle: 'atx',
+      codeBlockStyle: 'fenced',
+      bulletListMarker: '-',
+      emDelimiter: '_',
+      strongDelimiter: '**',
+      hr: '---',
+      linkStyle: 'inlined'
+    });
+    // Remove boilerplate elements before converting
+    _td.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);
+  }
+  return _td;
+}
+/**
+ * Convert an HTML string to Markdown.
+ * Returns an empty string if html is falsy.
+ *
+ * @param {string} html
+ * @returns {string}
+ */
+export function htmlToMarkdown(html) {
+  if (!html) return '';
+  try {
+    return getTurndown().turndown(html).trim();
+  } catch {
+    // Fallback: strip tags, return plain text
+    return html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
+  }
+}