npm - crawlforge-mcp-server - Versions diffs - 3.4.0 → 4.2.1 - Mend

crawlforge-mcp-server 3.4.0 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/README.md +28 -2
package/package.json +6 -4
package/server.js +166 -32
package/src/cli/commands/actions.js +36 -0
package/src/cli/commands/analyze.js +19 -0
package/src/cli/commands/batch.js +45 -0
package/src/cli/commands/crawl.js +30 -0
package/src/cli/commands/extract.js +45 -0
package/src/cli/commands/install-skills.js +46 -0
package/src/cli/commands/llmstxt.js +24 -0
package/src/cli/commands/localize.js +29 -0
package/src/cli/commands/map.js +26 -0
package/src/cli/commands/monitor.js +29 -0
package/src/cli/commands/research.js +26 -0
package/src/cli/commands/scrape.js +37 -0
package/src/cli/commands/search.js +28 -0
package/src/cli/commands/stealth.js +29 -0
package/src/cli/commands/template.js +26 -0
package/src/cli/commands/track.js +24 -0
package/src/cli/commands/uninstall-skills.js +35 -0
package/src/cli/formatter.js +57 -0
package/src/cli/index.js +94 -0
package/src/cli/lib/runTool.js +40 -0
package/src/core/ActionExecutor.js +8 -6
package/src/core/AuthManager.js +103 -3
package/src/core/ChangeTracker.js +34 -0
package/src/core/ElicitationHelper.js +112 -0
package/src/core/JobManager.js +36 -2
package/src/core/LocalizationManager.js +19 -5
package/src/core/PerformanceManager.js +53 -17
package/src/core/ResearchOrchestrator.js +40 -5
package/src/core/SamplingClient.js +191 -0
package/src/core/StealthBrowserManager.js +248 -2
package/src/core/WebhookDispatcher.js +18 -10
package/src/prompts/PromptRegistry.js +199 -0
package/src/resources/ResourceRegistry.js +273 -0
package/src/server/transports/streamableHttp.js +6 -6
package/src/server/withAuth.js +25 -0
package/src/skills/crawlforge-cli.md +157 -0
package/src/skills/crawlforge-mcp.md +80 -0
package/src/skills/crawlforge-research.md +104 -0
package/src/skills/crawlforge-stealth.md +98 -0
package/src/skills/installer.js +141 -0
package/src/tools/advanced/batchScrape/index.js +30 -0
package/src/tools/advanced/batchScrape/schema.js +1 -1
package/src/tools/basic/extractText.js +19 -8
package/src/tools/crawl/crawlDeep.js +27 -0
package/src/tools/extract/extractContent.js +5 -17
package/src/tools/extract/extractStructured.js +8 -0
package/src/tools/extract/extractWithLlm.js +35 -25
package/src/tools/extract/listOllamaModels.js +66 -0
package/src/tools/extract/processDocument.js +7 -1
package/src/tools/extract/summarizeContent.js +17 -0
package/src/tools/research/deepResearch.js +34 -0
package/src/tools/templates/ScrapeTemplateTool.js +68 -0
package/src/tools/templates/TemplateRegistry.js +311 -0
package/src/utils/Logger.js +15 -0
package/src/utils/htmlToMarkdown.js +54 -0
package/src/utils/secretMask.js +86 -0

package/src/tools/extract/extractWithLlm.js CHANGED Viewed

@@ -1,13 +1,22 @@
 /**
  * Extract With LLM MCP Tool
- * Natural-language extraction powered by OpenAI, Anthropic, or a local Ollama model.
- * Mirrors ScrapeGraphAI positioning: describe what you want, get structured JSON back.
+ * Natural-language extraction powered by a local Ollama model (default) or
+ * a cloud provider (OpenAI / Anthropic, explicit opt-in).
  *
- * Cloud providers require OPENAI_API_KEY or ANTHROPIC_API_KEY in environment.
- * Ollama requires no API key — just a running `ollama serve` on http://localhost:11434.
+ * Default: provider 'auto' → Ollama at http://localhost:11434, no API key required.
+ * Pass provider: "openai" | "anthropic" with the matching API key to use a cloud model.
  */
 import { fetchAndParse } from './_fetchAndParse.js';
+// D1.3: SamplingClient for MCP sampling fallback (lazy — only imported if needed)
+let _SamplingClient = null;
+async function getSamplingClient() {
+  if (!_SamplingClient) {
+    const mod = await import('../../core/SamplingClient.js');
+    _SamplingClient = mod.SamplingClient;
+  }
+  return _SamplingClient;
+}
 // ── Constants ─────────────────────────────────────────────────────────────────
@@ -36,34 +45,24 @@ function ollamaBaseUrl() {
  * @returns {{ provider: 'openai'|'anthropic'|'ollama', apiKey: string|null }}
  */
 function resolveProvider(provider) {
-  const anthropicKey = process.env.ANTHROPIC_API_KEY;
-  const openaiKey = process.env.OPENAI_API_KEY;
-  const ollamaOptIn = !!process.env.OLLAMA_BASE_URL;
-  if (provider === 'auto') {
-    if (anthropicKey) return { provider: 'anthropic', apiKey: anthropicKey };
-    if (openaiKey) return { provider: 'openai', apiKey: openaiKey };
-    if (ollamaOptIn) return { provider: 'ollama', apiKey: null };
-    throw new Error(
-      'extract_with_llm requires OPENAI_API_KEY, ANTHROPIC_API_KEY, or OLLAMA_BASE_URL in environment ' +
-      '(or pass provider: "ollama" explicitly to use a local Ollama server)'
-    );
+  if (provider === 'auto' || provider === 'ollama') {
+    // Local Ollama is the default. No API key required; OLLAMA_BASE_URL is
+    // an optional override (defaults to http://localhost:11434).
+    return { provider: 'ollama', apiKey: null };
   }
   if (provider === 'anthropic') {
+    const anthropicKey = process.env.ANTHROPIC_API_KEY;
     if (!anthropicKey) throw new Error('extract_with_llm: ANTHROPIC_API_KEY is not set');
     return { provider: 'anthropic', apiKey: anthropicKey };
   }
   if (provider === 'openai') {
+    const openaiKey = process.env.OPENAI_API_KEY;
     if (!openaiKey) throw new Error('extract_with_llm: OPENAI_API_KEY is not set');
     return { provider: 'openai', apiKey: openaiKey };
   }
-  if (provider === 'ollama') {
-    return { provider: 'ollama', apiKey: null };
-  }
   throw new Error(`extract_with_llm: unknown provider "${provider}"`);
 }
@@ -307,14 +306,25 @@ export class ExtractWithLlm {
     const userMessage = buildUserMessage(prompt, text, schema);
-    // Step 2: First LLM call
-    let rawText, usage;
+    // Step 2: First LLM call — with sampling fallback for 'auto' provider
+    // Fallback chain: Ollama → API key (handled by resolveProvider) → sampling → error
+    let rawText, usage, resolvedModel = model;
     try {
       ({ rawText, usage } = await callLLM({
         provider, apiKey, model, systemMessage, userMessage, maxTokens, schema
       }));
     } catch (llmErr) {
-      return { success: false, error: `LLM call failed: ${llmErr.message}` };
+      // D1.3: If provider is 'auto'/'ollama' and it failed, try sampling as final fallback
+      if (providerParam === 'auto' || providerParam === 'ollama') {
+        try {
+          ({ rawText, usage } = await callViaSampling({ systemMessage, userMessage, maxTokens }));
+          resolvedModel = 'sampling';
+        } catch (samplingErr) {
+          return { success: false, error: `LLM call failed: ${llmErr.message}. Sampling fallback also failed: ${samplingErr.message}` };
+        }
+      } else {
+        return { success: false, error: `LLM call failed: ${llmErr.message}` };
+      }
     }
     // Step 3: Parse JSON; retry once with stricter prompt if it fails
@@ -355,8 +365,8 @@ export class ExtractWithLlm {
     return {
       success: true,
       data: parsed,
-      provider,
-      model,
+      provider: resolvedModel === 'sampling' ? 'sampling' : provider,
+      model: resolvedModel || model,
       usage
     };
   }

package/src/tools/extract/listOllamaModels.js ADDED Viewed

@@ -0,0 +1,66 @@
+/**
+ * List Ollama Models MCP Tool
+ * Returns the models installed on the local Ollama server (GET /api/tags).
+ * Used to discover names that can be passed as the `model` parameter to extract_with_llm.
+ */
+function ollamaBaseUrl() {
+  return (process.env.OLLAMA_BASE_URL || 'http://localhost:11434').replace(/\/$/, '');
+}
+export class ListOllamaModelsTool {
+  async execute() {
+    const baseUrl = ollamaBaseUrl();
+    const url = `${baseUrl}/api/tags`;
+    let response;
+    try {
+      response = await fetch(url, { signal: AbortSignal.timeout(10_000) });
+    } catch (err) {
+      return {
+        success: false,
+        baseUrl,
+        error:
+          `Could not reach Ollama at ${url}: ${err.message}. ` +
+          `Install from https://ollama.com and run "ollama serve".`
+      };
+    }
+    if (!response.ok) {
+      return {
+        success: false,
+        baseUrl,
+        error: `Ollama responded ${response.status} at ${url}. Is "ollama serve" running?`
+      };
+    }
+    let data;
+    try {
+      data = await response.json();
+    } catch (err) {
+      return { success: false, baseUrl, error: `Invalid JSON from Ollama: ${err.message}` };
+    }
+    const models = (data.models || []).map((m) => ({
+      name: m.name,
+      size_bytes: m.size,
+      modified_at: m.modified_at,
+      family: m.details?.family,
+      parameter_size: m.details?.parameter_size,
+      quantization: m.details?.quantization_level
+    }));
+    return {
+      success: true,
+      baseUrl,
+      count: models.length,
+      models,
+      hint:
+        models.length === 0
+          ? 'No models installed. Run "ollama pull llama3.2" (or any model from https://ollama.com/library) in your terminal.'
+          : 'Pass any of these names as the `model` parameter to extract_with_llm.'
+    };
+  }
+}
+export default ListOllamaModelsTool;

package/src/tools/extract/processDocument.js CHANGED Viewed

@@ -8,6 +8,7 @@ import { PDFProcessor } from '../../core/processing/PDFProcessor.js';
 import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
 import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
 import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
+import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
 const ProcessDocumentSchema = z.object({
   source: z.string().min(1),
@@ -28,7 +29,7 @@ const ProcessDocumentSchema = z.object({
     // Processing options
     assessContentQuality: z.boolean().default(true),
     includeStatistics: z.boolean().default(true),
-    outputFormat: z.enum(['text', 'structured', 'full']).default('structured'),
+    outputFormat: z.enum(['text', 'structured', 'full', 'markdown']).default('structured'),
     // Content filtering
     minContentLength: z.number().min(0).default(50),
@@ -328,6 +329,11 @@ export class ProcessDocumentTool {
       result.content.html = html;
     }
+    // D3.1: Markdown output mode — convert extracted HTML to markdown via Turndown
+    if (options.outputFormat === 'markdown') {
+      result.content.markdown = htmlToMarkdown(extractedContent || html);
+    }
     // Step 4: Set metadata
     if (processingResult.metadata) {
       result.metadata = {

package/src/tools/extract/summarizeContent.js CHANGED Viewed

@@ -4,6 +4,15 @@
  */
 import { z } from 'zod';
+// D1.3: lazy SamplingClient for abstractive mode when no LLM keys are set
+let _SamplingClient = null;
+async function getSamplingClient() {
+  if (!_SamplingClient) {
+    const mod = await import('../../core/SamplingClient.js');
+    _SamplingClient = mod.SamplingClient;
+  }
+  return _SamplingClient;
+}
 import { ContentAnalyzer } from '../../core/analysis/ContentAnalyzer.js';
 import { splitSentences } from '../../core/analysis/sentenceUtils.js';
@@ -122,6 +131,14 @@ export class SummarizeContentTool {
       // Step 2: Set summary result
       result.summary = analysisResult.summary;
+      // D1.3: If abstractive mode requested, attempt sampling-based enhancement
+      if (options.summaryType === 'abstractive') {
+        const abstractive = await this._abstractiveSummaryViaSampling(text, analysisResult.summary, options.summaryLength);
+        if (abstractive) {
+          result.summary = abstractive;
+        }
+      }
       // Step 3: Extract key points if requested
       if (options.includeKeypoints) {
         result.keypoints = await this.extractKeyPoints(text, analysisResult.summary);

package/src/tools/research/deepResearch.js CHANGED Viewed

@@ -1,4 +1,6 @@
 import { z } from 'zod';
+// D1.4: Elicitation helper (injected from server.js or can be used standalone)
+import { ElicitationHelper } from '../../core/ElicitationHelper.js';
 import { ResearchOrchestrator } from '../../core/ResearchOrchestrator.js';
 import { Logger } from '../../utils/Logger.js';
@@ -93,6 +95,17 @@ export class DeepResearchTool {
       cacheTTL,
       ...orchestratorOptions
     };
+    // D1.4: Elicitation helper (set mcpServer via setMcpServer() after instantiation)
+    this._elicitation = new ElicitationHelper({});
+  }
+  /**
+   * D1.4: Set the MCP server instance for elicitation support.
+   * Call this from server.js after instantiating DeepResearchTool.
+   * @param {object} mcpServer
+   */
+  setMcpServer(mcpServer) {
+    this._elicitation = new ElicitationHelper({ mcpServer });
   }
   async execute(params) {
@@ -116,6 +129,27 @@ export class DeepResearchTool {
         };
       }
+      // D1.4: Elicitation — warn user if projected cost exceeds 50 credits
+      // deep_research costs approximately 1 credit per URL; maxUrls > 50 → confirm
+      if (validated.maxUrls > 50) {
+        const projectedCredits = validated.maxUrls;
+        const proceed = await this._elicitation.confirm(
+          `deep_research will scan up to ${validated.maxUrls} URLs, projecting ~${projectedCredits} credits.`,
+          {
+            topic: validated.topic,
+            projected_credits: projectedCredits,
+            max_urls: validated.maxUrls,
+          }
+        );
+        if (!proceed) {
+          return {
+            success: false,
+            error: 'Research cancelled by user before starting (elicitation declined).',
+            sessionId,
+          };
+        }
+      }
       // Configure research orchestrator based on research approach
       const orchestratorConfig = this.buildOrchestratorConfig(validated);
       const orchestrator = new ResearchOrchestrator(orchestratorConfig);

package/src/tools/templates/ScrapeTemplateTool.js ADDED Viewed

@@ -0,0 +1,68 @@
+/**
+ * ScrapeTemplateTool — wraps TemplateRegistry to expose the `scrape_template` MCP tool.
+ *
+ * Usage pattern (D3.3):
+ *   const tool = new ScrapeTemplateTool();
+ *   const result = await tool.execute({ template: "github-repo", url: "https://github.com/user/repo" });
+ */
+import { TemplateRegistry } from './TemplateRegistry.js';
+export class ScrapeTemplateTool {
+  constructor() {
+    this.registry = new TemplateRegistry();
+  }
+  /**
+   * Execute the scrape_template tool.
+   * @param {{ template: string, url: string, timeout?: number }} params
+   * @returns {Promise<object>}
+   */
+  async execute({ template, url, timeout = 15000 }) {
+    // list mode — return available templates without scraping
+    if (template === 'list' || !url) {
+      return {
+        templates: this.registry.list(),
+        count: this.registry.list().length
+      };
+    }
+    // Validate template exists before making network call
+    const tpl = this.registry.get(template);
+    if (!tpl) {
+      const available = this.registry.list().map(t => t.id).join(', ');
+      throw new Error(`Unknown template "${template}". Available templates: ${available}`);
+    }
+    // Fetch the page
+    const controller = new AbortController();
+    const timeoutId = setTimeout(() => controller.abort(), timeout);
+    let html;
+    try {
+      const response = await fetch(url, {
+        signal: controller.signal,
+        headers: {
+          'User-Agent': 'Mozilla/5.0 (compatible; CrawlForge-TemplateScraper/4.0)'
+        }
+      });
+      clearTimeout(timeoutId);
+      if (!response.ok) {
+        throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+      }
+      html = await response.text();
+    } catch (error) {
+      clearTimeout(timeoutId);
+      if (error.name === 'AbortError') {
+        throw new Error(`Request timeout after ${timeout}ms`);
+      }
+      throw error;
+    }
+    // Run the template extractor
+    const result = await this.registry.run(template, html, url);
+    return result;
+  }
+}
+export default ScrapeTemplateTool;

package/src/tools/templates/TemplateRegistry.js ADDED Viewed

@@ -0,0 +1,311 @@
+/**
+ * TemplateRegistry — pre-built scraping templates for popular sites (D3.3).
+ *
+ * Each template is a self-contained object with:
+ *   id            — unique slug used as the `template` parameter
+ *   name          — human-readable name
+ *   description   — when to use this template
+ *   targetPattern — regex matching URLs this template handles
+ *   selectors     — CSS selectors mapping field names to DOM locations
+ *   postProcess   — optional function(raw: Object) → Object for cleanup
+ *
+ * Templates do NOT make network calls.  The ScrapeTemplateTool fetches the
+ * page and passes the parsed HTML to the template's extract() method.
+ */
+import { load } from 'cheerio';
+// ── Helpers ──────────────────────────────────────────────────────────────────
+function text($, sel) {
+  return $(sel).first().text().trim() || null;
+}
+function attr($, sel, attribute) {
+  return $(sel).first().attr(attribute) || null;
+}
+function list($, sel) {
+  return $(sel).map((_, el) => $(el).text().trim()).get().filter(Boolean);
+}
+function listAttr($, sel, attribute) {
+  return $(sel).map((_, el) => $(el).attr(attribute)).get().filter(Boolean);
+}
+// ── Template definitions ─────────────────────────────────────────────────────
+const TEMPLATES = [
+  {
+    id: 'amazon-product',
+    name: 'Amazon Product',
+    description: 'Scrape an Amazon product page for title, price, rating, reviews, ASIN, and description.',
+    targetPattern: /amazon\.(com|co\.uk|de|fr|jp|ca|com\.au)/i,
+    extract($) {
+      return {
+        title: text($, '#productTitle'),
+        price: text($, '.a-price .a-offscreen') || text($, '#priceblock_ourprice') || text($, '#priceblock_dealprice'),
+        currency: attr($, 'meta[itemprop="priceCurrency"]', 'content'),
+        rating: text($, '#acrPopover .a-size-base'),
+        review_count: text($, '#acrCustomerReviewText'),
+        asin: text($, 'input#ASIN') || attr($, 'input[name="ASIN"]', 'value'),
+        brand: text($, '#bylineInfo'),
+        description: text($, '#productDescription p') || text($, '#feature-bullets'),
+        images: listAttr($, '#altImages img.a-thumbnail-image', 'src').slice(0, 8),
+        availability: text($, '#availability span'),
+        category_breadcrumb: list($, '#wayfinding-breadcrumbs_feature_div a')
+      };
+    }
+  },
+  {
+    id: 'linkedin-profile',
+    name: 'LinkedIn Profile',
+    description: 'Scrape a LinkedIn public profile for name, headline, location, and about section.',
+    targetPattern: /linkedin\.com\/in\//i,
+    extract($) {
+      return {
+        name: text($, 'h1') || text($, '.top-card-layout__title'),
+        headline: text($, '.top-card-layout__headline') || text($, 'h2'),
+        location: text($, '.top-card-layout__first-subline') || text($, '.profile-info-subheader'),
+        about: text($, '.core-section-container__content p') || text($, '.summary'),
+        connections: text($, '.top-card__connections'),
+        current_company: text($, '.top-card-layout__card-inner-full-width .top-card-link'),
+        note: 'LinkedIn requires authentication for full profiles. This template works on public profile pages only.'
+      };
+    }
+  },
+  {
+    id: 'github-repo',
+    name: 'GitHub Repository',
+    description: 'Scrape a GitHub repository page for stars, forks, description, language, topics, and README summary.',
+    targetPattern: /github\.com\/[^/]+\/[^/]+\/?$/i,
+    extract($) {
+      return {
+        name: text($, 'strong[itemprop="name"] a') || text($, '.repository-content h1'),
+        description: attr($, 'meta[property="og:description"]', 'content') || text($, 'p.f4.my-3'),
+        stars: text($, '#repo-stars-counter-star') || text($, '[aria-label*="stargazers"]'),
+        forks: text($, '#repo-network-counter') || text($, '[aria-label*="forks"]'),
+        watchers: text($, '[aria-label*="watchers"]'),
+        language: text($, 'span[itemprop="programmingLanguage"]') || text($, '.d-inline-flex[class*="language"]'),
+        topics: list($, 'a.topic-tag'),
+        license: text($, 'a[href*="blob/"][href*="LICENSE"]') || text($, '.octicon-law ~ span'),
+        last_updated: attr($, 'relative-time', 'datetime'),
+        homepage: attr($, 'a[href][rel="noopener noreferrer"]', 'href'),
+        open_issues: text($, '.Counter[aria-label*="issue"]')
+      };
+    }
+  },
+  {
+    id: 'youtube-video',
+    name: 'YouTube Video',
+    description: 'Scrape a YouTube video page for title, channel, views, likes, publish date, and description.',
+    targetPattern: /youtube\.com\/watch/i,
+    extract($) {
+      return {
+        title: attr($, 'meta[name="title"]', 'content') || attr($, 'meta[property="og:title"]', 'content'),
+        channel: attr($, 'link[itemprop="name"]', 'content') || text($, '#channel-name'),
+        channel_url: attr($, 'span[itemprop="author"] link[itemprop="url"]', 'href'),
+        views: attr($, 'meta[itemprop="interactionCount"]', 'content'),
+        published: attr($, 'meta[itemprop="uploadDate"]', 'content') || attr($, 'meta[itemprop="datePublished"]', 'content'),
+        description: attr($, 'meta[property="og:description"]', 'content'),
+        thumbnail: attr($, 'meta[property="og:image"]', 'content'),
+        duration: attr($, 'meta[itemprop="duration"]', 'content'),
+        video_id: new URL($('link[rel="canonical"]').attr('href') || 'https://youtube.com').searchParams.get('v')
+      };
+    }
+  },
+  {
+    id: 'tweet',
+    name: 'Tweet / X Post',
+    description: 'Scrape a tweet/X post for text, author, timestamp, likes, and retweets from the Open Graph / structured data.',
+    targetPattern: /(twitter|x)\.com\/[^/]+\/status\//i,
+    extract($) {
+      return {
+        text: attr($, 'meta[property="og:description"]', 'content'),
+        author: attr($, 'meta[property="og:title"]', 'content'),
+        url: attr($, 'meta[property="og:url"]', 'content') || attr($, 'link[rel="canonical"]', 'href'),
+        image: attr($, 'meta[property="og:image"]', 'content'),
+        note: 'X.com requires JavaScript rendering for full tweet data. Structured metadata is returned from static HTML.'
+      };
+    }
+  },
+  {
+    id: 'reddit-thread',
+    name: 'Reddit Thread',
+    description: 'Scrape a Reddit thread for title, subreddit, score, comment count, author, and top-level comments.',
+    targetPattern: /reddit\.com\/r\/[^/]+\/comments\//i,
+    extract($) {
+      return {
+        title: attr($, 'meta[property="og:title"]', 'content') || text($, 'h1'),
+        subreddit: text($, 'a[href*="/r/"][class*="subreddit"]') || (($('title').text().match(/r\/([^•]+)/) || [])[1] || '').trim(),
+        score: text($, '[data-score]') || attr($, '[itemprop="upvoteCount"]', 'content'),
+        author: text($, 'a[href*="/user/"]'),
+        posted: attr($, 'time[datetime]', 'datetime'),
+        body: text($, 'div[data-click-id="text"] p') || attr($, 'meta[property="og:description"]', 'content'),
+        url: attr($, 'meta[property="og:url"]', 'content'),
+        flair: text($, '[class*="flair"]')
+      };
+    }
+  },
+  {
+    id: 'hacker-news-front-page',
+    name: 'Hacker News Front Page',
+    description: 'Scrape the Hacker News front page for a list of stories with title, URL, score, and comment count.',
+    targetPattern: /news\.ycombinator\.com(\/news)?$/i,
+    extract($) {
+      const stories = [];
+      $('tr.athing').each((_, el) => {
+        const $row = $(el);
+        const $score = $row.next('.spacer').find('.score');
+        const $subtext = $row.next('.spacer').find('.subtext');
+        const $titleLink = $row.find('.titleline > a');
+        stories.push({
+          id: $row.attr('id'),
+          title: $titleLink.text().trim(),
+          url: $titleLink.attr('href'),
+          site: $row.find('.sitebit a').text().trim() || null,
+          score: $score.text().replace(' points', '').trim() || null,
+          author: $subtext.find('.hnuser').text().trim() || null,
+          posted: $subtext.find('.age a').attr('href') || null,
+          comments: $subtext.find('a[href*="item"]').last().text().trim() || null
+        });
+      });
+      return { stories: stories.slice(0, 30), scraped_at: new Date().toISOString() };
+    }
+  },
+  {
+    id: 'producthunt-launch',
+    name: 'Product Hunt Launch',
+    description: 'Scrape a Product Hunt product page for name, tagline, vote count, topics, and maker details.',
+    targetPattern: /producthunt\.com\/posts\//i,
+    extract($) {
+      return {
+        name: attr($, 'meta[property="og:title"]', 'content'),
+        tagline: attr($, 'meta[property="og:description"]', 'content'),
+        image: attr($, 'meta[property="og:image"]', 'content'),
+        url: attr($, 'meta[property="og:url"]', 'content'),
+        votes: text($, '[data-test="vote-button"] span') || text($, 'button[data-vote-button]'),
+        topics: list($, 'a[href*="/topics/"]'),
+        website: attr($, 'a[data-test="product-link"]', 'href') || attr($, 'a[href][rel="noopener"][target="_blank"]', 'href')
+      };
+    }
+  },
+  {
+    id: 'stackoverflow-question',
+    name: 'Stack Overflow Question',
+    description: 'Scrape a Stack Overflow question for title, body, votes, tags, answers, and accepted answer.',
+    targetPattern: /stackoverflow\.com\/questions\//i,
+    extract($) {
+      const answers = [];
+      $('.answer').each((_, el) => {
+        const $a = $(el);
+        answers.push({
+          votes: $a.find('[itemprop="upvoteCount"]').attr('content') || $a.find('.js-vote-count').text().trim(),
+          accepted: $a.hasClass('accepted-answer'),
+          body: $a.find('.s-prose').first().text().trim().slice(0, 500)
+        });
+      });
+      return {
+        title: text($, '#question-header h1'),
+        body: text($, '.question .s-prose'),
+        votes: text($, '.question .js-vote-count') || attr($, '.question [itemprop="upvoteCount"]', 'content'),
+        views: text($, '.js-view-count') || attr($, 'meta[name="twitter:data1"]', 'content'),
+        tags: list($, '.post-tag'),
+        author: text($, '.question .user-details a'),
+        asked: attr($, '.question time', 'datetime'),
+        answers: answers.slice(0, 5),
+        answered: $('div.accepted-answer').length > 0
+      };
+    }
+  },
+  {
+    id: 'npm-package',
+    name: 'npm Package',
+    description: 'Scrape an npm package page for name, version, description, weekly downloads, license, and dependencies.',
+    targetPattern: /npmjs\.com\/package\//i,
+    extract($) {
+      const scripts = [];
+      $('script[type="application/ld+json"]').each((_, el) => {
+        try { scripts.push(JSON.parse($(el).html())); } catch {}
+      });
+      const ld = scripts[0] || {};
+      return {
+        name: text($, 'h1') || ld.name,
+        version: text($, 'h3[data-testid="package-version-number"]') || text($, '[class*="version"]'),
+        description: attr($, 'meta[name="description"]', 'content') || text($, 'p[class*="description"]'),
+        license: text($, 'span[class*="license"]') || text($, '[data-cy="license"]') || ld.license,
+        weekly_downloads: text($, 'span[class*="weekly-downloads"]') || text($, '[data-cy="downloads"]'),
+        install_command: `npm install ${ld.name || text($, 'h1') || ''}`.trim(),
+        homepage: attr($, 'a[href][class*="homepage"]', 'href'),
+        repository: attr($, 'a[href*="github.com"]', 'href'),
+        maintainers: list($, 'a[href*="/~"]')
+      };
+    }
+  }
+];
+// ── Registry ─────────────────────────────────────────────────────────────────
+export class TemplateRegistry {
+  constructor() {
+    this._templates = new Map(TEMPLATES.map(t => [t.id, t]));
+  }
+  /**
+   * List all registered template IDs and names.
+   * @returns {{ id: string, name: string, description: string }[]}
+   */
+  list() {
+    return TEMPLATES.map(({ id, name, description, targetPattern }) => ({
+      id, name, description,
+      targetPattern: targetPattern.toString()
+    }));
+  }
+  /**
+   * Look up a template by ID.
+   * @param {string} id
+   * @returns {object|undefined}
+   */
+  get(id) {
+    return this._templates.get(id);
+  }
+  /**
+   * Run a template against raw HTML.
+   * @param {string} id     — template ID
+   * @param {string} html   — raw HTML of the target page
+   * @param {string} url    — original URL (for context)
+   * @returns {{ template: string, url: string, data: object, extractedAt: string }}
+   */
+  async run(id, html, url) {
+    const template = this.get(id);
+    if (!template) {
+      throw new Error(`Unknown template: "${id}". Available: ${TEMPLATES.map(t => t.id).join(', ')}`);
+    }
+    const $ = load(html);
+    const data = template.extract($);
+    return {
+      template: id,
+      template_name: template.name,
+      url,
+      data,
+      extractedAt: new Date().toISOString()
+    };
+  }
+}
+export default TemplateRegistry;