npm - crawlforge-mcp-server - Versions diffs - 3.5.1 → 4.2.2 - Mend

crawlforge-mcp-server 3.5.1 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/package.json +6 -4
package/server.js +138 -26
package/src/cli/commands/actions.js +36 -0
package/src/cli/commands/analyze.js +19 -0
package/src/cli/commands/batch.js +45 -0
package/src/cli/commands/crawl.js +30 -0
package/src/cli/commands/extract.js +45 -0
package/src/cli/commands/install-skills.js +46 -0
package/src/cli/commands/llmstxt.js +24 -0
package/src/cli/commands/localize.js +29 -0
package/src/cli/commands/map.js +26 -0
package/src/cli/commands/monitor.js +29 -0
package/src/cli/commands/research.js +26 -0
package/src/cli/commands/scrape.js +37 -0
package/src/cli/commands/search.js +28 -0
package/src/cli/commands/stealth.js +29 -0
package/src/cli/commands/template.js +26 -0
package/src/cli/commands/track.js +24 -0
package/src/cli/commands/uninstall-skills.js +35 -0
package/src/cli/formatter.js +57 -0
package/src/cli/index.js +94 -0
package/src/cli/lib/runTool.js +40 -0
package/src/core/ActionExecutor.js +8 -6
package/src/core/AuthManager.js +103 -3
package/src/core/ChangeTracker.js +34 -0
package/src/core/ElicitationHelper.js +112 -0
package/src/core/JobManager.js +36 -2
package/src/core/LocalizationManager.js +19 -5
package/src/core/PerformanceManager.js +53 -17
package/src/core/ResearchOrchestrator.js +40 -5
package/src/core/SamplingClient.js +191 -0
package/src/core/StealthBrowserManager.js +248 -2
package/src/core/WebhookDispatcher.js +18 -10
package/src/prompts/PromptRegistry.js +199 -0
package/src/resources/ResourceRegistry.js +273 -0
package/src/server/withAuth.js +25 -0
package/src/skills/crawlforge-cli.md +157 -0
package/src/skills/crawlforge-mcp.md +80 -0
package/src/skills/crawlforge-research.md +104 -0
package/src/skills/crawlforge-stealth.md +98 -0
package/src/skills/installer.js +141 -0
package/src/tools/advanced/batchScrape/index.js +30 -0
package/src/tools/advanced/batchScrape/schema.js +1 -1
package/src/tools/basic/extractText.js +19 -8
package/src/tools/crawl/crawlDeep.js +27 -0
package/src/tools/extract/extractContent.js +5 -17
package/src/tools/extract/extractStructured.js +8 -0
package/src/tools/extract/extractWithLlm.js +25 -5
package/src/tools/extract/processDocument.js +7 -1
package/src/tools/extract/summarizeContent.js +17 -0
package/src/tools/research/deepResearch.js +34 -0
package/src/tools/templates/ScrapeTemplateTool.js +68 -0
package/src/tools/templates/TemplateRegistry.js +311 -0
package/src/utils/Logger.js +15 -0
package/src/utils/htmlToMarkdown.js +54 -0
package/src/utils/secretMask.js +86 -0

package/src/prompts/PromptRegistry.js ADDED Viewed

@@ -0,0 +1,199 @@
+/**
+ * PromptRegistry — MCP Prompts for CrawlForge
+ * Pre-defined workflows as MCP prompts the client can list and invoke.
+ */
+export const PROMPTS = [
+  {
+    name: 'competitive-analysis',
+    description: 'Analyze competitor websites against your own to surface positioning, feature gaps, and SEO differences.',
+    arguments: [
+      { name: 'competitor_urls', description: 'Comma-separated list of competitor URLs to analyze', required: true },
+      { name: 'our_url', description: 'Your website URL for comparison', required: true },
+    ],
+  },
+  {
+    name: 'monitor-changes',
+    description: 'Set up continuous monitoring for content changes on a URL with webhook notifications.',
+    arguments: [
+      { name: 'url', description: 'URL to monitor for changes', required: true },
+      { name: 'interval', description: 'Check interval in seconds (default: 3600)', required: false },
+      { name: 'webhook', description: 'Webhook URL for change notifications', required: false },
+    ],
+  },
+  {
+    name: 'rag-ingest',
+    description: 'Scrape and convert one or more URLs into clean markdown suitable for RAG ingestion pipelines.',
+    arguments: [
+      { name: 'urls', description: 'Comma-separated list of URLs to ingest', required: true },
+      { name: 'output_format', description: 'Output format: markdown (default) or text', required: false },
+    ],
+  },
+  {
+    name: 'site-audit',
+    description: 'Comprehensive site audit: discovers all pages, extracts metadata, and generates an llms.txt summary.',
+    arguments: [
+      { name: 'url', description: 'Website URL to audit', required: true },
+    ],
+  },
+  {
+    name: 'research-deep-dive',
+    description: 'Conduct exhaustive multi-source research on a topic with synthesis, conflict detection, and citations.',
+    arguments: [
+      { name: 'topic', description: 'Research topic or question', required: true },
+      { name: 'depth', description: 'Research depth: shallow | medium | deep (default: medium)', required: false },
+    ],
+  },
+];
+/**
+ * Generate the prompt messages for a given prompt name and arguments.
+ * @param {string} name
+ * @param {Record<string, string>} args
+ * @returns {{ messages: Array<{ role: string, content: { type: string, text: string } }> }}
+ */
+export function getPromptMessages(name, args = {}) {
+  switch (name) {
+    case 'competitive-analysis': {
+      const competitors = args.competitor_urls || '';
+      const ourUrl = args.our_url || '';
+      return {
+        messages: [{
+          role: 'user',
+          content: {
+            type: 'text',
+            text: `Conduct a comprehensive competitive analysis.
+Our website: ${ourUrl}
+Competitors: ${competitors}
+Steps to follow:
+1. Use fetch_url or extract_content on each competitor URL and our URL.
+2. Use extract_metadata on all URLs to compare titles, descriptions, and keywords.
+3. Use analyze_content to surface content quality, topics, and tone differences.
+4. Use map_site on each domain to compare site structure and depth.
+5. Summarize: positioning gaps, feature differences, SEO opportunities, and recommended actions.
+Return a structured report with sections: Overview, Competitor Profiles, Gap Analysis, Recommendations.`,
+          },
+        }],
+      };
+    }
+    case 'monitor-changes': {
+      const url = args.url || '';
+      const interval = args.interval || '3600';
+      const webhook = args.webhook || '';
+      return {
+        messages: [{
+          role: 'user',
+          content: {
+            type: 'text',
+            text: `Set up change monitoring for: ${url}
+Configuration:
+- Check interval: ${interval} seconds
+- Webhook URL: ${webhook || '(none — report changes inline)'}
+Steps:
+1. Use track_changes with the URL to establish a baseline snapshot.
+2. Configure the check interval and webhook if provided.
+3. Report back the monitoring session ID and confirm setup.
+4. If no webhook is provided, describe how to retrieve changes later using track_changes.`,
+          },
+        }],
+      };
+    }
+    case 'rag-ingest': {
+      const urls = args.urls || '';
+      const outputFormat = args.output_format || 'markdown';
+      return {
+        messages: [{
+          role: 'user',
+          content: {
+            type: 'text',
+            text: `Ingest the following URLs for RAG (Retrieval-Augmented Generation):
+URLs: ${urls}
+Output format: ${outputFormat}
+Steps:
+1. Use batch_scrape with the URL list to fetch all pages in parallel.
+2. Use extract_content on each result to extract clean, readable content.
+3. Convert content to ${outputFormat} format — remove navigation, ads, and boilerplate.
+4. Return each document with: URL, title, word count, and clean ${outputFormat} body.
+5. Flag any URLs that failed to load.
+The output should be ready for chunking and embedding.`,
+          },
+        }],
+      };
+    }
+    case 'site-audit': {
+      const url = args.url || '';
+      return {
+        messages: [{
+          role: 'user',
+          content: {
+            type: 'text',
+            text: `Perform a comprehensive site audit for: ${url}
+Steps:
+1. Use map_site to discover all pages and site structure.
+2. Use extract_metadata on the homepage and top-level pages.
+3. Use generate_llms_txt to produce the site's AI-readable summary.
+4. Use analyze_content on the homepage to assess content quality and topics.
+5. Report:
+   - Total pages discovered
+   - Site structure overview
+   - Metadata completeness (missing titles, descriptions)
+   - Content quality assessment
+   - llms.txt summary
+   - Recommendations for improvement`,
+          },
+        }],
+      };
+    }
+    case 'research-deep-dive': {
+      const topic = args.topic || '';
+      const depth = args.depth || 'medium';
+      const depthConfig = {
+        shallow: { maxUrls: 20, maxDepth: 3 },
+        medium: { maxUrls: 50, maxDepth: 5 },
+        deep: { maxUrls: 150, maxDepth: 8 },
+      };
+      const cfg = depthConfig[depth] || depthConfig.medium;
+      return {
+        messages: [{
+          role: 'user',
+          content: {
+            type: 'text',
+            text: `Conduct a deep research investigation on the following topic:
+Topic: ${topic}
+Depth: ${depth} (max ${cfg.maxUrls} sources, depth ${cfg.maxDepth})
+Steps:
+1. Use deep_research with topic="${topic}", maxUrls=${cfg.maxUrls}, maxDepth=${cfg.maxDepth}.
+2. If deep_research returns raw evidence (no synthesis), synthesize it yourself:
+   - Group findings by sub-topic
+   - Identify agreements and conflicts between sources
+   - Rank sources by credibility
+3. Return a structured report with:
+   - Executive Summary
+   - Key Findings (with citations)
+   - Conflicting Information (if any)
+   - Source Quality Assessment
+   - Confidence Level and Gaps`,
+          },
+        }],
+      };
+    }
+    default:
+      throw new Error(`Unknown prompt: ${name}`);
+  }
+}

package/src/resources/ResourceRegistry.js ADDED Viewed

@@ -0,0 +1,273 @@
+/**
+ * ResourceRegistry — MCP Resources for CrawlForge
+ * URI scheme: crawlforge://<type>/<id>
+ * Exposes long-lived artifacts produced by tools as MCP Resources.
+ */
+import { createHash } from 'crypto';
+/**
+ * Supported resource types and their MIME types.
+ */
+const RESOURCE_MIME = {
+  research: 'application/json',
+  snapshot: 'text/html',
+  job: 'application/json',
+  crawl: 'application/json',
+  screenshot: 'image/png',
+};
+/**
+ * Parse a crawlforge:// URI into its components.
+ * @param {string} uri
+ * @returns {{ type: string, parts: string[] } | null}
+ */
+export function parseResourceUri(uri) {
+  if (!uri || !uri.startsWith('crawlforge://')) return null;
+  const rest = uri.slice('crawlforge://'.length);
+  const [type, ...parts] = rest.split('/');
+  if (!type || !RESOURCE_MIME[type]) return null;
+  return { type, parts };
+}
+/**
+ * Generate a URL hash for snapshot URIs.
+ * @param {string} url
+ * @returns {string}
+ */
+export function hashUrl(url) {
+  return createHash('sha256').update(url).digest('hex').slice(0, 16);
+}
+export class ResourceRegistry {
+  constructor({ researchOrchestrator, snapshotManager, jobManager, mapSiteTool, scrapeWithActionsTool } = {}) {
+    this.researchOrchestrator = researchOrchestrator || null;
+    this.snapshotManager = snapshotManager || null;
+    this.jobManager = jobManager || null;
+    this.mapSiteTool = mapSiteTool || null;
+    this.scrapeWithActionsTool = scrapeWithActionsTool || null;
+    // In-memory stores for lightweight resource tracking
+    /** @type {Map<string, { data: any, createdAt: number, ttl: number }>} */
+    this._crawlSitemaps = new Map(); // sessionId -> sitemap
+    /** @type {Map<string, { data: Buffer, createdAt: number, ttl: number }>} */
+    this._screenshots = new Map(); // actionId -> PNG buffer
+    // Default TTL: 1 hour
+    this.defaultTtl = 60 * 60 * 1000;
+  }
+  /**
+   * Store a crawl sitemap result for later retrieval.
+   * @param {string} sessionId
+   * @param {object} sitemapData
+   */
+  storeCrawlSitemap(sessionId, sitemapData) {
+    this._crawlSitemaps.set(sessionId, {
+      data: sitemapData,
+      createdAt: Date.now(),
+      ttl: this.defaultTtl,
+    });
+  }
+  /**
+   * Store a screenshot for later retrieval.
+   * @param {string} actionId
+   * @param {Buffer|string} screenshotData - PNG buffer or base64 string
+   */
+  storeScreenshot(actionId, screenshotData) {
+    const buf = Buffer.isBuffer(screenshotData)
+      ? screenshotData
+      : Buffer.from(screenshotData, 'base64');
+    this._screenshots.set(actionId, {
+      data: buf,
+      createdAt: Date.now(),
+      ttl: this.defaultTtl,
+    });
+  }
+  /**
+   * List all available resources.
+   * @returns {Array<{ uri: string, name: string, description: string, mimeType: string }>}
+   */
+  listResources() {
+    const resources = [];
+    const now = Date.now();
+    // Research sessions
+    if (this.researchOrchestrator?.activeSessions) {
+      for (const [sessionId] of this.researchOrchestrator.activeSessions) {
+        resources.push({
+          uri: `crawlforge://research/${sessionId}`,
+          name: `Research Session ${sessionId}`,
+          description: 'Completed deep_research report',
+          mimeType: RESOURCE_MIME.research,
+        });
+      }
+    }
+    // Snapshots — list recent ones from SnapshotManager if available
+    if (this.snapshotManager?.snapshots) {
+      for (const [id, snap] of this.snapshotManager.snapshots) {
+        const urlHash = hashUrl(snap.url || id);
+        const ts = snap.metadata?.timestamp || snap.createdAt || now;
+        resources.push({
+          uri: `crawlforge://snapshot/${urlHash}/${ts}`,
+          name: `Snapshot ${urlHash}`,
+          description: `Snapshot of ${snap.url || id}`,
+          mimeType: RESOURCE_MIME.snapshot,
+        });
+      }
+    }
+    // Jobs — completed/failed only
+    if (this.jobManager?.jobs) {
+      for (const [jobId, job] of this.jobManager.jobs) {
+        if (job.status === 'completed' || job.status === 'failed') {
+          resources.push({
+            uri: `crawlforge://job/${jobId}`,
+            name: `Job ${jobId}`,
+            description: `Batch scrape job (${job.status})`,
+            mimeType: RESOURCE_MIME.job,
+          });
+        }
+      }
+    }
+    // Crawl sitemaps
+    for (const [sessionId, entry] of this._crawlSitemaps) {
+      if (now - entry.createdAt < entry.ttl) {
+        resources.push({
+          uri: `crawlforge://crawl/${sessionId}/sitemap`,
+          name: `Crawl Sitemap ${sessionId}`,
+          description: 'map_site output for a crawl session',
+          mimeType: RESOURCE_MIME.crawl,
+        });
+      }
+    }
+    // Screenshots
+    for (const [actionId, entry] of this._screenshots) {
+      if (now - entry.createdAt < entry.ttl) {
+        resources.push({
+          uri: `crawlforge://screenshot/${actionId}`,
+          name: `Screenshot ${actionId}`,
+          description: 'Screenshot from scrape_with_actions',
+          mimeType: RESOURCE_MIME.screenshot,
+        });
+      }
+    }
+    return resources;
+  }
+  /**
+   * Read a specific resource by URI.
+   * @param {string} uri
+   * @returns {{ contents: Array<{ uri: string, mimeType: string, text?: string, blob?: string }> }}
+   */
+  async readResource(uri) {
+    const parsed = parseResourceUri(uri);
+    if (!parsed) {
+      throw new Error(`Unknown resource URI: ${uri}`);
+    }
+    const { type, parts } = parsed;
+    if (type === 'research') {
+      return this._readResearch(uri, parts[0]);
+    }
+    if (type === 'snapshot') {
+      return this._readSnapshot(uri, parts[0], parts[1]);
+    }
+    if (type === 'job') {
+      return this._readJob(uri, parts[0]);
+    }
+    if (type === 'crawl') {
+      // parts: [sessionId, 'sitemap']
+      return this._readCrawlSitemap(uri, parts[0]);
+    }
+    if (type === 'screenshot') {
+      return this._readScreenshot(uri, parts[0]);
+    }
+    throw new Error(`Resource type not implemented: ${type}`);
+  }
+  async _readResearch(uri, sessionId) {
+    const session = this.researchOrchestrator?.activeSessions?.get(sessionId);
+    if (!session) {
+      throw new Error(`Research session not found: ${sessionId}`);
+    }
+    return {
+      contents: [{
+        uri,
+        mimeType: RESOURCE_MIME.research,
+        text: JSON.stringify(session, null, 2),
+      }],
+    };
+  }
+  async _readSnapshot(uri, urlHash, timestamp) {
+    if (!this.snapshotManager?.snapshots) {
+      throw new Error('SnapshotManager not available');
+    }
+    // Find snapshot by matching urlHash and timestamp
+    for (const [id, snap] of this.snapshotManager.snapshots) {
+      const sh = hashUrl(snap.url || id);
+      const ts = String(snap.metadata?.timestamp || snap.createdAt || '');
+      if (sh === urlHash && ts === timestamp) {
+        return {
+          contents: [{
+            uri,
+            mimeType: RESOURCE_MIME.snapshot,
+            text: snap.content || JSON.stringify(snap, null, 2),
+          }],
+        };
+      }
+    }
+    throw new Error(`Snapshot not found: ${uri}`);
+  }
+  async _readJob(uri, jobId) {
+    const job = this.jobManager?.jobs?.get(jobId);
+    if (!job) {
+      throw new Error(`Job not found: ${jobId}`);
+    }
+    return {
+      contents: [{
+        uri,
+        mimeType: RESOURCE_MIME.job,
+        text: JSON.stringify(job, null, 2),
+      }],
+    };
+  }
+  async _readCrawlSitemap(uri, sessionId) {
+    const entry = this._crawlSitemaps.get(sessionId);
+    if (!entry || Date.now() - entry.createdAt >= entry.ttl) {
+      throw new Error(`Crawl sitemap not found or expired: ${sessionId}`);
+    }
+    return {
+      contents: [{
+        uri,
+        mimeType: RESOURCE_MIME.crawl,
+        text: JSON.stringify(entry.data, null, 2),
+      }],
+    };
+  }
+  async _readScreenshot(uri, actionId) {
+    const entry = this._screenshots.get(actionId);
+    if (!entry || Date.now() - entry.createdAt >= entry.ttl) {
+      throw new Error(`Screenshot not found or expired: ${actionId}`);
+    }
+    return {
+      contents: [{
+        uri,
+        mimeType: RESOURCE_MIME.screenshot,
+        blob: entry.data.toString('base64'),
+      }],
+    };
+  }
+}

package/src/server/withAuth.js CHANGED Viewed

@@ -60,6 +60,31 @@ export function makeWithAuth({ authManager, logger, metrics = null }) {
         const result = await handler(params);
         outcome = 'success';
+        // D3.5: Surface cost transparency in all tool responses
+        try {
+          const projection = authManager.projectCost(toolName, params);
+          const remainingCredits = creatorMode ? Infinity : (authManager.creditCache ? [...authManager.creditCache.values()][0] ?? null : null);
+          const costMeta = {
+            projected: creditCost,
+            actual: creditCost,
+            remaining_credits: remainingCredits,
+            projection_note: projection.note
+          };
+          // Inject _cost into the first text content item if it's JSON
+          if (result && Array.isArray(result.content) && result.content[0]?.type === 'text') {
+            try {
+              const parsed = JSON.parse(result.content[0].text);
+              parsed._cost = costMeta;
+              result.content[0].text = JSON.stringify(parsed, null, 2);
+            } catch {
+              // Not JSON — skip injection silently
+            }
+          }
+        } catch {
+          // Cost injection must never break the request path
+        }
         if (!creatorMode) {
           await authManager.reportUsage(toolName, creditCost, params, 200, Date.now() - startTime);
         }

package/src/skills/crawlforge-cli.md ADDED Viewed

@@ -0,0 +1,157 @@
+# CrawlForge CLI Usage Guide
+The `crawlforge` CLI exposes all 23 MCP tools as command-line subcommands.
+## Installation
+```bash
+npm install -g crawlforge-mcp-server
+# or run without installing:
+npx crawlforge-mcp-server <command>
+```
+## Global Flags
+All commands support these flags:
+- `--json` — output compact JSON
+- `--pretty` — output pretty-printed JSON
+- `--quiet` — suppress output (exit code only)
+- `--api-key <key>` — override CRAWLFORGE_API_KEY env var
+- `--timeout <ms>` — global request timeout (default: 30000)
+## Commands
+### scrape — Fetch a URL
+```bash
+crawlforge scrape https://example.com
+crawlforge scrape https://example.com --extract --format markdown
+crawlforge scrape https://example.com --pretty
+```
+### search — Search the web
+```bash
+crawlforge search "MCP server tutorial" --limit 10
+crawlforge search "nodejs scraping" --provider searxng --json
+```
+### crawl — Deep website crawl
+```bash
+crawlforge crawl https://docs.example.com --depth 3 --max-pages 200
+crawlforge crawl https://example.com --no-robots --concurrency 20
+```
+### map — Generate sitemap
+```bash
+crawlforge map https://example.com --pretty
+crawlforge map https://example.com --format xml > sitemap.xml
+```
+### extract — Structured data extraction
+```bash
+# Schema-based extraction
+crawlforge extract https://example.com/product --schema product-schema.json
+# LLM-guided extraction
+crawlforge extract https://example.com/article --prompt "extract title, author, date, summary"
+```
+### track — Track content changes
+```bash
+crawlforge track https://example.com --threshold 10
+crawlforge track https://example.com --selector ".main-content"
+```
+### analyze — Content analysis
+```bash
+crawlforge analyze https://example.com --depth full --pretty
+```
+### research — Deep research
+```bash
+crawlforge research "state of AI in 2025" --depth deep --max-urls 30
+crawlforge research "competitor pricing" --output-format detailed --json
+```
+### stealth — Anti-bot scraping
+```bash
+crawlforge stealth https://protected-site.com
+crawlforge stealth https://protected-site.com --engine camoufox --screenshot
+```
+### batch — Batch scrape from file
+```bash
+# Create a URLs file:
+cat > urls.txt << EOF
+https://example.com/page1
+https://example.com/page2
+https://example.com/page3
+EOF
+crawlforge batch urls.txt --format markdown --concurrency 10
+```
+### actions — Browser automation
+```bash
+# Create an actions script:
+cat > login.json << EOF
+[
+  { "type": "click", "selector": "#login-btn" },
+  { "type": "type", "selector": "#email", "text": "user@example.com" },
+  { "type": "wait", "duration": 1000 }
+]
+EOF
+crawlforge actions https://example.com --script login.json --screenshot
+```
+### localize — Geo-targeted fetch
+```bash
+crawlforge localize https://example.com --locale fr-FR --country FR
+crawlforge localize https://shop.example.com --locale en-GB --currency GBP
+```
+### llmstxt — Generate llms.txt
+```bash
+crawlforge llmstxt https://example.com
+crawlforge llmstxt https://example.com --include-full > llms.txt
+```
+### template — Pre-built site scrapers
+```bash
+crawlforge template github-repo https://github.com/owner/repo
+crawlforge template amazon-product https://amazon.com/dp/B0XXXXX
+crawlforge template npm-package https://npmjs.com/package/commander
+crawlforge template --list  # list all available templates
+```
+### monitor — Continuous change monitoring
+```bash
+crawlforge monitor https://example.com --interval 60 --webhook https://my-site.com/hook
+crawlforge monitor https://example.com --selector ".price" --threshold 1
+```
+### install-skills — Install AI assistant skills
+```bash
+crawlforge install-skills --target claude-code
+crawlforge install-skills --target cursor --force
+crawlforge install-skills --target all --dry-run
+```
+### uninstall-skills — Remove AI assistant skills
+```bash
+crawlforge uninstall-skills --target claude-code
+crawlforge uninstall-skills --target all
+```
+## Output Piping Examples
+```bash
+# Extract markdown and save to file
+crawlforge scrape https://example.com --extract --format markdown > page.md
+# Search and parse with jq
+crawlforge search "nodejs MCP" --json | jq '.results[].url'
+# Batch scrape and process results
+crawlforge batch urls.txt --json | jq '.results | length'
+```