crawlforge-mcp-server 3.4.0 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/README.md +28 -2
  2. package/package.json +6 -4
  3. package/server.js +166 -32
  4. package/src/cli/commands/actions.js +36 -0
  5. package/src/cli/commands/analyze.js +19 -0
  6. package/src/cli/commands/batch.js +45 -0
  7. package/src/cli/commands/crawl.js +30 -0
  8. package/src/cli/commands/extract.js +45 -0
  9. package/src/cli/commands/install-skills.js +46 -0
  10. package/src/cli/commands/llmstxt.js +24 -0
  11. package/src/cli/commands/localize.js +29 -0
  12. package/src/cli/commands/map.js +26 -0
  13. package/src/cli/commands/monitor.js +29 -0
  14. package/src/cli/commands/research.js +26 -0
  15. package/src/cli/commands/scrape.js +37 -0
  16. package/src/cli/commands/search.js +28 -0
  17. package/src/cli/commands/stealth.js +29 -0
  18. package/src/cli/commands/template.js +26 -0
  19. package/src/cli/commands/track.js +24 -0
  20. package/src/cli/commands/uninstall-skills.js +35 -0
  21. package/src/cli/formatter.js +57 -0
  22. package/src/cli/index.js +94 -0
  23. package/src/cli/lib/runTool.js +40 -0
  24. package/src/core/ActionExecutor.js +8 -6
  25. package/src/core/AuthManager.js +103 -3
  26. package/src/core/ChangeTracker.js +34 -0
  27. package/src/core/ElicitationHelper.js +112 -0
  28. package/src/core/JobManager.js +36 -2
  29. package/src/core/LocalizationManager.js +19 -5
  30. package/src/core/PerformanceManager.js +53 -17
  31. package/src/core/ResearchOrchestrator.js +40 -5
  32. package/src/core/SamplingClient.js +191 -0
  33. package/src/core/StealthBrowserManager.js +248 -2
  34. package/src/core/WebhookDispatcher.js +18 -10
  35. package/src/prompts/PromptRegistry.js +199 -0
  36. package/src/resources/ResourceRegistry.js +273 -0
  37. package/src/server/transports/streamableHttp.js +6 -6
  38. package/src/server/withAuth.js +25 -0
  39. package/src/skills/crawlforge-cli.md +157 -0
  40. package/src/skills/crawlforge-mcp.md +80 -0
  41. package/src/skills/crawlforge-research.md +104 -0
  42. package/src/skills/crawlforge-stealth.md +98 -0
  43. package/src/skills/installer.js +141 -0
  44. package/src/tools/advanced/batchScrape/index.js +30 -0
  45. package/src/tools/advanced/batchScrape/schema.js +1 -1
  46. package/src/tools/basic/extractText.js +19 -8
  47. package/src/tools/crawl/crawlDeep.js +27 -0
  48. package/src/tools/extract/extractContent.js +5 -17
  49. package/src/tools/extract/extractStructured.js +8 -0
  50. package/src/tools/extract/extractWithLlm.js +35 -25
  51. package/src/tools/extract/listOllamaModels.js +66 -0
  52. package/src/tools/extract/processDocument.js +7 -1
  53. package/src/tools/extract/summarizeContent.js +17 -0
  54. package/src/tools/research/deepResearch.js +34 -0
  55. package/src/tools/templates/ScrapeTemplateTool.js +68 -0
  56. package/src/tools/templates/TemplateRegistry.js +311 -0
  57. package/src/utils/Logger.js +15 -0
  58. package/src/utils/htmlToMarkdown.js +54 -0
  59. package/src/utils/secretMask.js +86 -0
@@ -1,13 +1,22 @@
1
1
  /**
2
2
  * Extract With LLM MCP Tool
3
- * Natural-language extraction powered by OpenAI, Anthropic, or a local Ollama model.
4
- * Mirrors ScrapeGraphAI positioning: describe what you want, get structured JSON back.
3
+ * Natural-language extraction powered by a local Ollama model (default) or
4
+ * a cloud provider (OpenAI / Anthropic, explicit opt-in).
5
5
  *
6
- * Cloud providers require OPENAI_API_KEY or ANTHROPIC_API_KEY in environment.
7
- * Ollama requires no API key just a running `ollama serve` on http://localhost:11434.
6
+ * Default: provider 'auto' Ollama at http://localhost:11434, no API key required.
7
+ * Pass provider: "openai" | "anthropic" with the matching API key to use a cloud model.
8
8
  */
9
9
 
10
10
  import { fetchAndParse } from './_fetchAndParse.js';
11
+ // D1.3: SamplingClient for MCP sampling fallback (lazy — only imported if needed)
12
+ let _SamplingClient = null;
13
+ async function getSamplingClient() {
14
+ if (!_SamplingClient) {
15
+ const mod = await import('../../core/SamplingClient.js');
16
+ _SamplingClient = mod.SamplingClient;
17
+ }
18
+ return _SamplingClient;
19
+ }
11
20
 
12
21
  // ── Constants ─────────────────────────────────────────────────────────────────
13
22
 
@@ -36,34 +45,24 @@ function ollamaBaseUrl() {
36
45
  * @returns {{ provider: 'openai'|'anthropic'|'ollama', apiKey: string|null }}
37
46
  */
38
47
  function resolveProvider(provider) {
39
- const anthropicKey = process.env.ANTHROPIC_API_KEY;
40
- const openaiKey = process.env.OPENAI_API_KEY;
41
- const ollamaOptIn = !!process.env.OLLAMA_BASE_URL;
42
-
43
- if (provider === 'auto') {
44
- if (anthropicKey) return { provider: 'anthropic', apiKey: anthropicKey };
45
- if (openaiKey) return { provider: 'openai', apiKey: openaiKey };
46
- if (ollamaOptIn) return { provider: 'ollama', apiKey: null };
47
- throw new Error(
48
- 'extract_with_llm requires OPENAI_API_KEY, ANTHROPIC_API_KEY, or OLLAMA_BASE_URL in environment ' +
49
- '(or pass provider: "ollama" explicitly to use a local Ollama server)'
50
- );
48
+ if (provider === 'auto' || provider === 'ollama') {
49
+ // Local Ollama is the default. No API key required; OLLAMA_BASE_URL is
50
+ // an optional override (defaults to http://localhost:11434).
51
+ return { provider: 'ollama', apiKey: null };
51
52
  }
52
53
 
53
54
  if (provider === 'anthropic') {
55
+ const anthropicKey = process.env.ANTHROPIC_API_KEY;
54
56
  if (!anthropicKey) throw new Error('extract_with_llm: ANTHROPIC_API_KEY is not set');
55
57
  return { provider: 'anthropic', apiKey: anthropicKey };
56
58
  }
57
59
 
58
60
  if (provider === 'openai') {
61
+ const openaiKey = process.env.OPENAI_API_KEY;
59
62
  if (!openaiKey) throw new Error('extract_with_llm: OPENAI_API_KEY is not set');
60
63
  return { provider: 'openai', apiKey: openaiKey };
61
64
  }
62
65
 
63
- if (provider === 'ollama') {
64
- return { provider: 'ollama', apiKey: null };
65
- }
66
-
67
66
  throw new Error(`extract_with_llm: unknown provider "${provider}"`);
68
67
  }
69
68
 
@@ -307,14 +306,25 @@ export class ExtractWithLlm {
307
306
 
308
307
  const userMessage = buildUserMessage(prompt, text, schema);
309
308
 
310
- // Step 2: First LLM call
311
- let rawText, usage;
309
+ // Step 2: First LLM call — with sampling fallback for 'auto' provider
310
+ // Fallback chain: Ollama → API key (handled by resolveProvider) → sampling → error
311
+ let rawText, usage, resolvedModel = model;
312
312
  try {
313
313
  ({ rawText, usage } = await callLLM({
314
314
  provider, apiKey, model, systemMessage, userMessage, maxTokens, schema
315
315
  }));
316
316
  } catch (llmErr) {
317
- return { success: false, error: `LLM call failed: ${llmErr.message}` };
317
+ // D1.3: If provider is 'auto'/'ollama' and it failed, try sampling as final fallback
318
+ if (providerParam === 'auto' || providerParam === 'ollama') {
319
+ try {
320
+ ({ rawText, usage } = await callViaSampling({ systemMessage, userMessage, maxTokens }));
321
+ resolvedModel = 'sampling';
322
+ } catch (samplingErr) {
323
+ return { success: false, error: `LLM call failed: ${llmErr.message}. Sampling fallback also failed: ${samplingErr.message}` };
324
+ }
325
+ } else {
326
+ return { success: false, error: `LLM call failed: ${llmErr.message}` };
327
+ }
318
328
  }
319
329
 
320
330
  // Step 3: Parse JSON; retry once with stricter prompt if it fails
@@ -355,8 +365,8 @@ export class ExtractWithLlm {
355
365
  return {
356
366
  success: true,
357
367
  data: parsed,
358
- provider,
359
- model,
368
+ provider: resolvedModel === 'sampling' ? 'sampling' : provider,
369
+ model: resolvedModel || model,
360
370
  usage
361
371
  };
362
372
  }
@@ -0,0 +1,66 @@
1
+ /**
2
+ * List Ollama Models MCP Tool
3
+ * Returns the models installed on the local Ollama server (GET /api/tags).
4
+ * Used to discover names that can be passed as the `model` parameter to extract_with_llm.
5
+ */
6
+
7
+ function ollamaBaseUrl() {
8
+ return (process.env.OLLAMA_BASE_URL || 'http://localhost:11434').replace(/\/$/, '');
9
+ }
10
+
11
+ export class ListOllamaModelsTool {
12
+ async execute() {
13
+ const baseUrl = ollamaBaseUrl();
14
+ const url = `${baseUrl}/api/tags`;
15
+
16
+ let response;
17
+ try {
18
+ response = await fetch(url, { signal: AbortSignal.timeout(10_000) });
19
+ } catch (err) {
20
+ return {
21
+ success: false,
22
+ baseUrl,
23
+ error:
24
+ `Could not reach Ollama at ${url}: ${err.message}. ` +
25
+ `Install from https://ollama.com and run "ollama serve".`
26
+ };
27
+ }
28
+
29
+ if (!response.ok) {
30
+ return {
31
+ success: false,
32
+ baseUrl,
33
+ error: `Ollama responded ${response.status} at ${url}. Is "ollama serve" running?`
34
+ };
35
+ }
36
+
37
+ let data;
38
+ try {
39
+ data = await response.json();
40
+ } catch (err) {
41
+ return { success: false, baseUrl, error: `Invalid JSON from Ollama: ${err.message}` };
42
+ }
43
+
44
+ const models = (data.models || []).map((m) => ({
45
+ name: m.name,
46
+ size_bytes: m.size,
47
+ modified_at: m.modified_at,
48
+ family: m.details?.family,
49
+ parameter_size: m.details?.parameter_size,
50
+ quantization: m.details?.quantization_level
51
+ }));
52
+
53
+ return {
54
+ success: true,
55
+ baseUrl,
56
+ count: models.length,
57
+ models,
58
+ hint:
59
+ models.length === 0
60
+ ? 'No models installed. Run "ollama pull llama3.2" (or any model from https://ollama.com/library) in your terminal.'
61
+ : 'Pass any of these names as the `model` parameter to extract_with_llm.'
62
+ };
63
+ }
64
+ }
65
+
66
+ export default ListOllamaModelsTool;
@@ -8,6 +8,7 @@ import { PDFProcessor } from '../../core/processing/PDFProcessor.js';
8
8
  import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
9
9
  import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
10
10
  import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
11
+ import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
11
12
 
12
13
  const ProcessDocumentSchema = z.object({
13
14
  source: z.string().min(1),
@@ -28,7 +29,7 @@ const ProcessDocumentSchema = z.object({
28
29
  // Processing options
29
30
  assessContentQuality: z.boolean().default(true),
30
31
  includeStatistics: z.boolean().default(true),
31
- outputFormat: z.enum(['text', 'structured', 'full']).default('structured'),
32
+ outputFormat: z.enum(['text', 'structured', 'full', 'markdown']).default('structured'),
32
33
 
33
34
  // Content filtering
34
35
  minContentLength: z.number().min(0).default(50),
@@ -328,6 +329,11 @@ export class ProcessDocumentTool {
328
329
  result.content.html = html;
329
330
  }
330
331
 
332
+ // D3.1: Markdown output mode — convert extracted HTML to markdown via Turndown
333
+ if (options.outputFormat === 'markdown') {
334
+ result.content.markdown = htmlToMarkdown(extractedContent || html);
335
+ }
336
+
331
337
  // Step 4: Set metadata
332
338
  if (processingResult.metadata) {
333
339
  result.metadata = {
@@ -4,6 +4,15 @@
4
4
  */
5
5
 
6
6
  import { z } from 'zod';
7
+ // D1.3: lazy SamplingClient for abstractive mode when no LLM keys are set
8
+ let _SamplingClient = null;
9
+ async function getSamplingClient() {
10
+ if (!_SamplingClient) {
11
+ const mod = await import('../../core/SamplingClient.js');
12
+ _SamplingClient = mod.SamplingClient;
13
+ }
14
+ return _SamplingClient;
15
+ }
7
16
  import { ContentAnalyzer } from '../../core/analysis/ContentAnalyzer.js';
8
17
  import { splitSentences } from '../../core/analysis/sentenceUtils.js';
9
18
 
@@ -122,6 +131,14 @@ export class SummarizeContentTool {
122
131
  // Step 2: Set summary result
123
132
  result.summary = analysisResult.summary;
124
133
 
134
+ // D1.3: If abstractive mode requested, attempt sampling-based enhancement
135
+ if (options.summaryType === 'abstractive') {
136
+ const abstractive = await this._abstractiveSummaryViaSampling(text, analysisResult.summary, options.summaryLength);
137
+ if (abstractive) {
138
+ result.summary = abstractive;
139
+ }
140
+ }
141
+
125
142
  // Step 3: Extract key points if requested
126
143
  if (options.includeKeypoints) {
127
144
  result.keypoints = await this.extractKeyPoints(text, analysisResult.summary);
@@ -1,4 +1,6 @@
1
1
  import { z } from 'zod';
2
+ // D1.4: Elicitation helper (injected from server.js or can be used standalone)
3
+ import { ElicitationHelper } from '../../core/ElicitationHelper.js';
2
4
  import { ResearchOrchestrator } from '../../core/ResearchOrchestrator.js';
3
5
  import { Logger } from '../../utils/Logger.js';
4
6
 
@@ -93,6 +95,17 @@ export class DeepResearchTool {
93
95
  cacheTTL,
94
96
  ...orchestratorOptions
95
97
  };
98
+ // D1.4: Elicitation helper (set mcpServer via setMcpServer() after instantiation)
99
+ this._elicitation = new ElicitationHelper({});
100
+ }
101
+
102
+ /**
103
+ * D1.4: Set the MCP server instance for elicitation support.
104
+ * Call this from server.js after instantiating DeepResearchTool.
105
+ * @param {object} mcpServer
106
+ */
107
+ setMcpServer(mcpServer) {
108
+ this._elicitation = new ElicitationHelper({ mcpServer });
96
109
  }
97
110
 
98
111
  async execute(params) {
@@ -116,6 +129,27 @@ export class DeepResearchTool {
116
129
  };
117
130
  }
118
131
 
132
+ // D1.4: Elicitation — warn user if projected cost exceeds 50 credits
133
+ // deep_research costs approximately 1 credit per URL; maxUrls > 50 → confirm
134
+ if (validated.maxUrls > 50) {
135
+ const projectedCredits = validated.maxUrls;
136
+ const proceed = await this._elicitation.confirm(
137
+ `deep_research will scan up to ${validated.maxUrls} URLs, projecting ~${projectedCredits} credits.`,
138
+ {
139
+ topic: validated.topic,
140
+ projected_credits: projectedCredits,
141
+ max_urls: validated.maxUrls,
142
+ }
143
+ );
144
+ if (!proceed) {
145
+ return {
146
+ success: false,
147
+ error: 'Research cancelled by user before starting (elicitation declined).',
148
+ sessionId,
149
+ };
150
+ }
151
+ }
152
+
119
153
  // Configure research orchestrator based on research approach
120
154
  const orchestratorConfig = this.buildOrchestratorConfig(validated);
121
155
  const orchestrator = new ResearchOrchestrator(orchestratorConfig);
@@ -0,0 +1,68 @@
1
+ /**
2
+ * ScrapeTemplateTool — wraps TemplateRegistry to expose the `scrape_template` MCP tool.
3
+ *
4
+ * Usage pattern (D3.3):
5
+ * const tool = new ScrapeTemplateTool();
6
+ * const result = await tool.execute({ template: "github-repo", url: "https://github.com/user/repo" });
7
+ */
8
+
9
+ import { TemplateRegistry } from './TemplateRegistry.js';
10
+
11
+ export class ScrapeTemplateTool {
12
+ constructor() {
13
+ this.registry = new TemplateRegistry();
14
+ }
15
+
16
+ /**
17
+ * Execute the scrape_template tool.
18
+ * @param {{ template: string, url: string, timeout?: number }} params
19
+ * @returns {Promise<object>}
20
+ */
21
+ async execute({ template, url, timeout = 15000 }) {
22
+ // list mode — return available templates without scraping
23
+ if (template === 'list' || !url) {
24
+ return {
25
+ templates: this.registry.list(),
26
+ count: this.registry.list().length
27
+ };
28
+ }
29
+
30
+ // Validate template exists before making network call
31
+ const tpl = this.registry.get(template);
32
+ if (!tpl) {
33
+ const available = this.registry.list().map(t => t.id).join(', ');
34
+ throw new Error(`Unknown template "${template}". Available templates: ${available}`);
35
+ }
36
+
37
+ // Fetch the page
38
+ const controller = new AbortController();
39
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
40
+ let html;
41
+ try {
42
+ const response = await fetch(url, {
43
+ signal: controller.signal,
44
+ headers: {
45
+ 'User-Agent': 'Mozilla/5.0 (compatible; CrawlForge-TemplateScraper/4.0)'
46
+ }
47
+ });
48
+ clearTimeout(timeoutId);
49
+
50
+ if (!response.ok) {
51
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
52
+ }
53
+ html = await response.text();
54
+ } catch (error) {
55
+ clearTimeout(timeoutId);
56
+ if (error.name === 'AbortError') {
57
+ throw new Error(`Request timeout after ${timeout}ms`);
58
+ }
59
+ throw error;
60
+ }
61
+
62
+ // Run the template extractor
63
+ const result = await this.registry.run(template, html, url);
64
+ return result;
65
+ }
66
+ }
67
+
68
+ export default ScrapeTemplateTool;
@@ -0,0 +1,311 @@
1
+ /**
2
+ * TemplateRegistry — pre-built scraping templates for popular sites (D3.3).
3
+ *
4
+ * Each template is a self-contained object with:
5
+ * id — unique slug used as the `template` parameter
6
+ * name — human-readable name
7
+ * description — when to use this template
8
+ * targetPattern — regex matching URLs this template handles
9
+ * selectors — CSS selectors mapping field names to DOM locations
10
+ * postProcess — optional function(raw: Object) → Object for cleanup
11
+ *
12
+ * Templates do NOT make network calls. The ScrapeTemplateTool fetches the
13
+ * page and passes the parsed HTML to the template's extract() method.
14
+ */
15
+
16
+ import { load } from 'cheerio';
17
+
18
+ // ── Helpers ──────────────────────────────────────────────────────────────────
19
+
20
+ function text($, sel) {
21
+ return $(sel).first().text().trim() || null;
22
+ }
23
+
24
+ function attr($, sel, attribute) {
25
+ return $(sel).first().attr(attribute) || null;
26
+ }
27
+
28
+ function list($, sel) {
29
+ return $(sel).map((_, el) => $(el).text().trim()).get().filter(Boolean);
30
+ }
31
+
32
+ function listAttr($, sel, attribute) {
33
+ return $(sel).map((_, el) => $(el).attr(attribute)).get().filter(Boolean);
34
+ }
35
+
36
+ // ── Template definitions ─────────────────────────────────────────────────────
37
+
38
+ const TEMPLATES = [
39
+ {
40
+ id: 'amazon-product',
41
+ name: 'Amazon Product',
42
+ description: 'Scrape an Amazon product page for title, price, rating, reviews, ASIN, and description.',
43
+ targetPattern: /amazon\.(com|co\.uk|de|fr|jp|ca|com\.au)/i,
44
+ extract($) {
45
+ return {
46
+ title: text($, '#productTitle'),
47
+ price: text($, '.a-price .a-offscreen') || text($, '#priceblock_ourprice') || text($, '#priceblock_dealprice'),
48
+ currency: attr($, 'meta[itemprop="priceCurrency"]', 'content'),
49
+ rating: text($, '#acrPopover .a-size-base'),
50
+ review_count: text($, '#acrCustomerReviewText'),
51
+ asin: text($, 'input#ASIN') || attr($, 'input[name="ASIN"]', 'value'),
52
+ brand: text($, '#bylineInfo'),
53
+ description: text($, '#productDescription p') || text($, '#feature-bullets'),
54
+ images: listAttr($, '#altImages img.a-thumbnail-image', 'src').slice(0, 8),
55
+ availability: text($, '#availability span'),
56
+ category_breadcrumb: list($, '#wayfinding-breadcrumbs_feature_div a')
57
+ };
58
+ }
59
+ },
60
+
61
+ {
62
+ id: 'linkedin-profile',
63
+ name: 'LinkedIn Profile',
64
+ description: 'Scrape a LinkedIn public profile for name, headline, location, and about section.',
65
+ targetPattern: /linkedin\.com\/in\//i,
66
+ extract($) {
67
+ return {
68
+ name: text($, 'h1') || text($, '.top-card-layout__title'),
69
+ headline: text($, '.top-card-layout__headline') || text($, 'h2'),
70
+ location: text($, '.top-card-layout__first-subline') || text($, '.profile-info-subheader'),
71
+ about: text($, '.core-section-container__content p') || text($, '.summary'),
72
+ connections: text($, '.top-card__connections'),
73
+ current_company: text($, '.top-card-layout__card-inner-full-width .top-card-link'),
74
+ note: 'LinkedIn requires authentication for full profiles. This template works on public profile pages only.'
75
+ };
76
+ }
77
+ },
78
+
79
+ {
80
+ id: 'github-repo',
81
+ name: 'GitHub Repository',
82
+ description: 'Scrape a GitHub repository page for stars, forks, description, language, topics, and README summary.',
83
+ targetPattern: /github\.com\/[^/]+\/[^/]+\/?$/i,
84
+ extract($) {
85
+ return {
86
+ name: text($, 'strong[itemprop="name"] a') || text($, '.repository-content h1'),
87
+ description: attr($, 'meta[property="og:description"]', 'content') || text($, 'p.f4.my-3'),
88
+ stars: text($, '#repo-stars-counter-star') || text($, '[aria-label*="stargazers"]'),
89
+ forks: text($, '#repo-network-counter') || text($, '[aria-label*="forks"]'),
90
+ watchers: text($, '[aria-label*="watchers"]'),
91
+ language: text($, 'span[itemprop="programmingLanguage"]') || text($, '.d-inline-flex[class*="language"]'),
92
+ topics: list($, 'a.topic-tag'),
93
+ license: text($, 'a[href*="blob/"][href*="LICENSE"]') || text($, '.octicon-law ~ span'),
94
+ last_updated: attr($, 'relative-time', 'datetime'),
95
+ homepage: attr($, 'a[href][rel="noopener noreferrer"]', 'href'),
96
+ open_issues: text($, '.Counter[aria-label*="issue"]')
97
+ };
98
+ }
99
+ },
100
+
101
+ {
102
+ id: 'youtube-video',
103
+ name: 'YouTube Video',
104
+ description: 'Scrape a YouTube video page for title, channel, views, likes, publish date, and description.',
105
+ targetPattern: /youtube\.com\/watch/i,
106
+ extract($) {
107
+ return {
108
+ title: attr($, 'meta[name="title"]', 'content') || attr($, 'meta[property="og:title"]', 'content'),
109
+ channel: attr($, 'link[itemprop="name"]', 'content') || text($, '#channel-name'),
110
+ channel_url: attr($, 'span[itemprop="author"] link[itemprop="url"]', 'href'),
111
+ views: attr($, 'meta[itemprop="interactionCount"]', 'content'),
112
+ published: attr($, 'meta[itemprop="uploadDate"]', 'content') || attr($, 'meta[itemprop="datePublished"]', 'content'),
113
+ description: attr($, 'meta[property="og:description"]', 'content'),
114
+ thumbnail: attr($, 'meta[property="og:image"]', 'content'),
115
+ duration: attr($, 'meta[itemprop="duration"]', 'content'),
116
+ video_id: new URL($('link[rel="canonical"]').attr('href') || 'https://youtube.com').searchParams.get('v')
117
+ };
118
+ }
119
+ },
120
+
121
+ {
122
+ id: 'tweet',
123
+ name: 'Tweet / X Post',
124
+ description: 'Scrape a tweet/X post for text, author, timestamp, likes, and retweets from the Open Graph / structured data.',
125
+ targetPattern: /(twitter|x)\.com\/[^/]+\/status\//i,
126
+ extract($) {
127
+ return {
128
+ text: attr($, 'meta[property="og:description"]', 'content'),
129
+ author: attr($, 'meta[property="og:title"]', 'content'),
130
+ url: attr($, 'meta[property="og:url"]', 'content') || attr($, 'link[rel="canonical"]', 'href'),
131
+ image: attr($, 'meta[property="og:image"]', 'content'),
132
+ note: 'X.com requires JavaScript rendering for full tweet data. Structured metadata is returned from static HTML.'
133
+ };
134
+ }
135
+ },
136
+
137
+ {
138
+ id: 'reddit-thread',
139
+ name: 'Reddit Thread',
140
+ description: 'Scrape a Reddit thread for title, subreddit, score, comment count, author, and top-level comments.',
141
+ targetPattern: /reddit\.com\/r\/[^/]+\/comments\//i,
142
+ extract($) {
143
+ return {
144
+ title: attr($, 'meta[property="og:title"]', 'content') || text($, 'h1'),
145
+ subreddit: text($, 'a[href*="/r/"][class*="subreddit"]') || (($('title').text().match(/r\/([^•]+)/) || [])[1] || '').trim(),
146
+ score: text($, '[data-score]') || attr($, '[itemprop="upvoteCount"]', 'content'),
147
+ author: text($, 'a[href*="/user/"]'),
148
+ posted: attr($, 'time[datetime]', 'datetime'),
149
+ body: text($, 'div[data-click-id="text"] p') || attr($, 'meta[property="og:description"]', 'content'),
150
+ url: attr($, 'meta[property="og:url"]', 'content'),
151
+ flair: text($, '[class*="flair"]')
152
+ };
153
+ }
154
+ },
155
+
156
+ {
157
+ id: 'hacker-news-front-page',
158
+ name: 'Hacker News Front Page',
159
+ description: 'Scrape the Hacker News front page for a list of stories with title, URL, score, and comment count.',
160
+ targetPattern: /news\.ycombinator\.com(\/news)?$/i,
161
+ extract($) {
162
+ const stories = [];
163
+ $('tr.athing').each((_, el) => {
164
+ const $row = $(el);
165
+ const $score = $row.next('.spacer').find('.score');
166
+ const $subtext = $row.next('.spacer').find('.subtext');
167
+ const $titleLink = $row.find('.titleline > a');
168
+ stories.push({
169
+ id: $row.attr('id'),
170
+ title: $titleLink.text().trim(),
171
+ url: $titleLink.attr('href'),
172
+ site: $row.find('.sitebit a').text().trim() || null,
173
+ score: $score.text().replace(' points', '').trim() || null,
174
+ author: $subtext.find('.hnuser').text().trim() || null,
175
+ posted: $subtext.find('.age a').attr('href') || null,
176
+ comments: $subtext.find('a[href*="item"]').last().text().trim() || null
177
+ });
178
+ });
179
+ return { stories: stories.slice(0, 30), scraped_at: new Date().toISOString() };
180
+ }
181
+ },
182
+
183
+ {
184
+ id: 'producthunt-launch',
185
+ name: 'Product Hunt Launch',
186
+ description: 'Scrape a Product Hunt product page for name, tagline, vote count, topics, and maker details.',
187
+ targetPattern: /producthunt\.com\/posts\//i,
188
+ extract($) {
189
+ return {
190
+ name: attr($, 'meta[property="og:title"]', 'content'),
191
+ tagline: attr($, 'meta[property="og:description"]', 'content'),
192
+ image: attr($, 'meta[property="og:image"]', 'content'),
193
+ url: attr($, 'meta[property="og:url"]', 'content'),
194
+ votes: text($, '[data-test="vote-button"] span') || text($, 'button[data-vote-button]'),
195
+ topics: list($, 'a[href*="/topics/"]'),
196
+ website: attr($, 'a[data-test="product-link"]', 'href') || attr($, 'a[href][rel="noopener"][target="_blank"]', 'href')
197
+ };
198
+ }
199
+ },
200
+
201
+ {
202
+ id: 'stackoverflow-question',
203
+ name: 'Stack Overflow Question',
204
+ description: 'Scrape a Stack Overflow question for title, body, votes, tags, answers, and accepted answer.',
205
+ targetPattern: /stackoverflow\.com\/questions\//i,
206
+ extract($) {
207
+ const answers = [];
208
+ $('.answer').each((_, el) => {
209
+ const $a = $(el);
210
+ answers.push({
211
+ votes: $a.find('[itemprop="upvoteCount"]').attr('content') || $a.find('.js-vote-count').text().trim(),
212
+ accepted: $a.hasClass('accepted-answer'),
213
+ body: $a.find('.s-prose').first().text().trim().slice(0, 500)
214
+ });
215
+ });
216
+
217
+ return {
218
+ title: text($, '#question-header h1'),
219
+ body: text($, '.question .s-prose'),
220
+ votes: text($, '.question .js-vote-count') || attr($, '.question [itemprop="upvoteCount"]', 'content'),
221
+ views: text($, '.js-view-count') || attr($, 'meta[name="twitter:data1"]', 'content'),
222
+ tags: list($, '.post-tag'),
223
+ author: text($, '.question .user-details a'),
224
+ asked: attr($, '.question time', 'datetime'),
225
+ answers: answers.slice(0, 5),
226
+ answered: $('div.accepted-answer').length > 0
227
+ };
228
+ }
229
+ },
230
+
231
+ {
232
+ id: 'npm-package',
233
+ name: 'npm Package',
234
+ description: 'Scrape an npm package page for name, version, description, weekly downloads, license, and dependencies.',
235
+ targetPattern: /npmjs\.com\/package\//i,
236
+ extract($) {
237
+ const scripts = [];
238
+ $('script[type="application/ld+json"]').each((_, el) => {
239
+ try { scripts.push(JSON.parse($(el).html())); } catch {}
240
+ });
241
+ const ld = scripts[0] || {};
242
+
243
+ return {
244
+ name: text($, 'h1') || ld.name,
245
+ version: text($, 'h3[data-testid="package-version-number"]') || text($, '[class*="version"]'),
246
+ description: attr($, 'meta[name="description"]', 'content') || text($, 'p[class*="description"]'),
247
+ license: text($, 'span[class*="license"]') || text($, '[data-cy="license"]') || ld.license,
248
+ weekly_downloads: text($, 'span[class*="weekly-downloads"]') || text($, '[data-cy="downloads"]'),
249
+ install_command: `npm install ${ld.name || text($, 'h1') || ''}`.trim(),
250
+ homepage: attr($, 'a[href][class*="homepage"]', 'href'),
251
+ repository: attr($, 'a[href*="github.com"]', 'href'),
252
+ maintainers: list($, 'a[href*="/~"]')
253
+ };
254
+ }
255
+ }
256
+ ];
257
+
258
+ // ── Registry ─────────────────────────────────────────────────────────────────
259
+
260
+ export class TemplateRegistry {
261
+ constructor() {
262
+ this._templates = new Map(TEMPLATES.map(t => [t.id, t]));
263
+ }
264
+
265
+ /**
266
+ * List all registered template IDs and names.
267
+ * @returns {{ id: string, name: string, description: string }[]}
268
+ */
269
+ list() {
270
+ return TEMPLATES.map(({ id, name, description, targetPattern }) => ({
271
+ id, name, description,
272
+ targetPattern: targetPattern.toString()
273
+ }));
274
+ }
275
+
276
+ /**
277
+ * Look up a template by ID.
278
+ * @param {string} id
279
+ * @returns {object|undefined}
280
+ */
281
+ get(id) {
282
+ return this._templates.get(id);
283
+ }
284
+
285
+ /**
286
+ * Run a template against raw HTML.
287
+ * @param {string} id — template ID
288
+ * @param {string} html — raw HTML of the target page
289
+ * @param {string} url — original URL (for context)
290
+ * @returns {{ template: string, url: string, data: object, extractedAt: string }}
291
+ */
292
+ async run(id, html, url) {
293
+ const template = this.get(id);
294
+ if (!template) {
295
+ throw new Error(`Unknown template: "${id}". Available: ${TEMPLATES.map(t => t.id).join(', ')}`);
296
+ }
297
+
298
+ const $ = load(html);
299
+ const data = template.extract($);
300
+
301
+ return {
302
+ template: id,
303
+ template_name: template.name,
304
+ url,
305
+ data,
306
+ extractedAt: new Date().toISOString()
307
+ };
308
+ }
309
+ }
310
+
311
+ export default TemplateRegistry;