crawlforge-mcp-server 3.4.0 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -2
- package/package.json +6 -4
- package/server.js +166 -32
- package/src/cli/commands/actions.js +36 -0
- package/src/cli/commands/analyze.js +19 -0
- package/src/cli/commands/batch.js +45 -0
- package/src/cli/commands/crawl.js +30 -0
- package/src/cli/commands/extract.js +45 -0
- package/src/cli/commands/install-skills.js +46 -0
- package/src/cli/commands/llmstxt.js +24 -0
- package/src/cli/commands/localize.js +29 -0
- package/src/cli/commands/map.js +26 -0
- package/src/cli/commands/monitor.js +29 -0
- package/src/cli/commands/research.js +26 -0
- package/src/cli/commands/scrape.js +37 -0
- package/src/cli/commands/search.js +28 -0
- package/src/cli/commands/stealth.js +29 -0
- package/src/cli/commands/template.js +26 -0
- package/src/cli/commands/track.js +24 -0
- package/src/cli/commands/uninstall-skills.js +35 -0
- package/src/cli/formatter.js +57 -0
- package/src/cli/index.js +94 -0
- package/src/cli/lib/runTool.js +40 -0
- package/src/core/ActionExecutor.js +8 -6
- package/src/core/AuthManager.js +103 -3
- package/src/core/ChangeTracker.js +34 -0
- package/src/core/ElicitationHelper.js +112 -0
- package/src/core/JobManager.js +36 -2
- package/src/core/LocalizationManager.js +19 -5
- package/src/core/PerformanceManager.js +53 -17
- package/src/core/ResearchOrchestrator.js +40 -5
- package/src/core/SamplingClient.js +191 -0
- package/src/core/StealthBrowserManager.js +248 -2
- package/src/core/WebhookDispatcher.js +18 -10
- package/src/prompts/PromptRegistry.js +199 -0
- package/src/resources/ResourceRegistry.js +273 -0
- package/src/server/transports/streamableHttp.js +6 -6
- package/src/server/withAuth.js +25 -0
- package/src/skills/crawlforge-cli.md +157 -0
- package/src/skills/crawlforge-mcp.md +80 -0
- package/src/skills/crawlforge-research.md +104 -0
- package/src/skills/crawlforge-stealth.md +98 -0
- package/src/skills/installer.js +141 -0
- package/src/tools/advanced/batchScrape/index.js +30 -0
- package/src/tools/advanced/batchScrape/schema.js +1 -1
- package/src/tools/basic/extractText.js +19 -8
- package/src/tools/crawl/crawlDeep.js +27 -0
- package/src/tools/extract/extractContent.js +5 -17
- package/src/tools/extract/extractStructured.js +8 -0
- package/src/tools/extract/extractWithLlm.js +35 -25
- package/src/tools/extract/listOllamaModels.js +66 -0
- package/src/tools/extract/processDocument.js +7 -1
- package/src/tools/extract/summarizeContent.js +17 -0
- package/src/tools/research/deepResearch.js +34 -0
- package/src/tools/templates/ScrapeTemplateTool.js +68 -0
- package/src/tools/templates/TemplateRegistry.js +311 -0
- package/src/utils/Logger.js +15 -0
- package/src/utils/htmlToMarkdown.js +54 -0
- package/src/utils/secretMask.js +86 -0
|
@@ -1,13 +1,22 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Extract With LLM MCP Tool
|
|
3
|
-
* Natural-language extraction powered by
|
|
4
|
-
*
|
|
3
|
+
* Natural-language extraction powered by a local Ollama model (default) or
|
|
4
|
+
* a cloud provider (OpenAI / Anthropic, explicit opt-in).
|
|
5
5
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
6
|
+
* Default: provider 'auto' → Ollama at http://localhost:11434, no API key required.
|
|
7
|
+
* Pass provider: "openai" | "anthropic" with the matching API key to use a cloud model.
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
10
|
import { fetchAndParse } from './_fetchAndParse.js';
|
|
11
|
+
// D1.3: SamplingClient for MCP sampling fallback (lazy — only imported if needed)
|
|
12
|
+
let _SamplingClient = null;
|
|
13
|
+
async function getSamplingClient() {
|
|
14
|
+
if (!_SamplingClient) {
|
|
15
|
+
const mod = await import('../../core/SamplingClient.js');
|
|
16
|
+
_SamplingClient = mod.SamplingClient;
|
|
17
|
+
}
|
|
18
|
+
return _SamplingClient;
|
|
19
|
+
}
|
|
11
20
|
|
|
12
21
|
// ── Constants ─────────────────────────────────────────────────────────────────
|
|
13
22
|
|
|
@@ -36,34 +45,24 @@ function ollamaBaseUrl() {
|
|
|
36
45
|
* @returns {{ provider: 'openai'|'anthropic'|'ollama', apiKey: string|null }}
|
|
37
46
|
*/
|
|
38
47
|
function resolveProvider(provider) {
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
if (provider === 'auto') {
|
|
44
|
-
if (anthropicKey) return { provider: 'anthropic', apiKey: anthropicKey };
|
|
45
|
-
if (openaiKey) return { provider: 'openai', apiKey: openaiKey };
|
|
46
|
-
if (ollamaOptIn) return { provider: 'ollama', apiKey: null };
|
|
47
|
-
throw new Error(
|
|
48
|
-
'extract_with_llm requires OPENAI_API_KEY, ANTHROPIC_API_KEY, or OLLAMA_BASE_URL in environment ' +
|
|
49
|
-
'(or pass provider: "ollama" explicitly to use a local Ollama server)'
|
|
50
|
-
);
|
|
48
|
+
if (provider === 'auto' || provider === 'ollama') {
|
|
49
|
+
// Local Ollama is the default. No API key required; OLLAMA_BASE_URL is
|
|
50
|
+
// an optional override (defaults to http://localhost:11434).
|
|
51
|
+
return { provider: 'ollama', apiKey: null };
|
|
51
52
|
}
|
|
52
53
|
|
|
53
54
|
if (provider === 'anthropic') {
|
|
55
|
+
const anthropicKey = process.env.ANTHROPIC_API_KEY;
|
|
54
56
|
if (!anthropicKey) throw new Error('extract_with_llm: ANTHROPIC_API_KEY is not set');
|
|
55
57
|
return { provider: 'anthropic', apiKey: anthropicKey };
|
|
56
58
|
}
|
|
57
59
|
|
|
58
60
|
if (provider === 'openai') {
|
|
61
|
+
const openaiKey = process.env.OPENAI_API_KEY;
|
|
59
62
|
if (!openaiKey) throw new Error('extract_with_llm: OPENAI_API_KEY is not set');
|
|
60
63
|
return { provider: 'openai', apiKey: openaiKey };
|
|
61
64
|
}
|
|
62
65
|
|
|
63
|
-
if (provider === 'ollama') {
|
|
64
|
-
return { provider: 'ollama', apiKey: null };
|
|
65
|
-
}
|
|
66
|
-
|
|
67
66
|
throw new Error(`extract_with_llm: unknown provider "${provider}"`);
|
|
68
67
|
}
|
|
69
68
|
|
|
@@ -307,14 +306,25 @@ export class ExtractWithLlm {
|
|
|
307
306
|
|
|
308
307
|
const userMessage = buildUserMessage(prompt, text, schema);
|
|
309
308
|
|
|
310
|
-
// Step 2: First LLM call
|
|
311
|
-
|
|
309
|
+
// Step 2: First LLM call — with sampling fallback for 'auto' provider
|
|
310
|
+
// Fallback chain: Ollama → API key (handled by resolveProvider) → sampling → error
|
|
311
|
+
let rawText, usage, resolvedModel = model;
|
|
312
312
|
try {
|
|
313
313
|
({ rawText, usage } = await callLLM({
|
|
314
314
|
provider, apiKey, model, systemMessage, userMessage, maxTokens, schema
|
|
315
315
|
}));
|
|
316
316
|
} catch (llmErr) {
|
|
317
|
-
|
|
317
|
+
// D1.3: If provider is 'auto'/'ollama' and it failed, try sampling as final fallback
|
|
318
|
+
if (providerParam === 'auto' || providerParam === 'ollama') {
|
|
319
|
+
try {
|
|
320
|
+
({ rawText, usage } = await callViaSampling({ systemMessage, userMessage, maxTokens }));
|
|
321
|
+
resolvedModel = 'sampling';
|
|
322
|
+
} catch (samplingErr) {
|
|
323
|
+
return { success: false, error: `LLM call failed: ${llmErr.message}. Sampling fallback also failed: ${samplingErr.message}` };
|
|
324
|
+
}
|
|
325
|
+
} else {
|
|
326
|
+
return { success: false, error: `LLM call failed: ${llmErr.message}` };
|
|
327
|
+
}
|
|
318
328
|
}
|
|
319
329
|
|
|
320
330
|
// Step 3: Parse JSON; retry once with stricter prompt if it fails
|
|
@@ -355,8 +365,8 @@ export class ExtractWithLlm {
|
|
|
355
365
|
return {
|
|
356
366
|
success: true,
|
|
357
367
|
data: parsed,
|
|
358
|
-
provider,
|
|
359
|
-
model,
|
|
368
|
+
provider: resolvedModel === 'sampling' ? 'sampling' : provider,
|
|
369
|
+
model: resolvedModel || model,
|
|
360
370
|
usage
|
|
361
371
|
};
|
|
362
372
|
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* List Ollama Models MCP Tool
|
|
3
|
+
* Returns the models installed on the local Ollama server (GET /api/tags).
|
|
4
|
+
* Used to discover names that can be passed as the `model` parameter to extract_with_llm.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
function ollamaBaseUrl() {
|
|
8
|
+
return (process.env.OLLAMA_BASE_URL || 'http://localhost:11434').replace(/\/$/, '');
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export class ListOllamaModelsTool {
|
|
12
|
+
async execute() {
|
|
13
|
+
const baseUrl = ollamaBaseUrl();
|
|
14
|
+
const url = `${baseUrl}/api/tags`;
|
|
15
|
+
|
|
16
|
+
let response;
|
|
17
|
+
try {
|
|
18
|
+
response = await fetch(url, { signal: AbortSignal.timeout(10_000) });
|
|
19
|
+
} catch (err) {
|
|
20
|
+
return {
|
|
21
|
+
success: false,
|
|
22
|
+
baseUrl,
|
|
23
|
+
error:
|
|
24
|
+
`Could not reach Ollama at ${url}: ${err.message}. ` +
|
|
25
|
+
`Install from https://ollama.com and run "ollama serve".`
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (!response.ok) {
|
|
30
|
+
return {
|
|
31
|
+
success: false,
|
|
32
|
+
baseUrl,
|
|
33
|
+
error: `Ollama responded ${response.status} at ${url}. Is "ollama serve" running?`
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
let data;
|
|
38
|
+
try {
|
|
39
|
+
data = await response.json();
|
|
40
|
+
} catch (err) {
|
|
41
|
+
return { success: false, baseUrl, error: `Invalid JSON from Ollama: ${err.message}` };
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const models = (data.models || []).map((m) => ({
|
|
45
|
+
name: m.name,
|
|
46
|
+
size_bytes: m.size,
|
|
47
|
+
modified_at: m.modified_at,
|
|
48
|
+
family: m.details?.family,
|
|
49
|
+
parameter_size: m.details?.parameter_size,
|
|
50
|
+
quantization: m.details?.quantization_level
|
|
51
|
+
}));
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
success: true,
|
|
55
|
+
baseUrl,
|
|
56
|
+
count: models.length,
|
|
57
|
+
models,
|
|
58
|
+
hint:
|
|
59
|
+
models.length === 0
|
|
60
|
+
? 'No models installed. Run "ollama pull llama3.2" (or any model from https://ollama.com/library) in your terminal.'
|
|
61
|
+
: 'Pass any of these names as the `model` parameter to extract_with_llm.'
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export default ListOllamaModelsTool;
|
|
@@ -8,6 +8,7 @@ import { PDFProcessor } from '../../core/processing/PDFProcessor.js';
|
|
|
8
8
|
import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
|
|
9
9
|
import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
|
|
10
10
|
import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
|
|
11
|
+
import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
|
|
11
12
|
|
|
12
13
|
const ProcessDocumentSchema = z.object({
|
|
13
14
|
source: z.string().min(1),
|
|
@@ -28,7 +29,7 @@ const ProcessDocumentSchema = z.object({
|
|
|
28
29
|
// Processing options
|
|
29
30
|
assessContentQuality: z.boolean().default(true),
|
|
30
31
|
includeStatistics: z.boolean().default(true),
|
|
31
|
-
outputFormat: z.enum(['text', 'structured', 'full']).default('structured'),
|
|
32
|
+
outputFormat: z.enum(['text', 'structured', 'full', 'markdown']).default('structured'),
|
|
32
33
|
|
|
33
34
|
// Content filtering
|
|
34
35
|
minContentLength: z.number().min(0).default(50),
|
|
@@ -328,6 +329,11 @@ export class ProcessDocumentTool {
|
|
|
328
329
|
result.content.html = html;
|
|
329
330
|
}
|
|
330
331
|
|
|
332
|
+
// D3.1: Markdown output mode — convert extracted HTML to markdown via Turndown
|
|
333
|
+
if (options.outputFormat === 'markdown') {
|
|
334
|
+
result.content.markdown = htmlToMarkdown(extractedContent || html);
|
|
335
|
+
}
|
|
336
|
+
|
|
331
337
|
// Step 4: Set metadata
|
|
332
338
|
if (processingResult.metadata) {
|
|
333
339
|
result.metadata = {
|
|
@@ -4,6 +4,15 @@
|
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
6
|
import { z } from 'zod';
|
|
7
|
+
// D1.3: lazy SamplingClient for abstractive mode when no LLM keys are set
|
|
8
|
+
let _SamplingClient = null;
|
|
9
|
+
async function getSamplingClient() {
|
|
10
|
+
if (!_SamplingClient) {
|
|
11
|
+
const mod = await import('../../core/SamplingClient.js');
|
|
12
|
+
_SamplingClient = mod.SamplingClient;
|
|
13
|
+
}
|
|
14
|
+
return _SamplingClient;
|
|
15
|
+
}
|
|
7
16
|
import { ContentAnalyzer } from '../../core/analysis/ContentAnalyzer.js';
|
|
8
17
|
import { splitSentences } from '../../core/analysis/sentenceUtils.js';
|
|
9
18
|
|
|
@@ -122,6 +131,14 @@ export class SummarizeContentTool {
|
|
|
122
131
|
// Step 2: Set summary result
|
|
123
132
|
result.summary = analysisResult.summary;
|
|
124
133
|
|
|
134
|
+
// D1.3: If abstractive mode requested, attempt sampling-based enhancement
|
|
135
|
+
if (options.summaryType === 'abstractive') {
|
|
136
|
+
const abstractive = await this._abstractiveSummaryViaSampling(text, analysisResult.summary, options.summaryLength);
|
|
137
|
+
if (abstractive) {
|
|
138
|
+
result.summary = abstractive;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
125
142
|
// Step 3: Extract key points if requested
|
|
126
143
|
if (options.includeKeypoints) {
|
|
127
144
|
result.keypoints = await this.extractKeyPoints(text, analysisResult.summary);
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
|
+
// D1.4: Elicitation helper (injected from server.js or can be used standalone)
|
|
3
|
+
import { ElicitationHelper } from '../../core/ElicitationHelper.js';
|
|
2
4
|
import { ResearchOrchestrator } from '../../core/ResearchOrchestrator.js';
|
|
3
5
|
import { Logger } from '../../utils/Logger.js';
|
|
4
6
|
|
|
@@ -93,6 +95,17 @@ export class DeepResearchTool {
|
|
|
93
95
|
cacheTTL,
|
|
94
96
|
...orchestratorOptions
|
|
95
97
|
};
|
|
98
|
+
// D1.4: Elicitation helper (set mcpServer via setMcpServer() after instantiation)
|
|
99
|
+
this._elicitation = new ElicitationHelper({});
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* D1.4: Set the MCP server instance for elicitation support.
|
|
104
|
+
* Call this from server.js after instantiating DeepResearchTool.
|
|
105
|
+
* @param {object} mcpServer
|
|
106
|
+
*/
|
|
107
|
+
setMcpServer(mcpServer) {
|
|
108
|
+
this._elicitation = new ElicitationHelper({ mcpServer });
|
|
96
109
|
}
|
|
97
110
|
|
|
98
111
|
async execute(params) {
|
|
@@ -116,6 +129,27 @@ export class DeepResearchTool {
|
|
|
116
129
|
};
|
|
117
130
|
}
|
|
118
131
|
|
|
132
|
+
// D1.4: Elicitation — warn user if projected cost exceeds 50 credits
|
|
133
|
+
// deep_research costs approximately 1 credit per URL; maxUrls > 50 → confirm
|
|
134
|
+
if (validated.maxUrls > 50) {
|
|
135
|
+
const projectedCredits = validated.maxUrls;
|
|
136
|
+
const proceed = await this._elicitation.confirm(
|
|
137
|
+
`deep_research will scan up to ${validated.maxUrls} URLs, projecting ~${projectedCredits} credits.`,
|
|
138
|
+
{
|
|
139
|
+
topic: validated.topic,
|
|
140
|
+
projected_credits: projectedCredits,
|
|
141
|
+
max_urls: validated.maxUrls,
|
|
142
|
+
}
|
|
143
|
+
);
|
|
144
|
+
if (!proceed) {
|
|
145
|
+
return {
|
|
146
|
+
success: false,
|
|
147
|
+
error: 'Research cancelled by user before starting (elicitation declined).',
|
|
148
|
+
sessionId,
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
119
153
|
// Configure research orchestrator based on research approach
|
|
120
154
|
const orchestratorConfig = this.buildOrchestratorConfig(validated);
|
|
121
155
|
const orchestrator = new ResearchOrchestrator(orchestratorConfig);
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ScrapeTemplateTool — wraps TemplateRegistry to expose the `scrape_template` MCP tool.
|
|
3
|
+
*
|
|
4
|
+
* Usage pattern (D3.3):
|
|
5
|
+
* const tool = new ScrapeTemplateTool();
|
|
6
|
+
* const result = await tool.execute({ template: "github-repo", url: "https://github.com/user/repo" });
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { TemplateRegistry } from './TemplateRegistry.js';
|
|
10
|
+
|
|
11
|
+
export class ScrapeTemplateTool {
|
|
12
|
+
constructor() {
|
|
13
|
+
this.registry = new TemplateRegistry();
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Execute the scrape_template tool.
|
|
18
|
+
* @param {{ template: string, url: string, timeout?: number }} params
|
|
19
|
+
* @returns {Promise<object>}
|
|
20
|
+
*/
|
|
21
|
+
async execute({ template, url, timeout = 15000 }) {
|
|
22
|
+
// list mode — return available templates without scraping
|
|
23
|
+
if (template === 'list' || !url) {
|
|
24
|
+
return {
|
|
25
|
+
templates: this.registry.list(),
|
|
26
|
+
count: this.registry.list().length
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Validate template exists before making network call
|
|
31
|
+
const tpl = this.registry.get(template);
|
|
32
|
+
if (!tpl) {
|
|
33
|
+
const available = this.registry.list().map(t => t.id).join(', ');
|
|
34
|
+
throw new Error(`Unknown template "${template}". Available templates: ${available}`);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Fetch the page
|
|
38
|
+
const controller = new AbortController();
|
|
39
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
40
|
+
let html;
|
|
41
|
+
try {
|
|
42
|
+
const response = await fetch(url, {
|
|
43
|
+
signal: controller.signal,
|
|
44
|
+
headers: {
|
|
45
|
+
'User-Agent': 'Mozilla/5.0 (compatible; CrawlForge-TemplateScraper/4.0)'
|
|
46
|
+
}
|
|
47
|
+
});
|
|
48
|
+
clearTimeout(timeoutId);
|
|
49
|
+
|
|
50
|
+
if (!response.ok) {
|
|
51
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
52
|
+
}
|
|
53
|
+
html = await response.text();
|
|
54
|
+
} catch (error) {
|
|
55
|
+
clearTimeout(timeoutId);
|
|
56
|
+
if (error.name === 'AbortError') {
|
|
57
|
+
throw new Error(`Request timeout after ${timeout}ms`);
|
|
58
|
+
}
|
|
59
|
+
throw error;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Run the template extractor
|
|
63
|
+
const result = await this.registry.run(template, html, url);
|
|
64
|
+
return result;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export default ScrapeTemplateTool;
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TemplateRegistry — pre-built scraping templates for popular sites (D3.3).
|
|
3
|
+
*
|
|
4
|
+
* Each template is a self-contained object with:
|
|
5
|
+
* id — unique slug used as the `template` parameter
|
|
6
|
+
* name — human-readable name
|
|
7
|
+
* description — when to use this template
|
|
8
|
+
* targetPattern — regex matching URLs this template handles
|
|
9
|
+
* selectors — CSS selectors mapping field names to DOM locations
|
|
10
|
+
* postProcess — optional function(raw: Object) → Object for cleanup
|
|
11
|
+
*
|
|
12
|
+
* Templates do NOT make network calls. The ScrapeTemplateTool fetches the
|
|
13
|
+
* page and passes the parsed HTML to the template's extract() method.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { load } from 'cheerio';
|
|
17
|
+
|
|
18
|
+
// ── Helpers ──────────────────────────────────────────────────────────────────
|
|
19
|
+
|
|
20
|
+
function text($, sel) {
|
|
21
|
+
return $(sel).first().text().trim() || null;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function attr($, sel, attribute) {
|
|
25
|
+
return $(sel).first().attr(attribute) || null;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function list($, sel) {
|
|
29
|
+
return $(sel).map((_, el) => $(el).text().trim()).get().filter(Boolean);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function listAttr($, sel, attribute) {
|
|
33
|
+
return $(sel).map((_, el) => $(el).attr(attribute)).get().filter(Boolean);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// ── Template definitions ─────────────────────────────────────────────────────
|
|
37
|
+
|
|
38
|
+
const TEMPLATES = [
|
|
39
|
+
{
|
|
40
|
+
id: 'amazon-product',
|
|
41
|
+
name: 'Amazon Product',
|
|
42
|
+
description: 'Scrape an Amazon product page for title, price, rating, reviews, ASIN, and description.',
|
|
43
|
+
targetPattern: /amazon\.(com|co\.uk|de|fr|jp|ca|com\.au)/i,
|
|
44
|
+
extract($) {
|
|
45
|
+
return {
|
|
46
|
+
title: text($, '#productTitle'),
|
|
47
|
+
price: text($, '.a-price .a-offscreen') || text($, '#priceblock_ourprice') || text($, '#priceblock_dealprice'),
|
|
48
|
+
currency: attr($, 'meta[itemprop="priceCurrency"]', 'content'),
|
|
49
|
+
rating: text($, '#acrPopover .a-size-base'),
|
|
50
|
+
review_count: text($, '#acrCustomerReviewText'),
|
|
51
|
+
asin: text($, 'input#ASIN') || attr($, 'input[name="ASIN"]', 'value'),
|
|
52
|
+
brand: text($, '#bylineInfo'),
|
|
53
|
+
description: text($, '#productDescription p') || text($, '#feature-bullets'),
|
|
54
|
+
images: listAttr($, '#altImages img.a-thumbnail-image', 'src').slice(0, 8),
|
|
55
|
+
availability: text($, '#availability span'),
|
|
56
|
+
category_breadcrumb: list($, '#wayfinding-breadcrumbs_feature_div a')
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
},
|
|
60
|
+
|
|
61
|
+
{
|
|
62
|
+
id: 'linkedin-profile',
|
|
63
|
+
name: 'LinkedIn Profile',
|
|
64
|
+
description: 'Scrape a LinkedIn public profile for name, headline, location, and about section.',
|
|
65
|
+
targetPattern: /linkedin\.com\/in\//i,
|
|
66
|
+
extract($) {
|
|
67
|
+
return {
|
|
68
|
+
name: text($, 'h1') || text($, '.top-card-layout__title'),
|
|
69
|
+
headline: text($, '.top-card-layout__headline') || text($, 'h2'),
|
|
70
|
+
location: text($, '.top-card-layout__first-subline') || text($, '.profile-info-subheader'),
|
|
71
|
+
about: text($, '.core-section-container__content p') || text($, '.summary'),
|
|
72
|
+
connections: text($, '.top-card__connections'),
|
|
73
|
+
current_company: text($, '.top-card-layout__card-inner-full-width .top-card-link'),
|
|
74
|
+
note: 'LinkedIn requires authentication for full profiles. This template works on public profile pages only.'
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
},
|
|
78
|
+
|
|
79
|
+
{
|
|
80
|
+
id: 'github-repo',
|
|
81
|
+
name: 'GitHub Repository',
|
|
82
|
+
description: 'Scrape a GitHub repository page for stars, forks, description, language, topics, and README summary.',
|
|
83
|
+
targetPattern: /github\.com\/[^/]+\/[^/]+\/?$/i,
|
|
84
|
+
extract($) {
|
|
85
|
+
return {
|
|
86
|
+
name: text($, 'strong[itemprop="name"] a') || text($, '.repository-content h1'),
|
|
87
|
+
description: attr($, 'meta[property="og:description"]', 'content') || text($, 'p.f4.my-3'),
|
|
88
|
+
stars: text($, '#repo-stars-counter-star') || text($, '[aria-label*="stargazers"]'),
|
|
89
|
+
forks: text($, '#repo-network-counter') || text($, '[aria-label*="forks"]'),
|
|
90
|
+
watchers: text($, '[aria-label*="watchers"]'),
|
|
91
|
+
language: text($, 'span[itemprop="programmingLanguage"]') || text($, '.d-inline-flex[class*="language"]'),
|
|
92
|
+
topics: list($, 'a.topic-tag'),
|
|
93
|
+
license: text($, 'a[href*="blob/"][href*="LICENSE"]') || text($, '.octicon-law ~ span'),
|
|
94
|
+
last_updated: attr($, 'relative-time', 'datetime'),
|
|
95
|
+
homepage: attr($, 'a[href][rel="noopener noreferrer"]', 'href'),
|
|
96
|
+
open_issues: text($, '.Counter[aria-label*="issue"]')
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
},
|
|
100
|
+
|
|
101
|
+
{
|
|
102
|
+
id: 'youtube-video',
|
|
103
|
+
name: 'YouTube Video',
|
|
104
|
+
description: 'Scrape a YouTube video page for title, channel, views, likes, publish date, and description.',
|
|
105
|
+
targetPattern: /youtube\.com\/watch/i,
|
|
106
|
+
extract($) {
|
|
107
|
+
return {
|
|
108
|
+
title: attr($, 'meta[name="title"]', 'content') || attr($, 'meta[property="og:title"]', 'content'),
|
|
109
|
+
channel: attr($, 'link[itemprop="name"]', 'content') || text($, '#channel-name'),
|
|
110
|
+
channel_url: attr($, 'span[itemprop="author"] link[itemprop="url"]', 'href'),
|
|
111
|
+
views: attr($, 'meta[itemprop="interactionCount"]', 'content'),
|
|
112
|
+
published: attr($, 'meta[itemprop="uploadDate"]', 'content') || attr($, 'meta[itemprop="datePublished"]', 'content'),
|
|
113
|
+
description: attr($, 'meta[property="og:description"]', 'content'),
|
|
114
|
+
thumbnail: attr($, 'meta[property="og:image"]', 'content'),
|
|
115
|
+
duration: attr($, 'meta[itemprop="duration"]', 'content'),
|
|
116
|
+
video_id: new URL($('link[rel="canonical"]').attr('href') || 'https://youtube.com').searchParams.get('v')
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
},
|
|
120
|
+
|
|
121
|
+
{
|
|
122
|
+
id: 'tweet',
|
|
123
|
+
name: 'Tweet / X Post',
|
|
124
|
+
description: 'Scrape a tweet/X post for text, author, timestamp, likes, and retweets from the Open Graph / structured data.',
|
|
125
|
+
targetPattern: /(twitter|x)\.com\/[^/]+\/status\//i,
|
|
126
|
+
extract($) {
|
|
127
|
+
return {
|
|
128
|
+
text: attr($, 'meta[property="og:description"]', 'content'),
|
|
129
|
+
author: attr($, 'meta[property="og:title"]', 'content'),
|
|
130
|
+
url: attr($, 'meta[property="og:url"]', 'content') || attr($, 'link[rel="canonical"]', 'href'),
|
|
131
|
+
image: attr($, 'meta[property="og:image"]', 'content'),
|
|
132
|
+
note: 'X.com requires JavaScript rendering for full tweet data. Structured metadata is returned from static HTML.'
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
},
|
|
136
|
+
|
|
137
|
+
{
|
|
138
|
+
id: 'reddit-thread',
|
|
139
|
+
name: 'Reddit Thread',
|
|
140
|
+
description: 'Scrape a Reddit thread for title, subreddit, score, comment count, author, and top-level comments.',
|
|
141
|
+
targetPattern: /reddit\.com\/r\/[^/]+\/comments\//i,
|
|
142
|
+
extract($) {
|
|
143
|
+
return {
|
|
144
|
+
title: attr($, 'meta[property="og:title"]', 'content') || text($, 'h1'),
|
|
145
|
+
subreddit: text($, 'a[href*="/r/"][class*="subreddit"]') || (($('title').text().match(/r\/([^•]+)/) || [])[1] || '').trim(),
|
|
146
|
+
score: text($, '[data-score]') || attr($, '[itemprop="upvoteCount"]', 'content'),
|
|
147
|
+
author: text($, 'a[href*="/user/"]'),
|
|
148
|
+
posted: attr($, 'time[datetime]', 'datetime'),
|
|
149
|
+
body: text($, 'div[data-click-id="text"] p') || attr($, 'meta[property="og:description"]', 'content'),
|
|
150
|
+
url: attr($, 'meta[property="og:url"]', 'content'),
|
|
151
|
+
flair: text($, '[class*="flair"]')
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
},
|
|
155
|
+
|
|
156
|
+
{
|
|
157
|
+
id: 'hacker-news-front-page',
|
|
158
|
+
name: 'Hacker News Front Page',
|
|
159
|
+
description: 'Scrape the Hacker News front page for a list of stories with title, URL, score, and comment count.',
|
|
160
|
+
targetPattern: /news\.ycombinator\.com(\/news)?$/i,
|
|
161
|
+
extract($) {
|
|
162
|
+
const stories = [];
|
|
163
|
+
$('tr.athing').each((_, el) => {
|
|
164
|
+
const $row = $(el);
|
|
165
|
+
const $score = $row.next('.spacer').find('.score');
|
|
166
|
+
const $subtext = $row.next('.spacer').find('.subtext');
|
|
167
|
+
const $titleLink = $row.find('.titleline > a');
|
|
168
|
+
stories.push({
|
|
169
|
+
id: $row.attr('id'),
|
|
170
|
+
title: $titleLink.text().trim(),
|
|
171
|
+
url: $titleLink.attr('href'),
|
|
172
|
+
site: $row.find('.sitebit a').text().trim() || null,
|
|
173
|
+
score: $score.text().replace(' points', '').trim() || null,
|
|
174
|
+
author: $subtext.find('.hnuser').text().trim() || null,
|
|
175
|
+
posted: $subtext.find('.age a').attr('href') || null,
|
|
176
|
+
comments: $subtext.find('a[href*="item"]').last().text().trim() || null
|
|
177
|
+
});
|
|
178
|
+
});
|
|
179
|
+
return { stories: stories.slice(0, 30), scraped_at: new Date().toISOString() };
|
|
180
|
+
}
|
|
181
|
+
},
|
|
182
|
+
|
|
183
|
+
{
|
|
184
|
+
id: 'producthunt-launch',
|
|
185
|
+
name: 'Product Hunt Launch',
|
|
186
|
+
description: 'Scrape a Product Hunt product page for name, tagline, vote count, topics, and maker details.',
|
|
187
|
+
targetPattern: /producthunt\.com\/posts\//i,
|
|
188
|
+
extract($) {
|
|
189
|
+
return {
|
|
190
|
+
name: attr($, 'meta[property="og:title"]', 'content'),
|
|
191
|
+
tagline: attr($, 'meta[property="og:description"]', 'content'),
|
|
192
|
+
image: attr($, 'meta[property="og:image"]', 'content'),
|
|
193
|
+
url: attr($, 'meta[property="og:url"]', 'content'),
|
|
194
|
+
votes: text($, '[data-test="vote-button"] span') || text($, 'button[data-vote-button]'),
|
|
195
|
+
topics: list($, 'a[href*="/topics/"]'),
|
|
196
|
+
website: attr($, 'a[data-test="product-link"]', 'href') || attr($, 'a[href][rel="noopener"][target="_blank"]', 'href')
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
},
|
|
200
|
+
|
|
201
|
+
{
|
|
202
|
+
id: 'stackoverflow-question',
|
|
203
|
+
name: 'Stack Overflow Question',
|
|
204
|
+
description: 'Scrape a Stack Overflow question for title, body, votes, tags, answers, and accepted answer.',
|
|
205
|
+
targetPattern: /stackoverflow\.com\/questions\//i,
|
|
206
|
+
extract($) {
|
|
207
|
+
const answers = [];
|
|
208
|
+
$('.answer').each((_, el) => {
|
|
209
|
+
const $a = $(el);
|
|
210
|
+
answers.push({
|
|
211
|
+
votes: $a.find('[itemprop="upvoteCount"]').attr('content') || $a.find('.js-vote-count').text().trim(),
|
|
212
|
+
accepted: $a.hasClass('accepted-answer'),
|
|
213
|
+
body: $a.find('.s-prose').first().text().trim().slice(0, 500)
|
|
214
|
+
});
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
return {
|
|
218
|
+
title: text($, '#question-header h1'),
|
|
219
|
+
body: text($, '.question .s-prose'),
|
|
220
|
+
votes: text($, '.question .js-vote-count') || attr($, '.question [itemprop="upvoteCount"]', 'content'),
|
|
221
|
+
views: text($, '.js-view-count') || attr($, 'meta[name="twitter:data1"]', 'content'),
|
|
222
|
+
tags: list($, '.post-tag'),
|
|
223
|
+
author: text($, '.question .user-details a'),
|
|
224
|
+
asked: attr($, '.question time', 'datetime'),
|
|
225
|
+
answers: answers.slice(0, 5),
|
|
226
|
+
answered: $('div.accepted-answer').length > 0
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
},
|
|
230
|
+
|
|
231
|
+
{
|
|
232
|
+
id: 'npm-package',
|
|
233
|
+
name: 'npm Package',
|
|
234
|
+
description: 'Scrape an npm package page for name, version, description, weekly downloads, license, and dependencies.',
|
|
235
|
+
targetPattern: /npmjs\.com\/package\//i,
|
|
236
|
+
extract($) {
|
|
237
|
+
const scripts = [];
|
|
238
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
239
|
+
try { scripts.push(JSON.parse($(el).html())); } catch {}
|
|
240
|
+
});
|
|
241
|
+
const ld = scripts[0] || {};
|
|
242
|
+
|
|
243
|
+
return {
|
|
244
|
+
name: text($, 'h1') || ld.name,
|
|
245
|
+
version: text($, 'h3[data-testid="package-version-number"]') || text($, '[class*="version"]'),
|
|
246
|
+
description: attr($, 'meta[name="description"]', 'content') || text($, 'p[class*="description"]'),
|
|
247
|
+
license: text($, 'span[class*="license"]') || text($, '[data-cy="license"]') || ld.license,
|
|
248
|
+
weekly_downloads: text($, 'span[class*="weekly-downloads"]') || text($, '[data-cy="downloads"]'),
|
|
249
|
+
install_command: `npm install ${ld.name || text($, 'h1') || ''}`.trim(),
|
|
250
|
+
homepage: attr($, 'a[href][class*="homepage"]', 'href'),
|
|
251
|
+
repository: attr($, 'a[href*="github.com"]', 'href'),
|
|
252
|
+
maintainers: list($, 'a[href*="/~"]')
|
|
253
|
+
};
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
];
|
|
257
|
+
|
|
258
|
+
// ── Registry ─────────────────────────────────────────────────────────────────
|
|
259
|
+
|
|
260
|
+
export class TemplateRegistry {
|
|
261
|
+
constructor() {
|
|
262
|
+
this._templates = new Map(TEMPLATES.map(t => [t.id, t]));
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* List all registered template IDs and names.
|
|
267
|
+
* @returns {{ id: string, name: string, description: string }[]}
|
|
268
|
+
*/
|
|
269
|
+
list() {
|
|
270
|
+
return TEMPLATES.map(({ id, name, description, targetPattern }) => ({
|
|
271
|
+
id, name, description,
|
|
272
|
+
targetPattern: targetPattern.toString()
|
|
273
|
+
}));
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
/**
|
|
277
|
+
* Look up a template by ID.
|
|
278
|
+
* @param {string} id
|
|
279
|
+
* @returns {object|undefined}
|
|
280
|
+
*/
|
|
281
|
+
get(id) {
|
|
282
|
+
return this._templates.get(id);
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Run a template against raw HTML.
|
|
287
|
+
* @param {string} id — template ID
|
|
288
|
+
* @param {string} html — raw HTML of the target page
|
|
289
|
+
* @param {string} url — original URL (for context)
|
|
290
|
+
* @returns {{ template: string, url: string, data: object, extractedAt: string }}
|
|
291
|
+
*/
|
|
292
|
+
async run(id, html, url) {
|
|
293
|
+
const template = this.get(id);
|
|
294
|
+
if (!template) {
|
|
295
|
+
throw new Error(`Unknown template: "${id}". Available: ${TEMPLATES.map(t => t.id).join(', ')}`);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
const $ = load(html);
|
|
299
|
+
const data = template.extract($);
|
|
300
|
+
|
|
301
|
+
return {
|
|
302
|
+
template: id,
|
|
303
|
+
template_name: template.name,
|
|
304
|
+
url,
|
|
305
|
+
data,
|
|
306
|
+
extractedAt: new Date().toISOString()
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
export default TemplateRegistry;
|