crawlforge-mcp-server 3.0.12 → 3.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +103 -324
- package/package.json +2 -1
- package/server.js +332 -169
- package/src/core/AuthManager.js +5 -2
- package/src/core/ChangeTracker.js +1 -1
- package/src/core/ResearchOrchestrator.js +43 -5
- package/src/core/SnapshotManager.js +2 -2
- package/src/core/analysis/ContentAnalyzer.js +73 -20
- package/src/core/analysis/sentenceUtils.js +73 -0
- package/src/core/creatorMode.js +47 -0
- package/src/core/llm/LLMManager.js +120 -0
- package/src/core/processing/BrowserProcessor.js +1 -1
- package/src/tools/extract/extractStructured.js +280 -0
- package/src/tools/extract/summarizeContent.js +3 -2
- package/src/tools/search/ranking/ResultDeduplicator.js +21 -21
- package/src/tools/search/searchWeb.js +1 -1
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract Structured Data MCP Tool
|
|
3
|
+
* LLM-powered structured extraction with JSON Schema validation
|
|
4
|
+
* Falls back to CSS selector extraction when no LLM provider is configured
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { z } from 'zod';
|
|
8
|
+
import { load } from 'cheerio';
|
|
9
|
+
import { LLMManager } from '../../core/llm/LLMManager.js';
|
|
10
|
+
|
|
11
|
+
const ExtractStructuredSchema = z.object({
|
|
12
|
+
url: z.string().url(),
|
|
13
|
+
schema: z.object({
|
|
14
|
+
type: z.string().optional(),
|
|
15
|
+
properties: z.record(z.any()),
|
|
16
|
+
required: z.array(z.string()).optional()
|
|
17
|
+
}),
|
|
18
|
+
prompt: z.string().optional(),
|
|
19
|
+
llmConfig: z.object({
|
|
20
|
+
provider: z.string().optional(),
|
|
21
|
+
apiKey: z.string().optional()
|
|
22
|
+
}).optional(),
|
|
23
|
+
fallbackToSelectors: z.boolean().optional().default(true),
|
|
24
|
+
selectorHints: z.record(z.string()).optional()
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
export class ExtractStructuredTool {
|
|
28
|
+
constructor(options = {}) {
|
|
29
|
+
this.llmManager = null;
|
|
30
|
+
this.llmConfig = options.llmConfig || {};
|
|
31
|
+
this.userAgent = 'Mozilla/5.0 (compatible; CrawlForge-MCP/3.0; ExtractStructured)';
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Lazily initialize LLMManager (avoids errors when no LLM keys are set)
|
|
36
|
+
*/
|
|
37
|
+
_ensureLLMManager(llmConfig = {}) {
|
|
38
|
+
const config = { ...this.llmConfig, ...llmConfig };
|
|
39
|
+
// Build provider options from llmConfig
|
|
40
|
+
const providerOptions = {};
|
|
41
|
+
if (config.provider === 'openai' && config.apiKey) {
|
|
42
|
+
providerOptions.openai = { apiKey: config.apiKey };
|
|
43
|
+
} else if (config.provider === 'anthropic' && config.apiKey) {
|
|
44
|
+
providerOptions.anthropic = { apiKey: config.apiKey };
|
|
45
|
+
}
|
|
46
|
+
if (config.provider) {
|
|
47
|
+
providerOptions.defaultProvider = config.provider;
|
|
48
|
+
}
|
|
49
|
+
this.llmManager = new LLMManager(providerOptions);
|
|
50
|
+
return this.llmManager;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Get tool definition for MCP server
|
|
55
|
+
*/
|
|
56
|
+
getDefinition() {
|
|
57
|
+
return {
|
|
58
|
+
name: 'extract_structured',
|
|
59
|
+
description: 'Extract structured data from a webpage using LLM-powered analysis and a JSON Schema. Falls back to CSS selector extraction when no LLM provider is configured.',
|
|
60
|
+
inputSchema: ExtractStructuredSchema
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Execute structured extraction
|
|
66
|
+
* @param {Object} params - Extraction parameters
|
|
67
|
+
* @returns {Promise<Object>} Extraction result
|
|
68
|
+
*/
|
|
69
|
+
async execute(params) {
|
|
70
|
+
const startTime = Date.now();
|
|
71
|
+
|
|
72
|
+
try {
|
|
73
|
+
const validated = ExtractStructuredSchema.parse(params);
|
|
74
|
+
const { url, schema, prompt, llmConfig, fallbackToSelectors, selectorHints } = validated;
|
|
75
|
+
|
|
76
|
+
// Step 1: Fetch URL
|
|
77
|
+
const response = await fetch(url, {
|
|
78
|
+
headers: {
|
|
79
|
+
'User-Agent': this.userAgent,
|
|
80
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
|
81
|
+
},
|
|
82
|
+
signal: AbortSignal.timeout(15000)
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
if (!response.ok) {
|
|
86
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const html = await response.text();
|
|
90
|
+
|
|
91
|
+
// Step 2: Parse HTML with Cheerio, strip scripts/styles
|
|
92
|
+
const $ = load(html);
|
|
93
|
+
$('script, style, noscript, iframe, svg').remove();
|
|
94
|
+
const textContent = $('body').text().replace(/\s+/g, ' ').trim();
|
|
95
|
+
|
|
96
|
+
// Step 3: Try LLM extraction first
|
|
97
|
+
let extractionResult = null;
|
|
98
|
+
let extractionMethod = 'llm';
|
|
99
|
+
|
|
100
|
+
try {
|
|
101
|
+
const llm = this._ensureLLMManager(llmConfig || {});
|
|
102
|
+
if (llm.isAvailable()) {
|
|
103
|
+
extractionResult = await llm.extractStructured(textContent, schema, {
|
|
104
|
+
prompt: prompt || '',
|
|
105
|
+
maxContentLength: 6000
|
|
106
|
+
});
|
|
107
|
+
extractionMethod = 'llm';
|
|
108
|
+
}
|
|
109
|
+
} catch (llmError) {
|
|
110
|
+
// LLM failed — will fall through to CSS fallback
|
|
111
|
+
extractionResult = null;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Step 4: CSS selector fallback if LLM unavailable or failed
|
|
115
|
+
if (!extractionResult && fallbackToSelectors !== false) {
|
|
116
|
+
extractionResult = this._cssExtraction($, schema, selectorHints || {});
|
|
117
|
+
extractionMethod = 'css_fallback';
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Step 5: If still no result, use keyword fallback from LLMManager
|
|
121
|
+
if (!extractionResult) {
|
|
122
|
+
const llm = this._ensureLLMManager(llmConfig || {});
|
|
123
|
+
extractionResult = llm.fallbackStructuredExtraction(textContent, schema);
|
|
124
|
+
extractionMethod = 'css_fallback';
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Step 6: Calculate confidence
|
|
128
|
+
const confidence = this._calculateConfidence(extractionResult, extractionMethod);
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
url,
|
|
132
|
+
data: extractionResult.data || {},
|
|
133
|
+
extraction_method: extractionMethod,
|
|
134
|
+
confidence,
|
|
135
|
+
schema_used: schema,
|
|
136
|
+
processingTime: Date.now() - startTime,
|
|
137
|
+
validation: {
|
|
138
|
+
valid: extractionResult.valid || false,
|
|
139
|
+
errors: extractionResult.validationErrors || []
|
|
140
|
+
}
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
} catch (error) {
|
|
144
|
+
return {
|
|
145
|
+
url: params.url || 'unknown',
|
|
146
|
+
data: {},
|
|
147
|
+
extraction_method: 'none',
|
|
148
|
+
confidence: 0,
|
|
149
|
+
schema_used: params.schema || {},
|
|
150
|
+
processingTime: Date.now() - startTime,
|
|
151
|
+
error: `Structured extraction failed: ${error.message}`,
|
|
152
|
+
validation: { valid: false, errors: [error.message] }
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* CSS selector-based extraction fallback
|
|
159
|
+
* Uses selectorHints to map schema fields to CSS selectors
|
|
160
|
+
*/
|
|
161
|
+
_cssExtraction($, schema, selectorHints) {
|
|
162
|
+
const properties = schema.properties || {};
|
|
163
|
+
const extracted = {};
|
|
164
|
+
let fieldsFound = 0;
|
|
165
|
+
|
|
166
|
+
for (const [key, fieldSchema] of Object.entries(properties)) {
|
|
167
|
+
// Use explicit selector hint if provided
|
|
168
|
+
const selector = selectorHints[key];
|
|
169
|
+
if (selector) {
|
|
170
|
+
const el = $(selector);
|
|
171
|
+
if (el.length > 0) {
|
|
172
|
+
const rawValue = el.first().text().trim();
|
|
173
|
+
if (rawValue) {
|
|
174
|
+
extracted[key] = this._coerceValue(rawValue, fieldSchema);
|
|
175
|
+
fieldsFound++;
|
|
176
|
+
continue;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Try common patterns: meta tags, headings, semantic elements
|
|
182
|
+
const metaContent = $(`meta[name="${key}"], meta[property="${key}"], meta[property="og:${key}"]`).attr('content');
|
|
183
|
+
if (metaContent) {
|
|
184
|
+
extracted[key] = this._coerceValue(metaContent, fieldSchema);
|
|
185
|
+
fieldsFound++;
|
|
186
|
+
continue;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// Try matching by common selectors based on field name
|
|
190
|
+
const commonSelectors = [
|
|
191
|
+
`[itemprop="${key}"]`,
|
|
192
|
+
`[data-${key}]`,
|
|
193
|
+
`.${key}`,
|
|
194
|
+
`#${key}`
|
|
195
|
+
];
|
|
196
|
+
|
|
197
|
+
for (const sel of commonSelectors) {
|
|
198
|
+
const el = $(sel);
|
|
199
|
+
if (el.length > 0) {
|
|
200
|
+
const rawValue = el.first().text().trim();
|
|
201
|
+
if (rawValue) {
|
|
202
|
+
extracted[key] = this._coerceValue(rawValue, fieldSchema);
|
|
203
|
+
fieldsFound++;
|
|
204
|
+
break;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
if (fieldsFound === 0) {
|
|
211
|
+
return null; // No fields found via CSS, let keyword fallback handle it
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Validate required fields
|
|
215
|
+
const errors = [];
|
|
216
|
+
const required = schema.required || [];
|
|
217
|
+
for (const field of required) {
|
|
218
|
+
if (!(field in extracted)) {
|
|
219
|
+
errors.push(`Missing required field: ${field}`);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
return {
|
|
224
|
+
data: extracted,
|
|
225
|
+
valid: errors.length === 0,
|
|
226
|
+
validationErrors: errors.length > 0 ? errors : ['Used CSS selector fallback extraction']
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Coerce a string value to the expected type
|
|
232
|
+
*/
|
|
233
|
+
_coerceValue(rawValue, fieldSchema) {
|
|
234
|
+
const type = fieldSchema.type;
|
|
235
|
+
if (type === 'number') {
|
|
236
|
+
const num = parseFloat(rawValue.replace(/[^0-9.-]/g, ''));
|
|
237
|
+
return isNaN(num) ? rawValue : num;
|
|
238
|
+
}
|
|
239
|
+
if (type === 'boolean') {
|
|
240
|
+
return /true|yes|1/i.test(rawValue);
|
|
241
|
+
}
|
|
242
|
+
if (type === 'array') {
|
|
243
|
+
// Try splitting by common delimiters
|
|
244
|
+
return rawValue.split(/[,;|]/).map(s => s.trim()).filter(Boolean);
|
|
245
|
+
}
|
|
246
|
+
return rawValue;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Calculate confidence score based on extraction method and validation
|
|
251
|
+
*/
|
|
252
|
+
_calculateConfidence(result, method) {
|
|
253
|
+
if (!result || !result.data) return 0;
|
|
254
|
+
|
|
255
|
+
const dataKeys = Object.keys(result.data).length;
|
|
256
|
+
if (dataKeys === 0) return 0;
|
|
257
|
+
|
|
258
|
+
let base;
|
|
259
|
+
if (method === 'llm') {
|
|
260
|
+
base = result.valid ? 0.9 : 0.7;
|
|
261
|
+
} else {
|
|
262
|
+
base = result.valid ? 0.6 : 0.4;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Penalize for validation errors
|
|
266
|
+
const errorCount = (result.validationErrors || []).length;
|
|
267
|
+
const penalty = Math.min(0.3, errorCount * 0.1);
|
|
268
|
+
|
|
269
|
+
return Math.max(0, Math.round((base - penalty) * 100) / 100);
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/**
|
|
273
|
+
* Clean up resources
|
|
274
|
+
*/
|
|
275
|
+
async destroy() {
|
|
276
|
+
this.llmManager = null;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
export default ExtractStructuredTool;
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
import { z } from 'zod';
|
|
7
7
|
import { ContentAnalyzer } from '../../core/analysis/ContentAnalyzer.js';
|
|
8
|
+
import { splitSentences } from '../../core/analysis/sentenceUtils.js';
|
|
8
9
|
|
|
9
10
|
const SummarizeContentSchema = z.object({
|
|
10
11
|
text: z.string().min(10),
|
|
@@ -182,7 +183,7 @@ export class SummarizeContentTool {
|
|
|
182
183
|
async extractKeyPoints(originalText, summary) {
|
|
183
184
|
try {
|
|
184
185
|
// Simple key point extraction based on important sentences
|
|
185
|
-
const sentences = originalText
|
|
186
|
+
const sentences = splitSentences(originalText);
|
|
186
187
|
|
|
187
188
|
// Score sentences based on various factors
|
|
188
189
|
const scoredSentences = sentences.map(sentence => {
|
|
@@ -248,7 +249,7 @@ export class SummarizeContentTool {
|
|
|
248
249
|
calculateTextStatistics(text) {
|
|
249
250
|
const characters = text.length;
|
|
250
251
|
const words = text.split(/\s+/).filter(w => w.length > 0);
|
|
251
|
-
const sentences = text
|
|
252
|
+
const sentences = splitSentences(text);
|
|
252
253
|
const paragraphs = text.split(/\n\s*\n/).filter(p => p.trim().length > 0);
|
|
253
254
|
|
|
254
255
|
// Estimate reading time (average 200 words per minute)
|
|
@@ -8,10 +8,10 @@ export class ResultDeduplicator {
|
|
|
8
8
|
this.options = {
|
|
9
9
|
// Similarity thresholds
|
|
10
10
|
thresholds: {
|
|
11
|
-
url: 0.
|
|
12
|
-
title: 0.
|
|
13
|
-
content: 0.
|
|
14
|
-
combined: 0.
|
|
11
|
+
url: 0.9, // URL similarity threshold
|
|
12
|
+
title: 0.85, // Title similarity threshold
|
|
13
|
+
content: 0.85, // Content similarity threshold
|
|
14
|
+
combined: 0.8 // Combined similarity threshold for final decision
|
|
15
15
|
},
|
|
16
16
|
|
|
17
17
|
// Deduplication strategies
|
|
@@ -38,7 +38,7 @@ export class ResultDeduplicator {
|
|
|
38
38
|
minLength: 10, // Minimum content length to compare
|
|
39
39
|
ngramSize: 3, // N-gram size for comparison
|
|
40
40
|
simhashBits: 64, // SimHash bit size
|
|
41
|
-
hammingThreshold:
|
|
41
|
+
hammingThreshold: 16 // Hamming distance threshold for SimHash
|
|
42
42
|
},
|
|
43
43
|
|
|
44
44
|
// Merge strategy
|
|
@@ -200,31 +200,31 @@ export class ResultDeduplicator {
|
|
|
200
200
|
*/
|
|
201
201
|
areDuplicates(result1, result2, options) {
|
|
202
202
|
const similarities = this.computeSimilarities(result1, result2, options);
|
|
203
|
-
|
|
204
|
-
//
|
|
205
|
-
if (similarities.url >=
|
|
203
|
+
|
|
204
|
+
// Near-identical URLs are always duplicates (e.g. http vs https of same page)
|
|
205
|
+
if (similarities.url >= 0.95) {
|
|
206
206
|
this.stats.urlDuplicates++;
|
|
207
207
|
return true;
|
|
208
208
|
}
|
|
209
|
-
|
|
210
|
-
//
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
this.stats.contentDuplicates++;
|
|
209
|
+
|
|
210
|
+
// Require at least 2 high-similarity signals to mark as duplicate
|
|
211
|
+
let matchCount = 0;
|
|
212
|
+
if (similarities.url >= options.thresholds.url) matchCount++;
|
|
213
|
+
if (similarities.title >= options.thresholds.title) matchCount++;
|
|
214
|
+
if (similarities.content >= options.thresholds.content) matchCount++;
|
|
215
|
+
|
|
216
|
+
if (matchCount >= 2) {
|
|
217
|
+
if (similarities.title >= options.thresholds.title) this.stats.titleDuplicates++;
|
|
218
|
+
if (similarities.content >= options.thresholds.content) this.stats.contentDuplicates++;
|
|
219
219
|
return true;
|
|
220
220
|
}
|
|
221
|
-
|
|
222
|
-
// Combined similarity score
|
|
221
|
+
|
|
222
|
+
// Combined similarity score (still requires high threshold)
|
|
223
223
|
const combinedScore = this.computeCombinedSimilarity(similarities);
|
|
224
224
|
if (combinedScore >= options.thresholds.combined) {
|
|
225
225
|
return true;
|
|
226
226
|
}
|
|
227
|
-
|
|
227
|
+
|
|
228
228
|
return false;
|
|
229
229
|
}
|
|
230
230
|
|
|
@@ -5,7 +5,7 @@ import { QueryExpander } from './queryExpander.js';
|
|
|
5
5
|
import { ResultRanker } from './ranking/ResultRanker.js';
|
|
6
6
|
import { ResultDeduplicator } from './ranking/ResultDeduplicator.js';
|
|
7
7
|
import LocalizationManager from '../../core/LocalizationManager.js';
|
|
8
|
-
import { isCreatorModeVerified } from '
|
|
8
|
+
import { isCreatorModeVerified } from '../../core/creatorMode.js';
|
|
9
9
|
|
|
10
10
|
const SearchWebSchema = z.object({
|
|
11
11
|
query: z.string().min(1),
|