crawlforge-mcp-server 3.0.12 → 3.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,280 @@
1
+ /**
2
+ * Extract Structured Data MCP Tool
3
+ * LLM-powered structured extraction with JSON Schema validation
4
+ * Falls back to CSS selector extraction when no LLM provider is configured
5
+ */
6
+
7
+ import { z } from 'zod';
8
+ import { load } from 'cheerio';
9
+ import { LLMManager } from '../../core/llm/LLMManager.js';
10
+
11
+ const ExtractStructuredSchema = z.object({
12
+ url: z.string().url(),
13
+ schema: z.object({
14
+ type: z.string().optional(),
15
+ properties: z.record(z.any()),
16
+ required: z.array(z.string()).optional()
17
+ }),
18
+ prompt: z.string().optional(),
19
+ llmConfig: z.object({
20
+ provider: z.string().optional(),
21
+ apiKey: z.string().optional()
22
+ }).optional(),
23
+ fallbackToSelectors: z.boolean().optional().default(true),
24
+ selectorHints: z.record(z.string()).optional()
25
+ });
26
+
27
+ export class ExtractStructuredTool {
28
+ constructor(options = {}) {
29
+ this.llmManager = null;
30
+ this.llmConfig = options.llmConfig || {};
31
+ this.userAgent = 'Mozilla/5.0 (compatible; CrawlForge-MCP/3.0; ExtractStructured)';
32
+ }
33
+
34
+ /**
35
+ * Lazily initialize LLMManager (avoids errors when no LLM keys are set)
36
+ */
37
+ _ensureLLMManager(llmConfig = {}) {
38
+ const config = { ...this.llmConfig, ...llmConfig };
39
+ // Build provider options from llmConfig
40
+ const providerOptions = {};
41
+ if (config.provider === 'openai' && config.apiKey) {
42
+ providerOptions.openai = { apiKey: config.apiKey };
43
+ } else if (config.provider === 'anthropic' && config.apiKey) {
44
+ providerOptions.anthropic = { apiKey: config.apiKey };
45
+ }
46
+ if (config.provider) {
47
+ providerOptions.defaultProvider = config.provider;
48
+ }
49
+ this.llmManager = new LLMManager(providerOptions);
50
+ return this.llmManager;
51
+ }
52
+
53
+ /**
54
+ * Get tool definition for MCP server
55
+ */
56
+ getDefinition() {
57
+ return {
58
+ name: 'extract_structured',
59
+ description: 'Extract structured data from a webpage using LLM-powered analysis and a JSON Schema. Falls back to CSS selector extraction when no LLM provider is configured.',
60
+ inputSchema: ExtractStructuredSchema
61
+ };
62
+ }
63
+
64
+ /**
65
+ * Execute structured extraction
66
+ * @param {Object} params - Extraction parameters
67
+ * @returns {Promise<Object>} Extraction result
68
+ */
69
+ async execute(params) {
70
+ const startTime = Date.now();
71
+
72
+ try {
73
+ const validated = ExtractStructuredSchema.parse(params);
74
+ const { url, schema, prompt, llmConfig, fallbackToSelectors, selectorHints } = validated;
75
+
76
+ // Step 1: Fetch URL
77
+ const response = await fetch(url, {
78
+ headers: {
79
+ 'User-Agent': this.userAgent,
80
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
81
+ },
82
+ signal: AbortSignal.timeout(15000)
83
+ });
84
+
85
+ if (!response.ok) {
86
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
87
+ }
88
+
89
+ const html = await response.text();
90
+
91
+ // Step 2: Parse HTML with Cheerio, strip scripts/styles
92
+ const $ = load(html);
93
+ $('script, style, noscript, iframe, svg').remove();
94
+ const textContent = $('body').text().replace(/\s+/g, ' ').trim();
95
+
96
+ // Step 3: Try LLM extraction first
97
+ let extractionResult = null;
98
+ let extractionMethod = 'llm';
99
+
100
+ try {
101
+ const llm = this._ensureLLMManager(llmConfig || {});
102
+ if (llm.isAvailable()) {
103
+ extractionResult = await llm.extractStructured(textContent, schema, {
104
+ prompt: prompt || '',
105
+ maxContentLength: 6000
106
+ });
107
+ extractionMethod = 'llm';
108
+ }
109
+ } catch (llmError) {
110
+ // LLM failed — will fall through to CSS fallback
111
+ extractionResult = null;
112
+ }
113
+
114
+ // Step 4: CSS selector fallback if LLM unavailable or failed
115
+ if (!extractionResult && fallbackToSelectors !== false) {
116
+ extractionResult = this._cssExtraction($, schema, selectorHints || {});
117
+ extractionMethod = 'css_fallback';
118
+ }
119
+
120
+ // Step 5: If still no result, use keyword fallback from LLMManager
121
+ if (!extractionResult) {
122
+ const llm = this._ensureLLMManager(llmConfig || {});
123
+ extractionResult = llm.fallbackStructuredExtraction(textContent, schema);
124
+ extractionMethod = 'css_fallback';
125
+ }
126
+
127
+ // Step 6: Calculate confidence
128
+ const confidence = this._calculateConfidence(extractionResult, extractionMethod);
129
+
130
+ return {
131
+ url,
132
+ data: extractionResult.data || {},
133
+ extraction_method: extractionMethod,
134
+ confidence,
135
+ schema_used: schema,
136
+ processingTime: Date.now() - startTime,
137
+ validation: {
138
+ valid: extractionResult.valid || false,
139
+ errors: extractionResult.validationErrors || []
140
+ }
141
+ };
142
+
143
+ } catch (error) {
144
+ return {
145
+ url: params.url || 'unknown',
146
+ data: {},
147
+ extraction_method: 'none',
148
+ confidence: 0,
149
+ schema_used: params.schema || {},
150
+ processingTime: Date.now() - startTime,
151
+ error: `Structured extraction failed: ${error.message}`,
152
+ validation: { valid: false, errors: [error.message] }
153
+ };
154
+ }
155
+ }
156
+
157
+ /**
158
+ * CSS selector-based extraction fallback
159
+ * Uses selectorHints to map schema fields to CSS selectors
160
+ */
161
+ _cssExtraction($, schema, selectorHints) {
162
+ const properties = schema.properties || {};
163
+ const extracted = {};
164
+ let fieldsFound = 0;
165
+
166
+ for (const [key, fieldSchema] of Object.entries(properties)) {
167
+ // Use explicit selector hint if provided
168
+ const selector = selectorHints[key];
169
+ if (selector) {
170
+ const el = $(selector);
171
+ if (el.length > 0) {
172
+ const rawValue = el.first().text().trim();
173
+ if (rawValue) {
174
+ extracted[key] = this._coerceValue(rawValue, fieldSchema);
175
+ fieldsFound++;
176
+ continue;
177
+ }
178
+ }
179
+ }
180
+
181
+ // Try common patterns: meta tags, headings, semantic elements
182
+ const metaContent = $(`meta[name="${key}"], meta[property="${key}"], meta[property="og:${key}"]`).attr('content');
183
+ if (metaContent) {
184
+ extracted[key] = this._coerceValue(metaContent, fieldSchema);
185
+ fieldsFound++;
186
+ continue;
187
+ }
188
+
189
+ // Try matching by common selectors based on field name
190
+ const commonSelectors = [
191
+ `[itemprop="${key}"]`,
192
+ `[data-${key}]`,
193
+ `.${key}`,
194
+ `#${key}`
195
+ ];
196
+
197
+ for (const sel of commonSelectors) {
198
+ const el = $(sel);
199
+ if (el.length > 0) {
200
+ const rawValue = el.first().text().trim();
201
+ if (rawValue) {
202
+ extracted[key] = this._coerceValue(rawValue, fieldSchema);
203
+ fieldsFound++;
204
+ break;
205
+ }
206
+ }
207
+ }
208
+ }
209
+
210
+ if (fieldsFound === 0) {
211
+ return null; // No fields found via CSS, let keyword fallback handle it
212
+ }
213
+
214
+ // Validate required fields
215
+ const errors = [];
216
+ const required = schema.required || [];
217
+ for (const field of required) {
218
+ if (!(field in extracted)) {
219
+ errors.push(`Missing required field: ${field}`);
220
+ }
221
+ }
222
+
223
+ return {
224
+ data: extracted,
225
+ valid: errors.length === 0,
226
+ validationErrors: errors.length > 0 ? errors : ['Used CSS selector fallback extraction']
227
+ };
228
+ }
229
+
230
+ /**
231
+ * Coerce a string value to the expected type
232
+ */
233
+ _coerceValue(rawValue, fieldSchema) {
234
+ const type = fieldSchema.type;
235
+ if (type === 'number') {
236
+ const num = parseFloat(rawValue.replace(/[^0-9.-]/g, ''));
237
+ return isNaN(num) ? rawValue : num;
238
+ }
239
+ if (type === 'boolean') {
240
+ return /true|yes|1/i.test(rawValue);
241
+ }
242
+ if (type === 'array') {
243
+ // Try splitting by common delimiters
244
+ return rawValue.split(/[,;|]/).map(s => s.trim()).filter(Boolean);
245
+ }
246
+ return rawValue;
247
+ }
248
+
249
+ /**
250
+ * Calculate confidence score based on extraction method and validation
251
+ */
252
+ _calculateConfidence(result, method) {
253
+ if (!result || !result.data) return 0;
254
+
255
+ const dataKeys = Object.keys(result.data).length;
256
+ if (dataKeys === 0) return 0;
257
+
258
+ let base;
259
+ if (method === 'llm') {
260
+ base = result.valid ? 0.9 : 0.7;
261
+ } else {
262
+ base = result.valid ? 0.6 : 0.4;
263
+ }
264
+
265
+ // Penalize for validation errors
266
+ const errorCount = (result.validationErrors || []).length;
267
+ const penalty = Math.min(0.3, errorCount * 0.1);
268
+
269
+ return Math.max(0, Math.round((base - penalty) * 100) / 100);
270
+ }
271
+
272
+ /**
273
+ * Clean up resources
274
+ */
275
+ async destroy() {
276
+ this.llmManager = null;
277
+ }
278
+ }
279
+
280
+ export default ExtractStructuredTool;
@@ -5,6 +5,7 @@
5
5
 
6
6
  import { z } from 'zod';
7
7
  import { ContentAnalyzer } from '../../core/analysis/ContentAnalyzer.js';
8
+ import { splitSentences } from '../../core/analysis/sentenceUtils.js';
8
9
 
9
10
  const SummarizeContentSchema = z.object({
10
11
  text: z.string().min(10),
@@ -182,7 +183,7 @@ export class SummarizeContentTool {
182
183
  async extractKeyPoints(originalText, summary) {
183
184
  try {
184
185
  // Simple key point extraction based on important sentences
185
- const sentences = originalText.split(/[.!?]+/).filter(s => s.trim().length > 0);
186
+ const sentences = splitSentences(originalText);
186
187
 
187
188
  // Score sentences based on various factors
188
189
  const scoredSentences = sentences.map(sentence => {
@@ -248,7 +249,7 @@ export class SummarizeContentTool {
248
249
  calculateTextStatistics(text) {
249
250
  const characters = text.length;
250
251
  const words = text.split(/\s+/).filter(w => w.length > 0);
251
- const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
252
+ const sentences = splitSentences(text);
252
253
  const paragraphs = text.split(/\n\s*\n/).filter(p => p.trim().length > 0);
253
254
 
254
255
  // Estimate reading time (average 200 words per minute)
@@ -8,10 +8,10 @@ export class ResultDeduplicator {
8
8
  this.options = {
9
9
  // Similarity thresholds
10
10
  thresholds: {
11
- url: 0.8, // URL similarity threshold
12
- title: 0.75, // Title similarity threshold
13
- content: 0.7, // Content similarity threshold
14
- combined: 0.6 // Combined similarity threshold for final decision
11
+ url: 0.9, // URL similarity threshold
12
+ title: 0.85, // Title similarity threshold
13
+ content: 0.85, // Content similarity threshold
14
+ combined: 0.8 // Combined similarity threshold for final decision
15
15
  },
16
16
 
17
17
  // Deduplication strategies
@@ -38,7 +38,7 @@ export class ResultDeduplicator {
38
38
  minLength: 10, // Minimum content length to compare
39
39
  ngramSize: 3, // N-gram size for comparison
40
40
  simhashBits: 64, // SimHash bit size
41
- hammingThreshold: 8 // Hamming distance threshold for SimHash
41
+ hammingThreshold: 16 // Hamming distance threshold for SimHash
42
42
  },
43
43
 
44
44
  // Merge strategy
@@ -200,31 +200,31 @@ export class ResultDeduplicator {
200
200
  */
201
201
  areDuplicates(result1, result2, options) {
202
202
  const similarities = this.computeSimilarities(result1, result2, options);
203
-
204
- // URL-based duplicate detection
205
- if (similarities.url >= options.thresholds.url) {
203
+
204
+ // Near-identical URLs are always duplicates (e.g. http vs https of same page)
205
+ if (similarities.url >= 0.95) {
206
206
  this.stats.urlDuplicates++;
207
207
  return true;
208
208
  }
209
-
210
- // Title-based duplicate detection
211
- if (similarities.title >= options.thresholds.title) {
212
- this.stats.titleDuplicates++;
213
- return true;
214
- }
215
-
216
- // Content-based duplicate detection
217
- if (similarities.content >= options.thresholds.content) {
218
- this.stats.contentDuplicates++;
209
+
210
+ // Require at least 2 high-similarity signals to mark as duplicate
211
+ let matchCount = 0;
212
+ if (similarities.url >= options.thresholds.url) matchCount++;
213
+ if (similarities.title >= options.thresholds.title) matchCount++;
214
+ if (similarities.content >= options.thresholds.content) matchCount++;
215
+
216
+ if (matchCount >= 2) {
217
+ if (similarities.title >= options.thresholds.title) this.stats.titleDuplicates++;
218
+ if (similarities.content >= options.thresholds.content) this.stats.contentDuplicates++;
219
219
  return true;
220
220
  }
221
-
222
- // Combined similarity score
221
+
222
+ // Combined similarity score (still requires high threshold)
223
223
  const combinedScore = this.computeCombinedSimilarity(similarities);
224
224
  if (combinedScore >= options.thresholds.combined) {
225
225
  return true;
226
226
  }
227
-
227
+
228
228
  return false;
229
229
  }
230
230
 
@@ -5,7 +5,7 @@ import { QueryExpander } from './queryExpander.js';
5
5
  import { ResultRanker } from './ranking/ResultRanker.js';
6
6
  import { ResultDeduplicator } from './ranking/ResultDeduplicator.js';
7
7
  import LocalizationManager from '../../core/LocalizationManager.js';
8
- import { isCreatorModeVerified } from '../../../server.js';
8
+ import { isCreatorModeVerified } from '../../core/creatorMode.js';
9
9
 
10
10
  const SearchWebSchema = z.object({
11
11
  query: z.string().min(1),