crawlforge-mcp-server 3.0.12 → 3.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +103 -324
- package/package.json +2 -1
- package/server.js +332 -169
- package/src/core/AuthManager.js +5 -2
- package/src/core/ChangeTracker.js +1 -1
- package/src/core/ResearchOrchestrator.js +43 -5
- package/src/core/analysis/ContentAnalyzer.js +70 -17
- package/src/core/analysis/sentenceUtils.js +73 -0
- package/src/core/creatorMode.js +47 -0
- package/src/core/llm/LLMManager.js +120 -0
- package/src/core/processing/BrowserProcessor.js +1 -1
- package/src/tools/extract/extractStructured.js +280 -0
- package/src/tools/extract/summarizeContent.js +3 -2
- package/src/tools/search/ranking/ResultDeduplicator.js +21 -21
- package/src/tools/search/searchWeb.js +1 -1
package/src/core/AuthManager.js
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
// Using native fetch (Node.js 18+)
|
|
7
7
|
import fs from 'fs/promises';
|
|
8
8
|
import path from 'path';
|
|
9
|
-
import { isCreatorModeVerified } from '
|
|
9
|
+
import { isCreatorModeVerified } from './creatorMode.js';
|
|
10
10
|
|
|
11
11
|
class AuthManager {
|
|
12
12
|
constructor() {
|
|
@@ -284,7 +284,10 @@ class AuthManager {
|
|
|
284
284
|
scrape_with_actions: 5,
|
|
285
285
|
generate_llms_txt: 3,
|
|
286
286
|
localization: 5,
|
|
287
|
-
track_changes: 3
|
|
287
|
+
track_changes: 3,
|
|
288
|
+
|
|
289
|
+
// Phase 1: LLM-Powered Structured Extraction
|
|
290
|
+
extract_structured: 4
|
|
288
291
|
};
|
|
289
292
|
|
|
290
293
|
return costs[tool] || 1;
|
|
@@ -1113,7 +1113,7 @@ export class ChangeTracker extends EventEmitter {
|
|
|
1113
1113
|
/**
|
|
1114
1114
|
* Detect changes against the latest snapshot
|
|
1115
1115
|
*/
|
|
1116
|
-
async
|
|
1116
|
+
async detectChangesFromSnapshot(url, currentContent) {
|
|
1117
1117
|
// Validate URL format
|
|
1118
1118
|
try {
|
|
1119
1119
|
new URL(url);
|
|
@@ -462,11 +462,49 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
462
462
|
this.researchState.visitedUrls.add(source.link);
|
|
463
463
|
this.metrics.urlsProcessed++;
|
|
464
464
|
|
|
465
|
-
// Extract detailed content
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
465
|
+
// Extract detailed content (with fallback to fetch_url + text extraction)
|
|
466
|
+
let contentData;
|
|
467
|
+
try {
|
|
468
|
+
contentData = await this.extractTool.execute({
|
|
469
|
+
url: source.link,
|
|
470
|
+
options: { includeMetadata: true, includeStructuredData: true }
|
|
471
|
+
});
|
|
472
|
+
} catch (extractError) {
|
|
473
|
+
this.logger.warn('Primary extraction failed, trying fallback', {
|
|
474
|
+
url: source.link,
|
|
475
|
+
error: extractError.message
|
|
476
|
+
});
|
|
477
|
+
// Fallback: use fetch + basic text extraction
|
|
478
|
+
try {
|
|
479
|
+
const fetchResponse = await fetch(source.link, {
|
|
480
|
+
headers: { 'User-Agent': 'CrawlForge-Research/1.0' },
|
|
481
|
+
signal: AbortSignal.timeout(10000)
|
|
482
|
+
});
|
|
483
|
+
if (fetchResponse.ok) {
|
|
484
|
+
const html = await fetchResponse.text();
|
|
485
|
+
// Strip HTML tags for basic text content
|
|
486
|
+
const textContent = html
|
|
487
|
+
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
|
488
|
+
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
|
489
|
+
.replace(/<[^>]+>/g, ' ')
|
|
490
|
+
.replace(/\s+/g, ' ')
|
|
491
|
+
.trim();
|
|
492
|
+
if (textContent.length > 50) {
|
|
493
|
+
contentData = {
|
|
494
|
+
content: textContent.slice(0, 5000),
|
|
495
|
+
metadata: { title: source.title || '' },
|
|
496
|
+
structuredData: {},
|
|
497
|
+
fallback: true
|
|
498
|
+
};
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
} catch (fallbackError) {
|
|
502
|
+
this.logger.warn('Fallback extraction also failed', {
|
|
503
|
+
url: source.link,
|
|
504
|
+
error: fallbackError.message
|
|
505
|
+
});
|
|
506
|
+
}
|
|
507
|
+
}
|
|
470
508
|
|
|
471
509
|
if (contentData && contentData.content) {
|
|
472
510
|
this.metrics.contentExtracted++;
|
|
@@ -7,6 +7,7 @@ import { SummarizerManager } from 'node-summarizer';
|
|
|
7
7
|
import { franc } from 'franc';
|
|
8
8
|
import nlp from 'compromise';
|
|
9
9
|
import { z } from 'zod';
|
|
10
|
+
import { splitSentences } from './sentenceUtils.js';
|
|
10
11
|
|
|
11
12
|
const ContentAnalyzerSchema = z.object({
|
|
12
13
|
text: z.string().min(1),
|
|
@@ -290,11 +291,29 @@ export class ContentAnalyzer {
|
|
|
290
291
|
});
|
|
291
292
|
|
|
292
293
|
if (detected === 'und') {
|
|
293
|
-
|
|
294
|
+
// Fallback: check if text is predominantly ASCII Latin characters (likely English)
|
|
295
|
+
const latinChars = (text.match(/[a-zA-Z]/g) || []).length;
|
|
296
|
+
const totalChars = text.replace(/\s/g, '').length;
|
|
297
|
+
if (totalChars > 0 && latinChars / totalChars > 0.7) {
|
|
298
|
+
// Check for common English words as a heuristic
|
|
299
|
+
const lower = text.toLowerCase();
|
|
300
|
+
const englishMarkers = ['the ', 'is ', 'are ', 'was ', 'and ', 'for ', 'that ', 'with ', 'this ', 'from '];
|
|
301
|
+
const matchCount = englishMarkers.filter(w => lower.includes(w)).length;
|
|
302
|
+
if (matchCount >= 2) {
|
|
303
|
+
return {
|
|
304
|
+
code: 'eng',
|
|
305
|
+
name: 'English',
|
|
306
|
+
confidence: 0.6,
|
|
307
|
+
alternative: [],
|
|
308
|
+
detectionMethod: 'heuristic'
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
return null; // Truly undetermined language
|
|
294
313
|
}
|
|
295
314
|
|
|
296
|
-
// Get confidence score
|
|
297
|
-
const confidence = Math.min(1, text.length /
|
|
315
|
+
// Get confidence score based on text length and detection certainty
|
|
316
|
+
const confidence = Math.min(1, 0.5 + (text.length / 500) * 0.5);
|
|
298
317
|
|
|
299
318
|
// Get alternative languages using franc.all
|
|
300
319
|
const alternatives = franc.all(text, {
|
|
@@ -329,7 +348,7 @@ export class ContentAnalyzer {
|
|
|
329
348
|
*/
|
|
330
349
|
async summarizeText(text, options = {}) {
|
|
331
350
|
try {
|
|
332
|
-
const sentences = text
|
|
351
|
+
const sentences = splitSentences(text);
|
|
333
352
|
|
|
334
353
|
if (sentences.length < 3) {
|
|
335
354
|
return {
|
|
@@ -364,7 +383,7 @@ export class ContentAnalyzer {
|
|
|
364
383
|
if (options.summaryType === 'extractive') {
|
|
365
384
|
// Use node-summarizer for extractive summarization
|
|
366
385
|
const summary = await this.summarizer.getSummaryByRanking(text, targetSentences);
|
|
367
|
-
summarySentences = summary
|
|
386
|
+
summarySentences = splitSentences(summary);
|
|
368
387
|
} else {
|
|
369
388
|
// Simple abstractive approach (for demonstration)
|
|
370
389
|
summarySentences = await this.createAbstractiveSummary(text, targetSentences);
|
|
@@ -385,7 +404,7 @@ export class ContentAnalyzer {
|
|
|
385
404
|
console.warn('Text summarization failed:', error.message);
|
|
386
405
|
|
|
387
406
|
// Fallback: return first few sentences
|
|
388
|
-
const sentences = text
|
|
407
|
+
const sentences = splitSentences(text);
|
|
389
408
|
const fallbackSentences = sentences.slice(0, 2);
|
|
390
409
|
|
|
391
410
|
return {
|
|
@@ -479,13 +498,45 @@ export class ContentAnalyzer {
|
|
|
479
498
|
try {
|
|
480
499
|
const doc = nlp(text);
|
|
481
500
|
|
|
501
|
+
const people = doc.people().out('array');
|
|
502
|
+
const places = doc.places().out('array');
|
|
503
|
+
const organizations = doc.organizations().out('array');
|
|
504
|
+
const dates = doc.dates().out('array');
|
|
505
|
+
const money = doc.money().out('array');
|
|
506
|
+
let other = doc.topics().out('array').slice(0, 10);
|
|
507
|
+
|
|
508
|
+
// Supplement with capitalized proper nouns that compromise may miss
|
|
509
|
+
// (technology names, product names, etc.)
|
|
510
|
+
const existingEntities = new Set([
|
|
511
|
+
...people, ...places, ...organizations, ...other
|
|
512
|
+
].map(e => e.toLowerCase()));
|
|
513
|
+
|
|
514
|
+
const properNouns = text.match(/\b[A-Z][a-zA-Z.]+(?:\s+[A-Z][a-zA-Z.]+)*/g) || [];
|
|
515
|
+
const supplemental = [...new Set(properNouns)]
|
|
516
|
+
.filter(n => !existingEntities.has(n.toLowerCase()) && n.length > 1)
|
|
517
|
+
.slice(0, 10);
|
|
518
|
+
|
|
519
|
+
if (supplemental.length > 0) {
|
|
520
|
+
other = [...other, ...supplemental].slice(0, 15);
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
const allEntities = [...people, ...places, ...organizations, ...dates, ...money, ...other];
|
|
524
|
+
const uniqueEntities = new Set(allEntities.map(e => e.toLowerCase()));
|
|
525
|
+
|
|
482
526
|
return {
|
|
483
|
-
people
|
|
484
|
-
places
|
|
485
|
-
organizations
|
|
486
|
-
dates
|
|
487
|
-
money
|
|
488
|
-
other
|
|
527
|
+
people,
|
|
528
|
+
places,
|
|
529
|
+
organizations,
|
|
530
|
+
dates,
|
|
531
|
+
money,
|
|
532
|
+
other,
|
|
533
|
+
summary: {
|
|
534
|
+
totalEntities: allEntities.length,
|
|
535
|
+
uniqueEntities: uniqueEntities.size,
|
|
536
|
+
entityDensity: text.split(/\s+/).length > 0
|
|
537
|
+
? uniqueEntities.size / text.split(/\s+/).length
|
|
538
|
+
: 0
|
|
539
|
+
}
|
|
489
540
|
};
|
|
490
541
|
|
|
491
542
|
} catch (error) {
|
|
@@ -496,7 +547,8 @@ export class ContentAnalyzer {
|
|
|
496
547
|
organizations: [],
|
|
497
548
|
dates: [],
|
|
498
549
|
money: [],
|
|
499
|
-
other: []
|
|
550
|
+
other: [],
|
|
551
|
+
summary: { totalEntities: 0, uniqueEntities: 0, entityDensity: 0 }
|
|
500
552
|
};
|
|
501
553
|
}
|
|
502
554
|
}
|
|
@@ -521,10 +573,11 @@ export class ContentAnalyzer {
|
|
|
521
573
|
const termTypes = {};
|
|
522
574
|
|
|
523
575
|
[...nouns, ...verbs, ...adjectives].forEach(term => {
|
|
524
|
-
|
|
576
|
+
// Strip leading/trailing punctuation but preserve internal periods (e.g. Node.js)
|
|
577
|
+
const cleaned = term.toLowerCase().trim().replace(/^[^a-z0-9]+|[^a-z0-9.]+$/gi, '').replace(/\.+$/, '');
|
|
525
578
|
if (cleaned.length > 2 && !this.isStopWord(cleaned)) {
|
|
526
579
|
termFreq[cleaned] = (termFreq[cleaned] || 0) + 1;
|
|
527
|
-
|
|
580
|
+
|
|
528
581
|
if (!termTypes[cleaned]) {
|
|
529
582
|
if (nouns.includes(term)) termTypes[cleaned] = 'noun';
|
|
530
583
|
else if (verbs.includes(term)) termTypes[cleaned] = 'verb';
|
|
@@ -561,7 +614,7 @@ export class ContentAnalyzer {
|
|
|
561
614
|
*/
|
|
562
615
|
async calculateReadability(text) {
|
|
563
616
|
try {
|
|
564
|
-
const sentences = text
|
|
617
|
+
const sentences = splitSentences(text);
|
|
565
618
|
const words = text.split(/\s+/).filter(w => w.length > 0);
|
|
566
619
|
const characters = text.length;
|
|
567
620
|
const charactersNoSpaces = text.replace(/\s/g, '').length;
|
|
@@ -669,7 +722,7 @@ export class ContentAnalyzer {
|
|
|
669
722
|
const characters = text.length;
|
|
670
723
|
const charactersNoSpaces = text.replace(/\s/g, '').length;
|
|
671
724
|
const words = text.split(/\s+/).filter(w => w.length > 0);
|
|
672
|
-
const sentences = text
|
|
725
|
+
const sentences = splitSentences(text);
|
|
673
726
|
const paragraphs = text.split(/\n\s*\n/).filter(p => p.trim().length > 0);
|
|
674
727
|
|
|
675
728
|
// Estimate reading time (average 200 words per minute)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sentence splitting utility that handles abbreviations, decimal numbers,
|
|
3
|
+
* domain names, and other common patterns that contain periods.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
// Common abbreviations that should not trigger sentence splits
|
|
7
|
+
const ABBREVIATIONS = new Set([
|
|
8
|
+
'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', 'jr', 'st', 'ave', 'blvd',
|
|
9
|
+
'vs', 'etc', 'inc', 'ltd', 'corp', 'dept', 'univ', 'assn',
|
|
10
|
+
'approx', 'appt', 'apt', 'dept', 'est', 'min', 'max',
|
|
11
|
+
'govt', 'lib', 'misc', 'natl', 'intl',
|
|
12
|
+
'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec',
|
|
13
|
+
'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
|
|
14
|
+
'fig', 'eq', 'ref', 'vol', 'no', 'pp', 'ed', 'rev',
|
|
15
|
+
'e', 'i', // for e.g. and i.e.
|
|
16
|
+
]);
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Split text into sentences, handling abbreviations and technical terms.
|
|
20
|
+
* @param {string} text - Text to split
|
|
21
|
+
* @returns {string[]} - Array of sentence strings
|
|
22
|
+
*/
|
|
23
|
+
export function splitSentences(text) {
|
|
24
|
+
if (!text || typeof text !== 'string') return [];
|
|
25
|
+
|
|
26
|
+
const sentences = [];
|
|
27
|
+
let current = '';
|
|
28
|
+
|
|
29
|
+
// Split by potential sentence boundaries: . ! ?
|
|
30
|
+
// But be smart about abbreviations, numbers, and domain-like patterns
|
|
31
|
+
const tokens = text.split(/(?<=[.!?])\s+/);
|
|
32
|
+
|
|
33
|
+
for (const token of tokens) {
|
|
34
|
+
const combined = current ? current + ' ' + token : token;
|
|
35
|
+
|
|
36
|
+
// Check if the current chunk ends with something that looks like a sentence end
|
|
37
|
+
if (/[.!?]\s*$/.test(combined)) {
|
|
38
|
+
// Check if the period is likely NOT a sentence boundary
|
|
39
|
+
const beforePeriod = combined.replace(/[.!?]\s*$/, '');
|
|
40
|
+
const lastWord = beforePeriod.split(/\s+/).pop() || '';
|
|
41
|
+
const lastWordLower = lastWord.toLowerCase().replace(/[^a-z]/g, '');
|
|
42
|
+
|
|
43
|
+
const isAbbreviation = ABBREVIATIONS.has(lastWordLower);
|
|
44
|
+
// e.g., i.e., U.S., Node.js - words with internal periods
|
|
45
|
+
const hasInternalPeriods = /\w\.\w/.test(lastWord);
|
|
46
|
+
// Numbers like 3.14, v2.0
|
|
47
|
+
const isDecimal = /\d\.\d/.test(lastWord);
|
|
48
|
+
// Single letter followed by period (initials like "A. Smith")
|
|
49
|
+
const isInitial = /^[A-Z]\.$/.test(lastWord);
|
|
50
|
+
|
|
51
|
+
if (isAbbreviation || hasInternalPeriods || isDecimal || isInitial) {
|
|
52
|
+
// Not a real sentence boundary — accumulate
|
|
53
|
+
current = combined;
|
|
54
|
+
} else {
|
|
55
|
+
// Real sentence boundary
|
|
56
|
+
const trimmed = combined.trim();
|
|
57
|
+
if (trimmed.length > 0) {
|
|
58
|
+
sentences.push(trimmed);
|
|
59
|
+
}
|
|
60
|
+
current = '';
|
|
61
|
+
}
|
|
62
|
+
} else {
|
|
63
|
+
current = combined;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Don't forget the last chunk
|
|
68
|
+
if (current.trim().length > 0) {
|
|
69
|
+
sentences.push(current.trim());
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
return sentences.length > 0 ? sentences : [text.trim()];
|
|
73
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Creator Mode Authentication
|
|
3
|
+
* Extracted from server.js to allow tool classes to be imported independently
|
|
4
|
+
* without triggering the full MCP server startup sequence.
|
|
5
|
+
*
|
|
6
|
+
* SECURITY: The creator secret hash is safe to commit — one-way SHA-256.
|
|
7
|
+
* The actual secret is never stored. Only the package maintainer has it.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import crypto from 'crypto';
|
|
11
|
+
import dotenv from 'dotenv';
|
|
12
|
+
|
|
13
|
+
// Load .env file early to check for creator secret
|
|
14
|
+
dotenv.config({ path: '.env', quiet: true });
|
|
15
|
+
|
|
16
|
+
// SECURITY: Clear any externally-set creator mode env var to prevent bypass
|
|
17
|
+
delete process.env.CRAWLFORGE_CREATOR_MODE;
|
|
18
|
+
|
|
19
|
+
const CREATOR_SECRET_HASH = 'cfef62e5068d48e7dd6a39c9e16f0be2615510c6b68274fc8abe3156feb5050b';
|
|
20
|
+
|
|
21
|
+
// Module-scoped flag — cannot be set externally
|
|
22
|
+
let _creatorModeVerified = false;
|
|
23
|
+
|
|
24
|
+
if (process.env.CRAWLFORGE_CREATOR_SECRET) {
|
|
25
|
+
const providedHash = crypto
|
|
26
|
+
.createHash('sha256')
|
|
27
|
+
.update(process.env.CRAWLFORGE_CREATOR_SECRET)
|
|
28
|
+
.digest('hex');
|
|
29
|
+
|
|
30
|
+
if (crypto.timingSafeEqual(Buffer.from(providedHash, 'hex'), Buffer.from(CREATOR_SECRET_HASH, 'hex'))) {
|
|
31
|
+
_creatorModeVerified = true;
|
|
32
|
+
console.log('Creator Mode Enabled - Unlimited Access');
|
|
33
|
+
} else {
|
|
34
|
+
console.warn('Invalid creator secret provided');
|
|
35
|
+
}
|
|
36
|
+
// Clean up the secret from environment
|
|
37
|
+
delete process.env.CRAWLFORGE_CREATOR_SECRET;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Returns true only when the package maintainer has provided the correct secret.
|
|
42
|
+
* This flag is module-scoped and cannot be set via environment variables after
|
|
43
|
+
* the module has loaded.
|
|
44
|
+
*/
|
|
45
|
+
export function isCreatorModeVerified() {
|
|
46
|
+
return _creatorModeVerified;
|
|
47
|
+
}
|
|
@@ -319,6 +319,126 @@ Synthesize these findings into a comprehensive analysis:`;
|
|
|
319
319
|
}
|
|
320
320
|
}
|
|
321
321
|
|
|
322
|
+
/**
|
|
323
|
+
* Extract structured data from content using LLM and a JSON Schema
|
|
324
|
+
* Follows the same pattern as analyzeRelevance()
|
|
325
|
+
*/
|
|
326
|
+
async extractStructured(content, schema, options = {}) {
|
|
327
|
+
const { maxContentLength = 6000, prompt: userPrompt = '', maxTokens = 1000 } = options;
|
|
328
|
+
|
|
329
|
+
const truncatedContent = content.length > maxContentLength
|
|
330
|
+
? content.substring(0, maxContentLength) + '...'
|
|
331
|
+
: content;
|
|
332
|
+
|
|
333
|
+
// Scale maxTokens with schema complexity
|
|
334
|
+
const schemaFields = Object.keys(schema.properties || {}).length;
|
|
335
|
+
const scaledTokens = Math.min(2000, Math.max(maxTokens, schemaFields * 100 + 500));
|
|
336
|
+
|
|
337
|
+
const systemPrompt = `You are a structured data extraction expert. Extract data from the provided content and return ONLY valid JSON that conforms to the given JSON Schema. Do not include any explanation or markdown — only the raw JSON object.`;
|
|
338
|
+
|
|
339
|
+
const schemaStr = JSON.stringify(schema, null, 2);
|
|
340
|
+
const guidance = userPrompt ? `\n\nExtraction guidance: ${userPrompt}` : '';
|
|
341
|
+
|
|
342
|
+
const extractionPrompt = `JSON Schema to extract:
|
|
343
|
+
${schemaStr}${guidance}
|
|
344
|
+
|
|
345
|
+
Content to extract from:
|
|
346
|
+
${truncatedContent}
|
|
347
|
+
|
|
348
|
+
Extract the data and return valid JSON:`;
|
|
349
|
+
|
|
350
|
+
try {
|
|
351
|
+
const response = await this.generateCompletion(extractionPrompt, {
|
|
352
|
+
systemPrompt,
|
|
353
|
+
maxTokens: scaledTokens,
|
|
354
|
+
temperature: 0.1
|
|
355
|
+
});
|
|
356
|
+
|
|
357
|
+
// Strip markdown code fences if present
|
|
358
|
+
const cleaned = response.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, '').trim();
|
|
359
|
+
const parsed = JSON.parse(cleaned);
|
|
360
|
+
|
|
361
|
+
// Lightweight validation
|
|
362
|
+
const validation = this.validateAgainstSchema(parsed, schema);
|
|
363
|
+
return {
|
|
364
|
+
data: parsed,
|
|
365
|
+
valid: validation.valid,
|
|
366
|
+
validationErrors: validation.errors
|
|
367
|
+
};
|
|
368
|
+
} catch (error) {
|
|
369
|
+
this.logger.warn('LLM structured extraction failed, using fallback', { error: error.message });
|
|
370
|
+
return this.fallbackStructuredExtraction(content, schema);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
/**
|
|
375
|
+
* Validate a parsed object against a simple JSON Schema
|
|
376
|
+
*/
|
|
377
|
+
validateAgainstSchema(data, schema) {
|
|
378
|
+
const errors = [];
|
|
379
|
+
const properties = schema.properties || {};
|
|
380
|
+
const required = schema.required || [];
|
|
381
|
+
|
|
382
|
+
for (const field of required) {
|
|
383
|
+
if (!(field in data)) {
|
|
384
|
+
errors.push(`Missing required field: ${field}`);
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
for (const [key, fieldSchema] of Object.entries(properties)) {
|
|
389
|
+
if (key in data) {
|
|
390
|
+
const value = data[key];
|
|
391
|
+
const expectedType = fieldSchema.type;
|
|
392
|
+
if (expectedType) {
|
|
393
|
+
const actualType = Array.isArray(value) ? 'array' : typeof value;
|
|
394
|
+
if (actualType !== expectedType) {
|
|
395
|
+
errors.push(`Field "${key}": expected ${expectedType}, got ${actualType}`);
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
if (fieldSchema.enum && !fieldSchema.enum.includes(value)) {
|
|
399
|
+
errors.push(`Field "${key}": value "${value}" not in enum ${JSON.stringify(fieldSchema.enum)}`);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
return { valid: errors.length === 0, errors };
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
/**
|
|
408
|
+
* Fallback structured extraction without LLM — keyword/regex matching for primitives
|
|
409
|
+
*/
|
|
410
|
+
fallbackStructuredExtraction(content, schema) {
|
|
411
|
+
const extracted = {};
|
|
412
|
+
const properties = schema.properties || {};
|
|
413
|
+
|
|
414
|
+
for (const [key, fieldSchema] of Object.entries(properties)) {
|
|
415
|
+
const keyPattern = new RegExp(key.replace(/_/g, '[\\s_-]'), 'i');
|
|
416
|
+
const lineMatch = content.split('\n').find(line => keyPattern.test(line));
|
|
417
|
+
|
|
418
|
+
if (lineMatch) {
|
|
419
|
+
const valueMatch = lineMatch.match(/:\s*(.+)$/);
|
|
420
|
+
const rawValue = valueMatch ? valueMatch[1].trim() : null;
|
|
421
|
+
|
|
422
|
+
if (rawValue) {
|
|
423
|
+
if (fieldSchema.type === 'number') {
|
|
424
|
+
const num = parseFloat(rawValue.replace(/[^0-9.-]/g, ''));
|
|
425
|
+
if (!isNaN(num)) extracted[key] = num;
|
|
426
|
+
} else if (fieldSchema.type === 'boolean') {
|
|
427
|
+
extracted[key] = /true|yes|1/i.test(rawValue);
|
|
428
|
+
} else {
|
|
429
|
+
extracted[key] = rawValue;
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
return {
|
|
436
|
+
data: extracted,
|
|
437
|
+
valid: false,
|
|
438
|
+
validationErrors: ['Used fallback extraction — no LLM provider available']
|
|
439
|
+
};
|
|
440
|
+
}
|
|
441
|
+
|
|
322
442
|
/**
|
|
323
443
|
* Fallback query expansion without LLM
|
|
324
444
|
*/
|
|
@@ -668,7 +668,7 @@ export class BrowserProcessor {
|
|
|
668
668
|
|
|
669
669
|
// Check for dynamic content indicators
|
|
670
670
|
const dynamicIndicators = document.querySelectorAll(
|
|
671
|
-
'[data-bind], [v-if], [v-for], [ng-if], [ng-repeat], [
|
|
671
|
+
'[data-bind], [v-if], [v-for], [ng-if], [ng-repeat], [ngFor], [ngIf]'
|
|
672
672
|
);
|
|
673
673
|
analysis.hasDynamicContent = dynamicIndicators.length > 0 || analysis.detectedFrameworks.length > 0;
|
|
674
674
|
|