crawlforge-mcp-server 3.0.12 → 3.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@
6
6
  // Using native fetch (Node.js 18+)
7
7
  import fs from 'fs/promises';
8
8
  import path from 'path';
9
- import { isCreatorModeVerified } from '../../server.js';
9
+ import { isCreatorModeVerified } from './creatorMode.js';
10
10
 
11
11
  class AuthManager {
12
12
  constructor() {
@@ -284,7 +284,10 @@ class AuthManager {
284
284
  scrape_with_actions: 5,
285
285
  generate_llms_txt: 3,
286
286
  localization: 5,
287
- track_changes: 3
287
+ track_changes: 3,
288
+
289
+ // Phase 1: LLM-Powered Structured Extraction
290
+ extract_structured: 4
288
291
  };
289
292
 
290
293
  return costs[tool] || 1;
@@ -1113,7 +1113,7 @@ export class ChangeTracker extends EventEmitter {
1113
1113
  /**
1114
1114
  * Detect changes against the latest snapshot
1115
1115
  */
1116
- async detectChanges(url, currentContent) {
1116
+ async detectChangesFromSnapshot(url, currentContent) {
1117
1117
  // Validate URL format
1118
1118
  try {
1119
1119
  new URL(url);
@@ -462,11 +462,49 @@ export class ResearchOrchestrator extends EventEmitter {
462
462
  this.researchState.visitedUrls.add(source.link);
463
463
  this.metrics.urlsProcessed++;
464
464
 
465
- // Extract detailed content
466
- const contentData = await this.extractTool.execute({
467
- url: source.link,
468
- options: { includeMetadata: true, includeStructuredData: true }
469
- });
465
+ // Extract detailed content (with fallback to fetch_url + text extraction)
466
+ let contentData;
467
+ try {
468
+ contentData = await this.extractTool.execute({
469
+ url: source.link,
470
+ options: { includeMetadata: true, includeStructuredData: true }
471
+ });
472
+ } catch (extractError) {
473
+ this.logger.warn('Primary extraction failed, trying fallback', {
474
+ url: source.link,
475
+ error: extractError.message
476
+ });
477
+ // Fallback: use fetch + basic text extraction
478
+ try {
479
+ const fetchResponse = await fetch(source.link, {
480
+ headers: { 'User-Agent': 'CrawlForge-Research/1.0' },
481
+ signal: AbortSignal.timeout(10000)
482
+ });
483
+ if (fetchResponse.ok) {
484
+ const html = await fetchResponse.text();
485
+ // Strip HTML tags for basic text content
486
+ const textContent = html
487
+ .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
488
+ .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
489
+ .replace(/<[^>]+>/g, ' ')
490
+ .replace(/\s+/g, ' ')
491
+ .trim();
492
+ if (textContent.length > 50) {
493
+ contentData = {
494
+ content: textContent.slice(0, 5000),
495
+ metadata: { title: source.title || '' },
496
+ structuredData: {},
497
+ fallback: true
498
+ };
499
+ }
500
+ }
501
+ } catch (fallbackError) {
502
+ this.logger.warn('Fallback extraction also failed', {
503
+ url: source.link,
504
+ error: fallbackError.message
505
+ });
506
+ }
507
+ }
470
508
 
471
509
  if (contentData && contentData.content) {
472
510
  this.metrics.contentExtracted++;
@@ -7,6 +7,7 @@ import { SummarizerManager } from 'node-summarizer';
7
7
  import { franc } from 'franc';
8
8
  import nlp from 'compromise';
9
9
  import { z } from 'zod';
10
+ import { splitSentences } from './sentenceUtils.js';
10
11
 
11
12
  const ContentAnalyzerSchema = z.object({
12
13
  text: z.string().min(1),
@@ -290,11 +291,29 @@ export class ContentAnalyzer {
290
291
  });
291
292
 
292
293
  if (detected === 'und') {
293
- return null; // Undetermined language
294
+ // Fallback: check if text is predominantly ASCII Latin characters (likely English)
295
+ const latinChars = (text.match(/[a-zA-Z]/g) || []).length;
296
+ const totalChars = text.replace(/\s/g, '').length;
297
+ if (totalChars > 0 && latinChars / totalChars > 0.7) {
298
+ // Check for common English words as a heuristic
299
+ const lower = text.toLowerCase();
300
+ const englishMarkers = ['the ', 'is ', 'are ', 'was ', 'and ', 'for ', 'that ', 'with ', 'this ', 'from '];
301
+ const matchCount = englishMarkers.filter(w => lower.includes(w)).length;
302
+ if (matchCount >= 2) {
303
+ return {
304
+ code: 'eng',
305
+ name: 'English',
306
+ confidence: 0.6,
307
+ alternative: [],
308
+ detectionMethod: 'heuristic'
309
+ };
310
+ }
311
+ }
312
+ return null; // Truly undetermined language
294
313
  }
295
314
 
296
- // Get confidence score (simplified approach)
297
- const confidence = Math.min(1, text.length / 100 * 0.01 + 0.5);
315
+ // Get confidence score based on text length and detection certainty
316
+ const confidence = Math.min(1, 0.5 + (text.length / 500) * 0.5);
298
317
 
299
318
  // Get alternative languages using franc.all
300
319
  const alternatives = franc.all(text, {
@@ -329,7 +348,7 @@ export class ContentAnalyzer {
329
348
  */
330
349
  async summarizeText(text, options = {}) {
331
350
  try {
332
- const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
351
+ const sentences = splitSentences(text);
333
352
 
334
353
  if (sentences.length < 3) {
335
354
  return {
@@ -364,7 +383,7 @@ export class ContentAnalyzer {
364
383
  if (options.summaryType === 'extractive') {
365
384
  // Use node-summarizer for extractive summarization
366
385
  const summary = await this.summarizer.getSummaryByRanking(text, targetSentences);
367
- summarySentences = summary.split(/[.!?]+/).filter(s => s.trim().length > 0);
386
+ summarySentences = splitSentences(summary);
368
387
  } else {
369
388
  // Simple abstractive approach (for demonstration)
370
389
  summarySentences = await this.createAbstractiveSummary(text, targetSentences);
@@ -385,7 +404,7 @@ export class ContentAnalyzer {
385
404
  console.warn('Text summarization failed:', error.message);
386
405
 
387
406
  // Fallback: return first few sentences
388
- const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
407
+ const sentences = splitSentences(text);
389
408
  const fallbackSentences = sentences.slice(0, 2);
390
409
 
391
410
  return {
@@ -479,13 +498,45 @@ export class ContentAnalyzer {
479
498
  try {
480
499
  const doc = nlp(text);
481
500
 
501
+ const people = doc.people().out('array');
502
+ const places = doc.places().out('array');
503
+ const organizations = doc.organizations().out('array');
504
+ const dates = doc.dates().out('array');
505
+ const money = doc.money().out('array');
506
+ let other = doc.topics().out('array').slice(0, 10);
507
+
508
+ // Supplement with capitalized proper nouns that compromise may miss
509
+ // (technology names, product names, etc.)
510
+ const existingEntities = new Set([
511
+ ...people, ...places, ...organizations, ...other
512
+ ].map(e => e.toLowerCase()));
513
+
514
+ const properNouns = text.match(/\b[A-Z][a-zA-Z.]+(?:\s+[A-Z][a-zA-Z.]+)*/g) || [];
515
+ const supplemental = [...new Set(properNouns)]
516
+ .filter(n => !existingEntities.has(n.toLowerCase()) && n.length > 1)
517
+ .slice(0, 10);
518
+
519
+ if (supplemental.length > 0) {
520
+ other = [...other, ...supplemental].slice(0, 15);
521
+ }
522
+
523
+ const allEntities = [...people, ...places, ...organizations, ...dates, ...money, ...other];
524
+ const uniqueEntities = new Set(allEntities.map(e => e.toLowerCase()));
525
+
482
526
  return {
483
- people: doc.people().out('array'),
484
- places: doc.places().out('array'),
485
- organizations: doc.organizations().out('array'),
486
- dates: doc.dates().out('array'),
487
- money: doc.money().out('array'),
488
- other: doc.topics().out('array').slice(0, 10) // Limit other entities
527
+ people,
528
+ places,
529
+ organizations,
530
+ dates,
531
+ money,
532
+ other,
533
+ summary: {
534
+ totalEntities: allEntities.length,
535
+ uniqueEntities: uniqueEntities.size,
536
+ entityDensity: text.split(/\s+/).length > 0
537
+ ? uniqueEntities.size / text.split(/\s+/).length
538
+ : 0
539
+ }
489
540
  };
490
541
 
491
542
  } catch (error) {
@@ -496,7 +547,8 @@ export class ContentAnalyzer {
496
547
  organizations: [],
497
548
  dates: [],
498
549
  money: [],
499
- other: []
550
+ other: [],
551
+ summary: { totalEntities: 0, uniqueEntities: 0, entityDensity: 0 }
500
552
  };
501
553
  }
502
554
  }
@@ -521,10 +573,11 @@ export class ContentAnalyzer {
521
573
  const termTypes = {};
522
574
 
523
575
  [...nouns, ...verbs, ...adjectives].forEach(term => {
524
- const cleaned = term.toLowerCase().trim();
576
+ // Strip leading/trailing punctuation but preserve internal periods (e.g. Node.js)
577
+ const cleaned = term.toLowerCase().trim().replace(/^[^a-z0-9]+|[^a-z0-9.]+$/gi, '').replace(/\.+$/, '');
525
578
  if (cleaned.length > 2 && !this.isStopWord(cleaned)) {
526
579
  termFreq[cleaned] = (termFreq[cleaned] || 0) + 1;
527
-
580
+
528
581
  if (!termTypes[cleaned]) {
529
582
  if (nouns.includes(term)) termTypes[cleaned] = 'noun';
530
583
  else if (verbs.includes(term)) termTypes[cleaned] = 'verb';
@@ -561,7 +614,7 @@ export class ContentAnalyzer {
561
614
  */
562
615
  async calculateReadability(text) {
563
616
  try {
564
- const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
617
+ const sentences = splitSentences(text);
565
618
  const words = text.split(/\s+/).filter(w => w.length > 0);
566
619
  const characters = text.length;
567
620
  const charactersNoSpaces = text.replace(/\s/g, '').length;
@@ -669,7 +722,7 @@ export class ContentAnalyzer {
669
722
  const characters = text.length;
670
723
  const charactersNoSpaces = text.replace(/\s/g, '').length;
671
724
  const words = text.split(/\s+/).filter(w => w.length > 0);
672
- const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
725
+ const sentences = splitSentences(text);
673
726
  const paragraphs = text.split(/\n\s*\n/).filter(p => p.trim().length > 0);
674
727
 
675
728
  // Estimate reading time (average 200 words per minute)
@@ -0,0 +1,73 @@
1
+ /**
2
+ * Sentence splitting utility that handles abbreviations, decimal numbers,
3
+ * domain names, and other common patterns that contain periods.
4
+ */
5
+
6
+ // Common abbreviations that should not trigger sentence splits
7
+ const ABBREVIATIONS = new Set([
8
+ 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', 'jr', 'st', 'ave', 'blvd',
9
+ 'vs', 'etc', 'inc', 'ltd', 'corp', 'dept', 'univ', 'assn',
10
+ 'approx', 'appt', 'apt', 'dept', 'est', 'min', 'max',
11
+ 'govt', 'lib', 'misc', 'natl', 'intl',
12
+ 'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec',
13
+ 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
14
+ 'fig', 'eq', 'ref', 'vol', 'no', 'pp', 'ed', 'rev',
15
+ 'e', 'i', // for e.g. and i.e.
16
+ ]);
17
+
18
+ /**
19
+ * Split text into sentences, handling abbreviations and technical terms.
20
+ * @param {string} text - Text to split
21
+ * @returns {string[]} - Array of sentence strings
22
+ */
23
+ export function splitSentences(text) {
24
+ if (!text || typeof text !== 'string') return [];
25
+
26
+ const sentences = [];
27
+ let current = '';
28
+
29
+ // Split by potential sentence boundaries: . ! ?
30
+ // But be smart about abbreviations, numbers, and domain-like patterns
31
+ const tokens = text.split(/(?<=[.!?])\s+/);
32
+
33
+ for (const token of tokens) {
34
+ const combined = current ? current + ' ' + token : token;
35
+
36
+ // Check if the current chunk ends with something that looks like a sentence end
37
+ if (/[.!?]\s*$/.test(combined)) {
38
+ // Check if the period is likely NOT a sentence boundary
39
+ const beforePeriod = combined.replace(/[.!?]\s*$/, '');
40
+ const lastWord = beforePeriod.split(/\s+/).pop() || '';
41
+ const lastWordLower = lastWord.toLowerCase().replace(/[^a-z]/g, '');
42
+
43
+ const isAbbreviation = ABBREVIATIONS.has(lastWordLower);
44
+ // e.g., i.e., U.S., Node.js - words with internal periods
45
+ const hasInternalPeriods = /\w\.\w/.test(lastWord);
46
+ // Numbers like 3.14, v2.0
47
+ const isDecimal = /\d\.\d/.test(lastWord);
48
+ // Single letter followed by period (initials like "A. Smith")
49
+ const isInitial = /^[A-Z]\.$/.test(lastWord);
50
+
51
+ if (isAbbreviation || hasInternalPeriods || isDecimal || isInitial) {
52
+ // Not a real sentence boundary — accumulate
53
+ current = combined;
54
+ } else {
55
+ // Real sentence boundary
56
+ const trimmed = combined.trim();
57
+ if (trimmed.length > 0) {
58
+ sentences.push(trimmed);
59
+ }
60
+ current = '';
61
+ }
62
+ } else {
63
+ current = combined;
64
+ }
65
+ }
66
+
67
+ // Don't forget the last chunk
68
+ if (current.trim().length > 0) {
69
+ sentences.push(current.trim());
70
+ }
71
+
72
+ return sentences.length > 0 ? sentences : [text.trim()];
73
+ }
@@ -0,0 +1,47 @@
1
+ /**
2
+ * Creator Mode Authentication
3
+ * Extracted from server.js to allow tool classes to be imported independently
4
+ * without triggering the full MCP server startup sequence.
5
+ *
6
+ * SECURITY: The creator secret hash is safe to commit — one-way SHA-256.
7
+ * The actual secret is never stored. Only the package maintainer has it.
8
+ */
9
+
10
+ import crypto from 'crypto';
11
+ import dotenv from 'dotenv';
12
+
13
+ // Load .env file early to check for creator secret
14
+ dotenv.config({ path: '.env', quiet: true });
15
+
16
+ // SECURITY: Clear any externally-set creator mode env var to prevent bypass
17
+ delete process.env.CRAWLFORGE_CREATOR_MODE;
18
+
19
+ const CREATOR_SECRET_HASH = 'cfef62e5068d48e7dd6a39c9e16f0be2615510c6b68274fc8abe3156feb5050b';
20
+
21
+ // Module-scoped flag — cannot be set externally
22
+ let _creatorModeVerified = false;
23
+
24
+ if (process.env.CRAWLFORGE_CREATOR_SECRET) {
25
+ const providedHash = crypto
26
+ .createHash('sha256')
27
+ .update(process.env.CRAWLFORGE_CREATOR_SECRET)
28
+ .digest('hex');
29
+
30
+ if (crypto.timingSafeEqual(Buffer.from(providedHash, 'hex'), Buffer.from(CREATOR_SECRET_HASH, 'hex'))) {
31
+ _creatorModeVerified = true;
32
+ console.log('Creator Mode Enabled - Unlimited Access');
33
+ } else {
34
+ console.warn('Invalid creator secret provided');
35
+ }
36
+ // Clean up the secret from environment
37
+ delete process.env.CRAWLFORGE_CREATOR_SECRET;
38
+ }
39
+
40
+ /**
41
+ * Returns true only when the package maintainer has provided the correct secret.
42
+ * This flag is module-scoped and cannot be set via environment variables after
43
+ * the module has loaded.
44
+ */
45
+ export function isCreatorModeVerified() {
46
+ return _creatorModeVerified;
47
+ }
@@ -319,6 +319,126 @@ Synthesize these findings into a comprehensive analysis:`;
319
319
  }
320
320
  }
321
321
 
322
+ /**
323
+ * Extract structured data from content using LLM and a JSON Schema
324
+ * Follows the same pattern as analyzeRelevance()
325
+ */
326
+ async extractStructured(content, schema, options = {}) {
327
+ const { maxContentLength = 6000, prompt: userPrompt = '', maxTokens = 1000 } = options;
328
+
329
+ const truncatedContent = content.length > maxContentLength
330
+ ? content.substring(0, maxContentLength) + '...'
331
+ : content;
332
+
333
+ // Scale maxTokens with schema complexity
334
+ const schemaFields = Object.keys(schema.properties || {}).length;
335
+ const scaledTokens = Math.min(2000, Math.max(maxTokens, schemaFields * 100 + 500));
336
+
337
+ const systemPrompt = `You are a structured data extraction expert. Extract data from the provided content and return ONLY valid JSON that conforms to the given JSON Schema. Do not include any explanation or markdown — only the raw JSON object.`;
338
+
339
+ const schemaStr = JSON.stringify(schema, null, 2);
340
+ const guidance = userPrompt ? `\n\nExtraction guidance: ${userPrompt}` : '';
341
+
342
+ const extractionPrompt = `JSON Schema to extract:
343
+ ${schemaStr}${guidance}
344
+
345
+ Content to extract from:
346
+ ${truncatedContent}
347
+
348
+ Extract the data and return valid JSON:`;
349
+
350
+ try {
351
+ const response = await this.generateCompletion(extractionPrompt, {
352
+ systemPrompt,
353
+ maxTokens: scaledTokens,
354
+ temperature: 0.1
355
+ });
356
+
357
+ // Strip markdown code fences if present
358
+ const cleaned = response.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, '').trim();
359
+ const parsed = JSON.parse(cleaned);
360
+
361
+ // Lightweight validation
362
+ const validation = this.validateAgainstSchema(parsed, schema);
363
+ return {
364
+ data: parsed,
365
+ valid: validation.valid,
366
+ validationErrors: validation.errors
367
+ };
368
+ } catch (error) {
369
+ this.logger.warn('LLM structured extraction failed, using fallback', { error: error.message });
370
+ return this.fallbackStructuredExtraction(content, schema);
371
+ }
372
+ }
373
+
374
+ /**
375
+ * Validate a parsed object against a simple JSON Schema
376
+ */
377
+ validateAgainstSchema(data, schema) {
378
+ const errors = [];
379
+ const properties = schema.properties || {};
380
+ const required = schema.required || [];
381
+
382
+ for (const field of required) {
383
+ if (!(field in data)) {
384
+ errors.push(`Missing required field: ${field}`);
385
+ }
386
+ }
387
+
388
+ for (const [key, fieldSchema] of Object.entries(properties)) {
389
+ if (key in data) {
390
+ const value = data[key];
391
+ const expectedType = fieldSchema.type;
392
+ if (expectedType) {
393
+ const actualType = Array.isArray(value) ? 'array' : typeof value;
394
+ if (actualType !== expectedType) {
395
+ errors.push(`Field "${key}": expected ${expectedType}, got ${actualType}`);
396
+ }
397
+ }
398
+ if (fieldSchema.enum && !fieldSchema.enum.includes(value)) {
399
+ errors.push(`Field "${key}": value "${value}" not in enum ${JSON.stringify(fieldSchema.enum)}`);
400
+ }
401
+ }
402
+ }
403
+
404
+ return { valid: errors.length === 0, errors };
405
+ }
406
+
407
+ /**
408
+ * Fallback structured extraction without LLM — keyword/regex matching for primitives
409
+ */
410
+ fallbackStructuredExtraction(content, schema) {
411
+ const extracted = {};
412
+ const properties = schema.properties || {};
413
+
414
+ for (const [key, fieldSchema] of Object.entries(properties)) {
415
+ const keyPattern = new RegExp(key.replace(/_/g, '[\\s_-]'), 'i');
416
+ const lineMatch = content.split('\n').find(line => keyPattern.test(line));
417
+
418
+ if (lineMatch) {
419
+ const valueMatch = lineMatch.match(/:\s*(.+)$/);
420
+ const rawValue = valueMatch ? valueMatch[1].trim() : null;
421
+
422
+ if (rawValue) {
423
+ if (fieldSchema.type === 'number') {
424
+ const num = parseFloat(rawValue.replace(/[^0-9.-]/g, ''));
425
+ if (!isNaN(num)) extracted[key] = num;
426
+ } else if (fieldSchema.type === 'boolean') {
427
+ extracted[key] = /true|yes|1/i.test(rawValue);
428
+ } else {
429
+ extracted[key] = rawValue;
430
+ }
431
+ }
432
+ }
433
+ }
434
+
435
+ return {
436
+ data: extracted,
437
+ valid: false,
438
+ validationErrors: ['Used fallback extraction — no LLM provider available']
439
+ };
440
+ }
441
+
322
442
  /**
323
443
  * Fallback query expansion without LLM
324
444
  */
@@ -668,7 +668,7 @@ export class BrowserProcessor {
668
668
 
669
669
  // Check for dynamic content indicators
670
670
  const dynamicIndicators = document.querySelectorAll(
671
- '[data-bind], [v-if], [v-for], [ng-if], [ng-repeat], [*ngFor], [*ngIf]'
671
+ '[data-bind], [v-if], [v-for], [ng-if], [ng-repeat], [ngFor], [ngIf]'
672
672
  );
673
673
  analysis.hasDynamicContent = dynamicIndicators.length > 0 || analysis.detectedFrameworks.length > 0;
674
674