crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,749 @@
1
+ /**
2
+ * ContentAnalyzer - Content analysis with summarization, language detection, and topic identification
3
+ * Uses multiple NLP libraries for comprehensive content analysis
4
+ */
5
+
6
+ import { SummarizerManager } from 'node-summarizer';
7
+ import { franc } from 'franc';
8
+ import nlp from 'compromise';
9
+ import { z } from 'zod';
10
+
11
+ const ContentAnalyzerSchema = z.object({
12
+ text: z.string().min(1),
13
+ options: z.object({
14
+ summarize: z.boolean().default(true),
15
+ detectLanguage: z.boolean().default(true),
16
+ extractTopics: z.boolean().default(true),
17
+ extractEntities: z.boolean().default(true),
18
+ extractKeywords: z.boolean().default(true),
19
+ summaryLength: z.enum(['short', 'medium', 'long']).default('medium'),
20
+ summaryType: z.enum(['extractive', 'abstractive']).default('extractive'),
21
+ minConfidence: z.number().min(0).max(1).default(0.1),
22
+ maxTopics: z.number().min(1).max(20).default(10),
23
+ maxKeywords: z.number().min(1).max(50).default(15),
24
+ includeReadabilityMetrics: z.boolean().default(true),
25
+ includeSentiment: z.boolean().default(true)
26
+ }).optional().default({})
27
+ });
28
+
29
+ const AnalysisResult = z.object({
30
+ text: z.string(),
31
+ language: z.object({
32
+ code: z.string(),
33
+ name: z.string(),
34
+ confidence: z.number(),
35
+ alternative: z.array(z.object({
36
+ code: z.string(),
37
+ name: z.string(),
38
+ confidence: z.number()
39
+ }))
40
+ }).optional(),
41
+ summary: z.object({
42
+ type: z.string(),
43
+ length: z.string(),
44
+ sentences: z.array(z.string()),
45
+ text: z.string(),
46
+ compressionRatio: z.number()
47
+ }).optional(),
48
+ topics: z.array(z.object({
49
+ topic: z.string(),
50
+ confidence: z.number(),
51
+ keywords: z.array(z.string())
52
+ })).optional(),
53
+ entities: z.object({
54
+ people: z.array(z.string()),
55
+ places: z.array(z.string()),
56
+ organizations: z.array(z.string()),
57
+ dates: z.array(z.string()),
58
+ money: z.array(z.string()),
59
+ other: z.array(z.string())
60
+ }).optional(),
61
+ keywords: z.array(z.object({
62
+ keyword: z.string(),
63
+ frequency: z.number(),
64
+ relevance: z.number(),
65
+ type: z.string()
66
+ })).optional(),
67
+ readability: z.object({
68
+ score: z.number(),
69
+ level: z.string(),
70
+ metrics: z.object({
71
+ sentences: z.number(),
72
+ words: z.number(),
73
+ characters: z.number(),
74
+ avgWordsPerSentence: z.number(),
75
+ avgCharsPerWord: z.number(),
76
+ complexWords: z.number(),
77
+ syllables: z.number()
78
+ })
79
+ }).optional(),
80
+ sentiment: z.object({
81
+ polarity: z.number(),
82
+ subjectivity: z.number(),
83
+ label: z.string(),
84
+ confidence: z.number()
85
+ }).optional(),
86
+ statistics: z.object({
87
+ characters: z.number(),
88
+ charactersNoSpaces: z.number(),
89
+ words: z.number(),
90
+ sentences: z.number(),
91
+ paragraphs: z.number(),
92
+ readingTime: z.number()
93
+ }),
94
+ analyzedAt: z.string(),
95
+ processingTime: z.number()
96
+ });
97
+
98
+ // Language code to name mapping
99
+ const LANGUAGE_NAMES = {
100
+ 'eng': 'English',
101
+ 'spa': 'Spanish',
102
+ 'fra': 'French',
103
+ 'deu': 'German',
104
+ 'ita': 'Italian',
105
+ 'por': 'Portuguese',
106
+ 'rus': 'Russian',
107
+ 'jpn': 'Japanese',
108
+ 'kor': 'Korean',
109
+ 'chi': 'Chinese',
110
+ 'ara': 'Arabic',
111
+ 'hin': 'Hindi',
112
+ 'nld': 'Dutch',
113
+ 'swe': 'Swedish',
114
+ 'nor': 'Norwegian',
115
+ 'dan': 'Danish',
116
+ 'fin': 'Finnish',
117
+ 'pol': 'Polish',
118
+ 'ces': 'Czech',
119
+ 'hun': 'Hungarian',
120
+ 'tur': 'Turkish',
121
+ 'gre': 'Greek',
122
+ 'heb': 'Hebrew',
123
+ 'tha': 'Thai',
124
+ 'vie': 'Vietnamese',
125
+ 'ind': 'Indonesian',
126
+ 'msa': 'Malay',
127
+ 'tgl': 'Tagalog',
128
+ 'ukr': 'Ukrainian',
129
+ 'bul': 'Bulgarian',
130
+ 'hrv': 'Croatian',
131
+ 'slv': 'Slovenian',
132
+ 'ron': 'Romanian',
133
+ 'lit': 'Lithuanian',
134
+ 'lav': 'Latvian',
135
+ 'est': 'Estonian',
136
+ 'slk': 'Slovak',
137
+ 'cat': 'Catalan',
138
+ 'eus': 'Basque',
139
+ 'glg': 'Galician',
140
+ 'gle': 'Irish',
141
+ 'cym': 'Welsh',
142
+ 'isl': 'Icelandic',
143
+ 'mlt': 'Maltese',
144
+ 'sqi': 'Albanian',
145
+ 'mkd': 'Macedonian',
146
+ 'srp': 'Serbian',
147
+ 'bos': 'Bosnian',
148
+ 'mon': 'Mongolian',
149
+ 'uzb': 'Uzbek',
150
+ 'kaz': 'Kazakh',
151
+ 'aze': 'Azerbaijani',
152
+ 'geo': 'Georgian',
153
+ 'arm': 'Armenian',
154
+ 'fas': 'Persian',
155
+ 'urd': 'Urdu',
156
+ 'ben': 'Bengali',
157
+ 'tam': 'Tamil',
158
+ 'tel': 'Telugu',
159
+ 'kan': 'Kannada',
160
+ 'mal': 'Malayalam',
161
+ 'guj': 'Gujarati',
162
+ 'pan': 'Punjabi',
163
+ 'ori': 'Odia',
164
+ 'mar': 'Marathi',
165
+ 'nep': 'Nepali',
166
+ 'sin': 'Sinhala',
167
+ 'mya': 'Burmese',
168
+ 'khm': 'Khmer',
169
+ 'lao': 'Lao',
170
+ 'amh': 'Amharic',
171
+ 'som': 'Somali',
172
+ 'swa': 'Swahili',
173
+ 'hau': 'Hausa',
174
+ 'yor': 'Yoruba',
175
+ 'ibo': 'Igbo',
176
+ 'afr': 'Afrikaans'
177
+ };
178
+
179
+ export class ContentAnalyzer {
180
+ constructor() {
181
+ this.summarizer = new SummarizerManager();
182
+ this.defaultOptions = {
183
+ summarize: true,
184
+ detectLanguage: true,
185
+ extractTopics: true,
186
+ extractEntities: true,
187
+ extractKeywords: true,
188
+ summaryLength: 'medium',
189
+ summaryType: 'extractive',
190
+ minConfidence: 0.1,
191
+ maxTopics: 10,
192
+ maxKeywords: 15,
193
+ includeReadabilityMetrics: true,
194
+ includeSentiment: true
195
+ };
196
+ }
197
+
198
+ /**
199
+ * Analyze text content with multiple NLP techniques
200
+ * @param {Object} params - Analysis parameters
201
+ * @param {string} params.text - Text to analyze
202
+ * @param {Object} params.options - Analysis options
203
+ * @returns {Promise<Object>} - Analysis results
204
+ */
205
+ async analyzeContent(params) {
206
+ const startTime = Date.now();
207
+
208
+ try {
209
+ const validated = ContentAnalyzerSchema.parse(params);
210
+ const { text, options } = validated;
211
+ const analysisOptions = { ...this.defaultOptions, ...options };
212
+
213
+ const result = {
214
+ text: text.substring(0, 1000), // Store truncated text for reference
215
+ analyzedAt: new Date().toISOString(),
216
+ processingTime: 0
217
+ };
218
+
219
+ // Calculate basic statistics
220
+ result.statistics = this.calculateStatistics(text);
221
+
222
+ // Language detection
223
+ if (analysisOptions.detectLanguage) {
224
+ result.language = await this.detectLanguage(text, analysisOptions);
225
+ }
226
+
227
+ // Text summarization
228
+ if (analysisOptions.summarize) {
229
+ result.summary = await this.summarizeText(text, analysisOptions);
230
+ }
231
+
232
+ // Topic extraction
233
+ if (analysisOptions.extractTopics) {
234
+ result.topics = await this.extractTopics(text, analysisOptions);
235
+ }
236
+
237
+ // Entity extraction
238
+ if (analysisOptions.extractEntities) {
239
+ result.entities = await this.extractEntities(text, analysisOptions);
240
+ }
241
+
242
+ // Keyword extraction
243
+ if (analysisOptions.extractKeywords) {
244
+ result.keywords = await this.extractKeywords(text, analysisOptions);
245
+ }
246
+
247
+ // Readability metrics
248
+ if (analysisOptions.includeReadabilityMetrics) {
249
+ result.readability = await this.calculateReadability(text);
250
+ }
251
+
252
+ // Sentiment analysis
253
+ if (analysisOptions.includeSentiment) {
254
+ result.sentiment = await this.analyzeSentiment(text);
255
+ }
256
+
257
+ result.processingTime = Date.now() - startTime;
258
+ return result;
259
+
260
+ } catch (error) {
261
+ return {
262
+ text: params.text?.substring(0, 100) || 'unknown',
263
+ analyzedAt: new Date().toISOString(),
264
+ processingTime: Date.now() - startTime,
265
+ error: `Content analysis failed: ${error.message}`,
266
+ statistics: {
267
+ characters: 0,
268
+ charactersNoSpaces: 0,
269
+ words: 0,
270
+ sentences: 0,
271
+ paragraphs: 0,
272
+ readingTime: 0
273
+ }
274
+ };
275
+ }
276
+ }
277
+
278
+ /**
279
+ * Detect language using franc library
280
+ * @param {string} text - Text to analyze
281
+ * @param {Object} options - Detection options
282
+ * @returns {Promise<Object>} - Language detection result
283
+ */
284
+ async detectLanguage(text, options = {}) {
285
+ try {
286
+ // Use franc for language detection
287
+ const detected = franc(text, {
288
+ minLength: 10,
289
+ whitelist: Object.keys(LANGUAGE_NAMES)
290
+ });
291
+
292
+ if (detected === 'und') {
293
+ return null; // Undetermined language
294
+ }
295
+
296
+ // Get confidence score (simplified approach)
297
+ const confidence = Math.min(1, text.length / 100 * 0.01 + 0.5);
298
+
299
+ // Get alternative languages using franc.all
300
+ const alternatives = franc.all(text, {
301
+ minLength: 10,
302
+ whitelist: Object.keys(LANGUAGE_NAMES)
303
+ })
304
+ .slice(1, 4) // Top 3 alternatives
305
+ .map(([code, score]) => ({
306
+ code,
307
+ name: LANGUAGE_NAMES[code] || code,
308
+ confidence: Math.round((1 - score) * 100) / 100
309
+ }));
310
+
311
+ return {
312
+ code: detected,
313
+ name: LANGUAGE_NAMES[detected] || detected,
314
+ confidence: Math.round(confidence * 100) / 100,
315
+ alternative: alternatives
316
+ };
317
+
318
+ } catch (error) {
319
+ console.warn('Language detection failed:', error.message);
320
+ return null;
321
+ }
322
+ }
323
+
324
+ /**
325
+ * Summarize text content
326
+ * @param {string} text - Text to summarize
327
+ * @param {Object} options - Summarization options
328
+ * @returns {Promise<Object>} - Summarization result
329
+ */
330
+ async summarizeText(text, options = {}) {
331
+ try {
332
+ const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
333
+
334
+ if (sentences.length < 3) {
335
+ return {
336
+ type: options.summaryType,
337
+ length: options.summaryLength,
338
+ sentences: sentences,
339
+ text: text.trim(),
340
+ compressionRatio: 1.0
341
+ };
342
+ }
343
+
344
+ // Determine number of sentences based on length preference
345
+ let targetSentences;
346
+ switch (options.summaryLength) {
347
+ case 'short':
348
+ targetSentences = Math.max(1, Math.ceil(sentences.length * 0.1));
349
+ break;
350
+ case 'medium':
351
+ targetSentences = Math.max(2, Math.ceil(sentences.length * 0.3));
352
+ break;
353
+ case 'long':
354
+ targetSentences = Math.max(3, Math.ceil(sentences.length * 0.5));
355
+ break;
356
+ default:
357
+ targetSentences = Math.max(2, Math.ceil(sentences.length * 0.3));
358
+ }
359
+
360
+ targetSentences = Math.min(targetSentences, sentences.length);
361
+
362
+ let summarySentences;
363
+
364
+ if (options.summaryType === 'extractive') {
365
+ // Use node-summarizer for extractive summarization
366
+ const summary = await this.summarizer.getSummaryByRanking(text, targetSentences);
367
+ summarySentences = summary.split(/[.!?]+/).filter(s => s.trim().length > 0);
368
+ } else {
369
+ // Simple abstractive approach (for demonstration)
370
+ summarySentences = await this.createAbstractiveSummary(text, targetSentences);
371
+ }
372
+
373
+ const summaryText = summarySentences.join('. ').trim() + '.';
374
+ const compressionRatio = summaryText.length / text.length;
375
+
376
+ return {
377
+ type: options.summaryType,
378
+ length: options.summaryLength,
379
+ sentences: summarySentences,
380
+ text: summaryText,
381
+ compressionRatio: Math.round(compressionRatio * 100) / 100
382
+ };
383
+
384
+ } catch (error) {
385
+ console.warn('Text summarization failed:', error.message);
386
+
387
+ // Fallback: return first few sentences
388
+ const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
389
+ const fallbackSentences = sentences.slice(0, 2);
390
+
391
+ return {
392
+ type: 'fallback',
393
+ length: 'short',
394
+ sentences: fallbackSentences,
395
+ text: fallbackSentences.join('. ').trim() + '.',
396
+ compressionRatio: fallbackSentences.join('. ').length / text.length
397
+ };
398
+ }
399
+ }
400
+
401
+ /**
402
+ * Create abstractive summary (simplified approach)
403
+ * @param {string} text - Text to summarize
404
+ * @param {number} targetSentences - Target number of sentences
405
+ * @returns {Promise<Array>} - Array of summary sentences
406
+ */
407
+ async createAbstractiveSummary(text, targetSentences) {
408
+ // This is a simplified abstractive approach
409
+ // In production, you might use a transformer model or API
410
+
411
+ const doc = nlp(text);
412
+ const sentences = doc.sentences().out('array');
413
+
414
+ // Score sentences by importance (simplified scoring)
415
+ const scoredSentences = sentences.map(sentence => {
416
+ const doc = nlp(sentence);
417
+ const score = doc.nouns().length + doc.verbs().length * 0.8 + doc.adjectives().length * 0.5;
418
+ return { sentence, score };
419
+ });
420
+
421
+ // Sort by score and take top sentences
422
+ return scoredSentences
423
+ .sort((a, b) => b.score - a.score)
424
+ .slice(0, targetSentences)
425
+ .map(item => item.sentence.trim());
426
+ }
427
+
428
+ /**
429
+ * Extract topics from text
430
+ * @param {string} text - Text to analyze
431
+ * @param {Object} options - Extraction options
432
+ * @returns {Promise<Array>} - Array of topics
433
+ */
434
+ async extractTopics(text, options = {}) {
435
+ try {
436
+ const doc = nlp(text);
437
+
438
+ // Extract noun phrases as potential topics
439
+ const nounPhrases = doc.nouns().out('array');
440
+ const adjNounPhrases = doc.match('#Adjective+ #Noun+').out('array');
441
+
442
+ // Combine and count frequency
443
+ const allPhrases = [...nounPhrases, ...adjNounPhrases];
444
+ const phraseCount = {};
445
+
446
+ allPhrases.forEach(phrase => {
447
+ const cleaned = phrase.toLowerCase().trim();
448
+ if (cleaned.length > 2) {
449
+ phraseCount[cleaned] = (phraseCount[cleaned] || 0) + 1;
450
+ }
451
+ });
452
+
453
+ // Score and rank topics
454
+ const topics = Object.entries(phraseCount)
455
+ .map(([topic, frequency]) => ({
456
+ topic,
457
+ confidence: Math.min(1, frequency / Math.max(allPhrases.length, 1)),
458
+ keywords: topic.split(' ').filter(w => w.length > 2)
459
+ }))
460
+ .filter(topic => topic.confidence >= options.minConfidence)
461
+ .sort((a, b) => b.confidence - a.confidence)
462
+ .slice(0, options.maxTopics);
463
+
464
+ return topics;
465
+
466
+ } catch (error) {
467
+ console.warn('Topic extraction failed:', error.message);
468
+ return [];
469
+ }
470
+ }
471
+
472
+ /**
473
+ * Extract named entities from text
474
+ * @param {string} text - Text to analyze
475
+ * @param {Object} options - Extraction options
476
+ * @returns {Promise<Object>} - Named entities by category
477
+ */
478
+ async extractEntities(text, options = {}) {
479
+ try {
480
+ const doc = nlp(text);
481
+
482
+ return {
483
+ people: doc.people().out('array'),
484
+ places: doc.places().out('array'),
485
+ organizations: doc.organizations().out('array'),
486
+ dates: doc.dates().out('array'),
487
+ money: doc.money().out('array'),
488
+ other: doc.topics().out('array').slice(0, 10) // Limit other entities
489
+ };
490
+
491
+ } catch (error) {
492
+ console.warn('Entity extraction failed:', error.message);
493
+ return {
494
+ people: [],
495
+ places: [],
496
+ organizations: [],
497
+ dates: [],
498
+ money: [],
499
+ other: []
500
+ };
501
+ }
502
+ }
503
+
504
+ /**
505
+ * Extract keywords from text
506
+ * @param {string} text - Text to analyze
507
+ * @param {Object} options - Extraction options
508
+ * @returns {Promise<Array>} - Array of keywords with metadata
509
+ */
510
+ async extractKeywords(text, options = {}) {
511
+ try {
512
+ const doc = nlp(text);
513
+
514
+ // Extract different types of terms
515
+ const nouns = doc.nouns().out('array');
516
+ const verbs = doc.verbs().out('array');
517
+ const adjectives = doc.adjectives().out('array');
518
+
519
+ // Count frequency for all terms
520
+ const termFreq = {};
521
+ const termTypes = {};
522
+
523
+ [...nouns, ...verbs, ...adjectives].forEach(term => {
524
+ const cleaned = term.toLowerCase().trim();
525
+ if (cleaned.length > 2 && !this.isStopWord(cleaned)) {
526
+ termFreq[cleaned] = (termFreq[cleaned] || 0) + 1;
527
+
528
+ if (!termTypes[cleaned]) {
529
+ if (nouns.includes(term)) termTypes[cleaned] = 'noun';
530
+ else if (verbs.includes(term)) termTypes[cleaned] = 'verb';
531
+ else if (adjectives.includes(term)) termTypes[cleaned] = 'adjective';
532
+ }
533
+ }
534
+ });
535
+
536
+ const totalTerms = Object.values(termFreq).reduce((sum, freq) => sum + freq, 0);
537
+
538
+ // Calculate relevance and create keyword objects
539
+ const keywords = Object.entries(termFreq)
540
+ .map(([keyword, frequency]) => ({
541
+ keyword,
542
+ frequency,
543
+ relevance: frequency / totalTerms,
544
+ type: termTypes[keyword] || 'unknown'
545
+ }))
546
+ .sort((a, b) => b.relevance - a.relevance)
547
+ .slice(0, options.maxKeywords);
548
+
549
+ return keywords;
550
+
551
+ } catch (error) {
552
+ console.warn('Keyword extraction failed:', error.message);
553
+ return [];
554
+ }
555
+ }
556
+
557
+ /**
558
+ * Calculate readability metrics
559
+ * @param {string} text - Text to analyze
560
+ * @returns {Promise<Object>} - Readability metrics
561
+ */
562
+ async calculateReadability(text) {
563
+ try {
564
+ const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
565
+ const words = text.split(/\s+/).filter(w => w.length > 0);
566
+ const characters = text.length;
567
+ const charactersNoSpaces = text.replace(/\s/g, '').length;
568
+
569
+ // Count syllables and complex words
570
+ let totalSyllables = 0;
571
+ let complexWords = 0;
572
+
573
+ words.forEach(word => {
574
+ const syllables = this.countSyllables(word);
575
+ totalSyllables += syllables;
576
+ if (syllables >= 3) complexWords++;
577
+ });
578
+
579
+ const avgWordsPerSentence = words.length / Math.max(sentences.length, 1);
580
+ const avgCharsPerWord = charactersNoSpaces / Math.max(words.length, 1);
581
+ const avgSyllablesPerWord = totalSyllables / Math.max(words.length, 1);
582
+
583
+ // Flesch Reading Ease Score
584
+ const fleschScore = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * avgSyllablesPerWord);
585
+
586
+ return {
587
+ score: Math.round(Math.max(0, Math.min(100, fleschScore)) * 100) / 100,
588
+ level: this.getReadabilityLevel(fleschScore),
589
+ metrics: {
590
+ sentences: sentences.length,
591
+ words: words.length,
592
+ characters,
593
+ avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
594
+ avgCharsPerWord: Math.round(avgCharsPerWord * 100) / 100,
595
+ complexWords,
596
+ syllables: totalSyllables
597
+ }
598
+ };
599
+
600
+ } catch (error) {
601
+ console.warn('Readability calculation failed:', error.message);
602
+ return null;
603
+ }
604
+ }
605
+
606
+ /**
607
+ * Analyze sentiment of text
608
+ * @param {string} text - Text to analyze
609
+ * @returns {Promise<Object>} - Sentiment analysis result
610
+ */
611
+ async analyzeSentiment(text) {
612
+ try {
613
+ const doc = nlp(text);
614
+
615
+ // Simple sentiment analysis using compromise
616
+ // This is basic - for production use a dedicated sentiment library
617
+ const positiveWords = ['good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 'awesome', 'perfect', 'love', 'like', 'happy', 'pleased', 'satisfied'];
618
+ const negativeWords = ['bad', 'terrible', 'awful', 'horrible', 'hate', 'dislike', 'angry', 'sad', 'disappointed', 'frustrated', 'annoyed', 'upset'];
619
+
620
+ const words = doc.terms().out('array').map(w => w.toLowerCase());
621
+
622
+ let positiveCount = 0;
623
+ let negativeCount = 0;
624
+
625
+ words.forEach(word => {
626
+ if (positiveWords.includes(word)) positiveCount++;
627
+ if (negativeWords.includes(word)) negativeCount++;
628
+ });
629
+
630
+ const totalSentimentWords = positiveCount + negativeCount;
631
+
632
+ if (totalSentimentWords === 0) {
633
+ return {
634
+ polarity: 0,
635
+ subjectivity: 0,
636
+ label: 'neutral',
637
+ confidence: 0.5
638
+ };
639
+ }
640
+
641
+ const polarity = (positiveCount - negativeCount) / Math.max(words.length, 1);
642
+ const subjectivity = totalSentimentWords / Math.max(words.length, 1);
643
+
644
+ let label = 'neutral';
645
+ if (polarity > 0.1) label = 'positive';
646
+ else if (polarity < -0.1) label = 'negative';
647
+
648
+ const confidence = Math.min(1, totalSentimentWords / 10);
649
+
650
+ return {
651
+ polarity: Math.round(polarity * 100) / 100,
652
+ subjectivity: Math.round(subjectivity * 100) / 100,
653
+ label,
654
+ confidence: Math.round(confidence * 100) / 100
655
+ };
656
+
657
+ } catch (error) {
658
+ console.warn('Sentiment analysis failed:', error.message);
659
+ return null;
660
+ }
661
+ }
662
+
663
+ /**
664
+ * Calculate basic text statistics
665
+ * @param {string} text - Text to analyze
666
+ * @returns {Object} - Text statistics
667
+ */
668
+ calculateStatistics(text) {
669
+ const characters = text.length;
670
+ const charactersNoSpaces = text.replace(/\s/g, '').length;
671
+ const words = text.split(/\s+/).filter(w => w.length > 0);
672
+ const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
673
+ const paragraphs = text.split(/\n\s*\n/).filter(p => p.trim().length > 0);
674
+
675
+ // Estimate reading time (average 200 words per minute)
676
+ const readingTime = Math.ceil(words.length / 200);
677
+
678
+ return {
679
+ characters,
680
+ charactersNoSpaces,
681
+ words: words.length,
682
+ sentences: sentences.length,
683
+ paragraphs: paragraphs.length,
684
+ readingTime
685
+ };
686
+ }
687
+
688
+ /**
689
+ * Count syllables in a word
690
+ * @param {string} word - Word to count syllables for
691
+ * @returns {number} - Syllable count
692
+ */
693
+ countSyllables(word) {
694
+ if (!word || word.length <= 3) return 1;
695
+
696
+ const vowels = 'aeiouy';
697
+ let count = 0;
698
+ let prevIsVowel = false;
699
+
700
+ for (let i = 0; i < word.length; i++) {
701
+ const isVowel = vowels.includes(word[i].toLowerCase());
702
+ if (isVowel && !prevIsVowel) {
703
+ count++;
704
+ }
705
+ prevIsVowel = isVowel;
706
+ }
707
+
708
+ // Adjust for silent 'e'
709
+ if (word.toLowerCase().endsWith('e')) {
710
+ count--;
711
+ }
712
+
713
+ return Math.max(1, count);
714
+ }
715
+
716
+ /**
717
+ * Get readability level from score
718
+ * @param {number} score - Readability score
719
+ * @returns {string} - Readability level
720
+ */
721
+ getReadabilityLevel(score) {
722
+ if (score >= 90) return 'Very Easy';
723
+ if (score >= 80) return 'Easy';
724
+ if (score >= 70) return 'Fairly Easy';
725
+ if (score >= 60) return 'Standard';
726
+ if (score >= 50) return 'Fairly Difficult';
727
+ if (score >= 30) return 'Difficult';
728
+ return 'Very Difficult';
729
+ }
730
+
731
+ /**
732
+ * Check if word is a stop word
733
+ * @param {string} word - Word to check
734
+ * @returns {boolean} - True if stop word
735
+ */
736
+ isStopWord(word) {
737
+ const stopWords = [
738
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
739
+ 'from', 'as', 'is', 'was', 'are', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
740
+ 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can',
741
+ 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me',
742
+ 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their'
743
+ ];
744
+
745
+ return stopWords.includes(word.toLowerCase());
746
+ }
747
+ }
748
+
749
+ export default ContentAnalyzer;