crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,749 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ContentAnalyzer - Content analysis with summarization, language detection, and topic identification
|
|
3
|
+
* Uses multiple NLP libraries for comprehensive content analysis
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { SummarizerManager } from 'node-summarizer';
|
|
7
|
+
import { franc } from 'franc';
|
|
8
|
+
import nlp from 'compromise';
|
|
9
|
+
import { z } from 'zod';
|
|
10
|
+
|
|
11
|
+
const ContentAnalyzerSchema = z.object({
|
|
12
|
+
text: z.string().min(1),
|
|
13
|
+
options: z.object({
|
|
14
|
+
summarize: z.boolean().default(true),
|
|
15
|
+
detectLanguage: z.boolean().default(true),
|
|
16
|
+
extractTopics: z.boolean().default(true),
|
|
17
|
+
extractEntities: z.boolean().default(true),
|
|
18
|
+
extractKeywords: z.boolean().default(true),
|
|
19
|
+
summaryLength: z.enum(['short', 'medium', 'long']).default('medium'),
|
|
20
|
+
summaryType: z.enum(['extractive', 'abstractive']).default('extractive'),
|
|
21
|
+
minConfidence: z.number().min(0).max(1).default(0.1),
|
|
22
|
+
maxTopics: z.number().min(1).max(20).default(10),
|
|
23
|
+
maxKeywords: z.number().min(1).max(50).default(15),
|
|
24
|
+
includeReadabilityMetrics: z.boolean().default(true),
|
|
25
|
+
includeSentiment: z.boolean().default(true)
|
|
26
|
+
}).optional().default({})
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
const AnalysisResult = z.object({
|
|
30
|
+
text: z.string(),
|
|
31
|
+
language: z.object({
|
|
32
|
+
code: z.string(),
|
|
33
|
+
name: z.string(),
|
|
34
|
+
confidence: z.number(),
|
|
35
|
+
alternative: z.array(z.object({
|
|
36
|
+
code: z.string(),
|
|
37
|
+
name: z.string(),
|
|
38
|
+
confidence: z.number()
|
|
39
|
+
}))
|
|
40
|
+
}).optional(),
|
|
41
|
+
summary: z.object({
|
|
42
|
+
type: z.string(),
|
|
43
|
+
length: z.string(),
|
|
44
|
+
sentences: z.array(z.string()),
|
|
45
|
+
text: z.string(),
|
|
46
|
+
compressionRatio: z.number()
|
|
47
|
+
}).optional(),
|
|
48
|
+
topics: z.array(z.object({
|
|
49
|
+
topic: z.string(),
|
|
50
|
+
confidence: z.number(),
|
|
51
|
+
keywords: z.array(z.string())
|
|
52
|
+
})).optional(),
|
|
53
|
+
entities: z.object({
|
|
54
|
+
people: z.array(z.string()),
|
|
55
|
+
places: z.array(z.string()),
|
|
56
|
+
organizations: z.array(z.string()),
|
|
57
|
+
dates: z.array(z.string()),
|
|
58
|
+
money: z.array(z.string()),
|
|
59
|
+
other: z.array(z.string())
|
|
60
|
+
}).optional(),
|
|
61
|
+
keywords: z.array(z.object({
|
|
62
|
+
keyword: z.string(),
|
|
63
|
+
frequency: z.number(),
|
|
64
|
+
relevance: z.number(),
|
|
65
|
+
type: z.string()
|
|
66
|
+
})).optional(),
|
|
67
|
+
readability: z.object({
|
|
68
|
+
score: z.number(),
|
|
69
|
+
level: z.string(),
|
|
70
|
+
metrics: z.object({
|
|
71
|
+
sentences: z.number(),
|
|
72
|
+
words: z.number(),
|
|
73
|
+
characters: z.number(),
|
|
74
|
+
avgWordsPerSentence: z.number(),
|
|
75
|
+
avgCharsPerWord: z.number(),
|
|
76
|
+
complexWords: z.number(),
|
|
77
|
+
syllables: z.number()
|
|
78
|
+
})
|
|
79
|
+
}).optional(),
|
|
80
|
+
sentiment: z.object({
|
|
81
|
+
polarity: z.number(),
|
|
82
|
+
subjectivity: z.number(),
|
|
83
|
+
label: z.string(),
|
|
84
|
+
confidence: z.number()
|
|
85
|
+
}).optional(),
|
|
86
|
+
statistics: z.object({
|
|
87
|
+
characters: z.number(),
|
|
88
|
+
charactersNoSpaces: z.number(),
|
|
89
|
+
words: z.number(),
|
|
90
|
+
sentences: z.number(),
|
|
91
|
+
paragraphs: z.number(),
|
|
92
|
+
readingTime: z.number()
|
|
93
|
+
}),
|
|
94
|
+
analyzedAt: z.string(),
|
|
95
|
+
processingTime: z.number()
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
// Language code to name mapping
|
|
99
|
+
const LANGUAGE_NAMES = {
|
|
100
|
+
'eng': 'English',
|
|
101
|
+
'spa': 'Spanish',
|
|
102
|
+
'fra': 'French',
|
|
103
|
+
'deu': 'German',
|
|
104
|
+
'ita': 'Italian',
|
|
105
|
+
'por': 'Portuguese',
|
|
106
|
+
'rus': 'Russian',
|
|
107
|
+
'jpn': 'Japanese',
|
|
108
|
+
'kor': 'Korean',
|
|
109
|
+
'chi': 'Chinese',
|
|
110
|
+
'ara': 'Arabic',
|
|
111
|
+
'hin': 'Hindi',
|
|
112
|
+
'nld': 'Dutch',
|
|
113
|
+
'swe': 'Swedish',
|
|
114
|
+
'nor': 'Norwegian',
|
|
115
|
+
'dan': 'Danish',
|
|
116
|
+
'fin': 'Finnish',
|
|
117
|
+
'pol': 'Polish',
|
|
118
|
+
'ces': 'Czech',
|
|
119
|
+
'hun': 'Hungarian',
|
|
120
|
+
'tur': 'Turkish',
|
|
121
|
+
'gre': 'Greek',
|
|
122
|
+
'heb': 'Hebrew',
|
|
123
|
+
'tha': 'Thai',
|
|
124
|
+
'vie': 'Vietnamese',
|
|
125
|
+
'ind': 'Indonesian',
|
|
126
|
+
'msa': 'Malay',
|
|
127
|
+
'tgl': 'Tagalog',
|
|
128
|
+
'ukr': 'Ukrainian',
|
|
129
|
+
'bul': 'Bulgarian',
|
|
130
|
+
'hrv': 'Croatian',
|
|
131
|
+
'slv': 'Slovenian',
|
|
132
|
+
'ron': 'Romanian',
|
|
133
|
+
'lit': 'Lithuanian',
|
|
134
|
+
'lav': 'Latvian',
|
|
135
|
+
'est': 'Estonian',
|
|
136
|
+
'slk': 'Slovak',
|
|
137
|
+
'cat': 'Catalan',
|
|
138
|
+
'eus': 'Basque',
|
|
139
|
+
'glg': 'Galician',
|
|
140
|
+
'gle': 'Irish',
|
|
141
|
+
'cym': 'Welsh',
|
|
142
|
+
'isl': 'Icelandic',
|
|
143
|
+
'mlt': 'Maltese',
|
|
144
|
+
'sqi': 'Albanian',
|
|
145
|
+
'mkd': 'Macedonian',
|
|
146
|
+
'srp': 'Serbian',
|
|
147
|
+
'bos': 'Bosnian',
|
|
148
|
+
'mon': 'Mongolian',
|
|
149
|
+
'uzb': 'Uzbek',
|
|
150
|
+
'kaz': 'Kazakh',
|
|
151
|
+
'aze': 'Azerbaijani',
|
|
152
|
+
'geo': 'Georgian',
|
|
153
|
+
'arm': 'Armenian',
|
|
154
|
+
'fas': 'Persian',
|
|
155
|
+
'urd': 'Urdu',
|
|
156
|
+
'ben': 'Bengali',
|
|
157
|
+
'tam': 'Tamil',
|
|
158
|
+
'tel': 'Telugu',
|
|
159
|
+
'kan': 'Kannada',
|
|
160
|
+
'mal': 'Malayalam',
|
|
161
|
+
'guj': 'Gujarati',
|
|
162
|
+
'pan': 'Punjabi',
|
|
163
|
+
'ori': 'Odia',
|
|
164
|
+
'mar': 'Marathi',
|
|
165
|
+
'nep': 'Nepali',
|
|
166
|
+
'sin': 'Sinhala',
|
|
167
|
+
'mya': 'Burmese',
|
|
168
|
+
'khm': 'Khmer',
|
|
169
|
+
'lao': 'Lao',
|
|
170
|
+
'amh': 'Amharic',
|
|
171
|
+
'som': 'Somali',
|
|
172
|
+
'swa': 'Swahili',
|
|
173
|
+
'hau': 'Hausa',
|
|
174
|
+
'yor': 'Yoruba',
|
|
175
|
+
'ibo': 'Igbo',
|
|
176
|
+
'afr': 'Afrikaans'
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
export class ContentAnalyzer {
|
|
180
|
+
constructor() {
|
|
181
|
+
this.summarizer = new SummarizerManager();
|
|
182
|
+
this.defaultOptions = {
|
|
183
|
+
summarize: true,
|
|
184
|
+
detectLanguage: true,
|
|
185
|
+
extractTopics: true,
|
|
186
|
+
extractEntities: true,
|
|
187
|
+
extractKeywords: true,
|
|
188
|
+
summaryLength: 'medium',
|
|
189
|
+
summaryType: 'extractive',
|
|
190
|
+
minConfidence: 0.1,
|
|
191
|
+
maxTopics: 10,
|
|
192
|
+
maxKeywords: 15,
|
|
193
|
+
includeReadabilityMetrics: true,
|
|
194
|
+
includeSentiment: true
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Analyze text content with multiple NLP techniques
|
|
200
|
+
* @param {Object} params - Analysis parameters
|
|
201
|
+
* @param {string} params.text - Text to analyze
|
|
202
|
+
* @param {Object} params.options - Analysis options
|
|
203
|
+
* @returns {Promise<Object>} - Analysis results
|
|
204
|
+
*/
|
|
205
|
+
async analyzeContent(params) {
|
|
206
|
+
const startTime = Date.now();
|
|
207
|
+
|
|
208
|
+
try {
|
|
209
|
+
const validated = ContentAnalyzerSchema.parse(params);
|
|
210
|
+
const { text, options } = validated;
|
|
211
|
+
const analysisOptions = { ...this.defaultOptions, ...options };
|
|
212
|
+
|
|
213
|
+
const result = {
|
|
214
|
+
text: text.substring(0, 1000), // Store truncated text for reference
|
|
215
|
+
analyzedAt: new Date().toISOString(),
|
|
216
|
+
processingTime: 0
|
|
217
|
+
};
|
|
218
|
+
|
|
219
|
+
// Calculate basic statistics
|
|
220
|
+
result.statistics = this.calculateStatistics(text);
|
|
221
|
+
|
|
222
|
+
// Language detection
|
|
223
|
+
if (analysisOptions.detectLanguage) {
|
|
224
|
+
result.language = await this.detectLanguage(text, analysisOptions);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// Text summarization
|
|
228
|
+
if (analysisOptions.summarize) {
|
|
229
|
+
result.summary = await this.summarizeText(text, analysisOptions);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// Topic extraction
|
|
233
|
+
if (analysisOptions.extractTopics) {
|
|
234
|
+
result.topics = await this.extractTopics(text, analysisOptions);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Entity extraction
|
|
238
|
+
if (analysisOptions.extractEntities) {
|
|
239
|
+
result.entities = await this.extractEntities(text, analysisOptions);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Keyword extraction
|
|
243
|
+
if (analysisOptions.extractKeywords) {
|
|
244
|
+
result.keywords = await this.extractKeywords(text, analysisOptions);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Readability metrics
|
|
248
|
+
if (analysisOptions.includeReadabilityMetrics) {
|
|
249
|
+
result.readability = await this.calculateReadability(text);
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// Sentiment analysis
|
|
253
|
+
if (analysisOptions.includeSentiment) {
|
|
254
|
+
result.sentiment = await this.analyzeSentiment(text);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
result.processingTime = Date.now() - startTime;
|
|
258
|
+
return result;
|
|
259
|
+
|
|
260
|
+
} catch (error) {
|
|
261
|
+
return {
|
|
262
|
+
text: params.text?.substring(0, 100) || 'unknown',
|
|
263
|
+
analyzedAt: new Date().toISOString(),
|
|
264
|
+
processingTime: Date.now() - startTime,
|
|
265
|
+
error: `Content analysis failed: ${error.message}`,
|
|
266
|
+
statistics: {
|
|
267
|
+
characters: 0,
|
|
268
|
+
charactersNoSpaces: 0,
|
|
269
|
+
words: 0,
|
|
270
|
+
sentences: 0,
|
|
271
|
+
paragraphs: 0,
|
|
272
|
+
readingTime: 0
|
|
273
|
+
}
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Detect language using franc library
|
|
280
|
+
* @param {string} text - Text to analyze
|
|
281
|
+
* @param {Object} options - Detection options
|
|
282
|
+
* @returns {Promise<Object>} - Language detection result
|
|
283
|
+
*/
|
|
284
|
+
async detectLanguage(text, options = {}) {
|
|
285
|
+
try {
|
|
286
|
+
// Use franc for language detection
|
|
287
|
+
const detected = franc(text, {
|
|
288
|
+
minLength: 10,
|
|
289
|
+
whitelist: Object.keys(LANGUAGE_NAMES)
|
|
290
|
+
});
|
|
291
|
+
|
|
292
|
+
if (detected === 'und') {
|
|
293
|
+
return null; // Undetermined language
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Get confidence score (simplified approach)
|
|
297
|
+
const confidence = Math.min(1, text.length / 100 * 0.01 + 0.5);
|
|
298
|
+
|
|
299
|
+
// Get alternative languages using franc.all
|
|
300
|
+
const alternatives = franc.all(text, {
|
|
301
|
+
minLength: 10,
|
|
302
|
+
whitelist: Object.keys(LANGUAGE_NAMES)
|
|
303
|
+
})
|
|
304
|
+
.slice(1, 4) // Top 3 alternatives
|
|
305
|
+
.map(([code, score]) => ({
|
|
306
|
+
code,
|
|
307
|
+
name: LANGUAGE_NAMES[code] || code,
|
|
308
|
+
confidence: Math.round((1 - score) * 100) / 100
|
|
309
|
+
}));
|
|
310
|
+
|
|
311
|
+
return {
|
|
312
|
+
code: detected,
|
|
313
|
+
name: LANGUAGE_NAMES[detected] || detected,
|
|
314
|
+
confidence: Math.round(confidence * 100) / 100,
|
|
315
|
+
alternative: alternatives
|
|
316
|
+
};
|
|
317
|
+
|
|
318
|
+
} catch (error) {
|
|
319
|
+
console.warn('Language detection failed:', error.message);
|
|
320
|
+
return null;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* Summarize text content
|
|
326
|
+
* @param {string} text - Text to summarize
|
|
327
|
+
* @param {Object} options - Summarization options
|
|
328
|
+
* @returns {Promise<Object>} - Summarization result
|
|
329
|
+
*/
|
|
330
|
+
async summarizeText(text, options = {}) {
|
|
331
|
+
try {
|
|
332
|
+
const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
|
|
333
|
+
|
|
334
|
+
if (sentences.length < 3) {
|
|
335
|
+
return {
|
|
336
|
+
type: options.summaryType,
|
|
337
|
+
length: options.summaryLength,
|
|
338
|
+
sentences: sentences,
|
|
339
|
+
text: text.trim(),
|
|
340
|
+
compressionRatio: 1.0
|
|
341
|
+
};
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// Determine number of sentences based on length preference
|
|
345
|
+
let targetSentences;
|
|
346
|
+
switch (options.summaryLength) {
|
|
347
|
+
case 'short':
|
|
348
|
+
targetSentences = Math.max(1, Math.ceil(sentences.length * 0.1));
|
|
349
|
+
break;
|
|
350
|
+
case 'medium':
|
|
351
|
+
targetSentences = Math.max(2, Math.ceil(sentences.length * 0.3));
|
|
352
|
+
break;
|
|
353
|
+
case 'long':
|
|
354
|
+
targetSentences = Math.max(3, Math.ceil(sentences.length * 0.5));
|
|
355
|
+
break;
|
|
356
|
+
default:
|
|
357
|
+
targetSentences = Math.max(2, Math.ceil(sentences.length * 0.3));
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
targetSentences = Math.min(targetSentences, sentences.length);
|
|
361
|
+
|
|
362
|
+
let summarySentences;
|
|
363
|
+
|
|
364
|
+
if (options.summaryType === 'extractive') {
|
|
365
|
+
// Use node-summarizer for extractive summarization
|
|
366
|
+
const summary = await this.summarizer.getSummaryByRanking(text, targetSentences);
|
|
367
|
+
summarySentences = summary.split(/[.!?]+/).filter(s => s.trim().length > 0);
|
|
368
|
+
} else {
|
|
369
|
+
// Simple abstractive approach (for demonstration)
|
|
370
|
+
summarySentences = await this.createAbstractiveSummary(text, targetSentences);
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
const summaryText = summarySentences.join('. ').trim() + '.';
|
|
374
|
+
const compressionRatio = summaryText.length / text.length;
|
|
375
|
+
|
|
376
|
+
return {
|
|
377
|
+
type: options.summaryType,
|
|
378
|
+
length: options.summaryLength,
|
|
379
|
+
sentences: summarySentences,
|
|
380
|
+
text: summaryText,
|
|
381
|
+
compressionRatio: Math.round(compressionRatio * 100) / 100
|
|
382
|
+
};
|
|
383
|
+
|
|
384
|
+
} catch (error) {
|
|
385
|
+
console.warn('Text summarization failed:', error.message);
|
|
386
|
+
|
|
387
|
+
// Fallback: return first few sentences
|
|
388
|
+
const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
|
|
389
|
+
const fallbackSentences = sentences.slice(0, 2);
|
|
390
|
+
|
|
391
|
+
return {
|
|
392
|
+
type: 'fallback',
|
|
393
|
+
length: 'short',
|
|
394
|
+
sentences: fallbackSentences,
|
|
395
|
+
text: fallbackSentences.join('. ').trim() + '.',
|
|
396
|
+
compressionRatio: fallbackSentences.join('. ').length / text.length
|
|
397
|
+
};
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
/**
|
|
402
|
+
* Create abstractive summary (simplified approach)
|
|
403
|
+
* @param {string} text - Text to summarize
|
|
404
|
+
* @param {number} targetSentences - Target number of sentences
|
|
405
|
+
* @returns {Promise<Array>} - Array of summary sentences
|
|
406
|
+
*/
|
|
407
|
+
async createAbstractiveSummary(text, targetSentences) {
|
|
408
|
+
// This is a simplified abstractive approach
|
|
409
|
+
// In production, you might use a transformer model or API
|
|
410
|
+
|
|
411
|
+
const doc = nlp(text);
|
|
412
|
+
const sentences = doc.sentences().out('array');
|
|
413
|
+
|
|
414
|
+
// Score sentences by importance (simplified scoring)
|
|
415
|
+
const scoredSentences = sentences.map(sentence => {
|
|
416
|
+
const doc = nlp(sentence);
|
|
417
|
+
const score = doc.nouns().length + doc.verbs().length * 0.8 + doc.adjectives().length * 0.5;
|
|
418
|
+
return { sentence, score };
|
|
419
|
+
});
|
|
420
|
+
|
|
421
|
+
// Sort by score and take top sentences
|
|
422
|
+
return scoredSentences
|
|
423
|
+
.sort((a, b) => b.score - a.score)
|
|
424
|
+
.slice(0, targetSentences)
|
|
425
|
+
.map(item => item.sentence.trim());
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
/**
|
|
429
|
+
* Extract topics from text
|
|
430
|
+
* @param {string} text - Text to analyze
|
|
431
|
+
* @param {Object} options - Extraction options
|
|
432
|
+
* @returns {Promise<Array>} - Array of topics
|
|
433
|
+
*/
|
|
434
|
+
async extractTopics(text, options = {}) {
|
|
435
|
+
try {
|
|
436
|
+
const doc = nlp(text);
|
|
437
|
+
|
|
438
|
+
// Extract noun phrases as potential topics
|
|
439
|
+
const nounPhrases = doc.nouns().out('array');
|
|
440
|
+
const adjNounPhrases = doc.match('#Adjective+ #Noun+').out('array');
|
|
441
|
+
|
|
442
|
+
// Combine and count frequency
|
|
443
|
+
const allPhrases = [...nounPhrases, ...adjNounPhrases];
|
|
444
|
+
const phraseCount = {};
|
|
445
|
+
|
|
446
|
+
allPhrases.forEach(phrase => {
|
|
447
|
+
const cleaned = phrase.toLowerCase().trim();
|
|
448
|
+
if (cleaned.length > 2) {
|
|
449
|
+
phraseCount[cleaned] = (phraseCount[cleaned] || 0) + 1;
|
|
450
|
+
}
|
|
451
|
+
});
|
|
452
|
+
|
|
453
|
+
// Score and rank topics
|
|
454
|
+
const topics = Object.entries(phraseCount)
|
|
455
|
+
.map(([topic, frequency]) => ({
|
|
456
|
+
topic,
|
|
457
|
+
confidence: Math.min(1, frequency / Math.max(allPhrases.length, 1)),
|
|
458
|
+
keywords: topic.split(' ').filter(w => w.length > 2)
|
|
459
|
+
}))
|
|
460
|
+
.filter(topic => topic.confidence >= options.minConfidence)
|
|
461
|
+
.sort((a, b) => b.confidence - a.confidence)
|
|
462
|
+
.slice(0, options.maxTopics);
|
|
463
|
+
|
|
464
|
+
return topics;
|
|
465
|
+
|
|
466
|
+
} catch (error) {
|
|
467
|
+
console.warn('Topic extraction failed:', error.message);
|
|
468
|
+
return [];
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
/**
|
|
473
|
+
* Extract named entities from text
|
|
474
|
+
* @param {string} text - Text to analyze
|
|
475
|
+
* @param {Object} options - Extraction options
|
|
476
|
+
* @returns {Promise<Object>} - Named entities by category
|
|
477
|
+
*/
|
|
478
|
+
async extractEntities(text, options = {}) {
|
|
479
|
+
try {
|
|
480
|
+
const doc = nlp(text);
|
|
481
|
+
|
|
482
|
+
return {
|
|
483
|
+
people: doc.people().out('array'),
|
|
484
|
+
places: doc.places().out('array'),
|
|
485
|
+
organizations: doc.organizations().out('array'),
|
|
486
|
+
dates: doc.dates().out('array'),
|
|
487
|
+
money: doc.money().out('array'),
|
|
488
|
+
other: doc.topics().out('array').slice(0, 10) // Limit other entities
|
|
489
|
+
};
|
|
490
|
+
|
|
491
|
+
} catch (error) {
|
|
492
|
+
console.warn('Entity extraction failed:', error.message);
|
|
493
|
+
return {
|
|
494
|
+
people: [],
|
|
495
|
+
places: [],
|
|
496
|
+
organizations: [],
|
|
497
|
+
dates: [],
|
|
498
|
+
money: [],
|
|
499
|
+
other: []
|
|
500
|
+
};
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
/**
|
|
505
|
+
* Extract keywords from text
|
|
506
|
+
* @param {string} text - Text to analyze
|
|
507
|
+
* @param {Object} options - Extraction options
|
|
508
|
+
* @returns {Promise<Array>} - Array of keywords with metadata
|
|
509
|
+
*/
|
|
510
|
+
async extractKeywords(text, options = {}) {
|
|
511
|
+
try {
|
|
512
|
+
const doc = nlp(text);
|
|
513
|
+
|
|
514
|
+
// Extract different types of terms
|
|
515
|
+
const nouns = doc.nouns().out('array');
|
|
516
|
+
const verbs = doc.verbs().out('array');
|
|
517
|
+
const adjectives = doc.adjectives().out('array');
|
|
518
|
+
|
|
519
|
+
// Count frequency for all terms
|
|
520
|
+
const termFreq = {};
|
|
521
|
+
const termTypes = {};
|
|
522
|
+
|
|
523
|
+
[...nouns, ...verbs, ...adjectives].forEach(term => {
|
|
524
|
+
const cleaned = term.toLowerCase().trim();
|
|
525
|
+
if (cleaned.length > 2 && !this.isStopWord(cleaned)) {
|
|
526
|
+
termFreq[cleaned] = (termFreq[cleaned] || 0) + 1;
|
|
527
|
+
|
|
528
|
+
if (!termTypes[cleaned]) {
|
|
529
|
+
if (nouns.includes(term)) termTypes[cleaned] = 'noun';
|
|
530
|
+
else if (verbs.includes(term)) termTypes[cleaned] = 'verb';
|
|
531
|
+
else if (adjectives.includes(term)) termTypes[cleaned] = 'adjective';
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
});
|
|
535
|
+
|
|
536
|
+
const totalTerms = Object.values(termFreq).reduce((sum, freq) => sum + freq, 0);
|
|
537
|
+
|
|
538
|
+
// Calculate relevance and create keyword objects
|
|
539
|
+
const keywords = Object.entries(termFreq)
|
|
540
|
+
.map(([keyword, frequency]) => ({
|
|
541
|
+
keyword,
|
|
542
|
+
frequency,
|
|
543
|
+
relevance: frequency / totalTerms,
|
|
544
|
+
type: termTypes[keyword] || 'unknown'
|
|
545
|
+
}))
|
|
546
|
+
.sort((a, b) => b.relevance - a.relevance)
|
|
547
|
+
.slice(0, options.maxKeywords);
|
|
548
|
+
|
|
549
|
+
return keywords;
|
|
550
|
+
|
|
551
|
+
} catch (error) {
|
|
552
|
+
console.warn('Keyword extraction failed:', error.message);
|
|
553
|
+
return [];
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
/**
|
|
558
|
+
* Calculate readability metrics
|
|
559
|
+
* @param {string} text - Text to analyze
|
|
560
|
+
* @returns {Promise<Object>} - Readability metrics
|
|
561
|
+
*/
|
|
562
|
+
async calculateReadability(text) {
|
|
563
|
+
try {
|
|
564
|
+
const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
|
|
565
|
+
const words = text.split(/\s+/).filter(w => w.length > 0);
|
|
566
|
+
const characters = text.length;
|
|
567
|
+
const charactersNoSpaces = text.replace(/\s/g, '').length;
|
|
568
|
+
|
|
569
|
+
// Count syllables and complex words
|
|
570
|
+
let totalSyllables = 0;
|
|
571
|
+
let complexWords = 0;
|
|
572
|
+
|
|
573
|
+
words.forEach(word => {
|
|
574
|
+
const syllables = this.countSyllables(word);
|
|
575
|
+
totalSyllables += syllables;
|
|
576
|
+
if (syllables >= 3) complexWords++;
|
|
577
|
+
});
|
|
578
|
+
|
|
579
|
+
const avgWordsPerSentence = words.length / Math.max(sentences.length, 1);
|
|
580
|
+
const avgCharsPerWord = charactersNoSpaces / Math.max(words.length, 1);
|
|
581
|
+
const avgSyllablesPerWord = totalSyllables / Math.max(words.length, 1);
|
|
582
|
+
|
|
583
|
+
// Flesch Reading Ease Score
|
|
584
|
+
const fleschScore = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * avgSyllablesPerWord);
|
|
585
|
+
|
|
586
|
+
return {
|
|
587
|
+
score: Math.round(Math.max(0, Math.min(100, fleschScore)) * 100) / 100,
|
|
588
|
+
level: this.getReadabilityLevel(fleschScore),
|
|
589
|
+
metrics: {
|
|
590
|
+
sentences: sentences.length,
|
|
591
|
+
words: words.length,
|
|
592
|
+
characters,
|
|
593
|
+
avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
|
|
594
|
+
avgCharsPerWord: Math.round(avgCharsPerWord * 100) / 100,
|
|
595
|
+
complexWords,
|
|
596
|
+
syllables: totalSyllables
|
|
597
|
+
}
|
|
598
|
+
};
|
|
599
|
+
|
|
600
|
+
} catch (error) {
|
|
601
|
+
console.warn('Readability calculation failed:', error.message);
|
|
602
|
+
return null;
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
/**
|
|
607
|
+
* Analyze sentiment of text
|
|
608
|
+
* @param {string} text - Text to analyze
|
|
609
|
+
* @returns {Promise<Object>} - Sentiment analysis result
|
|
610
|
+
*/
|
|
611
|
+
async analyzeSentiment(text) {
|
|
612
|
+
try {
|
|
613
|
+
const doc = nlp(text);
|
|
614
|
+
|
|
615
|
+
// Simple sentiment analysis using compromise
|
|
616
|
+
// This is basic - for production use a dedicated sentiment library
|
|
617
|
+
const positiveWords = ['good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 'awesome', 'perfect', 'love', 'like', 'happy', 'pleased', 'satisfied'];
|
|
618
|
+
const negativeWords = ['bad', 'terrible', 'awful', 'horrible', 'hate', 'dislike', 'angry', 'sad', 'disappointed', 'frustrated', 'annoyed', 'upset'];
|
|
619
|
+
|
|
620
|
+
const words = doc.terms().out('array').map(w => w.toLowerCase());
|
|
621
|
+
|
|
622
|
+
let positiveCount = 0;
|
|
623
|
+
let negativeCount = 0;
|
|
624
|
+
|
|
625
|
+
words.forEach(word => {
|
|
626
|
+
if (positiveWords.includes(word)) positiveCount++;
|
|
627
|
+
if (negativeWords.includes(word)) negativeCount++;
|
|
628
|
+
});
|
|
629
|
+
|
|
630
|
+
const totalSentimentWords = positiveCount + negativeCount;
|
|
631
|
+
|
|
632
|
+
if (totalSentimentWords === 0) {
|
|
633
|
+
return {
|
|
634
|
+
polarity: 0,
|
|
635
|
+
subjectivity: 0,
|
|
636
|
+
label: 'neutral',
|
|
637
|
+
confidence: 0.5
|
|
638
|
+
};
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
const polarity = (positiveCount - negativeCount) / Math.max(words.length, 1);
|
|
642
|
+
const subjectivity = totalSentimentWords / Math.max(words.length, 1);
|
|
643
|
+
|
|
644
|
+
let label = 'neutral';
|
|
645
|
+
if (polarity > 0.1) label = 'positive';
|
|
646
|
+
else if (polarity < -0.1) label = 'negative';
|
|
647
|
+
|
|
648
|
+
const confidence = Math.min(1, totalSentimentWords / 10);
|
|
649
|
+
|
|
650
|
+
return {
|
|
651
|
+
polarity: Math.round(polarity * 100) / 100,
|
|
652
|
+
subjectivity: Math.round(subjectivity * 100) / 100,
|
|
653
|
+
label,
|
|
654
|
+
confidence: Math.round(confidence * 100) / 100
|
|
655
|
+
};
|
|
656
|
+
|
|
657
|
+
} catch (error) {
|
|
658
|
+
console.warn('Sentiment analysis failed:', error.message);
|
|
659
|
+
return null;
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
/**
|
|
664
|
+
* Calculate basic text statistics
|
|
665
|
+
* @param {string} text - Text to analyze
|
|
666
|
+
* @returns {Object} - Text statistics
|
|
667
|
+
*/
|
|
668
|
+
calculateStatistics(text) {
|
|
669
|
+
const characters = text.length;
|
|
670
|
+
const charactersNoSpaces = text.replace(/\s/g, '').length;
|
|
671
|
+
const words = text.split(/\s+/).filter(w => w.length > 0);
|
|
672
|
+
const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
|
|
673
|
+
const paragraphs = text.split(/\n\s*\n/).filter(p => p.trim().length > 0);
|
|
674
|
+
|
|
675
|
+
// Estimate reading time (average 200 words per minute)
|
|
676
|
+
const readingTime = Math.ceil(words.length / 200);
|
|
677
|
+
|
|
678
|
+
return {
|
|
679
|
+
characters,
|
|
680
|
+
charactersNoSpaces,
|
|
681
|
+
words: words.length,
|
|
682
|
+
sentences: sentences.length,
|
|
683
|
+
paragraphs: paragraphs.length,
|
|
684
|
+
readingTime
|
|
685
|
+
};
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
/**
|
|
689
|
+
* Count syllables in a word
|
|
690
|
+
* @param {string} word - Word to count syllables for
|
|
691
|
+
* @returns {number} - Syllable count
|
|
692
|
+
*/
|
|
693
|
+
countSyllables(word) {
|
|
694
|
+
if (!word || word.length <= 3) return 1;
|
|
695
|
+
|
|
696
|
+
const vowels = 'aeiouy';
|
|
697
|
+
let count = 0;
|
|
698
|
+
let prevIsVowel = false;
|
|
699
|
+
|
|
700
|
+
for (let i = 0; i < word.length; i++) {
|
|
701
|
+
const isVowel = vowels.includes(word[i].toLowerCase());
|
|
702
|
+
if (isVowel && !prevIsVowel) {
|
|
703
|
+
count++;
|
|
704
|
+
}
|
|
705
|
+
prevIsVowel = isVowel;
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
// Adjust for silent 'e'
|
|
709
|
+
if (word.toLowerCase().endsWith('e')) {
|
|
710
|
+
count--;
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
return Math.max(1, count);
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
/**
|
|
717
|
+
* Get readability level from score
|
|
718
|
+
* @param {number} score - Readability score
|
|
719
|
+
* @returns {string} - Readability level
|
|
720
|
+
*/
|
|
721
|
+
getReadabilityLevel(score) {
|
|
722
|
+
if (score >= 90) return 'Very Easy';
|
|
723
|
+
if (score >= 80) return 'Easy';
|
|
724
|
+
if (score >= 70) return 'Fairly Easy';
|
|
725
|
+
if (score >= 60) return 'Standard';
|
|
726
|
+
if (score >= 50) return 'Fairly Difficult';
|
|
727
|
+
if (score >= 30) return 'Difficult';
|
|
728
|
+
return 'Very Difficult';
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
/**
|
|
732
|
+
* Check if word is a stop word
|
|
733
|
+
* @param {string} word - Word to check
|
|
734
|
+
* @returns {boolean} - True if stop word
|
|
735
|
+
*/
|
|
736
|
+
isStopWord(word) {
|
|
737
|
+
const stopWords = [
|
|
738
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
|
739
|
+
'from', 'as', 'is', 'was', 'are', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
|
740
|
+
'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can',
|
|
741
|
+
'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me',
|
|
742
|
+
'him', 'her', 'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their'
|
|
743
|
+
];
|
|
744
|
+
|
|
745
|
+
return stopWords.includes(word.toLowerCase());
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
export default ContentAnalyzer;
|