crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,743 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Worker thread script for CPU-intensive tasks
|
|
3
|
+
* Handles HTML parsing, content analysis, and other computationally expensive operations
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { parentPort } from 'worker_threads';
|
|
7
|
+
import * as cheerio from 'cheerio';
|
|
8
|
+
import { JSDOM } from 'jsdom';
|
|
9
|
+
import { Readability } from '@mozilla/readability';
|
|
10
|
+
import compromise from 'compromise';
|
|
11
|
+
import { franc } from 'franc';
|
|
12
|
+
|
|
13
|
+
// Task handlers
|
|
14
|
+
const taskHandlers = {
|
|
15
|
+
parseHtml: handleParseHtml,
|
|
16
|
+
extractContent: handleExtractContent,
|
|
17
|
+
analyzeText: handleAnalyzeText,
|
|
18
|
+
processStructuredData: handleProcessStructuredData,
|
|
19
|
+
calculateSimilarity: handleCalculateSimilarity,
|
|
20
|
+
validateUrls: handleValidateUrls,
|
|
21
|
+
normalizeData: handleNormalizeData,
|
|
22
|
+
computeHash: handleComputeHash
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
// Listen for messages from main thread
|
|
26
|
+
parentPort.on('message', async (message) => {
|
|
27
|
+
// Handle ready signal and ignore it
|
|
28
|
+
if (message && message.type === 'ready') {
|
|
29
|
+
return;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const { taskId, type, data } = message;
|
|
33
|
+
|
|
34
|
+
// Validate required fields
|
|
35
|
+
if (!taskId) {
|
|
36
|
+
console.error('Worker received message without taskId:', message);
|
|
37
|
+
return;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if (!type) {
|
|
41
|
+
parentPort.postMessage({
|
|
42
|
+
taskId,
|
|
43
|
+
error: 'No task type specified',
|
|
44
|
+
stack: new Error('No task type specified').stack
|
|
45
|
+
});
|
|
46
|
+
return;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
try {
|
|
50
|
+
const handler = taskHandlers[type];
|
|
51
|
+
if (!handler) {
|
|
52
|
+
throw new Error(`Unknown task type: ${type}`);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const result = await handler(data);
|
|
56
|
+
|
|
57
|
+
parentPort.postMessage({
|
|
58
|
+
taskId,
|
|
59
|
+
result
|
|
60
|
+
});
|
|
61
|
+
} catch (error) {
|
|
62
|
+
parentPort.postMessage({
|
|
63
|
+
taskId,
|
|
64
|
+
error: error.message,
|
|
65
|
+
stack: error.stack
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Parse HTML and extract basic structure
|
|
72
|
+
* @param {Object} data - Task data
|
|
73
|
+
* @returns {Object} - Parsed HTML structure
|
|
74
|
+
*/
|
|
75
|
+
async function handleParseHtml(data) {
|
|
76
|
+
const { html, options = {} } = data;
|
|
77
|
+
const {
|
|
78
|
+
extractText = true,
|
|
79
|
+
extractLinks = true,
|
|
80
|
+
extractImages = true,
|
|
81
|
+
extractMeta = true,
|
|
82
|
+
removeScripts = true
|
|
83
|
+
} = options;
|
|
84
|
+
|
|
85
|
+
const $ = cheerio.load(html);
|
|
86
|
+
const result = {};
|
|
87
|
+
|
|
88
|
+
// Remove scripts and styles if requested
|
|
89
|
+
if (removeScripts) {
|
|
90
|
+
$('script, style').remove();
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Extract text content
|
|
94
|
+
if (extractText) {
|
|
95
|
+
result.text = $('body').text().trim();
|
|
96
|
+
result.title = $('title').text().trim();
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Extract links
|
|
100
|
+
if (extractLinks) {
|
|
101
|
+
result.links = [];
|
|
102
|
+
$('a[href]').each((_, element) => {
|
|
103
|
+
const $link = $(element);
|
|
104
|
+
result.links.push({
|
|
105
|
+
href: $link.attr('href'),
|
|
106
|
+
text: $link.text().trim(),
|
|
107
|
+
title: $link.attr('title') || null
|
|
108
|
+
});
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Extract images
|
|
113
|
+
if (extractImages) {
|
|
114
|
+
result.images = [];
|
|
115
|
+
$('img[src]').each((_, element) => {
|
|
116
|
+
const $img = $(element);
|
|
117
|
+
result.images.push({
|
|
118
|
+
src: $img.attr('src'),
|
|
119
|
+
alt: $img.attr('alt') || null,
|
|
120
|
+
title: $img.attr('title') || null,
|
|
121
|
+
width: $img.attr('width') || null,
|
|
122
|
+
height: $img.attr('height') || null
|
|
123
|
+
});
|
|
124
|
+
});
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Extract meta information
|
|
128
|
+
if (extractMeta) {
|
|
129
|
+
result.meta = {};
|
|
130
|
+
$('meta').each((_, element) => {
|
|
131
|
+
const $meta = $(element);
|
|
132
|
+
const name = $meta.attr('name') || $meta.attr('property');
|
|
133
|
+
const content = $meta.attr('content');
|
|
134
|
+
if (name && content) {
|
|
135
|
+
result.meta[name] = content;
|
|
136
|
+
}
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return result;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Extract main content using Mozilla Readability
|
|
145
|
+
* @param {Object} data - Task data
|
|
146
|
+
* @returns {Object} - Extracted content
|
|
147
|
+
*/
|
|
148
|
+
async function handleExtractContent(data) {
|
|
149
|
+
const { html, url, options = {} } = data;
|
|
150
|
+
const {
|
|
151
|
+
removeBoilerplate = true,
|
|
152
|
+
extractStructuredData = true,
|
|
153
|
+
calculateReadability = true
|
|
154
|
+
} = options;
|
|
155
|
+
|
|
156
|
+
const result = {};
|
|
157
|
+
|
|
158
|
+
// Create JSDOM instance
|
|
159
|
+
const dom = new JSDOM(html, { url });
|
|
160
|
+
const document = dom.window.document;
|
|
161
|
+
|
|
162
|
+
// Use Readability for main content extraction
|
|
163
|
+
if (removeBoilerplate) {
|
|
164
|
+
const reader = new Readability(document, {
|
|
165
|
+
debug: false,
|
|
166
|
+
maxElemsToDivide: 300,
|
|
167
|
+
charThreshold: 500
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
const article = reader.parse();
|
|
171
|
+
if (article) {
|
|
172
|
+
result.article = {
|
|
173
|
+
title: article.title,
|
|
174
|
+
content: article.content,
|
|
175
|
+
textContent: article.textContent,
|
|
176
|
+
length: article.length,
|
|
177
|
+
excerpt: article.excerpt,
|
|
178
|
+
byline: article.byline,
|
|
179
|
+
dir: article.dir,
|
|
180
|
+
siteName: article.siteName,
|
|
181
|
+
lang: article.lang
|
|
182
|
+
};
|
|
183
|
+
|
|
184
|
+
// Calculate readability if requested
|
|
185
|
+
if (calculateReadability && article.textContent) {
|
|
186
|
+
result.readability = calculateReadabilityScore(article.textContent);
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Extract structured data
|
|
192
|
+
if (extractStructuredData) {
|
|
193
|
+
result.structuredData = extractStructuredData(html);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
return result;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Analyze text content for various metrics
|
|
201
|
+
* @param {Object} data - Task data
|
|
202
|
+
* @returns {Object} - Text analysis results
|
|
203
|
+
*/
|
|
204
|
+
async function handleAnalyzeText(data) {
|
|
205
|
+
const { text, options = {} } = data;
|
|
206
|
+
const {
|
|
207
|
+
detectLanguage = true,
|
|
208
|
+
extractEntities = true,
|
|
209
|
+
analyzeSentiment = true,
|
|
210
|
+
extractKeywords = true,
|
|
211
|
+
calculateMetrics = true
|
|
212
|
+
} = options;
|
|
213
|
+
|
|
214
|
+
const result = {};
|
|
215
|
+
|
|
216
|
+
if (!text || typeof text !== 'string') {
|
|
217
|
+
throw new Error('Invalid text input for analysis');
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Detect language
|
|
221
|
+
if (detectLanguage) {
|
|
222
|
+
try {
|
|
223
|
+
result.language = franc(text);
|
|
224
|
+
} catch (error) {
|
|
225
|
+
result.language = 'unknown';
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Use compromise for NLP analysis
|
|
230
|
+
const doc = compromise(text);
|
|
231
|
+
|
|
232
|
+
// Extract entities
|
|
233
|
+
if (extractEntities) {
|
|
234
|
+
result.entities = {
|
|
235
|
+
people: doc.people().out('array'),
|
|
236
|
+
places: doc.places().out('array'),
|
|
237
|
+
organizations: doc.organizations().out('array'),
|
|
238
|
+
topics: doc.topics().out('array')
|
|
239
|
+
};
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Basic sentiment analysis
|
|
243
|
+
if (analyzeSentiment) {
|
|
244
|
+
result.sentiment = analyzeSentiment(text);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Extract keywords
|
|
248
|
+
if (extractKeywords) {
|
|
249
|
+
result.keywords = extractKeywords(doc);
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// Calculate text metrics
|
|
253
|
+
if (calculateMetrics) {
|
|
254
|
+
result.metrics = calculateTextMetrics(text);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return result;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Process structured data from HTML
|
|
262
|
+
* @param {Object} data - Task data
|
|
263
|
+
* @returns {Object} - Processed structured data
|
|
264
|
+
*/
|
|
265
|
+
async function handleProcessStructuredData(data) {
|
|
266
|
+
const { html, options = {} } = data;
|
|
267
|
+
const {
|
|
268
|
+
extractJsonLd = true,
|
|
269
|
+
extractMicrodata = true,
|
|
270
|
+
extractSchemaOrg = true,
|
|
271
|
+
validateSchema = true
|
|
272
|
+
} = options;
|
|
273
|
+
|
|
274
|
+
const $ = cheerio.load(html);
|
|
275
|
+
const result = {
|
|
276
|
+
jsonLd: [],
|
|
277
|
+
microdata: [],
|
|
278
|
+
schemaOrg: []
|
|
279
|
+
};
|
|
280
|
+
|
|
281
|
+
// Extract JSON-LD
|
|
282
|
+
if (extractJsonLd) {
|
|
283
|
+
$('script[type="application/ld+json"]').each((_, element) => {
|
|
284
|
+
try {
|
|
285
|
+
const jsonText = $(element).html();
|
|
286
|
+
if (jsonText) {
|
|
287
|
+
const parsed = JSON.parse(jsonText);
|
|
288
|
+
result.jsonLd.push(parsed);
|
|
289
|
+
}
|
|
290
|
+
} catch (error) {
|
|
291
|
+
// Skip invalid JSON-LD
|
|
292
|
+
}
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Extract microdata
|
|
297
|
+
if (extractMicrodata) {
|
|
298
|
+
$('[itemscope]').each((_, element) => {
|
|
299
|
+
const item = extractMicrodataItem($, element);
|
|
300
|
+
if (item) {
|
|
301
|
+
result.microdata.push(item);
|
|
302
|
+
}
|
|
303
|
+
});
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// Extract schema.org markup
|
|
307
|
+
if (extractSchemaOrg) {
|
|
308
|
+
$('[typeof], [property], [vocab]').each((_, element) => {
|
|
309
|
+
const schemaItem = extractSchemaOrgItem($, element);
|
|
310
|
+
if (schemaItem) {
|
|
311
|
+
result.schemaOrg.push(schemaItem);
|
|
312
|
+
}
|
|
313
|
+
});
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
return result;
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
/**
|
|
320
|
+
* Calculate similarity between two pieces of text
|
|
321
|
+
* @param {Object} data - Task data
|
|
322
|
+
* @returns {Object} - Similarity metrics
|
|
323
|
+
*/
|
|
324
|
+
async function handleCalculateSimilarity(data) {
|
|
325
|
+
const { text1, text2, algorithm = 'jaccard' } = data;
|
|
326
|
+
|
|
327
|
+
if (!text1 || !text2) {
|
|
328
|
+
throw new Error('Two text inputs required for similarity calculation');
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
const result = {};
|
|
332
|
+
|
|
333
|
+
switch (algorithm) {
|
|
334
|
+
case 'jaccard':
|
|
335
|
+
result.jaccardSimilarity = calculateJaccardSimilarity(text1, text2);
|
|
336
|
+
break;
|
|
337
|
+
case 'cosine':
|
|
338
|
+
result.cosineSimilarity = calculateCosineSimilarity(text1, text2);
|
|
339
|
+
break;
|
|
340
|
+
case 'levenshtein':
|
|
341
|
+
result.levenshteinDistance = calculateLevenshteinDistance(text1, text2);
|
|
342
|
+
break;
|
|
343
|
+
default:
|
|
344
|
+
// Calculate all
|
|
345
|
+
result.jaccardSimilarity = calculateJaccardSimilarity(text1, text2);
|
|
346
|
+
result.cosineSimilarity = calculateCosineSimilarity(text1, text2);
|
|
347
|
+
result.levenshteinDistance = calculateLevenshteinDistance(text1, text2);
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
return result;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* Validate and normalize URLs
|
|
355
|
+
* @param {Object} data - Task data
|
|
356
|
+
* @returns {Object} - Validation results
|
|
357
|
+
*/
|
|
358
|
+
async function handleValidateUrls(data) {
|
|
359
|
+
const { urls, options = {} } = data;
|
|
360
|
+
const {
|
|
361
|
+
checkReachability = false,
|
|
362
|
+
normalizeUrls = true,
|
|
363
|
+
extractDomains = true
|
|
364
|
+
} = options;
|
|
365
|
+
|
|
366
|
+
if (!Array.isArray(urls)) {
|
|
367
|
+
throw new Error('URLs must be provided as an array');
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
const result = {
|
|
371
|
+
valid: [],
|
|
372
|
+
invalid: [],
|
|
373
|
+
normalized: [],
|
|
374
|
+
domains: new Set()
|
|
375
|
+
};
|
|
376
|
+
|
|
377
|
+
for (const url of urls) {
|
|
378
|
+
try {
|
|
379
|
+
const urlObj = new URL(url);
|
|
380
|
+
result.valid.push(url);
|
|
381
|
+
|
|
382
|
+
if (normalizeUrls) {
|
|
383
|
+
result.normalized.push(normalizeUrl(url));
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
if (extractDomains) {
|
|
387
|
+
result.domains.add(urlObj.hostname);
|
|
388
|
+
}
|
|
389
|
+
} catch (error) {
|
|
390
|
+
result.invalid.push({ url, error: error.message });
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
result.domains = Array.from(result.domains);
|
|
395
|
+
|
|
396
|
+
return result;
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
/**
|
|
400
|
+
* Normalize data structures
|
|
401
|
+
* @param {Object} data - Task data
|
|
402
|
+
* @returns {Object} - Normalized data
|
|
403
|
+
*/
|
|
404
|
+
async function handleNormalizeData(data) {
|
|
405
|
+
const { input, schema, options = {} } = data;
|
|
406
|
+
const { removeNulls = true, trimStrings = true, lowercaseKeys = false } = options;
|
|
407
|
+
|
|
408
|
+
const result = normalizeObject(input, { removeNulls, trimStrings, lowercaseKeys });
|
|
409
|
+
|
|
410
|
+
// Validate against schema if provided
|
|
411
|
+
if (schema) {
|
|
412
|
+
result.isValid = validateAgainstSchema(result.data, schema);
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
return result;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
/**
|
|
419
|
+
* Compute various hashes for data
|
|
420
|
+
* @param {Object} data - Task data
|
|
421
|
+
* @returns {Object} - Hash results
|
|
422
|
+
*/
|
|
423
|
+
async function handleComputeHash(data) {
|
|
424
|
+
const { input, algorithms = ['md5', 'sha1', 'sha256'] } = data;
|
|
425
|
+
|
|
426
|
+
const { createHash } = await import('crypto');
|
|
427
|
+
const inputString = typeof input === 'string' ? input : JSON.stringify(input);
|
|
428
|
+
|
|
429
|
+
const result = {};
|
|
430
|
+
|
|
431
|
+
for (const algorithm of algorithms) {
|
|
432
|
+
try {
|
|
433
|
+
const hash = createHash(algorithm);
|
|
434
|
+
hash.update(inputString);
|
|
435
|
+
result[algorithm] = hash.digest('hex');
|
|
436
|
+
} catch (error) {
|
|
437
|
+
result[algorithm] = { error: error.message };
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
return result;
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
// Helper functions
|
|
445
|
+
|
|
446
|
+
function calculateReadabilityScore(text) {
|
|
447
|
+
if (!text || typeof text !== 'string') {
|
|
448
|
+
return null;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
|
|
452
|
+
const words = text.split(/\s+/).filter(w => w.length > 0);
|
|
453
|
+
const characters = text.length;
|
|
454
|
+
const charactersNoSpaces = text.replace(/\s/g, '').length;
|
|
455
|
+
|
|
456
|
+
if (sentences.length === 0 || words.length === 0) {
|
|
457
|
+
return null;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
const avgWordsPerSentence = words.length / sentences.length;
|
|
461
|
+
const avgCharsPerWord = charactersNoSpaces / words.length;
|
|
462
|
+
|
|
463
|
+
// Flesch Reading Ease score
|
|
464
|
+
const readabilityScore = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * avgCharsPerWord);
|
|
465
|
+
|
|
466
|
+
return {
|
|
467
|
+
sentences: sentences.length,
|
|
468
|
+
words: words.length,
|
|
469
|
+
characters,
|
|
470
|
+
charactersNoSpaces,
|
|
471
|
+
avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
|
|
472
|
+
avgCharsPerWord: Math.round(avgCharsPerWord * 100) / 100,
|
|
473
|
+
readabilityScore: Math.round(readabilityScore * 100) / 100,
|
|
474
|
+
readabilityLevel: getReadabilityLevel(readabilityScore)
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
function getReadabilityLevel(score) {
|
|
479
|
+
if (score >= 90) return 'Very Easy';
|
|
480
|
+
if (score >= 80) return 'Easy';
|
|
481
|
+
if (score >= 70) return 'Fairly Easy';
|
|
482
|
+
if (score >= 60) return 'Standard';
|
|
483
|
+
if (score >= 50) return 'Fairly Difficult';
|
|
484
|
+
if (score >= 30) return 'Difficult';
|
|
485
|
+
return 'Very Difficult';
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
function extractStructuredData(html) {
|
|
489
|
+
const $ = cheerio.load(html);
|
|
490
|
+
const result = {
|
|
491
|
+
jsonLd: [],
|
|
492
|
+
microdata: [],
|
|
493
|
+
schemaOrg: []
|
|
494
|
+
};
|
|
495
|
+
|
|
496
|
+
// Extract JSON-LD
|
|
497
|
+
$('script[type="application/ld+json"]').each((_, element) => {
|
|
498
|
+
try {
|
|
499
|
+
const jsonText = $(element).html();
|
|
500
|
+
if (jsonText) {
|
|
501
|
+
const parsed = JSON.parse(jsonText);
|
|
502
|
+
result.jsonLd.push(parsed);
|
|
503
|
+
}
|
|
504
|
+
} catch (error) {
|
|
505
|
+
// Skip invalid JSON-LD
|
|
506
|
+
}
|
|
507
|
+
});
|
|
508
|
+
|
|
509
|
+
return result;
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
function extractMicrodataItem($, element) {
|
|
513
|
+
const $element = $(element);
|
|
514
|
+
const itemType = $element.attr('itemtype');
|
|
515
|
+
|
|
516
|
+
if (!itemType) return null;
|
|
517
|
+
|
|
518
|
+
const item = {
|
|
519
|
+
type: itemType,
|
|
520
|
+
properties: {}
|
|
521
|
+
};
|
|
522
|
+
|
|
523
|
+
$element.find('[itemprop]').each((_, propElement) => {
|
|
524
|
+
const $prop = $(propElement);
|
|
525
|
+
const propName = $prop.attr('itemprop');
|
|
526
|
+
const propValue = $prop.text().trim();
|
|
527
|
+
|
|
528
|
+
if (propName && propValue) {
|
|
529
|
+
if (!item.properties[propName]) {
|
|
530
|
+
item.properties[propName] = [];
|
|
531
|
+
}
|
|
532
|
+
item.properties[propName].push(propValue);
|
|
533
|
+
}
|
|
534
|
+
});
|
|
535
|
+
|
|
536
|
+
return item;
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
function extractSchemaOrgItem($, element) {
|
|
540
|
+
const $element = $(element);
|
|
541
|
+
const typeOf = $element.attr('typeof');
|
|
542
|
+
const property = $element.attr('property');
|
|
543
|
+
|
|
544
|
+
if (!typeOf && !property) return null;
|
|
545
|
+
|
|
546
|
+
const item = {};
|
|
547
|
+
if (typeOf) item.typeof = typeOf;
|
|
548
|
+
if (property) item.property = property;
|
|
549
|
+
|
|
550
|
+
const content = $element.text().trim();
|
|
551
|
+
if (content) {
|
|
552
|
+
item.content = content;
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
return item;
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
function analyzeSentiment(text) {
|
|
559
|
+
// Simple sentiment analysis based on positive/negative word counts
|
|
560
|
+
const positiveWords = ['good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 'awesome', 'love', 'best', 'perfect'];
|
|
561
|
+
const negativeWords = ['bad', 'terrible', 'awful', 'horrible', 'hate', 'worst', 'disappointing', 'poor', 'negative', 'sad'];
|
|
562
|
+
|
|
563
|
+
const words = text.toLowerCase().split(/\s+/);
|
|
564
|
+
let positiveCount = 0;
|
|
565
|
+
let negativeCount = 0;
|
|
566
|
+
|
|
567
|
+
words.forEach(word => {
|
|
568
|
+
if (positiveWords.includes(word)) positiveCount++;
|
|
569
|
+
if (negativeWords.includes(word)) negativeCount++;
|
|
570
|
+
});
|
|
571
|
+
|
|
572
|
+
const total = positiveCount + negativeCount;
|
|
573
|
+
let sentiment = 'neutral';
|
|
574
|
+
let score = 0;
|
|
575
|
+
|
|
576
|
+
if (total > 0) {
|
|
577
|
+
score = (positiveCount - negativeCount) / total;
|
|
578
|
+
if (score > 0.1) sentiment = 'positive';
|
|
579
|
+
else if (score < -0.1) sentiment = 'negative';
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
return {
|
|
583
|
+
sentiment,
|
|
584
|
+
score: Math.round(score * 100) / 100,
|
|
585
|
+
positiveWords: positiveCount,
|
|
586
|
+
negativeWords: negativeCount
|
|
587
|
+
};
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
function extractKeywords(doc) {
|
|
591
|
+
const nouns = doc.nouns().out('array');
|
|
592
|
+
const adjectives = doc.adjectives().out('array');
|
|
593
|
+
const verbs = doc.verbs().out('array');
|
|
594
|
+
|
|
595
|
+
// Simple frequency-based keyword extraction
|
|
596
|
+
const allWords = [...nouns, ...adjectives, ...verbs];
|
|
597
|
+
const frequency = {};
|
|
598
|
+
|
|
599
|
+
allWords.forEach(word => {
|
|
600
|
+
const normalized = word.toLowerCase();
|
|
601
|
+
frequency[normalized] = (frequency[normalized] || 0) + 1;
|
|
602
|
+
});
|
|
603
|
+
|
|
604
|
+
// Sort by frequency and return top keywords
|
|
605
|
+
const keywords = Object.entries(frequency)
|
|
606
|
+
.sort(([,a], [,b]) => b - a)
|
|
607
|
+
.slice(0, 10)
|
|
608
|
+
.map(([word, freq]) => ({ word, frequency: freq }));
|
|
609
|
+
|
|
610
|
+
return keywords;
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
function calculateTextMetrics(text) {
|
|
614
|
+
const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
|
|
615
|
+
const words = text.split(/\s+/).filter(w => w.length > 0);
|
|
616
|
+
const paragraphs = text.split(/\n\s*\n/).filter(p => p.trim().length > 0);
|
|
617
|
+
|
|
618
|
+
return {
|
|
619
|
+
characters: text.length,
|
|
620
|
+
charactersNoSpaces: text.replace(/\s/g, '').length,
|
|
621
|
+
words: words.length,
|
|
622
|
+
sentences: sentences.length,
|
|
623
|
+
paragraphs: paragraphs.length,
|
|
624
|
+
averageWordsPerSentence: words.length / (sentences.length || 1),
|
|
625
|
+
averageSentencesPerParagraph: sentences.length / (paragraphs.length || 1)
|
|
626
|
+
};
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
function calculateJaccardSimilarity(text1, text2) {
|
|
630
|
+
const set1 = new Set(text1.toLowerCase().split(/\s+/));
|
|
631
|
+
const set2 = new Set(text2.toLowerCase().split(/\s+/));
|
|
632
|
+
|
|
633
|
+
const intersection = new Set([...set1].filter(word => set2.has(word)));
|
|
634
|
+
const union = new Set([...set1, ...set2]);
|
|
635
|
+
|
|
636
|
+
return intersection.size / union.size;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
function calculateCosineSimilarity(text1, text2) {
|
|
640
|
+
const words1 = text1.toLowerCase().split(/\s+/);
|
|
641
|
+
const words2 = text2.toLowerCase().split(/\s+/);
|
|
642
|
+
|
|
643
|
+
const allWords = [...new Set([...words1, ...words2])];
|
|
644
|
+
|
|
645
|
+
const vector1 = allWords.map(word => words1.filter(w => w === word).length);
|
|
646
|
+
const vector2 = allWords.map(word => words2.filter(w => w === word).length);
|
|
647
|
+
|
|
648
|
+
const dotProduct = vector1.reduce((sum, val, i) => sum + val * vector2[i], 0);
|
|
649
|
+
const magnitude1 = Math.sqrt(vector1.reduce((sum, val) => sum + val * val, 0));
|
|
650
|
+
const magnitude2 = Math.sqrt(vector2.reduce((sum, val) => sum + val * val, 0));
|
|
651
|
+
|
|
652
|
+
return dotProduct / (magnitude1 * magnitude2);
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
function calculateLevenshteinDistance(text1, text2) {
|
|
656
|
+
const matrix = [];
|
|
657
|
+
const len1 = text1.length;
|
|
658
|
+
const len2 = text2.length;
|
|
659
|
+
|
|
660
|
+
for (let i = 0; i <= len1; i++) {
|
|
661
|
+
matrix[i] = [i];
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
for (let j = 0; j <= len2; j++) {
|
|
665
|
+
matrix[0][j] = j;
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
for (let i = 1; i <= len1; i++) {
|
|
669
|
+
for (let j = 1; j <= len2; j++) {
|
|
670
|
+
const cost = text1[i - 1] === text2[j - 1] ? 0 : 1;
|
|
671
|
+
matrix[i][j] = Math.min(
|
|
672
|
+
matrix[i - 1][j] + 1, // deletion
|
|
673
|
+
matrix[i][j - 1] + 1, // insertion
|
|
674
|
+
matrix[i - 1][j - 1] + cost // substitution
|
|
675
|
+
);
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
return matrix[len1][len2];
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
function normalizeUrl(url) {
|
|
683
|
+
try {
|
|
684
|
+
const urlObj = new URL(url);
|
|
685
|
+
// Remove trailing slash, convert to lowercase, remove default ports
|
|
686
|
+
urlObj.pathname = urlObj.pathname.replace(/\/$/, '') || '/';
|
|
687
|
+
urlObj.hostname = urlObj.hostname.toLowerCase();
|
|
688
|
+
if ((urlObj.protocol === 'http:' && urlObj.port === '80') ||
|
|
689
|
+
(urlObj.protocol === 'https:' && urlObj.port === '443')) {
|
|
690
|
+
urlObj.port = '';
|
|
691
|
+
}
|
|
692
|
+
return urlObj.toString();
|
|
693
|
+
} catch (error) {
|
|
694
|
+
return url;
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
function normalizeObject(obj, options) {
|
|
699
|
+
const { removeNulls, trimStrings, lowercaseKeys } = options;
|
|
700
|
+
|
|
701
|
+
if (obj === null || obj === undefined) {
|
|
702
|
+
return removeNulls ? undefined : obj;
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
if (Array.isArray(obj)) {
|
|
706
|
+
const normalized = obj.map(item => normalizeObject(item, options)).filter(item => item !== undefined);
|
|
707
|
+
return normalized;
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
if (typeof obj === 'object') {
|
|
711
|
+
const normalized = {};
|
|
712
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
713
|
+
const normalizedKey = lowercaseKeys ? key.toLowerCase() : key;
|
|
714
|
+
const normalizedValue = normalizeObject(value, options);
|
|
715
|
+
|
|
716
|
+
if (normalizedValue !== undefined) {
|
|
717
|
+
normalized[normalizedKey] = normalizedValue;
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
return normalized;
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
if (typeof obj === 'string' && trimStrings) {
|
|
724
|
+
return obj.trim();
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
return obj;
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
function validateAgainstSchema(data, schema) {
|
|
731
|
+
// Simple schema validation - in a real implementation, you'd use a proper schema validator
|
|
732
|
+
try {
|
|
733
|
+
if (typeof schema === 'object' && schema.type) {
|
|
734
|
+
return typeof data === schema.type;
|
|
735
|
+
}
|
|
736
|
+
return true;
|
|
737
|
+
} catch (error) {
|
|
738
|
+
return false;
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
// Signal that worker is ready
|
|
743
|
+
parentPort.postMessage({ type: 'ready' });
|