crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,743 @@
1
+ /**
2
+ * Worker thread script for CPU-intensive tasks
3
+ * Handles HTML parsing, content analysis, and other computationally expensive operations
4
+ */
5
+
6
+ import { parentPort } from 'worker_threads';
7
+ import * as cheerio from 'cheerio';
8
+ import { JSDOM } from 'jsdom';
9
+ import { Readability } from '@mozilla/readability';
10
+ import compromise from 'compromise';
11
+ import { franc } from 'franc';
12
+
13
+ // Task handlers
14
+ const taskHandlers = {
15
+ parseHtml: handleParseHtml,
16
+ extractContent: handleExtractContent,
17
+ analyzeText: handleAnalyzeText,
18
+ processStructuredData: handleProcessStructuredData,
19
+ calculateSimilarity: handleCalculateSimilarity,
20
+ validateUrls: handleValidateUrls,
21
+ normalizeData: handleNormalizeData,
22
+ computeHash: handleComputeHash
23
+ };
24
+
25
+ // Listen for messages from main thread
26
+ parentPort.on('message', async (message) => {
27
+ // Handle ready signal and ignore it
28
+ if (message && message.type === 'ready') {
29
+ return;
30
+ }
31
+
32
+ const { taskId, type, data } = message;
33
+
34
+ // Validate required fields
35
+ if (!taskId) {
36
+ console.error('Worker received message without taskId:', message);
37
+ return;
38
+ }
39
+
40
+ if (!type) {
41
+ parentPort.postMessage({
42
+ taskId,
43
+ error: 'No task type specified',
44
+ stack: new Error('No task type specified').stack
45
+ });
46
+ return;
47
+ }
48
+
49
+ try {
50
+ const handler = taskHandlers[type];
51
+ if (!handler) {
52
+ throw new Error(`Unknown task type: ${type}`);
53
+ }
54
+
55
+ const result = await handler(data);
56
+
57
+ parentPort.postMessage({
58
+ taskId,
59
+ result
60
+ });
61
+ } catch (error) {
62
+ parentPort.postMessage({
63
+ taskId,
64
+ error: error.message,
65
+ stack: error.stack
66
+ });
67
+ }
68
+ });
69
+
70
+ /**
71
+ * Parse HTML and extract basic structure
72
+ * @param {Object} data - Task data
73
+ * @returns {Object} - Parsed HTML structure
74
+ */
75
+ async function handleParseHtml(data) {
76
+ const { html, options = {} } = data;
77
+ const {
78
+ extractText = true,
79
+ extractLinks = true,
80
+ extractImages = true,
81
+ extractMeta = true,
82
+ removeScripts = true
83
+ } = options;
84
+
85
+ const $ = cheerio.load(html);
86
+ const result = {};
87
+
88
+ // Remove scripts and styles if requested
89
+ if (removeScripts) {
90
+ $('script, style').remove();
91
+ }
92
+
93
+ // Extract text content
94
+ if (extractText) {
95
+ result.text = $('body').text().trim();
96
+ result.title = $('title').text().trim();
97
+ }
98
+
99
+ // Extract links
100
+ if (extractLinks) {
101
+ result.links = [];
102
+ $('a[href]').each((_, element) => {
103
+ const $link = $(element);
104
+ result.links.push({
105
+ href: $link.attr('href'),
106
+ text: $link.text().trim(),
107
+ title: $link.attr('title') || null
108
+ });
109
+ });
110
+ }
111
+
112
+ // Extract images
113
+ if (extractImages) {
114
+ result.images = [];
115
+ $('img[src]').each((_, element) => {
116
+ const $img = $(element);
117
+ result.images.push({
118
+ src: $img.attr('src'),
119
+ alt: $img.attr('alt') || null,
120
+ title: $img.attr('title') || null,
121
+ width: $img.attr('width') || null,
122
+ height: $img.attr('height') || null
123
+ });
124
+ });
125
+ }
126
+
127
+ // Extract meta information
128
+ if (extractMeta) {
129
+ result.meta = {};
130
+ $('meta').each((_, element) => {
131
+ const $meta = $(element);
132
+ const name = $meta.attr('name') || $meta.attr('property');
133
+ const content = $meta.attr('content');
134
+ if (name && content) {
135
+ result.meta[name] = content;
136
+ }
137
+ });
138
+ }
139
+
140
+ return result;
141
+ }
142
+
143
+ /**
144
+ * Extract main content using Mozilla Readability
145
+ * @param {Object} data - Task data
146
+ * @returns {Object} - Extracted content
147
+ */
148
+ async function handleExtractContent(data) {
149
+ const { html, url, options = {} } = data;
150
+ const {
151
+ removeBoilerplate = true,
152
+ extractStructuredData = true,
153
+ calculateReadability = true
154
+ } = options;
155
+
156
+ const result = {};
157
+
158
+ // Create JSDOM instance
159
+ const dom = new JSDOM(html, { url });
160
+ const document = dom.window.document;
161
+
162
+ // Use Readability for main content extraction
163
+ if (removeBoilerplate) {
164
+ const reader = new Readability(document, {
165
+ debug: false,
166
+ maxElemsToDivide: 300,
167
+ charThreshold: 500
168
+ });
169
+
170
+ const article = reader.parse();
171
+ if (article) {
172
+ result.article = {
173
+ title: article.title,
174
+ content: article.content,
175
+ textContent: article.textContent,
176
+ length: article.length,
177
+ excerpt: article.excerpt,
178
+ byline: article.byline,
179
+ dir: article.dir,
180
+ siteName: article.siteName,
181
+ lang: article.lang
182
+ };
183
+
184
+ // Calculate readability if requested
185
+ if (calculateReadability && article.textContent) {
186
+ result.readability = calculateReadabilityScore(article.textContent);
187
+ }
188
+ }
189
+ }
190
+
191
+ // Extract structured data
192
+ if (extractStructuredData) {
193
+ result.structuredData = extractStructuredData(html);
194
+ }
195
+
196
+ return result;
197
+ }
198
+
199
+ /**
200
+ * Analyze text content for various metrics
201
+ * @param {Object} data - Task data
202
+ * @returns {Object} - Text analysis results
203
+ */
204
+ async function handleAnalyzeText(data) {
205
+ const { text, options = {} } = data;
206
+ const {
207
+ detectLanguage = true,
208
+ extractEntities = true,
209
+ analyzeSentiment = true,
210
+ extractKeywords = true,
211
+ calculateMetrics = true
212
+ } = options;
213
+
214
+ const result = {};
215
+
216
+ if (!text || typeof text !== 'string') {
217
+ throw new Error('Invalid text input for analysis');
218
+ }
219
+
220
+ // Detect language
221
+ if (detectLanguage) {
222
+ try {
223
+ result.language = franc(text);
224
+ } catch (error) {
225
+ result.language = 'unknown';
226
+ }
227
+ }
228
+
229
+ // Use compromise for NLP analysis
230
+ const doc = compromise(text);
231
+
232
+ // Extract entities
233
+ if (extractEntities) {
234
+ result.entities = {
235
+ people: doc.people().out('array'),
236
+ places: doc.places().out('array'),
237
+ organizations: doc.organizations().out('array'),
238
+ topics: doc.topics().out('array')
239
+ };
240
+ }
241
+
242
+ // Basic sentiment analysis
243
+ if (analyzeSentiment) {
244
+ result.sentiment = analyzeSentiment(text);
245
+ }
246
+
247
+ // Extract keywords
248
+ if (extractKeywords) {
249
+ result.keywords = extractKeywords(doc);
250
+ }
251
+
252
+ // Calculate text metrics
253
+ if (calculateMetrics) {
254
+ result.metrics = calculateTextMetrics(text);
255
+ }
256
+
257
+ return result;
258
+ }
259
+
260
+ /**
261
+ * Process structured data from HTML
262
+ * @param {Object} data - Task data
263
+ * @returns {Object} - Processed structured data
264
+ */
265
+ async function handleProcessStructuredData(data) {
266
+ const { html, options = {} } = data;
267
+ const {
268
+ extractJsonLd = true,
269
+ extractMicrodata = true,
270
+ extractSchemaOrg = true,
271
+ validateSchema = true
272
+ } = options;
273
+
274
+ const $ = cheerio.load(html);
275
+ const result = {
276
+ jsonLd: [],
277
+ microdata: [],
278
+ schemaOrg: []
279
+ };
280
+
281
+ // Extract JSON-LD
282
+ if (extractJsonLd) {
283
+ $('script[type="application/ld+json"]').each((_, element) => {
284
+ try {
285
+ const jsonText = $(element).html();
286
+ if (jsonText) {
287
+ const parsed = JSON.parse(jsonText);
288
+ result.jsonLd.push(parsed);
289
+ }
290
+ } catch (error) {
291
+ // Skip invalid JSON-LD
292
+ }
293
+ });
294
+ }
295
+
296
+ // Extract microdata
297
+ if (extractMicrodata) {
298
+ $('[itemscope]').each((_, element) => {
299
+ const item = extractMicrodataItem($, element);
300
+ if (item) {
301
+ result.microdata.push(item);
302
+ }
303
+ });
304
+ }
305
+
306
+ // Extract schema.org markup
307
+ if (extractSchemaOrg) {
308
+ $('[typeof], [property], [vocab]').each((_, element) => {
309
+ const schemaItem = extractSchemaOrgItem($, element);
310
+ if (schemaItem) {
311
+ result.schemaOrg.push(schemaItem);
312
+ }
313
+ });
314
+ }
315
+
316
+ return result;
317
+ }
318
+
319
+ /**
320
+ * Calculate similarity between two pieces of text
321
+ * @param {Object} data - Task data
322
+ * @returns {Object} - Similarity metrics
323
+ */
324
+ async function handleCalculateSimilarity(data) {
325
+ const { text1, text2, algorithm = 'jaccard' } = data;
326
+
327
+ if (!text1 || !text2) {
328
+ throw new Error('Two text inputs required for similarity calculation');
329
+ }
330
+
331
+ const result = {};
332
+
333
+ switch (algorithm) {
334
+ case 'jaccard':
335
+ result.jaccardSimilarity = calculateJaccardSimilarity(text1, text2);
336
+ break;
337
+ case 'cosine':
338
+ result.cosineSimilarity = calculateCosineSimilarity(text1, text2);
339
+ break;
340
+ case 'levenshtein':
341
+ result.levenshteinDistance = calculateLevenshteinDistance(text1, text2);
342
+ break;
343
+ default:
344
+ // Calculate all
345
+ result.jaccardSimilarity = calculateJaccardSimilarity(text1, text2);
346
+ result.cosineSimilarity = calculateCosineSimilarity(text1, text2);
347
+ result.levenshteinDistance = calculateLevenshteinDistance(text1, text2);
348
+ }
349
+
350
+ return result;
351
+ }
352
+
353
+ /**
354
+ * Validate and normalize URLs
355
+ * @param {Object} data - Task data
356
+ * @returns {Object} - Validation results
357
+ */
358
+ async function handleValidateUrls(data) {
359
+ const { urls, options = {} } = data;
360
+ const {
361
+ checkReachability = false,
362
+ normalizeUrls = true,
363
+ extractDomains = true
364
+ } = options;
365
+
366
+ if (!Array.isArray(urls)) {
367
+ throw new Error('URLs must be provided as an array');
368
+ }
369
+
370
+ const result = {
371
+ valid: [],
372
+ invalid: [],
373
+ normalized: [],
374
+ domains: new Set()
375
+ };
376
+
377
+ for (const url of urls) {
378
+ try {
379
+ const urlObj = new URL(url);
380
+ result.valid.push(url);
381
+
382
+ if (normalizeUrls) {
383
+ result.normalized.push(normalizeUrl(url));
384
+ }
385
+
386
+ if (extractDomains) {
387
+ result.domains.add(urlObj.hostname);
388
+ }
389
+ } catch (error) {
390
+ result.invalid.push({ url, error: error.message });
391
+ }
392
+ }
393
+
394
+ result.domains = Array.from(result.domains);
395
+
396
+ return result;
397
+ }
398
+
399
+ /**
400
+ * Normalize data structures
401
+ * @param {Object} data - Task data
402
+ * @returns {Object} - Normalized data
403
+ */
404
+ async function handleNormalizeData(data) {
405
+ const { input, schema, options = {} } = data;
406
+ const { removeNulls = true, trimStrings = true, lowercaseKeys = false } = options;
407
+
408
+ const result = normalizeObject(input, { removeNulls, trimStrings, lowercaseKeys });
409
+
410
+ // Validate against schema if provided
411
+ if (schema) {
412
+ result.isValid = validateAgainstSchema(result.data, schema);
413
+ }
414
+
415
+ return result;
416
+ }
417
+
418
+ /**
419
+ * Compute various hashes for data
420
+ * @param {Object} data - Task data
421
+ * @returns {Object} - Hash results
422
+ */
423
+ async function handleComputeHash(data) {
424
+ const { input, algorithms = ['md5', 'sha1', 'sha256'] } = data;
425
+
426
+ const { createHash } = await import('crypto');
427
+ const inputString = typeof input === 'string' ? input : JSON.stringify(input);
428
+
429
+ const result = {};
430
+
431
+ for (const algorithm of algorithms) {
432
+ try {
433
+ const hash = createHash(algorithm);
434
+ hash.update(inputString);
435
+ result[algorithm] = hash.digest('hex');
436
+ } catch (error) {
437
+ result[algorithm] = { error: error.message };
438
+ }
439
+ }
440
+
441
+ return result;
442
+ }
443
+
444
+ // Helper functions
445
+
446
+ function calculateReadabilityScore(text) {
447
+ if (!text || typeof text !== 'string') {
448
+ return null;
449
+ }
450
+
451
+ const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
452
+ const words = text.split(/\s+/).filter(w => w.length > 0);
453
+ const characters = text.length;
454
+ const charactersNoSpaces = text.replace(/\s/g, '').length;
455
+
456
+ if (sentences.length === 0 || words.length === 0) {
457
+ return null;
458
+ }
459
+
460
+ const avgWordsPerSentence = words.length / sentences.length;
461
+ const avgCharsPerWord = charactersNoSpaces / words.length;
462
+
463
+ // Flesch Reading Ease score
464
+ const readabilityScore = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * avgCharsPerWord);
465
+
466
+ return {
467
+ sentences: sentences.length,
468
+ words: words.length,
469
+ characters,
470
+ charactersNoSpaces,
471
+ avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
472
+ avgCharsPerWord: Math.round(avgCharsPerWord * 100) / 100,
473
+ readabilityScore: Math.round(readabilityScore * 100) / 100,
474
+ readabilityLevel: getReadabilityLevel(readabilityScore)
475
+ };
476
+ }
477
+
478
+ function getReadabilityLevel(score) {
479
+ if (score >= 90) return 'Very Easy';
480
+ if (score >= 80) return 'Easy';
481
+ if (score >= 70) return 'Fairly Easy';
482
+ if (score >= 60) return 'Standard';
483
+ if (score >= 50) return 'Fairly Difficult';
484
+ if (score >= 30) return 'Difficult';
485
+ return 'Very Difficult';
486
+ }
487
+
488
+ function extractStructuredData(html) {
489
+ const $ = cheerio.load(html);
490
+ const result = {
491
+ jsonLd: [],
492
+ microdata: [],
493
+ schemaOrg: []
494
+ };
495
+
496
+ // Extract JSON-LD
497
+ $('script[type="application/ld+json"]').each((_, element) => {
498
+ try {
499
+ const jsonText = $(element).html();
500
+ if (jsonText) {
501
+ const parsed = JSON.parse(jsonText);
502
+ result.jsonLd.push(parsed);
503
+ }
504
+ } catch (error) {
505
+ // Skip invalid JSON-LD
506
+ }
507
+ });
508
+
509
+ return result;
510
+ }
511
+
512
+ function extractMicrodataItem($, element) {
513
+ const $element = $(element);
514
+ const itemType = $element.attr('itemtype');
515
+
516
+ if (!itemType) return null;
517
+
518
+ const item = {
519
+ type: itemType,
520
+ properties: {}
521
+ };
522
+
523
+ $element.find('[itemprop]').each((_, propElement) => {
524
+ const $prop = $(propElement);
525
+ const propName = $prop.attr('itemprop');
526
+ const propValue = $prop.text().trim();
527
+
528
+ if (propName && propValue) {
529
+ if (!item.properties[propName]) {
530
+ item.properties[propName] = [];
531
+ }
532
+ item.properties[propName].push(propValue);
533
+ }
534
+ });
535
+
536
+ return item;
537
+ }
538
+
539
+ function extractSchemaOrgItem($, element) {
540
+ const $element = $(element);
541
+ const typeOf = $element.attr('typeof');
542
+ const property = $element.attr('property');
543
+
544
+ if (!typeOf && !property) return null;
545
+
546
+ const item = {};
547
+ if (typeOf) item.typeof = typeOf;
548
+ if (property) item.property = property;
549
+
550
+ const content = $element.text().trim();
551
+ if (content) {
552
+ item.content = content;
553
+ }
554
+
555
+ return item;
556
+ }
557
+
558
+ function analyzeSentiment(text) {
559
+ // Simple sentiment analysis based on positive/negative word counts
560
+ const positiveWords = ['good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 'awesome', 'love', 'best', 'perfect'];
561
+ const negativeWords = ['bad', 'terrible', 'awful', 'horrible', 'hate', 'worst', 'disappointing', 'poor', 'negative', 'sad'];
562
+
563
+ const words = text.toLowerCase().split(/\s+/);
564
+ let positiveCount = 0;
565
+ let negativeCount = 0;
566
+
567
+ words.forEach(word => {
568
+ if (positiveWords.includes(word)) positiveCount++;
569
+ if (negativeWords.includes(word)) negativeCount++;
570
+ });
571
+
572
+ const total = positiveCount + negativeCount;
573
+ let sentiment = 'neutral';
574
+ let score = 0;
575
+
576
+ if (total > 0) {
577
+ score = (positiveCount - negativeCount) / total;
578
+ if (score > 0.1) sentiment = 'positive';
579
+ else if (score < -0.1) sentiment = 'negative';
580
+ }
581
+
582
+ return {
583
+ sentiment,
584
+ score: Math.round(score * 100) / 100,
585
+ positiveWords: positiveCount,
586
+ negativeWords: negativeCount
587
+ };
588
+ }
589
+
590
+ function extractKeywords(doc) {
591
+ const nouns = doc.nouns().out('array');
592
+ const adjectives = doc.adjectives().out('array');
593
+ const verbs = doc.verbs().out('array');
594
+
595
+ // Simple frequency-based keyword extraction
596
+ const allWords = [...nouns, ...adjectives, ...verbs];
597
+ const frequency = {};
598
+
599
+ allWords.forEach(word => {
600
+ const normalized = word.toLowerCase();
601
+ frequency[normalized] = (frequency[normalized] || 0) + 1;
602
+ });
603
+
604
+ // Sort by frequency and return top keywords
605
+ const keywords = Object.entries(frequency)
606
+ .sort(([,a], [,b]) => b - a)
607
+ .slice(0, 10)
608
+ .map(([word, freq]) => ({ word, frequency: freq }));
609
+
610
+ return keywords;
611
+ }
612
+
613
+ function calculateTextMetrics(text) {
614
+ const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
615
+ const words = text.split(/\s+/).filter(w => w.length > 0);
616
+ const paragraphs = text.split(/\n\s*\n/).filter(p => p.trim().length > 0);
617
+
618
+ return {
619
+ characters: text.length,
620
+ charactersNoSpaces: text.replace(/\s/g, '').length,
621
+ words: words.length,
622
+ sentences: sentences.length,
623
+ paragraphs: paragraphs.length,
624
+ averageWordsPerSentence: words.length / (sentences.length || 1),
625
+ averageSentencesPerParagraph: sentences.length / (paragraphs.length || 1)
626
+ };
627
+ }
628
+
629
+ function calculateJaccardSimilarity(text1, text2) {
630
+ const set1 = new Set(text1.toLowerCase().split(/\s+/));
631
+ const set2 = new Set(text2.toLowerCase().split(/\s+/));
632
+
633
+ const intersection = new Set([...set1].filter(word => set2.has(word)));
634
+ const union = new Set([...set1, ...set2]);
635
+
636
+ return intersection.size / union.size;
637
+ }
638
+
639
+ function calculateCosineSimilarity(text1, text2) {
640
+ const words1 = text1.toLowerCase().split(/\s+/);
641
+ const words2 = text2.toLowerCase().split(/\s+/);
642
+
643
+ const allWords = [...new Set([...words1, ...words2])];
644
+
645
+ const vector1 = allWords.map(word => words1.filter(w => w === word).length);
646
+ const vector2 = allWords.map(word => words2.filter(w => w === word).length);
647
+
648
+ const dotProduct = vector1.reduce((sum, val, i) => sum + val * vector2[i], 0);
649
+ const magnitude1 = Math.sqrt(vector1.reduce((sum, val) => sum + val * val, 0));
650
+ const magnitude2 = Math.sqrt(vector2.reduce((sum, val) => sum + val * val, 0));
651
+
652
+ return dotProduct / (magnitude1 * magnitude2);
653
+ }
654
+
655
+ function calculateLevenshteinDistance(text1, text2) {
656
+ const matrix = [];
657
+ const len1 = text1.length;
658
+ const len2 = text2.length;
659
+
660
+ for (let i = 0; i <= len1; i++) {
661
+ matrix[i] = [i];
662
+ }
663
+
664
+ for (let j = 0; j <= len2; j++) {
665
+ matrix[0][j] = j;
666
+ }
667
+
668
+ for (let i = 1; i <= len1; i++) {
669
+ for (let j = 1; j <= len2; j++) {
670
+ const cost = text1[i - 1] === text2[j - 1] ? 0 : 1;
671
+ matrix[i][j] = Math.min(
672
+ matrix[i - 1][j] + 1, // deletion
673
+ matrix[i][j - 1] + 1, // insertion
674
+ matrix[i - 1][j - 1] + cost // substitution
675
+ );
676
+ }
677
+ }
678
+
679
+ return matrix[len1][len2];
680
+ }
681
+
682
+ function normalizeUrl(url) {
683
+ try {
684
+ const urlObj = new URL(url);
685
+ // Remove trailing slash, convert to lowercase, remove default ports
686
+ urlObj.pathname = urlObj.pathname.replace(/\/$/, '') || '/';
687
+ urlObj.hostname = urlObj.hostname.toLowerCase();
688
+ if ((urlObj.protocol === 'http:' && urlObj.port === '80') ||
689
+ (urlObj.protocol === 'https:' && urlObj.port === '443')) {
690
+ urlObj.port = '';
691
+ }
692
+ return urlObj.toString();
693
+ } catch (error) {
694
+ return url;
695
+ }
696
+ }
697
+
698
+ function normalizeObject(obj, options) {
699
+ const { removeNulls, trimStrings, lowercaseKeys } = options;
700
+
701
+ if (obj === null || obj === undefined) {
702
+ return removeNulls ? undefined : obj;
703
+ }
704
+
705
+ if (Array.isArray(obj)) {
706
+ const normalized = obj.map(item => normalizeObject(item, options)).filter(item => item !== undefined);
707
+ return normalized;
708
+ }
709
+
710
+ if (typeof obj === 'object') {
711
+ const normalized = {};
712
+ for (const [key, value] of Object.entries(obj)) {
713
+ const normalizedKey = lowercaseKeys ? key.toLowerCase() : key;
714
+ const normalizedValue = normalizeObject(value, options);
715
+
716
+ if (normalizedValue !== undefined) {
717
+ normalized[normalizedKey] = normalizedValue;
718
+ }
719
+ }
720
+ return normalized;
721
+ }
722
+
723
+ if (typeof obj === 'string' && trimStrings) {
724
+ return obj.trim();
725
+ }
726
+
727
+ return obj;
728
+ }
729
+
730
+ function validateAgainstSchema(data, schema) {
731
+ // Simple schema validation - in a real implementation, you'd use a proper schema validator
732
+ try {
733
+ if (typeof schema === 'object' && schema.type) {
734
+ return typeof data === schema.type;
735
+ }
736
+ return true;
737
+ } catch (error) {
738
+ return false;
739
+ }
740
+ }
741
+
742
+ // Signal that worker is ready
743
+ parentPort.postMessage({ type: 'ready' });