crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,588 @@
1
+ /**
2
+ * Content Processing Utilities
3
+ * Supporting functions for content extraction, cleaning, and quality assessment
4
+ */
5
+
6
+ import * as cheerio from 'cheerio';
7
+ import { z } from 'zod';
8
+
9
+ /**
10
+ * HTML cleaning utilities
11
+ */
12
+ export class HTMLCleaner {
13
+ /**
14
+ * Clean HTML content by removing unwanted elements and attributes
15
+ * @param {string} html - HTML content to clean
16
+ * @param {Object} options - Cleaning options
17
+ * @returns {string} - Cleaned HTML
18
+ */
19
+ static cleanHTML(html, options = {}) {
20
+ const defaultOptions = {
21
+ removeScripts: true,
22
+ removeStyles: true,
23
+ removeComments: true,
24
+ removeEmpty: true,
25
+ allowedTags: ['p', 'div', 'span', 'a', 'img', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'strong', 'em', 'b', 'i', 'blockquote', 'code', 'pre'],
26
+ allowedAttributes: ['href', 'src', 'alt', 'title', 'class', 'id']
27
+ };
28
+
29
+ const cleaningOptions = { ...defaultOptions, ...options };
30
+ const $ = cheerio.load(html);
31
+
32
+ // Remove scripts and styles
33
+ if (cleaningOptions.removeScripts) {
34
+ $('script, noscript').remove();
35
+ }
36
+ if (cleaningOptions.removeStyles) {
37
+ $('style, link[rel="stylesheet"]').remove();
38
+ }
39
+
40
+ // Remove comments
41
+ if (cleaningOptions.removeComments) {
42
+ $('*').contents().filter((_, node) => node.type === 'comment').remove();
43
+ }
44
+
45
+ // Remove unwanted elements
46
+ $('nav, header, footer, aside, .advertisement, .ads, .social-share, .popup, .modal').remove();
47
+
48
+ // Clean attributes
49
+ if (cleaningOptions.allowedAttributes) {
50
+ $('*').each((_, element) => {
51
+ const $element = $(element);
52
+ const attributes = element.attribs || {};
53
+
54
+ Object.keys(attributes).forEach(attr => {
55
+ if (!cleaningOptions.allowedAttributes.includes(attr)) {
56
+ $element.removeAttr(attr);
57
+ }
58
+ });
59
+ });
60
+ }
61
+
62
+ // Remove empty elements
63
+ if (cleaningOptions.removeEmpty) {
64
+ $('*').filter((_, element) => {
65
+ const $element = $(element);
66
+ return $element.text().trim() === '' &&
67
+ $element.find('img, video, audio, iframe').length === 0;
68
+ }).remove();
69
+ }
70
+
71
+ return $.html();
72
+ }
73
+
74
+ /**
75
+ * Extract text content with preserved formatting
76
+ * @param {string} html - HTML content
77
+ * @param {Object} options - Extraction options
78
+ * @returns {string} - Extracted text
79
+ */
80
+ static extractTextWithFormatting(html, options = {}) {
81
+ const defaultOptions = {
82
+ preserveLineBreaks: true,
83
+ preserveParagraphs: true,
84
+ includeLinks: false,
85
+ includeImageAlt: true
86
+ };
87
+
88
+ const extractOptions = { ...defaultOptions, ...options };
89
+ const $ = cheerio.load(html);
90
+
91
+ // Remove unwanted elements
92
+ $('script, style, nav, header, footer, aside').remove();
93
+
94
+ let text = '';
95
+
96
+ $('body').find('*').each((_, element) => {
97
+ const $element = $(element);
98
+ const tagName = element.tagName.toLowerCase();
99
+
100
+ switch (tagName) {
101
+ case 'p':
102
+ case 'div':
103
+ if (extractOptions.preserveParagraphs) {
104
+ text += '\n\n' + $element.text().trim();
105
+ } else {
106
+ text += ' ' + $element.text().trim();
107
+ }
108
+ break;
109
+ case 'br':
110
+ if (extractOptions.preserveLineBreaks) {
111
+ text += '\n';
112
+ }
113
+ break;
114
+ case 'h1':
115
+ case 'h2':
116
+ case 'h3':
117
+ case 'h4':
118
+ case 'h5':
119
+ case 'h6':
120
+ text += '\n\n' + $element.text().trim().toUpperCase() + '\n';
121
+ break;
122
+ case 'a':
123
+ if (extractOptions.includeLinks) {
124
+ const href = $element.attr('href');
125
+ const linkText = $element.text().trim();
126
+ text += ` ${linkText}${href ? ` (${href})` : ''}`;
127
+ } else {
128
+ text += ' ' + $element.text().trim();
129
+ }
130
+ break;
131
+ case 'img':
132
+ if (extractOptions.includeImageAlt) {
133
+ const alt = $element.attr('alt');
134
+ if (alt) {
135
+ text += ` [Image: ${alt}]`;
136
+ }
137
+ }
138
+ break;
139
+ case 'li':
140
+ text += '\n• ' + $element.text().trim();
141
+ break;
142
+ default:
143
+ // For other elements, just extract text
144
+ if ($element.children().length === 0) {
145
+ text += ' ' + $element.text().trim();
146
+ }
147
+ }
148
+ });
149
+
150
+ return text.replace(/\s+/g, ' ').replace(/\n\s+/g, '\n').trim();
151
+ }
152
+ }
153
+
154
+ /**
155
+ * Content quality assessment utilities
156
+ */
157
+ export class ContentQualityAssessor {
158
+ /**
159
+ * Assess content quality based on various metrics
160
+ * @param {string} content - Text content to assess
161
+ * @param {Object} options - Assessment options
162
+ * @returns {Object} - Quality assessment results
163
+ */
164
+ static assessContentQuality(content, options = {}) {
165
+ const defaultOptions = {
166
+ minLength: 100,
167
+ maxLength: 50000,
168
+ minWords: 20,
169
+ assessReadability: true,
170
+ checkForBoilerplate: true
171
+ };
172
+
173
+ const assessmentOptions = { ...defaultOptions, ...options };
174
+
175
+ if (!content || typeof content !== 'string') {
176
+ return {
177
+ isValid: false,
178
+ score: 0,
179
+ reasons: ['Invalid or empty content']
180
+ };
181
+ }
182
+
183
+ const assessment = {
184
+ isValid: true,
185
+ score: 100,
186
+ reasons: [],
187
+ metrics: {}
188
+ };
189
+
190
+ // Basic metrics
191
+ const words = content.split(/\s+/).filter(w => w.length > 0);
192
+ const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 0);
193
+ const paragraphs = content.split(/\n\s*\n/).filter(p => p.trim().length > 0);
194
+
195
+ assessment.metrics = {
196
+ length: content.length,
197
+ words: words.length,
198
+ sentences: sentences.length,
199
+ paragraphs: paragraphs.length,
200
+ avgWordsPerSentence: words.length / Math.max(sentences.length, 1),
201
+ avgSentencesPerParagraph: sentences.length / Math.max(paragraphs.length, 1)
202
+ };
203
+
204
+ // Length assessment
205
+ if (content.length < assessmentOptions.minLength) {
206
+ assessment.score -= 30;
207
+ assessment.reasons.push(`Content too short (${content.length} chars)`);
208
+ }
209
+ if (content.length > assessmentOptions.maxLength) {
210
+ assessment.score -= 10;
211
+ assessment.reasons.push(`Content very long (${content.length} chars)`);
212
+ }
213
+
214
+ // Word count assessment
215
+ if (words.length < assessmentOptions.minWords) {
216
+ assessment.score -= 25;
217
+ assessment.reasons.push(`Too few words (${words.length})`);
218
+ }
219
+
220
+ // Sentence structure assessment
221
+ if (assessment.metrics.avgWordsPerSentence < 5) {
222
+ assessment.score -= 15;
223
+ assessment.reasons.push('Very short sentences detected');
224
+ }
225
+ if (assessment.metrics.avgWordsPerSentence > 30) {
226
+ assessment.score -= 10;
227
+ assessment.reasons.push('Very long sentences detected');
228
+ }
229
+
230
+ // Boilerplate detection
231
+ if (assessmentOptions.checkForBoilerplate) {
232
+ const boilerplateScore = this.detectBoilerplate(content);
233
+ if (boilerplateScore > 0.3) {
234
+ assessment.score -= Math.round(boilerplateScore * 50);
235
+ assessment.reasons.push('Potential boilerplate content detected');
236
+ }
237
+ assessment.metrics.boilerplateScore = boilerplateScore;
238
+ }
239
+
240
+ // Readability assessment
241
+ if (assessmentOptions.assessReadability) {
242
+ const readability = this.calculateSimpleReadability(content);
243
+ assessment.metrics.readability = readability;
244
+
245
+ if (readability.score < 30 || readability.score > 100) {
246
+ assessment.score -= 10;
247
+ assessment.reasons.push('Poor readability score');
248
+ }
249
+ }
250
+
251
+ // Final validation
252
+ if (assessment.score < 50) {
253
+ assessment.isValid = false;
254
+ }
255
+
256
+ assessment.score = Math.max(0, Math.min(100, assessment.score));
257
+
258
+ return assessment;
259
+ }
260
+
261
+ /**
262
+ * Detect boilerplate content patterns
263
+ * @param {string} content - Content to analyze
264
+ * @returns {number} - Boilerplate score (0-1)
265
+ */
266
+ static detectBoilerplate(content) {
267
+ const boilerplatePatterns = [
268
+ /cookie/gi,
269
+ /privacy policy/gi,
270
+ /terms of service/gi,
271
+ /subscribe to/gi,
272
+ /newsletter/gi,
273
+ /follow us/gi,
274
+ /share this/gi,
275
+ /related articles/gi,
276
+ /read more/gi,
277
+ /advertisement/gi,
278
+ /sponsored/gi,
279
+ /copyright/gi,
280
+ /all rights reserved/gi
281
+ ];
282
+
283
+ let matches = 0;
284
+ let totalLength = 0;
285
+
286
+ boilerplatePatterns.forEach(pattern => {
287
+ const patternMatches = content.match(pattern);
288
+ if (patternMatches) {
289
+ matches += patternMatches.length;
290
+ totalLength += patternMatches.join('').length;
291
+ }
292
+ });
293
+
294
+ // Calculate score based on frequency and length of matches
295
+ const frequency = matches / Math.max(content.split(/\s+/).length, 1);
296
+ const lengthRatio = totalLength / Math.max(content.length, 1);
297
+
298
+ return Math.min(1, frequency * 10 + lengthRatio * 5);
299
+ }
300
+
301
+ /**
302
+ * Calculate simple readability metrics
303
+ * @param {string} text - Text to analyze
304
+ * @returns {Object} - Readability metrics
305
+ */
306
+ static calculateSimpleReadability(text) {
307
+ const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
308
+ const words = text.split(/\s+/).filter(w => w.length > 0);
309
+ const syllables = words.reduce((count, word) => count + this.countSyllables(word), 0);
310
+
311
+ if (sentences.length === 0 || words.length === 0) {
312
+ return { score: 0, level: 'Unknown' };
313
+ }
314
+
315
+ const avgWordsPerSentence = words.length / sentences.length;
316
+ const avgSyllablesPerWord = syllables / words.length;
317
+
318
+ // Flesch Reading Ease Score
319
+ const score = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * avgSyllablesPerWord);
320
+
321
+ return {
322
+ score: Math.round(score * 100) / 100,
323
+ level: this.getReadabilityLevel(score),
324
+ avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
325
+ avgSyllablesPerWord: Math.round(avgSyllablesPerWord * 100) / 100
326
+ };
327
+ }
328
+
329
+ /**
330
+ * Count syllables in a word (simple approximation)
331
+ * @param {string} word - Word to count syllables for
332
+ * @returns {number} - Syllable count
333
+ */
334
+ static countSyllables(word) {
335
+ if (!word || word.length <= 3) return 1;
336
+
337
+ const vowels = 'aeiouy';
338
+ let count = 0;
339
+ let prevIsVowel = false;
340
+
341
+ for (let i = 0; i < word.length; i++) {
342
+ const isVowel = vowels.includes(word[i].toLowerCase());
343
+ if (isVowel && !prevIsVowel) {
344
+ count++;
345
+ }
346
+ prevIsVowel = isVowel;
347
+ }
348
+
349
+ // Adjust for silent 'e'
350
+ if (word.toLowerCase().endsWith('e')) {
351
+ count--;
352
+ }
353
+
354
+ return Math.max(1, count);
355
+ }
356
+
357
+ /**
358
+ * Get readability level from score
359
+ * @param {number} score - Readability score
360
+ * @returns {string} - Readability level
361
+ */
362
+ static getReadabilityLevel(score) {
363
+ if (score >= 90) return 'Very Easy';
364
+ if (score >= 80) return 'Easy';
365
+ if (score >= 70) return 'Fairly Easy';
366
+ if (score >= 60) return 'Standard';
367
+ if (score >= 50) return 'Fairly Difficult';
368
+ if (score >= 30) return 'Difficult';
369
+ return 'Very Difficult';
370
+ }
371
+ }
372
+
373
+ /**
374
+ * Structured data parsing utilities
375
+ */
376
+ export class StructuredDataParser {
377
+ /**
378
+ * Parse and validate JSON-LD data
379
+ * @param {Array} jsonLdArray - Array of JSON-LD objects
380
+ * @returns {Array} - Validated and parsed JSON-LD data
381
+ */
382
+ static parseJsonLD(jsonLdArray) {
383
+ if (!Array.isArray(jsonLdArray)) {
384
+ return [];
385
+ }
386
+
387
+ return jsonLdArray.map(item => {
388
+ try {
389
+ // If item is a string, parse it
390
+ const parsed = typeof item === 'string' ? JSON.parse(item) : item;
391
+
392
+ // Validate basic JSON-LD structure
393
+ if (parsed && typeof parsed === 'object') {
394
+ return {
395
+ type: parsed['@type'] || 'Unknown',
396
+ context: parsed['@context'] || null,
397
+ data: parsed,
398
+ isValid: true
399
+ };
400
+ }
401
+ return null;
402
+ } catch (error) {
403
+ return {
404
+ type: 'Invalid',
405
+ context: null,
406
+ data: item,
407
+ isValid: false,
408
+ error: error.message
409
+ };
410
+ }
411
+ }).filter(item => item !== null);
412
+ }
413
+
414
+ /**
415
+ * Extract common schema.org types from structured data
416
+ * @param {Object} structuredData - Structured data object
417
+ * @returns {Object} - Extracted common types
418
+ */
419
+ static extractCommonSchemaTypes(structuredData) {
420
+ const commonTypes = {
421
+ article: null,
422
+ organization: null,
423
+ person: null,
424
+ product: null,
425
+ event: null,
426
+ place: null,
427
+ website: null
428
+ };
429
+
430
+ // Process JSON-LD data
431
+ if (structuredData.jsonLd && Array.isArray(structuredData.jsonLd)) {
432
+ structuredData.jsonLd.forEach(item => {
433
+ if (!item || typeof item !== 'object') return;
434
+
435
+ const type = (item['@type'] || '').toLowerCase();
436
+
437
+ if (type.includes('article') || type.includes('blogposting') || type.includes('newsarticle')) {
438
+ commonTypes.article = this.extractArticleData(item);
439
+ } else if (type.includes('organization')) {
440
+ commonTypes.organization = this.extractOrganizationData(item);
441
+ } else if (type.includes('person')) {
442
+ commonTypes.person = this.extractPersonData(item);
443
+ } else if (type.includes('product')) {
444
+ commonTypes.product = this.extractProductData(item);
445
+ } else if (type.includes('event')) {
446
+ commonTypes.event = this.extractEventData(item);
447
+ } else if (type.includes('place')) {
448
+ commonTypes.place = this.extractPlaceData(item);
449
+ } else if (type.includes('website')) {
450
+ commonTypes.website = this.extractWebsiteData(item);
451
+ }
452
+ });
453
+ }
454
+
455
+ return commonTypes;
456
+ }
457
+
458
+ /**
459
+ * Extract article data from structured data
460
+ * @param {Object} data - Structured data item
461
+ * @returns {Object} - Extracted article data
462
+ */
463
+ static extractArticleData(data) {
464
+ return {
465
+ headline: data.headline || data.name || null,
466
+ author: data.author ? (typeof data.author === 'string' ? data.author : data.author.name) : null,
467
+ datePublished: data.datePublished || null,
468
+ dateModified: data.dateModified || null,
469
+ description: data.description || null,
470
+ image: data.image || null,
471
+ publisher: data.publisher ? (typeof data.publisher === 'string' ? data.publisher : data.publisher.name) : null,
472
+ wordCount: data.wordCount || null,
473
+ articleSection: data.articleSection || null
474
+ };
475
+ }
476
+
477
+ /**
478
+ * Extract organization data from structured data
479
+ * @param {Object} data - Structured data item
480
+ * @returns {Object} - Extracted organization data
481
+ */
482
+ static extractOrganizationData(data) {
483
+ return {
484
+ name: data.name || null,
485
+ url: data.url || null,
486
+ logo: data.logo || null,
487
+ description: data.description || null,
488
+ address: data.address || null,
489
+ telephone: data.telephone || null,
490
+ email: data.email || null,
491
+ foundingDate: data.foundingDate || null
492
+ };
493
+ }
494
+
495
+ /**
496
+ * Extract person data from structured data
497
+ * @param {Object} data - Structured data item
498
+ * @returns {Object} - Extracted person data
499
+ */
500
+ static extractPersonData(data) {
501
+ return {
502
+ name: data.name || null,
503
+ givenName: data.givenName || null,
504
+ familyName: data.familyName || null,
505
+ jobTitle: data.jobTitle || null,
506
+ worksFor: data.worksFor ? (typeof data.worksFor === 'string' ? data.worksFor : data.worksFor.name) : null,
507
+ url: data.url || null,
508
+ image: data.image || null,
509
+ description: data.description || null
510
+ };
511
+ }
512
+
513
+ /**
514
+ * Extract product data from structured data
515
+ * @param {Object} data - Structured data item
516
+ * @returns {Object} - Extracted product data
517
+ */
518
+ static extractProductData(data) {
519
+ return {
520
+ name: data.name || null,
521
+ description: data.description || null,
522
+ image: data.image || null,
523
+ brand: data.brand ? (typeof data.brand === 'string' ? data.brand : data.brand.name) : null,
524
+ price: data.offers ? data.offers.price : null,
525
+ currency: data.offers ? data.offers.priceCurrency : null,
526
+ availability: data.offers ? data.offers.availability : null,
527
+ sku: data.sku || null,
528
+ gtin: data.gtin || data.gtin13 || data.gtin12 || data.gtin8 || null
529
+ };
530
+ }
531
+
532
+ /**
533
+ * Extract event data from structured data
534
+ * @param {Object} data - Structured data item
535
+ * @returns {Object} - Extracted event data
536
+ */
537
+ static extractEventData(data) {
538
+ return {
539
+ name: data.name || null,
540
+ description: data.description || null,
541
+ startDate: data.startDate || null,
542
+ endDate: data.endDate || null,
543
+ location: data.location ? (typeof data.location === 'string' ? data.location : data.location.name) : null,
544
+ organizer: data.organizer ? (typeof data.organizer === 'string' ? data.organizer : data.organizer.name) : null,
545
+ price: data.offers ? data.offers.price : null,
546
+ url: data.url || null
547
+ };
548
+ }
549
+
550
+ /**
551
+ * Extract place data from structured data
552
+ * @param {Object} data - Structured data item
553
+ * @returns {Object} - Extracted place data
554
+ */
555
+ static extractPlaceData(data) {
556
+ return {
557
+ name: data.name || null,
558
+ address: data.address || null,
559
+ telephone: data.telephone || null,
560
+ url: data.url || null,
561
+ description: data.description || null,
562
+ geo: data.geo || null,
563
+ openingHours: data.openingHours || null
564
+ };
565
+ }
566
+
567
+ /**
568
+ * Extract website data from structured data
569
+ * @param {Object} data - Structured data item
570
+ * @returns {Object} - Extracted website data
571
+ */
572
+ static extractWebsiteData(data) {
573
+ return {
574
+ name: data.name || null,
575
+ url: data.url || null,
576
+ description: data.description || null,
577
+ publisher: data.publisher ? (typeof data.publisher === 'string' ? data.publisher : data.publisher.name) : null,
578
+ inLanguage: data.inLanguage || null,
579
+ potentialAction: data.potentialAction || null
580
+ };
581
+ }
582
+ }
583
+
584
+ export default {
585
+ HTMLCleaner,
586
+ ContentQualityAssessor,
587
+ StructuredDataParser
588
+ };