crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,612 @@
1
+ import { normalizeUrl } from './urlNormalizer.js';
2
+
3
+ /**
4
+ * Advanced domain filtering system with whitelist/blacklist management,
5
+ * subdomain handling, pattern matching, and domain-specific rules
6
+ */
7
+ export class DomainFilter {
8
+ constructor(options = {}) {
9
+ const {
10
+ allowSubdomains = true,
11
+ defaultMaxDepth = 5,
12
+ defaultRateLimit = 10
13
+ } = options;
14
+
15
+ this.allowSubdomains = allowSubdomains;
16
+ this.defaultMaxDepth = defaultMaxDepth;
17
+ this.defaultRateLimit = defaultRateLimit;
18
+
19
+ // Core filtering lists
20
+ this.whitelist = new Map(); // domain -> options
21
+ this.blacklist = new Map(); // domain -> options
22
+ this.patterns = {
23
+ include: [], // { pattern: RegExp, options: Object }
24
+ exclude: [] // { pattern: RegExp, options: Object }
25
+ };
26
+
27
+ // Domain-specific rules
28
+ this.domainRules = new Map(); // domain -> rules object
29
+
30
+ // Cache for performance
31
+ this.cache = new Map(); // url -> decision
32
+ this.cacheSize = 10000;
33
+ this.cacheHits = 0;
34
+ this.cacheMisses = 0;
35
+ }
36
+
37
+ /**
38
+ * Add domain to whitelist with options
39
+ * @param {string} domain - Domain to whitelist
40
+ * @param {Object} options - Configuration options
41
+ */
42
+ addWhitelistDomain(domain, options = {}) {
43
+ const normalizedDomain = this.normalizeDomain(domain);
44
+ const config = {
45
+ includeSubdomains: options.includeSubdomains ?? this.allowSubdomains,
46
+ maxDepth: options.maxDepth ?? this.defaultMaxDepth,
47
+ rateLimit: options.rateLimit ?? this.defaultRateLimit,
48
+ customHeaders: options.customHeaders || {},
49
+ timeout: options.timeout || 30000,
50
+ priority: options.priority || 1,
51
+ addedAt: new Date().toISOString()
52
+ };
53
+
54
+ this.whitelist.set(normalizedDomain, config);
55
+ this.clearCache();
56
+ return this;
57
+ }
58
+
59
+ /**
60
+ * Add domain to blacklist with options
61
+ * @param {string} domain - Domain to blacklist
62
+ * @param {Object} options - Configuration options
63
+ */
64
+ addBlacklistDomain(domain, options = {}) {
65
+ const normalizedDomain = this.normalizeDomain(domain);
66
+ const config = {
67
+ includeSubdomains: options.includeSubdomains ?? this.allowSubdomains,
68
+ reason: options.reason || 'Blacklisted',
69
+ permanent: options.permanent ?? false,
70
+ addedAt: new Date().toISOString()
71
+ };
72
+
73
+ this.blacklist.set(normalizedDomain, config);
74
+ this.clearCache();
75
+ return this;
76
+ }
77
+
78
+ /**
79
+ * Add pattern-based filter
80
+ * @param {string} pattern - RegExp pattern string
81
+ * @param {string} type - 'include' or 'exclude'
82
+ * @param {Object} options - Pattern options
83
+ */
84
+ addPattern(pattern, type = 'exclude', options = {}) {
85
+ if (!['include', 'exclude'].includes(type)) {
86
+ throw new Error('Pattern type must be "include" or "exclude"');
87
+ }
88
+
89
+ const config = {
90
+ pattern: new RegExp(pattern, options.flags || 'i'),
91
+ rawPattern: pattern,
92
+ priority: options.priority || 1,
93
+ description: options.description || '',
94
+ addedAt: new Date().toISOString()
95
+ };
96
+
97
+ this.patterns[type].push(config);
98
+
99
+ // Sort by priority (higher first)
100
+ this.patterns[type].sort((a, b) => b.priority - a.priority);
101
+
102
+ this.clearCache();
103
+ return this;
104
+ }
105
+
106
+ /**
107
+ * Remove domain from whitelist
108
+ * @param {string} domain - Domain to remove
109
+ */
110
+ removeWhitelistDomain(domain) {
111
+ const normalizedDomain = this.normalizeDomain(domain);
112
+ const removed = this.whitelist.delete(normalizedDomain);
113
+ if (removed) this.clearCache();
114
+ return removed;
115
+ }
116
+
117
+ /**
118
+ * Remove domain from blacklist
119
+ * @param {string} domain - Domain to remove
120
+ */
121
+ removeBlacklistDomain(domain) {
122
+ const normalizedDomain = this.normalizeDomain(domain);
123
+ const removed = this.blacklist.delete(normalizedDomain);
124
+ if (removed) this.clearCache();
125
+ return removed;
126
+ }
127
+
128
+ /**
129
+ * Remove pattern by index
130
+ * @param {string} type - 'include' or 'exclude'
131
+ * @param {number} index - Pattern index
132
+ */
133
+ removePattern(type, index) {
134
+ if (!['include', 'exclude'].includes(type)) {
135
+ throw new Error('Pattern type must be "include" or "exclude"');
136
+ }
137
+
138
+ if (index >= 0 && index < this.patterns[type].length) {
139
+ this.patterns[type].splice(index, 1);
140
+ this.clearCache();
141
+ return true;
142
+ }
143
+ return false;
144
+ }
145
+
146
+ /**
147
+ * Set domain-specific crawling rules
148
+ * @param {string} domain - Domain for rules
149
+ * @param {Object} rules - Domain-specific rules
150
+ */
151
+ setDomainRules(domain, rules) {
152
+ const normalizedDomain = this.normalizeDomain(domain);
153
+ const config = {
154
+ maxDepth: rules.maxDepth ?? this.defaultMaxDepth,
155
+ rateLimit: rules.rateLimit ?? this.defaultRateLimit,
156
+ respectRobots: rules.respectRobots ?? true,
157
+ allowedPaths: rules.allowedPaths || [],
158
+ blockedPaths: rules.blockedPaths || [],
159
+ customHeaders: rules.customHeaders || {},
160
+ timeout: rules.timeout || 30000,
161
+ maxPages: rules.maxPages || 100,
162
+ concurrency: rules.concurrency || 10,
163
+ updatedAt: new Date().toISOString()
164
+ };
165
+
166
+ this.domainRules.set(normalizedDomain, config);
167
+ return this;
168
+ }
169
+
170
+ /**
171
+ * Get domain-specific rules
172
+ * @param {string} domain - Domain to get rules for
173
+ * @returns {Object} Domain rules or defaults
174
+ */
175
+ getDomainRules(domain) {
176
+ const normalizedDomain = this.normalizeDomain(domain);
177
+
178
+ // Check exact match first
179
+ if (this.domainRules.has(normalizedDomain)) {
180
+ return { ...this.domainRules.get(normalizedDomain) };
181
+ }
182
+
183
+ // Check parent domains for subdomain inheritance
184
+ const parts = normalizedDomain.split('.');
185
+ for (let i = 1; i < parts.length; i++) {
186
+ const parentDomain = parts.slice(i).join('.');
187
+ if (this.domainRules.has(parentDomain)) {
188
+ const parentRules = this.domainRules.get(parentDomain);
189
+ if (parentRules.inheritToSubdomains !== false) {
190
+ return { ...parentRules };
191
+ }
192
+ }
193
+ }
194
+
195
+ // Return defaults
196
+ return {
197
+ maxDepth: this.defaultMaxDepth,
198
+ rateLimit: this.defaultRateLimit,
199
+ respectRobots: true,
200
+ allowedPaths: [],
201
+ blockedPaths: [],
202
+ customHeaders: {},
203
+ timeout: 30000,
204
+ maxPages: 100,
205
+ concurrency: 10
206
+ };
207
+ }
208
+
209
+ /**
210
+ * Check if URL is allowed based on all filtering rules
211
+ * @param {string} url - URL to check
212
+ * @returns {Object} Decision object with allowed status and metadata
213
+ */
214
+ isAllowed(url) {
215
+ try {
216
+ const normalizedUrl = normalizeUrl(url);
217
+
218
+ // Check cache first
219
+ if (this.cache.has(normalizedUrl)) {
220
+ this.cacheHits++;
221
+ return this.cache.get(normalizedUrl);
222
+ }
223
+
224
+ this.cacheMisses++;
225
+ const decision = this.evaluateUrl(normalizedUrl);
226
+
227
+ // Cache the decision
228
+ this.addToCache(normalizedUrl, decision);
229
+
230
+ return decision;
231
+ } catch (error) {
232
+ return {
233
+ allowed: false,
234
+ reason: `Invalid URL: ${error.message}`,
235
+ confidence: 1.0,
236
+ metadata: { error: error.message }
237
+ };
238
+ }
239
+ }
240
+
241
+ /**
242
+ * Internal URL evaluation logic
243
+ * @param {string} url - Normalized URL to evaluate
244
+ * @returns {Object} Decision object
245
+ */
246
+ evaluateUrl(url) {
247
+ const urlObj = new URL(url);
248
+ const domain = urlObj.hostname;
249
+ const path = urlObj.pathname;
250
+
251
+ // 1. Check blacklist first (highest priority)
252
+ const blacklistResult = this.checkBlacklist(domain, path);
253
+ if (!blacklistResult.allowed) {
254
+ return blacklistResult;
255
+ }
256
+
257
+ // 2. Check exclude patterns
258
+ const excludePatternResult = this.checkExcludePatterns(url);
259
+ if (!excludePatternResult.allowed) {
260
+ return excludePatternResult;
261
+ }
262
+
263
+ // 3. Check whitelist
264
+ const whitelistResult = this.checkWhitelist(domain, path);
265
+ if (whitelistResult.allowed) {
266
+ return whitelistResult;
267
+ }
268
+
269
+ // 4. Check include patterns
270
+ const includePatternResult = this.checkIncludePatterns(url);
271
+ if (includePatternResult.allowed) {
272
+ return includePatternResult;
273
+ }
274
+
275
+ // 5. Default behavior - if no whitelist exists, allow; if whitelist exists, deny
276
+ const hasWhitelist = this.whitelist.size > 0 || this.patterns.include.length > 0;
277
+
278
+ return {
279
+ allowed: !hasWhitelist,
280
+ reason: hasWhitelist ? 'Not in whitelist or include patterns' : 'No restrictions',
281
+ confidence: hasWhitelist ? 0.9 : 0.5,
282
+ metadata: {
283
+ domain,
284
+ path,
285
+ hasWhitelist,
286
+ evaluatedAt: new Date().toISOString()
287
+ }
288
+ };
289
+ }
290
+
291
+ /**
292
+ * Check blacklist rules
293
+ * @param {string} domain - Domain to check
294
+ * @param {string} path - URL path
295
+ * @returns {Object} Decision object
296
+ */
297
+ checkBlacklist(domain, path) {
298
+ // Check exact domain match
299
+ if (this.blacklist.has(domain)) {
300
+ const config = this.blacklist.get(domain);
301
+ return {
302
+ allowed: false,
303
+ reason: `Blacklisted domain: ${domain} (${config.reason})`,
304
+ confidence: 1.0,
305
+ metadata: { blacklistConfig: config, matchType: 'exact' }
306
+ };
307
+ }
308
+
309
+ // Check subdomain matches
310
+ const parts = domain.split('.');
311
+ for (let i = 1; i < parts.length; i++) {
312
+ const parentDomain = parts.slice(i).join('.');
313
+ if (this.blacklist.has(parentDomain)) {
314
+ const config = this.blacklist.get(parentDomain);
315
+ if (config.includeSubdomains) {
316
+ return {
317
+ allowed: false,
318
+ reason: `Blacklisted parent domain: ${parentDomain} (${config.reason})`,
319
+ confidence: 0.9,
320
+ metadata: { blacklistConfig: config, matchType: 'subdomain', parentDomain }
321
+ };
322
+ }
323
+ }
324
+ }
325
+
326
+ return { allowed: true, reason: 'Not blacklisted', confidence: 0.5 };
327
+ }
328
+
329
+ /**
330
+ * Check whitelist rules
331
+ * @param {string} domain - Domain to check
332
+ * @param {string} path - URL path
333
+ * @returns {Object} Decision object
334
+ */
335
+ checkWhitelist(domain, path) {
336
+ // Check exact domain match
337
+ if (this.whitelist.has(domain)) {
338
+ const config = this.whitelist.get(domain);
339
+ return {
340
+ allowed: true,
341
+ reason: `Whitelisted domain: ${domain}`,
342
+ confidence: 1.0,
343
+ metadata: { whitelistConfig: config, matchType: 'exact' }
344
+ };
345
+ }
346
+
347
+ // Check subdomain matches
348
+ const parts = domain.split('.');
349
+ for (let i = 1; i < parts.length; i++) {
350
+ const parentDomain = parts.slice(i).join('.');
351
+ if (this.whitelist.has(parentDomain)) {
352
+ const config = this.whitelist.get(parentDomain);
353
+ if (config.includeSubdomains) {
354
+ return {
355
+ allowed: true,
356
+ reason: `Whitelisted parent domain: ${parentDomain}`,
357
+ confidence: 0.9,
358
+ metadata: { whitelistConfig: config, matchType: 'subdomain', parentDomain }
359
+ };
360
+ }
361
+ }
362
+ }
363
+
364
+ return { allowed: false, reason: 'Not whitelisted', confidence: 0.5 };
365
+ }
366
+
367
+ /**
368
+ * Check exclude patterns
369
+ * @param {string} url - URL to check
370
+ * @returns {Object} Decision object
371
+ */
372
+ checkExcludePatterns(url) {
373
+ for (const patternConfig of this.patterns.exclude) {
374
+ if (patternConfig.pattern.test(url)) {
375
+ return {
376
+ allowed: false,
377
+ reason: `Matches exclude pattern: ${patternConfig.rawPattern}`,
378
+ confidence: 0.95,
379
+ metadata: {
380
+ patternConfig,
381
+ matchType: 'exclude_pattern',
382
+ description: patternConfig.description
383
+ }
384
+ };
385
+ }
386
+ }
387
+
388
+ return { allowed: true, reason: 'No exclude pattern match', confidence: 0.5 };
389
+ }
390
+
391
+ /**
392
+ * Check include patterns
393
+ * @param {string} url - URL to check
394
+ * @returns {Object} Decision object
395
+ */
396
+ checkIncludePatterns(url) {
397
+ for (const patternConfig of this.patterns.include) {
398
+ if (patternConfig.pattern.test(url)) {
399
+ return {
400
+ allowed: true,
401
+ reason: `Matches include pattern: ${patternConfig.rawPattern}`,
402
+ confidence: 0.95,
403
+ metadata: {
404
+ patternConfig,
405
+ matchType: 'include_pattern',
406
+ description: patternConfig.description
407
+ }
408
+ };
409
+ }
410
+ }
411
+
412
+ return { allowed: false, reason: 'No include pattern match', confidence: 0.5 };
413
+ }
414
+
415
+ /**
416
+ * Export filter configuration
417
+ * @returns {Object} Serializable filter configuration
418
+ */
419
+ exportConfig() {
420
+ return {
421
+ version: '1.0',
422
+ exportedAt: new Date().toISOString(),
423
+ options: {
424
+ allowSubdomains: this.allowSubdomains,
425
+ defaultMaxDepth: this.defaultMaxDepth,
426
+ defaultRateLimit: this.defaultRateLimit
427
+ },
428
+ whitelist: Object.fromEntries(this.whitelist),
429
+ blacklist: Object.fromEntries(this.blacklist),
430
+ patterns: {
431
+ include: this.patterns.include.map(p => ({
432
+ ...p,
433
+ pattern: p.rawPattern // Store raw pattern for re-import
434
+ })),
435
+ exclude: this.patterns.exclude.map(p => ({
436
+ ...p,
437
+ pattern: p.rawPattern
438
+ }))
439
+ },
440
+ domainRules: Object.fromEntries(this.domainRules),
441
+ stats: this.getStats()
442
+ };
443
+ }
444
+
445
+ /**
446
+ * Import filter configuration
447
+ * @param {Object} config - Configuration to import
448
+ */
449
+ importConfig(config) {
450
+ if (!config || config.version !== '1.0') {
451
+ throw new Error('Invalid or unsupported configuration format');
452
+ }
453
+
454
+ // Clear existing configuration
455
+ this.clearAll();
456
+
457
+ // Import options
458
+ if (config.options) {
459
+ this.allowSubdomains = config.options.allowSubdomains ?? true;
460
+ this.defaultMaxDepth = config.options.defaultMaxDepth ?? 5;
461
+ this.defaultRateLimit = config.options.defaultRateLimit ?? 10;
462
+ }
463
+
464
+ // Import whitelist
465
+ if (config.whitelist) {
466
+ for (const [domain, options] of Object.entries(config.whitelist)) {
467
+ this.whitelist.set(domain, options);
468
+ }
469
+ }
470
+
471
+ // Import blacklist
472
+ if (config.blacklist) {
473
+ for (const [domain, options] of Object.entries(config.blacklist)) {
474
+ this.blacklist.set(domain, options);
475
+ }
476
+ }
477
+
478
+ // Import patterns
479
+ if (config.patterns) {
480
+ if (config.patterns.include) {
481
+ this.patterns.include = config.patterns.include.map(p => ({
482
+ ...p,
483
+ pattern: new RegExp(p.pattern, p.flags || 'i'),
484
+ rawPattern: p.pattern
485
+ }));
486
+ }
487
+ if (config.patterns.exclude) {
488
+ this.patterns.exclude = config.patterns.exclude.map(p => ({
489
+ ...p,
490
+ pattern: new RegExp(p.pattern, p.flags || 'i'),
491
+ rawPattern: p.pattern
492
+ }));
493
+ }
494
+ }
495
+
496
+ // Import domain rules
497
+ if (config.domainRules) {
498
+ for (const [domain, rules] of Object.entries(config.domainRules)) {
499
+ this.domainRules.set(domain, rules);
500
+ }
501
+ }
502
+
503
+ this.clearCache();
504
+ return this;
505
+ }
506
+
507
+ /**
508
+ * Get filter statistics
509
+ * @returns {Object} Statistics object
510
+ */
511
+ getStats() {
512
+ return {
513
+ whitelist: {
514
+ domains: this.whitelist.size,
515
+ withSubdomains: Array.from(this.whitelist.values()).filter(c => c.includeSubdomains).length
516
+ },
517
+ blacklist: {
518
+ domains: this.blacklist.size,
519
+ withSubdomains: Array.from(this.blacklist.values()).filter(c => c.includeSubdomains).length
520
+ },
521
+ patterns: {
522
+ include: this.patterns.include.length,
523
+ exclude: this.patterns.exclude.length
524
+ },
525
+ domainRules: this.domainRules.size,
526
+ cache: {
527
+ size: this.cache.size,
528
+ hits: this.cacheHits,
529
+ misses: this.cacheMisses,
530
+ hitRate: this.cacheHits / (this.cacheHits + this.cacheMisses) || 0
531
+ }
532
+ };
533
+ }
534
+
535
+ /**
536
+ * Clear all filters and rules
537
+ */
538
+ clearAll() {
539
+ this.whitelist.clear();
540
+ this.blacklist.clear();
541
+ this.patterns.include = [];
542
+ this.patterns.exclude = [];
543
+ this.domainRules.clear();
544
+ this.clearCache();
545
+ return this;
546
+ }
547
+
548
+ /**
549
+ * Clear the decision cache
550
+ */
551
+ clearCache() {
552
+ this.cache.clear();
553
+ this.cacheHits = 0;
554
+ this.cacheMisses = 0;
555
+ }
556
+
557
+ /**
558
+ * Add decision to cache with size management
559
+ * @param {string} url - URL key
560
+ * @param {Object} decision - Decision object
561
+ */
562
+ addToCache(url, decision) {
563
+ if (this.cache.size >= this.cacheSize) {
564
+ // Remove oldest entries (simple FIFO)
565
+ const firstKey = this.cache.keys().next().value;
566
+ this.cache.delete(firstKey);
567
+ }
568
+ this.cache.set(url, decision);
569
+ }
570
+
571
+ /**
572
+ * Normalize domain name for consistent lookup
573
+ * @param {string} domain - Domain to normalize
574
+ * @returns {string} Normalized domain
575
+ */
576
+ normalizeDomain(domain) {
577
+ return domain.toLowerCase().trim();
578
+ }
579
+
580
+ /**
581
+ * Validate and get effective configuration for a crawl operation
582
+ * @param {string} startUrl - Starting URL for crawl
583
+ * @param {Object} crawlOptions - Crawl options
584
+ * @returns {Object} Effective configuration
585
+ */
586
+ getEffectiveConfig(startUrl, crawlOptions = {}) {
587
+ try {
588
+ const urlObj = new URL(startUrl);
589
+ const domain = urlObj.hostname;
590
+ const domainRules = this.getDomainRules(domain);
591
+
592
+ return {
593
+ domain,
594
+ isAllowed: this.isAllowed(startUrl),
595
+ domainRules,
596
+ effectiveOptions: {
597
+ maxDepth: crawlOptions.maxDepth ?? domainRules.maxDepth,
598
+ maxPages: crawlOptions.maxPages ?? domainRules.maxPages,
599
+ rateLimit: crawlOptions.rateLimit ?? domainRules.rateLimit,
600
+ concurrency: crawlOptions.concurrency ?? domainRules.concurrency,
601
+ timeout: crawlOptions.timeout ?? domainRules.timeout,
602
+ respectRobots: crawlOptions.respectRobots ?? domainRules.respectRobots,
603
+ customHeaders: { ...domainRules.customHeaders, ...(crawlOptions.customHeaders || {}) }
604
+ }
605
+ };
606
+ } catch (error) {
607
+ throw new Error(`Invalid start URL: ${error.message}`);
608
+ }
609
+ }
610
+ }
611
+
612
+ export default DomainFilter;