crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,612 @@
|
|
|
1
|
+
import { normalizeUrl } from './urlNormalizer.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Advanced domain filtering system with whitelist/blacklist management,
|
|
5
|
+
* subdomain handling, pattern matching, and domain-specific rules
|
|
6
|
+
*/
|
|
7
|
+
export class DomainFilter {
|
|
8
|
+
constructor(options = {}) {
|
|
9
|
+
const {
|
|
10
|
+
allowSubdomains = true,
|
|
11
|
+
defaultMaxDepth = 5,
|
|
12
|
+
defaultRateLimit = 10
|
|
13
|
+
} = options;
|
|
14
|
+
|
|
15
|
+
this.allowSubdomains = allowSubdomains;
|
|
16
|
+
this.defaultMaxDepth = defaultMaxDepth;
|
|
17
|
+
this.defaultRateLimit = defaultRateLimit;
|
|
18
|
+
|
|
19
|
+
// Core filtering lists
|
|
20
|
+
this.whitelist = new Map(); // domain -> options
|
|
21
|
+
this.blacklist = new Map(); // domain -> options
|
|
22
|
+
this.patterns = {
|
|
23
|
+
include: [], // { pattern: RegExp, options: Object }
|
|
24
|
+
exclude: [] // { pattern: RegExp, options: Object }
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
// Domain-specific rules
|
|
28
|
+
this.domainRules = new Map(); // domain -> rules object
|
|
29
|
+
|
|
30
|
+
// Cache for performance
|
|
31
|
+
this.cache = new Map(); // url -> decision
|
|
32
|
+
this.cacheSize = 10000;
|
|
33
|
+
this.cacheHits = 0;
|
|
34
|
+
this.cacheMisses = 0;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Add domain to whitelist with options
|
|
39
|
+
* @param {string} domain - Domain to whitelist
|
|
40
|
+
* @param {Object} options - Configuration options
|
|
41
|
+
*/
|
|
42
|
+
addWhitelistDomain(domain, options = {}) {
|
|
43
|
+
const normalizedDomain = this.normalizeDomain(domain);
|
|
44
|
+
const config = {
|
|
45
|
+
includeSubdomains: options.includeSubdomains ?? this.allowSubdomains,
|
|
46
|
+
maxDepth: options.maxDepth ?? this.defaultMaxDepth,
|
|
47
|
+
rateLimit: options.rateLimit ?? this.defaultRateLimit,
|
|
48
|
+
customHeaders: options.customHeaders || {},
|
|
49
|
+
timeout: options.timeout || 30000,
|
|
50
|
+
priority: options.priority || 1,
|
|
51
|
+
addedAt: new Date().toISOString()
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
this.whitelist.set(normalizedDomain, config);
|
|
55
|
+
this.clearCache();
|
|
56
|
+
return this;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Add domain to blacklist with options
|
|
61
|
+
* @param {string} domain - Domain to blacklist
|
|
62
|
+
* @param {Object} options - Configuration options
|
|
63
|
+
*/
|
|
64
|
+
addBlacklistDomain(domain, options = {}) {
|
|
65
|
+
const normalizedDomain = this.normalizeDomain(domain);
|
|
66
|
+
const config = {
|
|
67
|
+
includeSubdomains: options.includeSubdomains ?? this.allowSubdomains,
|
|
68
|
+
reason: options.reason || 'Blacklisted',
|
|
69
|
+
permanent: options.permanent ?? false,
|
|
70
|
+
addedAt: new Date().toISOString()
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
this.blacklist.set(normalizedDomain, config);
|
|
74
|
+
this.clearCache();
|
|
75
|
+
return this;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Add pattern-based filter
|
|
80
|
+
* @param {string} pattern - RegExp pattern string
|
|
81
|
+
* @param {string} type - 'include' or 'exclude'
|
|
82
|
+
* @param {Object} options - Pattern options
|
|
83
|
+
*/
|
|
84
|
+
addPattern(pattern, type = 'exclude', options = {}) {
|
|
85
|
+
if (!['include', 'exclude'].includes(type)) {
|
|
86
|
+
throw new Error('Pattern type must be "include" or "exclude"');
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const config = {
|
|
90
|
+
pattern: new RegExp(pattern, options.flags || 'i'),
|
|
91
|
+
rawPattern: pattern,
|
|
92
|
+
priority: options.priority || 1,
|
|
93
|
+
description: options.description || '',
|
|
94
|
+
addedAt: new Date().toISOString()
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
this.patterns[type].push(config);
|
|
98
|
+
|
|
99
|
+
// Sort by priority (higher first)
|
|
100
|
+
this.patterns[type].sort((a, b) => b.priority - a.priority);
|
|
101
|
+
|
|
102
|
+
this.clearCache();
|
|
103
|
+
return this;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Remove domain from whitelist
|
|
108
|
+
* @param {string} domain - Domain to remove
|
|
109
|
+
*/
|
|
110
|
+
removeWhitelistDomain(domain) {
|
|
111
|
+
const normalizedDomain = this.normalizeDomain(domain);
|
|
112
|
+
const removed = this.whitelist.delete(normalizedDomain);
|
|
113
|
+
if (removed) this.clearCache();
|
|
114
|
+
return removed;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Remove domain from blacklist
|
|
119
|
+
* @param {string} domain - Domain to remove
|
|
120
|
+
*/
|
|
121
|
+
removeBlacklistDomain(domain) {
|
|
122
|
+
const normalizedDomain = this.normalizeDomain(domain);
|
|
123
|
+
const removed = this.blacklist.delete(normalizedDomain);
|
|
124
|
+
if (removed) this.clearCache();
|
|
125
|
+
return removed;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Remove pattern by index
|
|
130
|
+
* @param {string} type - 'include' or 'exclude'
|
|
131
|
+
* @param {number} index - Pattern index
|
|
132
|
+
*/
|
|
133
|
+
removePattern(type, index) {
|
|
134
|
+
if (!['include', 'exclude'].includes(type)) {
|
|
135
|
+
throw new Error('Pattern type must be "include" or "exclude"');
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if (index >= 0 && index < this.patterns[type].length) {
|
|
139
|
+
this.patterns[type].splice(index, 1);
|
|
140
|
+
this.clearCache();
|
|
141
|
+
return true;
|
|
142
|
+
}
|
|
143
|
+
return false;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Set domain-specific crawling rules
|
|
148
|
+
* @param {string} domain - Domain for rules
|
|
149
|
+
* @param {Object} rules - Domain-specific rules
|
|
150
|
+
*/
|
|
151
|
+
setDomainRules(domain, rules) {
|
|
152
|
+
const normalizedDomain = this.normalizeDomain(domain);
|
|
153
|
+
const config = {
|
|
154
|
+
maxDepth: rules.maxDepth ?? this.defaultMaxDepth,
|
|
155
|
+
rateLimit: rules.rateLimit ?? this.defaultRateLimit,
|
|
156
|
+
respectRobots: rules.respectRobots ?? true,
|
|
157
|
+
allowedPaths: rules.allowedPaths || [],
|
|
158
|
+
blockedPaths: rules.blockedPaths || [],
|
|
159
|
+
customHeaders: rules.customHeaders || {},
|
|
160
|
+
timeout: rules.timeout || 30000,
|
|
161
|
+
maxPages: rules.maxPages || 100,
|
|
162
|
+
concurrency: rules.concurrency || 10,
|
|
163
|
+
updatedAt: new Date().toISOString()
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
this.domainRules.set(normalizedDomain, config);
|
|
167
|
+
return this;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Get domain-specific rules
|
|
172
|
+
* @param {string} domain - Domain to get rules for
|
|
173
|
+
* @returns {Object} Domain rules or defaults
|
|
174
|
+
*/
|
|
175
|
+
getDomainRules(domain) {
|
|
176
|
+
const normalizedDomain = this.normalizeDomain(domain);
|
|
177
|
+
|
|
178
|
+
// Check exact match first
|
|
179
|
+
if (this.domainRules.has(normalizedDomain)) {
|
|
180
|
+
return { ...this.domainRules.get(normalizedDomain) };
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Check parent domains for subdomain inheritance
|
|
184
|
+
const parts = normalizedDomain.split('.');
|
|
185
|
+
for (let i = 1; i < parts.length; i++) {
|
|
186
|
+
const parentDomain = parts.slice(i).join('.');
|
|
187
|
+
if (this.domainRules.has(parentDomain)) {
|
|
188
|
+
const parentRules = this.domainRules.get(parentDomain);
|
|
189
|
+
if (parentRules.inheritToSubdomains !== false) {
|
|
190
|
+
return { ...parentRules };
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Return defaults
|
|
196
|
+
return {
|
|
197
|
+
maxDepth: this.defaultMaxDepth,
|
|
198
|
+
rateLimit: this.defaultRateLimit,
|
|
199
|
+
respectRobots: true,
|
|
200
|
+
allowedPaths: [],
|
|
201
|
+
blockedPaths: [],
|
|
202
|
+
customHeaders: {},
|
|
203
|
+
timeout: 30000,
|
|
204
|
+
maxPages: 100,
|
|
205
|
+
concurrency: 10
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Check if URL is allowed based on all filtering rules
|
|
211
|
+
* @param {string} url - URL to check
|
|
212
|
+
* @returns {Object} Decision object with allowed status and metadata
|
|
213
|
+
*/
|
|
214
|
+
isAllowed(url) {
|
|
215
|
+
try {
|
|
216
|
+
const normalizedUrl = normalizeUrl(url);
|
|
217
|
+
|
|
218
|
+
// Check cache first
|
|
219
|
+
if (this.cache.has(normalizedUrl)) {
|
|
220
|
+
this.cacheHits++;
|
|
221
|
+
return this.cache.get(normalizedUrl);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
this.cacheMisses++;
|
|
225
|
+
const decision = this.evaluateUrl(normalizedUrl);
|
|
226
|
+
|
|
227
|
+
// Cache the decision
|
|
228
|
+
this.addToCache(normalizedUrl, decision);
|
|
229
|
+
|
|
230
|
+
return decision;
|
|
231
|
+
} catch (error) {
|
|
232
|
+
return {
|
|
233
|
+
allowed: false,
|
|
234
|
+
reason: `Invalid URL: ${error.message}`,
|
|
235
|
+
confidence: 1.0,
|
|
236
|
+
metadata: { error: error.message }
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/**
|
|
242
|
+
* Internal URL evaluation logic
|
|
243
|
+
* @param {string} url - Normalized URL to evaluate
|
|
244
|
+
* @returns {Object} Decision object
|
|
245
|
+
*/
|
|
246
|
+
evaluateUrl(url) {
|
|
247
|
+
const urlObj = new URL(url);
|
|
248
|
+
const domain = urlObj.hostname;
|
|
249
|
+
const path = urlObj.pathname;
|
|
250
|
+
|
|
251
|
+
// 1. Check blacklist first (highest priority)
|
|
252
|
+
const blacklistResult = this.checkBlacklist(domain, path);
|
|
253
|
+
if (!blacklistResult.allowed) {
|
|
254
|
+
return blacklistResult;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// 2. Check exclude patterns
|
|
258
|
+
const excludePatternResult = this.checkExcludePatterns(url);
|
|
259
|
+
if (!excludePatternResult.allowed) {
|
|
260
|
+
return excludePatternResult;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
// 3. Check whitelist
|
|
264
|
+
const whitelistResult = this.checkWhitelist(domain, path);
|
|
265
|
+
if (whitelistResult.allowed) {
|
|
266
|
+
return whitelistResult;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// 4. Check include patterns
|
|
270
|
+
const includePatternResult = this.checkIncludePatterns(url);
|
|
271
|
+
if (includePatternResult.allowed) {
|
|
272
|
+
return includePatternResult;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// 5. Default behavior - if no whitelist exists, allow; if whitelist exists, deny
|
|
276
|
+
const hasWhitelist = this.whitelist.size > 0 || this.patterns.include.length > 0;
|
|
277
|
+
|
|
278
|
+
return {
|
|
279
|
+
allowed: !hasWhitelist,
|
|
280
|
+
reason: hasWhitelist ? 'Not in whitelist or include patterns' : 'No restrictions',
|
|
281
|
+
confidence: hasWhitelist ? 0.9 : 0.5,
|
|
282
|
+
metadata: {
|
|
283
|
+
domain,
|
|
284
|
+
path,
|
|
285
|
+
hasWhitelist,
|
|
286
|
+
evaluatedAt: new Date().toISOString()
|
|
287
|
+
}
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Check blacklist rules
|
|
293
|
+
* @param {string} domain - Domain to check
|
|
294
|
+
* @param {string} path - URL path
|
|
295
|
+
* @returns {Object} Decision object
|
|
296
|
+
*/
|
|
297
|
+
checkBlacklist(domain, path) {
|
|
298
|
+
// Check exact domain match
|
|
299
|
+
if (this.blacklist.has(domain)) {
|
|
300
|
+
const config = this.blacklist.get(domain);
|
|
301
|
+
return {
|
|
302
|
+
allowed: false,
|
|
303
|
+
reason: `Blacklisted domain: ${domain} (${config.reason})`,
|
|
304
|
+
confidence: 1.0,
|
|
305
|
+
metadata: { blacklistConfig: config, matchType: 'exact' }
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// Check subdomain matches
|
|
310
|
+
const parts = domain.split('.');
|
|
311
|
+
for (let i = 1; i < parts.length; i++) {
|
|
312
|
+
const parentDomain = parts.slice(i).join('.');
|
|
313
|
+
if (this.blacklist.has(parentDomain)) {
|
|
314
|
+
const config = this.blacklist.get(parentDomain);
|
|
315
|
+
if (config.includeSubdomains) {
|
|
316
|
+
return {
|
|
317
|
+
allowed: false,
|
|
318
|
+
reason: `Blacklisted parent domain: ${parentDomain} (${config.reason})`,
|
|
319
|
+
confidence: 0.9,
|
|
320
|
+
metadata: { blacklistConfig: config, matchType: 'subdomain', parentDomain }
|
|
321
|
+
};
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
return { allowed: true, reason: 'Not blacklisted', confidence: 0.5 };
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
/**
|
|
330
|
+
* Check whitelist rules
|
|
331
|
+
* @param {string} domain - Domain to check
|
|
332
|
+
* @param {string} path - URL path
|
|
333
|
+
* @returns {Object} Decision object
|
|
334
|
+
*/
|
|
335
|
+
checkWhitelist(domain, path) {
|
|
336
|
+
// Check exact domain match
|
|
337
|
+
if (this.whitelist.has(domain)) {
|
|
338
|
+
const config = this.whitelist.get(domain);
|
|
339
|
+
return {
|
|
340
|
+
allowed: true,
|
|
341
|
+
reason: `Whitelisted domain: ${domain}`,
|
|
342
|
+
confidence: 1.0,
|
|
343
|
+
metadata: { whitelistConfig: config, matchType: 'exact' }
|
|
344
|
+
};
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// Check subdomain matches
|
|
348
|
+
const parts = domain.split('.');
|
|
349
|
+
for (let i = 1; i < parts.length; i++) {
|
|
350
|
+
const parentDomain = parts.slice(i).join('.');
|
|
351
|
+
if (this.whitelist.has(parentDomain)) {
|
|
352
|
+
const config = this.whitelist.get(parentDomain);
|
|
353
|
+
if (config.includeSubdomains) {
|
|
354
|
+
return {
|
|
355
|
+
allowed: true,
|
|
356
|
+
reason: `Whitelisted parent domain: ${parentDomain}`,
|
|
357
|
+
confidence: 0.9,
|
|
358
|
+
metadata: { whitelistConfig: config, matchType: 'subdomain', parentDomain }
|
|
359
|
+
};
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
return { allowed: false, reason: 'Not whitelisted', confidence: 0.5 };
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
/**
|
|
368
|
+
* Check exclude patterns
|
|
369
|
+
* @param {string} url - URL to check
|
|
370
|
+
* @returns {Object} Decision object
|
|
371
|
+
*/
|
|
372
|
+
checkExcludePatterns(url) {
|
|
373
|
+
for (const patternConfig of this.patterns.exclude) {
|
|
374
|
+
if (patternConfig.pattern.test(url)) {
|
|
375
|
+
return {
|
|
376
|
+
allowed: false,
|
|
377
|
+
reason: `Matches exclude pattern: ${patternConfig.rawPattern}`,
|
|
378
|
+
confidence: 0.95,
|
|
379
|
+
metadata: {
|
|
380
|
+
patternConfig,
|
|
381
|
+
matchType: 'exclude_pattern',
|
|
382
|
+
description: patternConfig.description
|
|
383
|
+
}
|
|
384
|
+
};
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
return { allowed: true, reason: 'No exclude pattern match', confidence: 0.5 };
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
/**
|
|
392
|
+
* Check include patterns
|
|
393
|
+
* @param {string} url - URL to check
|
|
394
|
+
* @returns {Object} Decision object
|
|
395
|
+
*/
|
|
396
|
+
checkIncludePatterns(url) {
|
|
397
|
+
for (const patternConfig of this.patterns.include) {
|
|
398
|
+
if (patternConfig.pattern.test(url)) {
|
|
399
|
+
return {
|
|
400
|
+
allowed: true,
|
|
401
|
+
reason: `Matches include pattern: ${patternConfig.rawPattern}`,
|
|
402
|
+
confidence: 0.95,
|
|
403
|
+
metadata: {
|
|
404
|
+
patternConfig,
|
|
405
|
+
matchType: 'include_pattern',
|
|
406
|
+
description: patternConfig.description
|
|
407
|
+
}
|
|
408
|
+
};
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
return { allowed: false, reason: 'No include pattern match', confidence: 0.5 };
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
/**
|
|
416
|
+
* Export filter configuration
|
|
417
|
+
* @returns {Object} Serializable filter configuration
|
|
418
|
+
*/
|
|
419
|
+
exportConfig() {
|
|
420
|
+
return {
|
|
421
|
+
version: '1.0',
|
|
422
|
+
exportedAt: new Date().toISOString(),
|
|
423
|
+
options: {
|
|
424
|
+
allowSubdomains: this.allowSubdomains,
|
|
425
|
+
defaultMaxDepth: this.defaultMaxDepth,
|
|
426
|
+
defaultRateLimit: this.defaultRateLimit
|
|
427
|
+
},
|
|
428
|
+
whitelist: Object.fromEntries(this.whitelist),
|
|
429
|
+
blacklist: Object.fromEntries(this.blacklist),
|
|
430
|
+
patterns: {
|
|
431
|
+
include: this.patterns.include.map(p => ({
|
|
432
|
+
...p,
|
|
433
|
+
pattern: p.rawPattern // Store raw pattern for re-import
|
|
434
|
+
})),
|
|
435
|
+
exclude: this.patterns.exclude.map(p => ({
|
|
436
|
+
...p,
|
|
437
|
+
pattern: p.rawPattern
|
|
438
|
+
}))
|
|
439
|
+
},
|
|
440
|
+
domainRules: Object.fromEntries(this.domainRules),
|
|
441
|
+
stats: this.getStats()
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
/**
|
|
446
|
+
* Import filter configuration
|
|
447
|
+
* @param {Object} config - Configuration to import
|
|
448
|
+
*/
|
|
449
|
+
importConfig(config) {
|
|
450
|
+
if (!config || config.version !== '1.0') {
|
|
451
|
+
throw new Error('Invalid or unsupported configuration format');
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
// Clear existing configuration
|
|
455
|
+
this.clearAll();
|
|
456
|
+
|
|
457
|
+
// Import options
|
|
458
|
+
if (config.options) {
|
|
459
|
+
this.allowSubdomains = config.options.allowSubdomains ?? true;
|
|
460
|
+
this.defaultMaxDepth = config.options.defaultMaxDepth ?? 5;
|
|
461
|
+
this.defaultRateLimit = config.options.defaultRateLimit ?? 10;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
// Import whitelist
|
|
465
|
+
if (config.whitelist) {
|
|
466
|
+
for (const [domain, options] of Object.entries(config.whitelist)) {
|
|
467
|
+
this.whitelist.set(domain, options);
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
// Import blacklist
|
|
472
|
+
if (config.blacklist) {
|
|
473
|
+
for (const [domain, options] of Object.entries(config.blacklist)) {
|
|
474
|
+
this.blacklist.set(domain, options);
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
// Import patterns
|
|
479
|
+
if (config.patterns) {
|
|
480
|
+
if (config.patterns.include) {
|
|
481
|
+
this.patterns.include = config.patterns.include.map(p => ({
|
|
482
|
+
...p,
|
|
483
|
+
pattern: new RegExp(p.pattern, p.flags || 'i'),
|
|
484
|
+
rawPattern: p.pattern
|
|
485
|
+
}));
|
|
486
|
+
}
|
|
487
|
+
if (config.patterns.exclude) {
|
|
488
|
+
this.patterns.exclude = config.patterns.exclude.map(p => ({
|
|
489
|
+
...p,
|
|
490
|
+
pattern: new RegExp(p.pattern, p.flags || 'i'),
|
|
491
|
+
rawPattern: p.pattern
|
|
492
|
+
}));
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
// Import domain rules
|
|
497
|
+
if (config.domainRules) {
|
|
498
|
+
for (const [domain, rules] of Object.entries(config.domainRules)) {
|
|
499
|
+
this.domainRules.set(domain, rules);
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
this.clearCache();
|
|
504
|
+
return this;
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
/**
|
|
508
|
+
* Get filter statistics
|
|
509
|
+
* @returns {Object} Statistics object
|
|
510
|
+
*/
|
|
511
|
+
getStats() {
|
|
512
|
+
return {
|
|
513
|
+
whitelist: {
|
|
514
|
+
domains: this.whitelist.size,
|
|
515
|
+
withSubdomains: Array.from(this.whitelist.values()).filter(c => c.includeSubdomains).length
|
|
516
|
+
},
|
|
517
|
+
blacklist: {
|
|
518
|
+
domains: this.blacklist.size,
|
|
519
|
+
withSubdomains: Array.from(this.blacklist.values()).filter(c => c.includeSubdomains).length
|
|
520
|
+
},
|
|
521
|
+
patterns: {
|
|
522
|
+
include: this.patterns.include.length,
|
|
523
|
+
exclude: this.patterns.exclude.length
|
|
524
|
+
},
|
|
525
|
+
domainRules: this.domainRules.size,
|
|
526
|
+
cache: {
|
|
527
|
+
size: this.cache.size,
|
|
528
|
+
hits: this.cacheHits,
|
|
529
|
+
misses: this.cacheMisses,
|
|
530
|
+
hitRate: this.cacheHits / (this.cacheHits + this.cacheMisses) || 0
|
|
531
|
+
}
|
|
532
|
+
};
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
/**
|
|
536
|
+
* Clear all filters and rules
|
|
537
|
+
*/
|
|
538
|
+
clearAll() {
|
|
539
|
+
this.whitelist.clear();
|
|
540
|
+
this.blacklist.clear();
|
|
541
|
+
this.patterns.include = [];
|
|
542
|
+
this.patterns.exclude = [];
|
|
543
|
+
this.domainRules.clear();
|
|
544
|
+
this.clearCache();
|
|
545
|
+
return this;
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
/**
|
|
549
|
+
* Clear the decision cache
|
|
550
|
+
*/
|
|
551
|
+
clearCache() {
|
|
552
|
+
this.cache.clear();
|
|
553
|
+
this.cacheHits = 0;
|
|
554
|
+
this.cacheMisses = 0;
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
/**
|
|
558
|
+
* Add decision to cache with size management
|
|
559
|
+
* @param {string} url - URL key
|
|
560
|
+
* @param {Object} decision - Decision object
|
|
561
|
+
*/
|
|
562
|
+
addToCache(url, decision) {
|
|
563
|
+
if (this.cache.size >= this.cacheSize) {
|
|
564
|
+
// Remove oldest entries (simple FIFO)
|
|
565
|
+
const firstKey = this.cache.keys().next().value;
|
|
566
|
+
this.cache.delete(firstKey);
|
|
567
|
+
}
|
|
568
|
+
this.cache.set(url, decision);
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
/**
|
|
572
|
+
* Normalize domain name for consistent lookup
|
|
573
|
+
* @param {string} domain - Domain to normalize
|
|
574
|
+
* @returns {string} Normalized domain
|
|
575
|
+
*/
|
|
576
|
+
normalizeDomain(domain) {
|
|
577
|
+
return domain.toLowerCase().trim();
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
/**
|
|
581
|
+
* Validate and get effective configuration for a crawl operation
|
|
582
|
+
* @param {string} startUrl - Starting URL for crawl
|
|
583
|
+
* @param {Object} crawlOptions - Crawl options
|
|
584
|
+
* @returns {Object} Effective configuration
|
|
585
|
+
*/
|
|
586
|
+
getEffectiveConfig(startUrl, crawlOptions = {}) {
|
|
587
|
+
try {
|
|
588
|
+
const urlObj = new URL(startUrl);
|
|
589
|
+
const domain = urlObj.hostname;
|
|
590
|
+
const domainRules = this.getDomainRules(domain);
|
|
591
|
+
|
|
592
|
+
return {
|
|
593
|
+
domain,
|
|
594
|
+
isAllowed: this.isAllowed(startUrl),
|
|
595
|
+
domainRules,
|
|
596
|
+
effectiveOptions: {
|
|
597
|
+
maxDepth: crawlOptions.maxDepth ?? domainRules.maxDepth,
|
|
598
|
+
maxPages: crawlOptions.maxPages ?? domainRules.maxPages,
|
|
599
|
+
rateLimit: crawlOptions.rateLimit ?? domainRules.rateLimit,
|
|
600
|
+
concurrency: crawlOptions.concurrency ?? domainRules.concurrency,
|
|
601
|
+
timeout: crawlOptions.timeout ?? domainRules.timeout,
|
|
602
|
+
respectRobots: crawlOptions.respectRobots ?? domainRules.respectRobots,
|
|
603
|
+
customHeaders: { ...domainRules.customHeaders, ...(crawlOptions.customHeaders || {}) }
|
|
604
|
+
}
|
|
605
|
+
};
|
|
606
|
+
} catch (error) {
|
|
607
|
+
throw new Error(`Invalid start URL: ${error.message}`);
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
export default DomainFilter;
|