crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
export function normalizeUrl(url) {
|
|
2
|
+
try {
|
|
3
|
+
const urlObj = new URL(url);
|
|
4
|
+
|
|
5
|
+
// Convert to lowercase
|
|
6
|
+
urlObj.hostname = urlObj.hostname.toLowerCase();
|
|
7
|
+
|
|
8
|
+
// Remove default ports
|
|
9
|
+
if ((urlObj.protocol === 'http:' && urlObj.port === '80') ||
|
|
10
|
+
(urlObj.protocol === 'https:' && urlObj.port === '443')) {
|
|
11
|
+
urlObj.port = '';
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
// Remove trailing slash from pathname
|
|
15
|
+
if (urlObj.pathname.endsWith('/') && urlObj.pathname.length > 1) {
|
|
16
|
+
urlObj.pathname = urlObj.pathname.slice(0, -1);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// Sort query parameters for consistency
|
|
20
|
+
if (urlObj.search) {
|
|
21
|
+
const params = new URLSearchParams(urlObj.search);
|
|
22
|
+
const sortedParams = new URLSearchParams();
|
|
23
|
+
[...params.keys()].sort().forEach(key => {
|
|
24
|
+
sortedParams.append(key, params.get(key));
|
|
25
|
+
});
|
|
26
|
+
urlObj.search = sortedParams.toString();
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Remove fragment
|
|
30
|
+
urlObj.hash = '';
|
|
31
|
+
|
|
32
|
+
return urlObj.toString();
|
|
33
|
+
} catch (error) {
|
|
34
|
+
throw new Error(`Invalid URL: ${url}`);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export function isValidUrl(url) {
|
|
39
|
+
try {
|
|
40
|
+
new URL(url);
|
|
41
|
+
return true;
|
|
42
|
+
} catch {
|
|
43
|
+
return false;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export function isSameDomain(url1, url2) {
|
|
48
|
+
try {
|
|
49
|
+
const urlObj1 = new URL(url1);
|
|
50
|
+
const urlObj2 = new URL(url2);
|
|
51
|
+
return urlObj1.hostname === urlObj2.hostname;
|
|
52
|
+
} catch {
|
|
53
|
+
return false;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export function isSubdomain(url, baseUrl) {
|
|
58
|
+
try {
|
|
59
|
+
const urlObj = new URL(url);
|
|
60
|
+
const baseObj = new URL(baseUrl);
|
|
61
|
+
|
|
62
|
+
const urlParts = urlObj.hostname.split('.');
|
|
63
|
+
const baseParts = baseObj.hostname.split('.');
|
|
64
|
+
|
|
65
|
+
if (urlParts.length < baseParts.length) {
|
|
66
|
+
return false;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const urlDomain = urlParts.slice(-baseParts.length).join('.');
|
|
70
|
+
const baseDomain = baseParts.join('.');
|
|
71
|
+
|
|
72
|
+
return urlDomain === baseDomain;
|
|
73
|
+
} catch {
|
|
74
|
+
return false;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export function extractDomain(url) {
|
|
79
|
+
try {
|
|
80
|
+
const urlObj = new URL(url);
|
|
81
|
+
return urlObj.hostname;
|
|
82
|
+
} catch {
|
|
83
|
+
return null;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export function extractLinks(html, baseUrl) {
|
|
88
|
+
const links = new Set();
|
|
89
|
+
|
|
90
|
+
// Simple regex-based link extraction (faster than cheerio for this purpose)
|
|
91
|
+
const hrefRegex = /href\s*=\s*["']([^"']+)["']/gi;
|
|
92
|
+
let match;
|
|
93
|
+
|
|
94
|
+
while ((match = hrefRegex.exec(html)) !== null) {
|
|
95
|
+
const href = match[1];
|
|
96
|
+
|
|
97
|
+
if (href && !href.startsWith('#') && !href.startsWith('javascript:')) {
|
|
98
|
+
try {
|
|
99
|
+
const absoluteUrl = new URL(href, baseUrl);
|
|
100
|
+
links.add(absoluteUrl.toString());
|
|
101
|
+
} catch {
|
|
102
|
+
// Invalid URL, skip it
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return Array.from(links);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
export function getUrlDepth(url) {
|
|
111
|
+
try {
|
|
112
|
+
const urlObj = new URL(url);
|
|
113
|
+
const pathSegments = urlObj.pathname.split('/').filter(segment => segment.length > 0);
|
|
114
|
+
return pathSegments.length;
|
|
115
|
+
} catch {
|
|
116
|
+
return 0;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
export function isFileUrl(url) {
|
|
121
|
+
const fileExtensions = [
|
|
122
|
+
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
|
|
123
|
+
'.zip', '.rar', '.tar', '.gz', '.7z',
|
|
124
|
+
'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp',
|
|
125
|
+
'.mp3', '.mp4', '.avi', '.mov', '.wmv',
|
|
126
|
+
'.exe', '.dmg', '.pkg', '.deb', '.rpm'
|
|
127
|
+
];
|
|
128
|
+
|
|
129
|
+
try {
|
|
130
|
+
const urlObj = new URL(url);
|
|
131
|
+
const pathname = urlObj.pathname.toLowerCase();
|
|
132
|
+
return fileExtensions.some(ext => pathname.endsWith(ext));
|
|
133
|
+
} catch {
|
|
134
|
+
return false;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
export function removeQueryParameters(url) {
|
|
139
|
+
try {
|
|
140
|
+
const urlObj = new URL(url);
|
|
141
|
+
urlObj.search = '';
|
|
142
|
+
return urlObj.toString();
|
|
143
|
+
} catch {
|
|
144
|
+
return url;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
export function getBaseUrl(url) {
|
|
149
|
+
try {
|
|
150
|
+
const urlObj = new URL(url);
|
|
151
|
+
return `${urlObj.protocol}//${urlObj.host}`;
|
|
152
|
+
} catch {
|
|
153
|
+
return null;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
export default {
|
|
158
|
+
normalizeUrl,
|
|
159
|
+
isValidUrl,
|
|
160
|
+
isSameDomain,
|
|
161
|
+
isSubdomain,
|
|
162
|
+
extractDomain,
|
|
163
|
+
extractLinks,
|
|
164
|
+
getUrlDepth,
|
|
165
|
+
isFileUrl,
|
|
166
|
+
removeQueryParameters,
|
|
167
|
+
getBaseUrl
|
|
168
|
+
};
|