crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,753 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { load } from 'cheerio';
|
|
3
|
+
import { MapSiteTool } from '../tools/crawl/mapSite.js';
|
|
4
|
+
import { CrawlDeepTool } from '../tools/crawl/crawlDeep.js';
|
|
5
|
+
import { normalizeUrl, getBaseUrl } from '../utils/urlNormalizer.js';
|
|
6
|
+
import { Logger } from '../utils/Logger.js';
|
|
7
|
+
|
|
8
|
+
const logger = new Logger('LLMsTxtAnalyzer');
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* LLMsTxtAnalyzer - Comprehensive website analysis for LLMs.txt generation
|
|
12
|
+
*
|
|
13
|
+
* This analyzer performs deep website analysis to understand:
|
|
14
|
+
* - Site structure and navigation patterns
|
|
15
|
+
* - API endpoints and data sources
|
|
16
|
+
* - Content types and classification
|
|
17
|
+
* - Security boundaries and sensitive areas
|
|
18
|
+
* - Rate limiting recommendations
|
|
19
|
+
* - Usage guidelines for AI models
|
|
20
|
+
*/
|
|
21
|
+
export class LLMsTxtAnalyzer {
|
|
22
|
+
constructor(options = {}) {
|
|
23
|
+
this.options = {
|
|
24
|
+
maxDepth: options.maxDepth || 3,
|
|
25
|
+
maxPages: options.maxPages || 100,
|
|
26
|
+
timeout: options.timeout || 30000,
|
|
27
|
+
userAgent: options.userAgent || 'LLMs.txt-Analyzer/1.0',
|
|
28
|
+
respectRobots: options.respectRobots !== false,
|
|
29
|
+
detectAPIs: options.detectAPIs !== false,
|
|
30
|
+
analyzeContent: options.analyzeContent !== false,
|
|
31
|
+
checkSecurity: options.checkSecurity !== false,
|
|
32
|
+
...options
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
this.mapSiteTool = new MapSiteTool({
|
|
36
|
+
timeout: this.options.timeout,
|
|
37
|
+
userAgent: this.options.userAgent
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
this.crawlDeepTool = new CrawlDeepTool({
|
|
41
|
+
timeout: this.options.timeout,
|
|
42
|
+
userAgent: this.options.userAgent
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
this.analysis = {
|
|
46
|
+
structure: {},
|
|
47
|
+
apis: [],
|
|
48
|
+
contentTypes: {},
|
|
49
|
+
securityAreas: [],
|
|
50
|
+
rateLimit: {},
|
|
51
|
+
guidelines: {},
|
|
52
|
+
metadata: {},
|
|
53
|
+
errors: []
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Perform comprehensive website analysis
|
|
59
|
+
*/
|
|
60
|
+
async analyzeWebsite(url, options = {}) {
|
|
61
|
+
const startTime = Date.now();
|
|
62
|
+
logger.info(`Starting comprehensive analysis for: ${url}`);
|
|
63
|
+
|
|
64
|
+
try {
|
|
65
|
+
const baseUrl = getBaseUrl(url);
|
|
66
|
+
this.analysis.metadata = {
|
|
67
|
+
baseUrl,
|
|
68
|
+
analyzedAt: new Date().toISOString(),
|
|
69
|
+
analyzer: 'LLMs.txt-Analyzer/1.0',
|
|
70
|
+
analysisOptions: { ...this.options, ...options }
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
// Phase 1: Site Structure Analysis
|
|
74
|
+
await this.analyzeSiteStructure(url, options);
|
|
75
|
+
|
|
76
|
+
// Phase 2: API Detection
|
|
77
|
+
if (this.options.detectAPIs) {
|
|
78
|
+
await this.detectAPIEndpoints(url);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Phase 3: Content Classification
|
|
82
|
+
if (this.options.analyzeContent) {
|
|
83
|
+
await this.classifyContent();
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Phase 4: Security Analysis
|
|
87
|
+
if (this.options.checkSecurity) {
|
|
88
|
+
await this.analyzeSecurity(url);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Phase 5: Rate Limiting Analysis
|
|
92
|
+
await this.analyzeRateLimiting(url);
|
|
93
|
+
|
|
94
|
+
// Phase 6: Generate Guidelines
|
|
95
|
+
await this.generateUsageGuidelines();
|
|
96
|
+
|
|
97
|
+
const analysisTime = Date.now() - startTime;
|
|
98
|
+
this.analysis.metadata.analysisTimeMs = analysisTime;
|
|
99
|
+
|
|
100
|
+
logger.info(`Analysis completed in ${analysisTime}ms`);
|
|
101
|
+
return this.analysis;
|
|
102
|
+
|
|
103
|
+
} catch (error) {
|
|
104
|
+
logger.error(`Analysis failed: ${error.message}`);
|
|
105
|
+
this.analysis.errors.push({
|
|
106
|
+
phase: 'general',
|
|
107
|
+
error: error.message,
|
|
108
|
+
timestamp: new Date().toISOString()
|
|
109
|
+
});
|
|
110
|
+
throw error;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Analyze website structure using site mapping and crawling
|
|
116
|
+
*/
|
|
117
|
+
async analyzeSiteStructure(url, options = {}) {
|
|
118
|
+
logger.info('Analyzing site structure...');
|
|
119
|
+
|
|
120
|
+
try {
|
|
121
|
+
// Get comprehensive site map
|
|
122
|
+
const siteMap = await this.mapSiteTool.execute({
|
|
123
|
+
url,
|
|
124
|
+
include_sitemap: true,
|
|
125
|
+
max_urls: this.options.maxPages,
|
|
126
|
+
group_by_path: true,
|
|
127
|
+
include_metadata: true
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
// Perform targeted crawl for deeper analysis
|
|
131
|
+
const crawlResult = await this.crawlDeepTool.execute({
|
|
132
|
+
url,
|
|
133
|
+
max_depth: Math.min(this.options.maxDepth, 3),
|
|
134
|
+
max_pages: Math.min(this.options.maxPages, 50),
|
|
135
|
+
extract_content: true,
|
|
136
|
+
respect_robots: this.options.respectRobots
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
this.analysis.structure = {
|
|
140
|
+
siteMap: siteMap.site_map,
|
|
141
|
+
totalPages: siteMap.total_urls,
|
|
142
|
+
sections: this.categorizeSections(siteMap.urls),
|
|
143
|
+
navigation: this.analyzeNavigation(crawlResult.pages),
|
|
144
|
+
hierarchy: this.buildHierarchy(siteMap.urls),
|
|
145
|
+
robotsTxt: await this.fetchRobotsTxt(url),
|
|
146
|
+
sitemap: siteMap.urls || []
|
|
147
|
+
};
|
|
148
|
+
|
|
149
|
+
logger.info(`Analyzed ${siteMap.total_urls} pages in site structure`);
|
|
150
|
+
|
|
151
|
+
} catch (error) {
|
|
152
|
+
logger.error(`Site structure analysis failed: ${error.message}`);
|
|
153
|
+
this.analysis.errors.push({
|
|
154
|
+
phase: 'structure',
|
|
155
|
+
error: error.message,
|
|
156
|
+
timestamp: new Date().toISOString()
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Detect API endpoints and data sources
|
|
163
|
+
*/
|
|
164
|
+
async detectAPIEndpoints(baseUrl) {
|
|
165
|
+
logger.info('Detecting API endpoints...');
|
|
166
|
+
|
|
167
|
+
try {
|
|
168
|
+
const apis = [];
|
|
169
|
+
const commonPaths = [
|
|
170
|
+
'/api', '/v1', '/v2', '/v3', '/rest', '/graphql',
|
|
171
|
+
'/data', '/feed', '/json', '/xml', '/rss',
|
|
172
|
+
'/.well-known', '/openapi', '/swagger'
|
|
173
|
+
];
|
|
174
|
+
|
|
175
|
+
// Check common API paths
|
|
176
|
+
for (const path of commonPaths) {
|
|
177
|
+
const apiUrl = `${baseUrl}${path}`;
|
|
178
|
+
try {
|
|
179
|
+
const response = await this.fetchWithTimeout(apiUrl, { timeout: 5000 });
|
|
180
|
+
if (response.ok) {
|
|
181
|
+
const contentType = response.headers.get('content-type') || '';
|
|
182
|
+
apis.push({
|
|
183
|
+
url: apiUrl,
|
|
184
|
+
type: this.determineAPIType(apiUrl, contentType),
|
|
185
|
+
status: response.status,
|
|
186
|
+
contentType,
|
|
187
|
+
accessible: true
|
|
188
|
+
});
|
|
189
|
+
}
|
|
190
|
+
} catch {
|
|
191
|
+
// API endpoint not accessible or doesn't exist
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Look for API documentation references
|
|
196
|
+
const mainPageResponse = await this.fetchWithTimeout(baseUrl);
|
|
197
|
+
if (mainPageResponse.ok) {
|
|
198
|
+
const html = await mainPageResponse.text();
|
|
199
|
+
const $ = load(html);
|
|
200
|
+
|
|
201
|
+
// Find API documentation links
|
|
202
|
+
$('a[href*="api"], a[href*="developer"], a[href*="docs"]').each((_, element) => {
|
|
203
|
+
const href = $(element).attr('href');
|
|
204
|
+
const text = $(element).text().toLowerCase();
|
|
205
|
+
if (href && (text.includes('api') || text.includes('developer'))) {
|
|
206
|
+
apis.push({
|
|
207
|
+
url: new URL(href, baseUrl).toString(),
|
|
208
|
+
type: 'documentation',
|
|
209
|
+
description: text.trim()
|
|
210
|
+
});
|
|
211
|
+
}
|
|
212
|
+
});
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
this.analysis.apis = apis;
|
|
216
|
+
logger.info(`Detected ${apis.length} API endpoints`);
|
|
217
|
+
|
|
218
|
+
} catch (error) {
|
|
219
|
+
logger.error(`API detection failed: ${error.message}`);
|
|
220
|
+
this.analysis.errors.push({
|
|
221
|
+
phase: 'apis',
|
|
222
|
+
error: error.message,
|
|
223
|
+
timestamp: new Date().toISOString()
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Classify content types across the website
|
|
230
|
+
*/
|
|
231
|
+
async classifyContent() {
|
|
232
|
+
logger.info('Classifying content types...');
|
|
233
|
+
|
|
234
|
+
try {
|
|
235
|
+
const contentTypes = {
|
|
236
|
+
public: [],
|
|
237
|
+
restricted: [],
|
|
238
|
+
dynamic: [],
|
|
239
|
+
static: [],
|
|
240
|
+
forms: [],
|
|
241
|
+
media: [],
|
|
242
|
+
documents: []
|
|
243
|
+
};
|
|
244
|
+
|
|
245
|
+
// Analyze pages from structure analysis
|
|
246
|
+
const sitemapUrls = this.analysis.structure?.sitemap || [];
|
|
247
|
+
const urlsToAnalyze = Array.isArray(sitemapUrls) ? sitemapUrls :
|
|
248
|
+
(typeof sitemapUrls === 'object' ? Object.values(sitemapUrls).flat() : []);
|
|
249
|
+
|
|
250
|
+
if (urlsToAnalyze.length > 0) {
|
|
251
|
+
for (const url of urlsToAnalyze.slice(0, 20)) {
|
|
252
|
+
try {
|
|
253
|
+
const classification = await this.classifyPage(url);
|
|
254
|
+
contentTypes[classification.category].push({
|
|
255
|
+
url,
|
|
256
|
+
type: classification.type,
|
|
257
|
+
confidence: classification.confidence,
|
|
258
|
+
metadata: classification.metadata
|
|
259
|
+
});
|
|
260
|
+
} catch (error) {
|
|
261
|
+
logger.warn(`Failed to classify page ${url}: ${error.message}`);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
this.analysis.contentTypes = contentTypes;
|
|
267
|
+
logger.info('Content classification completed');
|
|
268
|
+
|
|
269
|
+
} catch (error) {
|
|
270
|
+
logger.error(`Content classification failed: ${error.message}`);
|
|
271
|
+
this.analysis.errors.push({
|
|
272
|
+
phase: 'content',
|
|
273
|
+
error: error.message,
|
|
274
|
+
timestamp: new Date().toISOString()
|
|
275
|
+
});
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
/**
|
|
280
|
+
* Analyze security boundaries and sensitive areas
|
|
281
|
+
*/
|
|
282
|
+
async analyzeSecurity(baseUrl) {
|
|
283
|
+
logger.info('Analyzing security boundaries...');
|
|
284
|
+
|
|
285
|
+
try {
|
|
286
|
+
const securityAreas = [];
|
|
287
|
+
|
|
288
|
+
// Check for common sensitive paths
|
|
289
|
+
const sensitivePaths = [
|
|
290
|
+
'/admin', '/administrator', '/wp-admin', '/cms',
|
|
291
|
+
'/login', '/signin', '/auth', '/oauth',
|
|
292
|
+
'/user', '/account', '/profile', '/dashboard',
|
|
293
|
+
'/private', '/internal', '/secure',
|
|
294
|
+
'/config', '/settings', '/env'
|
|
295
|
+
];
|
|
296
|
+
|
|
297
|
+
for (const path of sensitivePaths) {
|
|
298
|
+
const testUrl = `${baseUrl}${path}`;
|
|
299
|
+
try {
|
|
300
|
+
const response = await this.fetchWithTimeout(testUrl, { timeout: 3000 });
|
|
301
|
+
if (response.status === 200 || response.status === 302 || response.status === 401) {
|
|
302
|
+
securityAreas.push({
|
|
303
|
+
path,
|
|
304
|
+
url: testUrl,
|
|
305
|
+
status: response.status,
|
|
306
|
+
type: this.classifySecurityArea(path),
|
|
307
|
+
recommendation: 'restrict'
|
|
308
|
+
});
|
|
309
|
+
}
|
|
310
|
+
} catch {
|
|
311
|
+
// Area not accessible
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Check for security headers
|
|
316
|
+
const mainResponse = await this.fetchWithTimeout(baseUrl);
|
|
317
|
+
const securityHeaders = this.analyzeSecurityHeaders(mainResponse.headers);
|
|
318
|
+
|
|
319
|
+
this.analysis.securityAreas = securityAreas;
|
|
320
|
+
this.analysis.securityHeaders = securityHeaders;
|
|
321
|
+
logger.info(`Identified ${securityAreas.length} security areas`);
|
|
322
|
+
|
|
323
|
+
} catch (error) {
|
|
324
|
+
logger.error(`Security analysis failed: ${error.message}`);
|
|
325
|
+
this.analysis.errors.push({
|
|
326
|
+
phase: 'security',
|
|
327
|
+
error: error.message,
|
|
328
|
+
timestamp: new Date().toISOString()
|
|
329
|
+
});
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
/**
|
|
334
|
+
* Analyze and recommend rate limiting
|
|
335
|
+
*/
|
|
336
|
+
async analyzeRateLimiting(baseUrl) {
|
|
337
|
+
logger.info('Analyzing rate limiting requirements...');
|
|
338
|
+
|
|
339
|
+
try {
|
|
340
|
+
// Test response times and determine appropriate limits
|
|
341
|
+
const testRequests = 5;
|
|
342
|
+
const responseTimes = [];
|
|
343
|
+
|
|
344
|
+
for (let i = 0; i < testRequests; i++) {
|
|
345
|
+
const start = Date.now();
|
|
346
|
+
try {
|
|
347
|
+
await this.fetchWithTimeout(baseUrl, { timeout: 10000 });
|
|
348
|
+
responseTimes.push(Date.now() - start);
|
|
349
|
+
} catch {
|
|
350
|
+
responseTimes.push(10000); // Max timeout
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
const avgResponseTime = responseTimes.reduce((a, b) => a + b, 0) / responseTimes.length;
|
|
355
|
+
|
|
356
|
+
this.analysis.rateLimit = {
|
|
357
|
+
averageResponseTime: avgResponseTime,
|
|
358
|
+
recommendedDelay: Math.max(100, Math.floor(avgResponseTime * 0.5)),
|
|
359
|
+
maxConcurrency: avgResponseTime > 2000 ? 2 : (avgResponseTime > 1000 ? 5 : 10),
|
|
360
|
+
recommendedRPM: avgResponseTime > 2000 ? 10 : (avgResponseTime > 1000 ? 30 : 60),
|
|
361
|
+
reasoning: this.generateRateLimitReasoning(avgResponseTime)
|
|
362
|
+
};
|
|
363
|
+
|
|
364
|
+
logger.info(`Rate limiting analysis completed. Avg response: ${avgResponseTime}ms`);
|
|
365
|
+
|
|
366
|
+
} catch (error) {
|
|
367
|
+
logger.error(`Rate limiting analysis failed: ${error.message}`);
|
|
368
|
+
this.analysis.errors.push({
|
|
369
|
+
phase: 'rateLimit',
|
|
370
|
+
error: error.message,
|
|
371
|
+
timestamp: new Date().toISOString()
|
|
372
|
+
});
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/**
|
|
377
|
+
* Generate comprehensive usage guidelines
|
|
378
|
+
*/
|
|
379
|
+
async generateUsageGuidelines() {
|
|
380
|
+
logger.info('Generating usage guidelines...');
|
|
381
|
+
|
|
382
|
+
try {
|
|
383
|
+
this.analysis.guidelines = {
|
|
384
|
+
crawling: this.generateCrawlingGuidelines(),
|
|
385
|
+
apis: this.generateAPIGuidelines(),
|
|
386
|
+
rateLimit: this.generateRateLimitGuidelines(),
|
|
387
|
+
content: this.generateContentGuidelines(),
|
|
388
|
+
security: this.generateSecurityGuidelines(),
|
|
389
|
+
compliance: this.generateComplianceGuidelines()
|
|
390
|
+
};
|
|
391
|
+
|
|
392
|
+
logger.info('Usage guidelines generated');
|
|
393
|
+
|
|
394
|
+
} catch (error) {
|
|
395
|
+
logger.error(`Guidelines generation failed: ${error.message}`);
|
|
396
|
+
this.analysis.errors.push({
|
|
397
|
+
phase: 'guidelines',
|
|
398
|
+
error: error.message,
|
|
399
|
+
timestamp: new Date().toISOString()
|
|
400
|
+
});
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
// Helper methods
|
|
405
|
+
|
|
406
|
+
async fetchWithTimeout(url, options = {}) {
|
|
407
|
+
const { timeout = this.options.timeout } = options;
|
|
408
|
+
const controller = new AbortController();
|
|
409
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
410
|
+
|
|
411
|
+
try {
|
|
412
|
+
const response = await fetch(url, {
|
|
413
|
+
signal: controller.signal,
|
|
414
|
+
headers: {
|
|
415
|
+
'User-Agent': this.options.userAgent
|
|
416
|
+
},
|
|
417
|
+
...options
|
|
418
|
+
});
|
|
419
|
+
clearTimeout(timeoutId);
|
|
420
|
+
return response;
|
|
421
|
+
} catch (error) {
|
|
422
|
+
clearTimeout(timeoutId);
|
|
423
|
+
throw error;
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
async fetchRobotsTxt(baseUrl) {
|
|
428
|
+
try {
|
|
429
|
+
const robotsUrl = `${baseUrl}/robots.txt`;
|
|
430
|
+
const response = await this.fetchWithTimeout(robotsUrl);
|
|
431
|
+
if (response.ok) {
|
|
432
|
+
return await response.text();
|
|
433
|
+
}
|
|
434
|
+
} catch {
|
|
435
|
+
// No robots.txt found
|
|
436
|
+
}
|
|
437
|
+
return null;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
categorizeSections(urls) {
|
|
441
|
+
const categories = {
|
|
442
|
+
content: [],
|
|
443
|
+
navigation: [],
|
|
444
|
+
media: [],
|
|
445
|
+
tools: [],
|
|
446
|
+
documentation: [],
|
|
447
|
+
other: []
|
|
448
|
+
};
|
|
449
|
+
|
|
450
|
+
if (typeof urls === 'object' && !Array.isArray(urls)) {
|
|
451
|
+
// Handle grouped URLs
|
|
452
|
+
for (const [path, urlList] of Object.entries(urls)) {
|
|
453
|
+
const category = this.categorizeSection(path);
|
|
454
|
+
categories[category].push({ path, urls: urlList });
|
|
455
|
+
}
|
|
456
|
+
} else if (Array.isArray(urls)) {
|
|
457
|
+
// Handle flat URL list
|
|
458
|
+
for (const url of urls) {
|
|
459
|
+
try {
|
|
460
|
+
const urlObj = new URL(url);
|
|
461
|
+
const path = urlObj.pathname;
|
|
462
|
+
const category = this.categorizeSection(path);
|
|
463
|
+
categories[category].push(url);
|
|
464
|
+
} catch {
|
|
465
|
+
categories.other.push(url);
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
return categories;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
categorizeSection(path) {
|
|
474
|
+
const contentPaths = ['/blog', '/news', '/articles', '/posts'];
|
|
475
|
+
const navPaths = ['/about', '/contact', '/help', '/support'];
|
|
476
|
+
const mediaPaths = ['/images', '/media', '/gallery', '/downloads'];
|
|
477
|
+
const toolPaths = ['/tools', '/utilities', '/calculator', '/converter'];
|
|
478
|
+
const docPaths = ['/docs', '/documentation', '/api', '/guide'];
|
|
479
|
+
|
|
480
|
+
if (contentPaths.some(p => path.includes(p))) return 'content';
|
|
481
|
+
if (navPaths.some(p => path.includes(p))) return 'navigation';
|
|
482
|
+
if (mediaPaths.some(p => path.includes(p))) return 'media';
|
|
483
|
+
if (toolPaths.some(p => path.includes(p))) return 'tools';
|
|
484
|
+
if (docPaths.some(p => path.includes(p))) return 'documentation';
|
|
485
|
+
|
|
486
|
+
return 'other';
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
analyzeNavigation(pages) {
|
|
490
|
+
const navigation = {
|
|
491
|
+
mainMenu: [],
|
|
492
|
+
breadcrumbs: [],
|
|
493
|
+
footer: [],
|
|
494
|
+
sideNav: []
|
|
495
|
+
};
|
|
496
|
+
|
|
497
|
+
if (pages && pages.length > 0) {
|
|
498
|
+
// Analyze first few pages for common navigation patterns
|
|
499
|
+
for (const page of pages.slice(0, 3)) {
|
|
500
|
+
if (page.content) {
|
|
501
|
+
const $ = load(page.content);
|
|
502
|
+
|
|
503
|
+
// Extract main navigation
|
|
504
|
+
$('nav, .nav, #nav, .navigation, .menu').each((_, element) => {
|
|
505
|
+
$(element).find('a').each((_, link) => {
|
|
506
|
+
const href = $(link).attr('href');
|
|
507
|
+
const text = $(link).text().trim();
|
|
508
|
+
if (href && text) {
|
|
509
|
+
navigation.mainMenu.push({ href, text });
|
|
510
|
+
}
|
|
511
|
+
});
|
|
512
|
+
});
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
return navigation;
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
buildHierarchy(urls) {
|
|
521
|
+
const hierarchy = { depth: {}, paths: {} };
|
|
522
|
+
|
|
523
|
+
const urlArray = Array.isArray(urls) ? urls :
|
|
524
|
+
typeof urls === 'object' ? Object.values(urls).flat() : [];
|
|
525
|
+
|
|
526
|
+
for (const url of urlArray) {
|
|
527
|
+
try {
|
|
528
|
+
const urlObj = new URL(url);
|
|
529
|
+
const pathSegments = urlObj.pathname.split('/').filter(s => s);
|
|
530
|
+
const depth = pathSegments.length;
|
|
531
|
+
|
|
532
|
+
if (!hierarchy.depth[depth]) {
|
|
533
|
+
hierarchy.depth[depth] = [];
|
|
534
|
+
}
|
|
535
|
+
hierarchy.depth[depth].push(url);
|
|
536
|
+
|
|
537
|
+
// Build path hierarchy
|
|
538
|
+
let currentPath = '';
|
|
539
|
+
for (const segment of pathSegments) {
|
|
540
|
+
currentPath += '/' + segment;
|
|
541
|
+
if (!hierarchy.paths[currentPath]) {
|
|
542
|
+
hierarchy.paths[currentPath] = [];
|
|
543
|
+
}
|
|
544
|
+
hierarchy.paths[currentPath].push(url);
|
|
545
|
+
}
|
|
546
|
+
} catch {
|
|
547
|
+
// Skip invalid URLs
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
return hierarchy;
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
async classifyPage(url) {
|
|
555
|
+
try {
|
|
556
|
+
const response = await this.fetchWithTimeout(url, { timeout: 5000 });
|
|
557
|
+
if (!response.ok) {
|
|
558
|
+
return { category: 'other', type: 'inaccessible', confidence: 1.0 };
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
const contentType = response.headers.get('content-type') || '';
|
|
562
|
+
const html = await response.text();
|
|
563
|
+
const $ = load(html);
|
|
564
|
+
|
|
565
|
+
// Check for forms
|
|
566
|
+
if ($('form').length > 0) {
|
|
567
|
+
return {
|
|
568
|
+
category: 'forms',
|
|
569
|
+
type: 'interactive',
|
|
570
|
+
confidence: 0.9,
|
|
571
|
+
metadata: { formCount: $('form').length }
|
|
572
|
+
};
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
// Check for login/auth indicators
|
|
576
|
+
if (html.includes('login') || html.includes('password') || $('input[type="password"]').length > 0) {
|
|
577
|
+
return { category: 'restricted', type: 'authentication', confidence: 0.8 };
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
// Check for dynamic content indicators
|
|
581
|
+
if (html.includes('application/json') || contentType.includes('json')) {
|
|
582
|
+
return { category: 'dynamic', type: 'api', confidence: 0.9 };
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
// Check file extensions for media/documents
|
|
586
|
+
const urlObj = new URL(url);
|
|
587
|
+
const extension = urlObj.pathname.split('.').pop().toLowerCase();
|
|
588
|
+
const mediaExts = ['jpg', 'jpeg', 'png', 'gif', 'svg', 'mp4', 'mp3'];
|
|
589
|
+
const docExts = ['pdf', 'doc', 'docx', 'txt', 'csv', 'xml'];
|
|
590
|
+
|
|
591
|
+
if (mediaExts.includes(extension)) {
|
|
592
|
+
return { category: 'media', type: extension, confidence: 1.0 };
|
|
593
|
+
}
|
|
594
|
+
if (docExts.includes(extension)) {
|
|
595
|
+
return { category: 'documents', type: extension, confidence: 1.0 };
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
// Default to public static content
|
|
599
|
+
return {
|
|
600
|
+
category: 'public',
|
|
601
|
+
type: 'static',
|
|
602
|
+
confidence: 0.7,
|
|
603
|
+
metadata: {
|
|
604
|
+
title: $('title').text().trim(),
|
|
605
|
+
contentLength: html.length
|
|
606
|
+
}
|
|
607
|
+
};
|
|
608
|
+
|
|
609
|
+
} catch (error) {
|
|
610
|
+
return {
|
|
611
|
+
category: 'other',
|
|
612
|
+
type: 'error',
|
|
613
|
+
confidence: 1.0,
|
|
614
|
+
error: error.message
|
|
615
|
+
};
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
determineAPIType(url, contentType) {
|
|
620
|
+
if (url.includes('graphql')) return 'GraphQL';
|
|
621
|
+
if (url.includes('rest') || url.includes('api')) return 'REST';
|
|
622
|
+
if (contentType.includes('json')) return 'JSON API';
|
|
623
|
+
if (contentType.includes('xml')) return 'XML API';
|
|
624
|
+
if (url.includes('rss')) return 'RSS Feed';
|
|
625
|
+
return 'Unknown API';
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
classifySecurityArea(path) {
|
|
629
|
+
if (path.includes('admin')) return 'admin';
|
|
630
|
+
if (path.includes('login') || path.includes('auth')) return 'authentication';
|
|
631
|
+
if (path.includes('user') || path.includes('account')) return 'user_area';
|
|
632
|
+
if (path.includes('private') || path.includes('internal')) return 'private';
|
|
633
|
+
if (path.includes('config') || path.includes('settings')) return 'configuration';
|
|
634
|
+
return 'sensitive';
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
analyzeSecurityHeaders(headers) {
|
|
638
|
+
const securityHeaders = {};
|
|
639
|
+
const importantHeaders = [
|
|
640
|
+
'x-frame-options',
|
|
641
|
+
'x-content-type-options',
|
|
642
|
+
'x-xss-protection',
|
|
643
|
+
'strict-transport-security',
|
|
644
|
+
'content-security-policy',
|
|
645
|
+
'x-robots-tag'
|
|
646
|
+
];
|
|
647
|
+
|
|
648
|
+
for (const header of importantHeaders) {
|
|
649
|
+
const value = headers.get(header);
|
|
650
|
+
if (value) {
|
|
651
|
+
securityHeaders[header] = value;
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
return securityHeaders;
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
generateRateLimitReasoning(avgResponseTime) {
|
|
659
|
+
if (avgResponseTime > 2000) {
|
|
660
|
+
return 'High response times suggest limited server capacity. Conservative rate limiting recommended.';
|
|
661
|
+
}
|
|
662
|
+
if (avgResponseTime > 1000) {
|
|
663
|
+
return 'Moderate response times indicate standard server capacity. Moderate rate limiting appropriate.';
|
|
664
|
+
}
|
|
665
|
+
return 'Fast response times suggest good server capacity. Higher rate limits may be acceptable.';
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
generateCrawlingGuidelines() {
|
|
669
|
+
const guidelines = {
|
|
670
|
+
allowed: true,
|
|
671
|
+
respectRobots: true,
|
|
672
|
+
recommendations: []
|
|
673
|
+
};
|
|
674
|
+
|
|
675
|
+
if (this.analysis.structure?.robotsTxt) {
|
|
676
|
+
guidelines.robotsTxtFound = true;
|
|
677
|
+
guidelines.recommendations.push('Follow robots.txt directives');
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
if (this.analysis.securityAreas && this.analysis.securityAreas.length > 0) {
|
|
681
|
+
guidelines.recommendations.push('Avoid crawling administrative and user-specific areas');
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
return guidelines;
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
generateAPIGuidelines() {
|
|
688
|
+
const apiCount = this.analysis.apis ? this.analysis.apis.length : 0;
|
|
689
|
+
return {
|
|
690
|
+
endpoints: apiCount,
|
|
691
|
+
recommendations: apiCount > 0 ?
|
|
692
|
+
['Use APIs when available instead of scraping', 'Check API documentation for rate limits'] :
|
|
693
|
+
['No public APIs detected', 'Web scraping may be the only option']
|
|
694
|
+
};
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
generateRateLimitGuidelines() {
|
|
698
|
+
const rateLimit = this.analysis.rateLimit || {};
|
|
699
|
+
return {
|
|
700
|
+
delay: rateLimit.recommendedDelay || 1000,
|
|
701
|
+
maxConcurrency: rateLimit.maxConcurrency || 5,
|
|
702
|
+
requestsPerMinute: rateLimit.recommendedRPM || 30,
|
|
703
|
+
reasoning: rateLimit.reasoning || 'Default conservative rate limiting applied'
|
|
704
|
+
};
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
generateContentGuidelines() {
|
|
708
|
+
const contentTypes = this.analysis.contentTypes || {};
|
|
709
|
+
const totalContent = Object.values(contentTypes).reduce(
|
|
710
|
+
(sum, arr) => sum + (arr ? arr.length : 0), 0
|
|
711
|
+
);
|
|
712
|
+
|
|
713
|
+
return {
|
|
714
|
+
totalPagesAnalyzed: totalContent,
|
|
715
|
+
publicContent: contentTypes.public ? contentTypes.public.length : 0,
|
|
716
|
+
restrictedContent: contentTypes.restricted ? contentTypes.restricted.length : 0,
|
|
717
|
+
recommendations: [
|
|
718
|
+
'Focus on public content areas',
|
|
719
|
+
'Respect form submissions and user data',
|
|
720
|
+
'Avoid restricted and private sections'
|
|
721
|
+
]
|
|
722
|
+
};
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
generateSecurityGuidelines() {
|
|
726
|
+
const securityAreas = this.analysis.securityAreas || [];
|
|
727
|
+
return {
|
|
728
|
+
sensitiveAreas: securityAreas.length,
|
|
729
|
+
recommendations: [
|
|
730
|
+
'Do not attempt to access administrative areas',
|
|
731
|
+
'Respect authentication requirements',
|
|
732
|
+
'Avoid sensitive paths and user data'
|
|
733
|
+
]
|
|
734
|
+
};
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
generateComplianceGuidelines() {
|
|
738
|
+
return {
|
|
739
|
+
dataProtection: [
|
|
740
|
+
'Respect user privacy and data protection laws',
|
|
741
|
+
'Do not collect personal information',
|
|
742
|
+
'Follow GDPR, CCPA, and other applicable regulations'
|
|
743
|
+
],
|
|
744
|
+
ethical: [
|
|
745
|
+
'Use data responsibly and ethically',
|
|
746
|
+
'Respect website terms of service',
|
|
747
|
+
'Credit sources appropriately'
|
|
748
|
+
]
|
|
749
|
+
};
|
|
750
|
+
}
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
export default LLMsTxtAnalyzer;
|