crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,753 @@
1
+ import { z } from 'zod';
2
+ import { load } from 'cheerio';
3
+ import { MapSiteTool } from '../tools/crawl/mapSite.js';
4
+ import { CrawlDeepTool } from '../tools/crawl/crawlDeep.js';
5
+ import { normalizeUrl, getBaseUrl } from '../utils/urlNormalizer.js';
6
+ import { Logger } from '../utils/Logger.js';
7
+
8
+ const logger = new Logger('LLMsTxtAnalyzer');
9
+
10
+ /**
11
+ * LLMsTxtAnalyzer - Comprehensive website analysis for LLMs.txt generation
12
+ *
13
+ * This analyzer performs deep website analysis to understand:
14
+ * - Site structure and navigation patterns
15
+ * - API endpoints and data sources
16
+ * - Content types and classification
17
+ * - Security boundaries and sensitive areas
18
+ * - Rate limiting recommendations
19
+ * - Usage guidelines for AI models
20
+ */
21
+ export class LLMsTxtAnalyzer {
22
+ constructor(options = {}) {
23
+ this.options = {
24
+ maxDepth: options.maxDepth || 3,
25
+ maxPages: options.maxPages || 100,
26
+ timeout: options.timeout || 30000,
27
+ userAgent: options.userAgent || 'LLMs.txt-Analyzer/1.0',
28
+ respectRobots: options.respectRobots !== false,
29
+ detectAPIs: options.detectAPIs !== false,
30
+ analyzeContent: options.analyzeContent !== false,
31
+ checkSecurity: options.checkSecurity !== false,
32
+ ...options
33
+ };
34
+
35
+ this.mapSiteTool = new MapSiteTool({
36
+ timeout: this.options.timeout,
37
+ userAgent: this.options.userAgent
38
+ });
39
+
40
+ this.crawlDeepTool = new CrawlDeepTool({
41
+ timeout: this.options.timeout,
42
+ userAgent: this.options.userAgent
43
+ });
44
+
45
+ this.analysis = {
46
+ structure: {},
47
+ apis: [],
48
+ contentTypes: {},
49
+ securityAreas: [],
50
+ rateLimit: {},
51
+ guidelines: {},
52
+ metadata: {},
53
+ errors: []
54
+ };
55
+ }
56
+
57
+ /**
58
+ * Perform comprehensive website analysis
59
+ */
60
+ async analyzeWebsite(url, options = {}) {
61
+ const startTime = Date.now();
62
+ logger.info(`Starting comprehensive analysis for: ${url}`);
63
+
64
+ try {
65
+ const baseUrl = getBaseUrl(url);
66
+ this.analysis.metadata = {
67
+ baseUrl,
68
+ analyzedAt: new Date().toISOString(),
69
+ analyzer: 'LLMs.txt-Analyzer/1.0',
70
+ analysisOptions: { ...this.options, ...options }
71
+ };
72
+
73
+ // Phase 1: Site Structure Analysis
74
+ await this.analyzeSiteStructure(url, options);
75
+
76
+ // Phase 2: API Detection
77
+ if (this.options.detectAPIs) {
78
+ await this.detectAPIEndpoints(url);
79
+ }
80
+
81
+ // Phase 3: Content Classification
82
+ if (this.options.analyzeContent) {
83
+ await this.classifyContent();
84
+ }
85
+
86
+ // Phase 4: Security Analysis
87
+ if (this.options.checkSecurity) {
88
+ await this.analyzeSecurity(url);
89
+ }
90
+
91
+ // Phase 5: Rate Limiting Analysis
92
+ await this.analyzeRateLimiting(url);
93
+
94
+ // Phase 6: Generate Guidelines
95
+ await this.generateUsageGuidelines();
96
+
97
+ const analysisTime = Date.now() - startTime;
98
+ this.analysis.metadata.analysisTimeMs = analysisTime;
99
+
100
+ logger.info(`Analysis completed in ${analysisTime}ms`);
101
+ return this.analysis;
102
+
103
+ } catch (error) {
104
+ logger.error(`Analysis failed: ${error.message}`);
105
+ this.analysis.errors.push({
106
+ phase: 'general',
107
+ error: error.message,
108
+ timestamp: new Date().toISOString()
109
+ });
110
+ throw error;
111
+ }
112
+ }
113
+
114
+ /**
115
+ * Analyze website structure using site mapping and crawling
116
+ */
117
+ async analyzeSiteStructure(url, options = {}) {
118
+ logger.info('Analyzing site structure...');
119
+
120
+ try {
121
+ // Get comprehensive site map
122
+ const siteMap = await this.mapSiteTool.execute({
123
+ url,
124
+ include_sitemap: true,
125
+ max_urls: this.options.maxPages,
126
+ group_by_path: true,
127
+ include_metadata: true
128
+ });
129
+
130
+ // Perform targeted crawl for deeper analysis
131
+ const crawlResult = await this.crawlDeepTool.execute({
132
+ url,
133
+ max_depth: Math.min(this.options.maxDepth, 3),
134
+ max_pages: Math.min(this.options.maxPages, 50),
135
+ extract_content: true,
136
+ respect_robots: this.options.respectRobots
137
+ });
138
+
139
+ this.analysis.structure = {
140
+ siteMap: siteMap.site_map,
141
+ totalPages: siteMap.total_urls,
142
+ sections: this.categorizeSections(siteMap.urls),
143
+ navigation: this.analyzeNavigation(crawlResult.pages),
144
+ hierarchy: this.buildHierarchy(siteMap.urls),
145
+ robotsTxt: await this.fetchRobotsTxt(url),
146
+ sitemap: siteMap.urls || []
147
+ };
148
+
149
+ logger.info(`Analyzed ${siteMap.total_urls} pages in site structure`);
150
+
151
+ } catch (error) {
152
+ logger.error(`Site structure analysis failed: ${error.message}`);
153
+ this.analysis.errors.push({
154
+ phase: 'structure',
155
+ error: error.message,
156
+ timestamp: new Date().toISOString()
157
+ });
158
+ }
159
+ }
160
+
161
+ /**
162
+ * Detect API endpoints and data sources
163
+ */
164
+ async detectAPIEndpoints(baseUrl) {
165
+ logger.info('Detecting API endpoints...');
166
+
167
+ try {
168
+ const apis = [];
169
+ const commonPaths = [
170
+ '/api', '/v1', '/v2', '/v3', '/rest', '/graphql',
171
+ '/data', '/feed', '/json', '/xml', '/rss',
172
+ '/.well-known', '/openapi', '/swagger'
173
+ ];
174
+
175
+ // Check common API paths
176
+ for (const path of commonPaths) {
177
+ const apiUrl = `${baseUrl}${path}`;
178
+ try {
179
+ const response = await this.fetchWithTimeout(apiUrl, { timeout: 5000 });
180
+ if (response.ok) {
181
+ const contentType = response.headers.get('content-type') || '';
182
+ apis.push({
183
+ url: apiUrl,
184
+ type: this.determineAPIType(apiUrl, contentType),
185
+ status: response.status,
186
+ contentType,
187
+ accessible: true
188
+ });
189
+ }
190
+ } catch {
191
+ // API endpoint not accessible or doesn't exist
192
+ }
193
+ }
194
+
195
+ // Look for API documentation references
196
+ const mainPageResponse = await this.fetchWithTimeout(baseUrl);
197
+ if (mainPageResponse.ok) {
198
+ const html = await mainPageResponse.text();
199
+ const $ = load(html);
200
+
201
+ // Find API documentation links
202
+ $('a[href*="api"], a[href*="developer"], a[href*="docs"]').each((_, element) => {
203
+ const href = $(element).attr('href');
204
+ const text = $(element).text().toLowerCase();
205
+ if (href && (text.includes('api') || text.includes('developer'))) {
206
+ apis.push({
207
+ url: new URL(href, baseUrl).toString(),
208
+ type: 'documentation',
209
+ description: text.trim()
210
+ });
211
+ }
212
+ });
213
+ }
214
+
215
+ this.analysis.apis = apis;
216
+ logger.info(`Detected ${apis.length} API endpoints`);
217
+
218
+ } catch (error) {
219
+ logger.error(`API detection failed: ${error.message}`);
220
+ this.analysis.errors.push({
221
+ phase: 'apis',
222
+ error: error.message,
223
+ timestamp: new Date().toISOString()
224
+ });
225
+ }
226
+ }
227
+
228
+ /**
229
+ * Classify content types across the website
230
+ */
231
+ async classifyContent() {
232
+ logger.info('Classifying content types...');
233
+
234
+ try {
235
+ const contentTypes = {
236
+ public: [],
237
+ restricted: [],
238
+ dynamic: [],
239
+ static: [],
240
+ forms: [],
241
+ media: [],
242
+ documents: []
243
+ };
244
+
245
+ // Analyze pages from structure analysis
246
+ const sitemapUrls = this.analysis.structure?.sitemap || [];
247
+ const urlsToAnalyze = Array.isArray(sitemapUrls) ? sitemapUrls :
248
+ (typeof sitemapUrls === 'object' ? Object.values(sitemapUrls).flat() : []);
249
+
250
+ if (urlsToAnalyze.length > 0) {
251
+ for (const url of urlsToAnalyze.slice(0, 20)) {
252
+ try {
253
+ const classification = await this.classifyPage(url);
254
+ contentTypes[classification.category].push({
255
+ url,
256
+ type: classification.type,
257
+ confidence: classification.confidence,
258
+ metadata: classification.metadata
259
+ });
260
+ } catch (error) {
261
+ logger.warn(`Failed to classify page ${url}: ${error.message}`);
262
+ }
263
+ }
264
+ }
265
+
266
+ this.analysis.contentTypes = contentTypes;
267
+ logger.info('Content classification completed');
268
+
269
+ } catch (error) {
270
+ logger.error(`Content classification failed: ${error.message}`);
271
+ this.analysis.errors.push({
272
+ phase: 'content',
273
+ error: error.message,
274
+ timestamp: new Date().toISOString()
275
+ });
276
+ }
277
+ }
278
+
279
+ /**
280
+ * Analyze security boundaries and sensitive areas
281
+ */
282
+ async analyzeSecurity(baseUrl) {
283
+ logger.info('Analyzing security boundaries...');
284
+
285
+ try {
286
+ const securityAreas = [];
287
+
288
+ // Check for common sensitive paths
289
+ const sensitivePaths = [
290
+ '/admin', '/administrator', '/wp-admin', '/cms',
291
+ '/login', '/signin', '/auth', '/oauth',
292
+ '/user', '/account', '/profile', '/dashboard',
293
+ '/private', '/internal', '/secure',
294
+ '/config', '/settings', '/env'
295
+ ];
296
+
297
+ for (const path of sensitivePaths) {
298
+ const testUrl = `${baseUrl}${path}`;
299
+ try {
300
+ const response = await this.fetchWithTimeout(testUrl, { timeout: 3000 });
301
+ if (response.status === 200 || response.status === 302 || response.status === 401) {
302
+ securityAreas.push({
303
+ path,
304
+ url: testUrl,
305
+ status: response.status,
306
+ type: this.classifySecurityArea(path),
307
+ recommendation: 'restrict'
308
+ });
309
+ }
310
+ } catch {
311
+ // Area not accessible
312
+ }
313
+ }
314
+
315
+ // Check for security headers
316
+ const mainResponse = await this.fetchWithTimeout(baseUrl);
317
+ const securityHeaders = this.analyzeSecurityHeaders(mainResponse.headers);
318
+
319
+ this.analysis.securityAreas = securityAreas;
320
+ this.analysis.securityHeaders = securityHeaders;
321
+ logger.info(`Identified ${securityAreas.length} security areas`);
322
+
323
+ } catch (error) {
324
+ logger.error(`Security analysis failed: ${error.message}`);
325
+ this.analysis.errors.push({
326
+ phase: 'security',
327
+ error: error.message,
328
+ timestamp: new Date().toISOString()
329
+ });
330
+ }
331
+ }
332
+
333
+ /**
334
+ * Analyze and recommend rate limiting
335
+ */
336
+ async analyzeRateLimiting(baseUrl) {
337
+ logger.info('Analyzing rate limiting requirements...');
338
+
339
+ try {
340
+ // Test response times and determine appropriate limits
341
+ const testRequests = 5;
342
+ const responseTimes = [];
343
+
344
+ for (let i = 0; i < testRequests; i++) {
345
+ const start = Date.now();
346
+ try {
347
+ await this.fetchWithTimeout(baseUrl, { timeout: 10000 });
348
+ responseTimes.push(Date.now() - start);
349
+ } catch {
350
+ responseTimes.push(10000); // Max timeout
351
+ }
352
+ }
353
+
354
+ const avgResponseTime = responseTimes.reduce((a, b) => a + b, 0) / responseTimes.length;
355
+
356
+ this.analysis.rateLimit = {
357
+ averageResponseTime: avgResponseTime,
358
+ recommendedDelay: Math.max(100, Math.floor(avgResponseTime * 0.5)),
359
+ maxConcurrency: avgResponseTime > 2000 ? 2 : (avgResponseTime > 1000 ? 5 : 10),
360
+ recommendedRPM: avgResponseTime > 2000 ? 10 : (avgResponseTime > 1000 ? 30 : 60),
361
+ reasoning: this.generateRateLimitReasoning(avgResponseTime)
362
+ };
363
+
364
+ logger.info(`Rate limiting analysis completed. Avg response: ${avgResponseTime}ms`);
365
+
366
+ } catch (error) {
367
+ logger.error(`Rate limiting analysis failed: ${error.message}`);
368
+ this.analysis.errors.push({
369
+ phase: 'rateLimit',
370
+ error: error.message,
371
+ timestamp: new Date().toISOString()
372
+ });
373
+ }
374
+ }
375
+
376
+ /**
377
+ * Generate comprehensive usage guidelines
378
+ */
379
+ async generateUsageGuidelines() {
380
+ logger.info('Generating usage guidelines...');
381
+
382
+ try {
383
+ this.analysis.guidelines = {
384
+ crawling: this.generateCrawlingGuidelines(),
385
+ apis: this.generateAPIGuidelines(),
386
+ rateLimit: this.generateRateLimitGuidelines(),
387
+ content: this.generateContentGuidelines(),
388
+ security: this.generateSecurityGuidelines(),
389
+ compliance: this.generateComplianceGuidelines()
390
+ };
391
+
392
+ logger.info('Usage guidelines generated');
393
+
394
+ } catch (error) {
395
+ logger.error(`Guidelines generation failed: ${error.message}`);
396
+ this.analysis.errors.push({
397
+ phase: 'guidelines',
398
+ error: error.message,
399
+ timestamp: new Date().toISOString()
400
+ });
401
+ }
402
+ }
403
+
404
+ // Helper methods
405
+
406
+ async fetchWithTimeout(url, options = {}) {
407
+ const { timeout = this.options.timeout } = options;
408
+ const controller = new AbortController();
409
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
410
+
411
+ try {
412
+ const response = await fetch(url, {
413
+ signal: controller.signal,
414
+ headers: {
415
+ 'User-Agent': this.options.userAgent
416
+ },
417
+ ...options
418
+ });
419
+ clearTimeout(timeoutId);
420
+ return response;
421
+ } catch (error) {
422
+ clearTimeout(timeoutId);
423
+ throw error;
424
+ }
425
+ }
426
+
427
+ async fetchRobotsTxt(baseUrl) {
428
+ try {
429
+ const robotsUrl = `${baseUrl}/robots.txt`;
430
+ const response = await this.fetchWithTimeout(robotsUrl);
431
+ if (response.ok) {
432
+ return await response.text();
433
+ }
434
+ } catch {
435
+ // No robots.txt found
436
+ }
437
+ return null;
438
+ }
439
+
440
+ categorizeSections(urls) {
441
+ const categories = {
442
+ content: [],
443
+ navigation: [],
444
+ media: [],
445
+ tools: [],
446
+ documentation: [],
447
+ other: []
448
+ };
449
+
450
+ if (typeof urls === 'object' && !Array.isArray(urls)) {
451
+ // Handle grouped URLs
452
+ for (const [path, urlList] of Object.entries(urls)) {
453
+ const category = this.categorizeSection(path);
454
+ categories[category].push({ path, urls: urlList });
455
+ }
456
+ } else if (Array.isArray(urls)) {
457
+ // Handle flat URL list
458
+ for (const url of urls) {
459
+ try {
460
+ const urlObj = new URL(url);
461
+ const path = urlObj.pathname;
462
+ const category = this.categorizeSection(path);
463
+ categories[category].push(url);
464
+ } catch {
465
+ categories.other.push(url);
466
+ }
467
+ }
468
+ }
469
+
470
+ return categories;
471
+ }
472
+
473
+ categorizeSection(path) {
474
+ const contentPaths = ['/blog', '/news', '/articles', '/posts'];
475
+ const navPaths = ['/about', '/contact', '/help', '/support'];
476
+ const mediaPaths = ['/images', '/media', '/gallery', '/downloads'];
477
+ const toolPaths = ['/tools', '/utilities', '/calculator', '/converter'];
478
+ const docPaths = ['/docs', '/documentation', '/api', '/guide'];
479
+
480
+ if (contentPaths.some(p => path.includes(p))) return 'content';
481
+ if (navPaths.some(p => path.includes(p))) return 'navigation';
482
+ if (mediaPaths.some(p => path.includes(p))) return 'media';
483
+ if (toolPaths.some(p => path.includes(p))) return 'tools';
484
+ if (docPaths.some(p => path.includes(p))) return 'documentation';
485
+
486
+ return 'other';
487
+ }
488
+
489
+ analyzeNavigation(pages) {
490
+ const navigation = {
491
+ mainMenu: [],
492
+ breadcrumbs: [],
493
+ footer: [],
494
+ sideNav: []
495
+ };
496
+
497
+ if (pages && pages.length > 0) {
498
+ // Analyze first few pages for common navigation patterns
499
+ for (const page of pages.slice(0, 3)) {
500
+ if (page.content) {
501
+ const $ = load(page.content);
502
+
503
+ // Extract main navigation
504
+ $('nav, .nav, #nav, .navigation, .menu').each((_, element) => {
505
+ $(element).find('a').each((_, link) => {
506
+ const href = $(link).attr('href');
507
+ const text = $(link).text().trim();
508
+ if (href && text) {
509
+ navigation.mainMenu.push({ href, text });
510
+ }
511
+ });
512
+ });
513
+ }
514
+ }
515
+ }
516
+
517
+ return navigation;
518
+ }
519
+
520
+ buildHierarchy(urls) {
521
+ const hierarchy = { depth: {}, paths: {} };
522
+
523
+ const urlArray = Array.isArray(urls) ? urls :
524
+ typeof urls === 'object' ? Object.values(urls).flat() : [];
525
+
526
+ for (const url of urlArray) {
527
+ try {
528
+ const urlObj = new URL(url);
529
+ const pathSegments = urlObj.pathname.split('/').filter(s => s);
530
+ const depth = pathSegments.length;
531
+
532
+ if (!hierarchy.depth[depth]) {
533
+ hierarchy.depth[depth] = [];
534
+ }
535
+ hierarchy.depth[depth].push(url);
536
+
537
+ // Build path hierarchy
538
+ let currentPath = '';
539
+ for (const segment of pathSegments) {
540
+ currentPath += '/' + segment;
541
+ if (!hierarchy.paths[currentPath]) {
542
+ hierarchy.paths[currentPath] = [];
543
+ }
544
+ hierarchy.paths[currentPath].push(url);
545
+ }
546
+ } catch {
547
+ // Skip invalid URLs
548
+ }
549
+ }
550
+
551
+ return hierarchy;
552
+ }
553
+
554
+ async classifyPage(url) {
555
+ try {
556
+ const response = await this.fetchWithTimeout(url, { timeout: 5000 });
557
+ if (!response.ok) {
558
+ return { category: 'other', type: 'inaccessible', confidence: 1.0 };
559
+ }
560
+
561
+ const contentType = response.headers.get('content-type') || '';
562
+ const html = await response.text();
563
+ const $ = load(html);
564
+
565
+ // Check for forms
566
+ if ($('form').length > 0) {
567
+ return {
568
+ category: 'forms',
569
+ type: 'interactive',
570
+ confidence: 0.9,
571
+ metadata: { formCount: $('form').length }
572
+ };
573
+ }
574
+
575
+ // Check for login/auth indicators
576
+ if (html.includes('login') || html.includes('password') || $('input[type="password"]').length > 0) {
577
+ return { category: 'restricted', type: 'authentication', confidence: 0.8 };
578
+ }
579
+
580
+ // Check for dynamic content indicators
581
+ if (html.includes('application/json') || contentType.includes('json')) {
582
+ return { category: 'dynamic', type: 'api', confidence: 0.9 };
583
+ }
584
+
585
+ // Check file extensions for media/documents
586
+ const urlObj = new URL(url);
587
+ const extension = urlObj.pathname.split('.').pop().toLowerCase();
588
+ const mediaExts = ['jpg', 'jpeg', 'png', 'gif', 'svg', 'mp4', 'mp3'];
589
+ const docExts = ['pdf', 'doc', 'docx', 'txt', 'csv', 'xml'];
590
+
591
+ if (mediaExts.includes(extension)) {
592
+ return { category: 'media', type: extension, confidence: 1.0 };
593
+ }
594
+ if (docExts.includes(extension)) {
595
+ return { category: 'documents', type: extension, confidence: 1.0 };
596
+ }
597
+
598
+ // Default to public static content
599
+ return {
600
+ category: 'public',
601
+ type: 'static',
602
+ confidence: 0.7,
603
+ metadata: {
604
+ title: $('title').text().trim(),
605
+ contentLength: html.length
606
+ }
607
+ };
608
+
609
+ } catch (error) {
610
+ return {
611
+ category: 'other',
612
+ type: 'error',
613
+ confidence: 1.0,
614
+ error: error.message
615
+ };
616
+ }
617
+ }
618
+
619
+ determineAPIType(url, contentType) {
620
+ if (url.includes('graphql')) return 'GraphQL';
621
+ if (url.includes('rest') || url.includes('api')) return 'REST';
622
+ if (contentType.includes('json')) return 'JSON API';
623
+ if (contentType.includes('xml')) return 'XML API';
624
+ if (url.includes('rss')) return 'RSS Feed';
625
+ return 'Unknown API';
626
+ }
627
+
628
+ classifySecurityArea(path) {
629
+ if (path.includes('admin')) return 'admin';
630
+ if (path.includes('login') || path.includes('auth')) return 'authentication';
631
+ if (path.includes('user') || path.includes('account')) return 'user_area';
632
+ if (path.includes('private') || path.includes('internal')) return 'private';
633
+ if (path.includes('config') || path.includes('settings')) return 'configuration';
634
+ return 'sensitive';
635
+ }
636
+
637
+ analyzeSecurityHeaders(headers) {
638
+ const securityHeaders = {};
639
+ const importantHeaders = [
640
+ 'x-frame-options',
641
+ 'x-content-type-options',
642
+ 'x-xss-protection',
643
+ 'strict-transport-security',
644
+ 'content-security-policy',
645
+ 'x-robots-tag'
646
+ ];
647
+
648
+ for (const header of importantHeaders) {
649
+ const value = headers.get(header);
650
+ if (value) {
651
+ securityHeaders[header] = value;
652
+ }
653
+ }
654
+
655
+ return securityHeaders;
656
+ }
657
+
658
+ generateRateLimitReasoning(avgResponseTime) {
659
+ if (avgResponseTime > 2000) {
660
+ return 'High response times suggest limited server capacity. Conservative rate limiting recommended.';
661
+ }
662
+ if (avgResponseTime > 1000) {
663
+ return 'Moderate response times indicate standard server capacity. Moderate rate limiting appropriate.';
664
+ }
665
+ return 'Fast response times suggest good server capacity. Higher rate limits may be acceptable.';
666
+ }
667
+
668
+ generateCrawlingGuidelines() {
669
+ const guidelines = {
670
+ allowed: true,
671
+ respectRobots: true,
672
+ recommendations: []
673
+ };
674
+
675
+ if (this.analysis.structure?.robotsTxt) {
676
+ guidelines.robotsTxtFound = true;
677
+ guidelines.recommendations.push('Follow robots.txt directives');
678
+ }
679
+
680
+ if (this.analysis.securityAreas && this.analysis.securityAreas.length > 0) {
681
+ guidelines.recommendations.push('Avoid crawling administrative and user-specific areas');
682
+ }
683
+
684
+ return guidelines;
685
+ }
686
+
687
+ generateAPIGuidelines() {
688
+ const apiCount = this.analysis.apis ? this.analysis.apis.length : 0;
689
+ return {
690
+ endpoints: apiCount,
691
+ recommendations: apiCount > 0 ?
692
+ ['Use APIs when available instead of scraping', 'Check API documentation for rate limits'] :
693
+ ['No public APIs detected', 'Web scraping may be the only option']
694
+ };
695
+ }
696
+
697
+ generateRateLimitGuidelines() {
698
+ const rateLimit = this.analysis.rateLimit || {};
699
+ return {
700
+ delay: rateLimit.recommendedDelay || 1000,
701
+ maxConcurrency: rateLimit.maxConcurrency || 5,
702
+ requestsPerMinute: rateLimit.recommendedRPM || 30,
703
+ reasoning: rateLimit.reasoning || 'Default conservative rate limiting applied'
704
+ };
705
+ }
706
+
707
+ generateContentGuidelines() {
708
+ const contentTypes = this.analysis.contentTypes || {};
709
+ const totalContent = Object.values(contentTypes).reduce(
710
+ (sum, arr) => sum + (arr ? arr.length : 0), 0
711
+ );
712
+
713
+ return {
714
+ totalPagesAnalyzed: totalContent,
715
+ publicContent: contentTypes.public ? contentTypes.public.length : 0,
716
+ restrictedContent: contentTypes.restricted ? contentTypes.restricted.length : 0,
717
+ recommendations: [
718
+ 'Focus on public content areas',
719
+ 'Respect form submissions and user data',
720
+ 'Avoid restricted and private sections'
721
+ ]
722
+ };
723
+ }
724
+
725
+ generateSecurityGuidelines() {
726
+ const securityAreas = this.analysis.securityAreas || [];
727
+ return {
728
+ sensitiveAreas: securityAreas.length,
729
+ recommendations: [
730
+ 'Do not attempt to access administrative areas',
731
+ 'Respect authentication requirements',
732
+ 'Avoid sensitive paths and user data'
733
+ ]
734
+ };
735
+ }
736
+
737
+ generateComplianceGuidelines() {
738
+ return {
739
+ dataProtection: [
740
+ 'Respect user privacy and data protection laws',
741
+ 'Do not collect personal information',
742
+ 'Follow GDPR, CCPA, and other applicable regulations'
743
+ ],
744
+ ethical: [
745
+ 'Use data responsibly and ethically',
746
+ 'Respect website terms of service',
747
+ 'Credit sources appropriately'
748
+ ]
749
+ };
750
+ }
751
+ }
752
+
753
+ export default LLMsTxtAnalyzer;