crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,570 @@
1
+ import { z } from 'zod';
2
+ import { LLMsTxtAnalyzer } from '../../core/LLMsTxtAnalyzer.js';
3
+ import { Logger } from '../../utils/Logger.js';
4
+ import { getBaseUrl } from '../../utils/urlNormalizer.js';
5
+
6
+ const logger = new Logger('GenerateLLMsTxtTool');
7
+
8
+ const GenerateLLMsTxtSchema = z.object({
9
+ url: z.string().url().describe('The website URL to analyze and generate LLMs.txt for'),
10
+
11
+ analysisOptions: z.object({
12
+ maxDepth: z.number().min(1).max(5).optional().default(3).describe('Maximum crawl depth for analysis'),
13
+ maxPages: z.number().min(10).max(500).optional().default(100).describe('Maximum pages to analyze'),
14
+ detectAPIs: z.boolean().optional().default(true).describe('Whether to detect API endpoints'),
15
+ analyzeContent: z.boolean().optional().default(true).describe('Whether to analyze content types'),
16
+ checkSecurity: z.boolean().optional().default(true).describe('Whether to check security boundaries'),
17
+ respectRobots: z.boolean().optional().default(true).describe('Whether to respect robots.txt')
18
+ }).optional().default({}),
19
+
20
+ outputOptions: z.object({
21
+ includeDetailed: z.boolean().optional().default(true).describe('Generate detailed LLMs-full.txt'),
22
+ includeAnalysis: z.boolean().optional().default(false).describe('Include raw analysis data'),
23
+ contactEmail: z.string().email().optional().describe('Contact email for the LLMs.txt'),
24
+ organizationName: z.string().optional().describe('Organization name'),
25
+ customGuidelines: z.array(z.string()).optional().describe('Additional custom guidelines'),
26
+ customRestrictions: z.array(z.string()).optional().describe('Additional restrictions')
27
+ }).optional().default({}),
28
+
29
+ complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard').describe('Compliance level for generated guidelines'),
30
+
31
+ format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both').describe('Which files to generate')
32
+ });
33
+
34
+ /**
35
+ * GenerateLLMsTxtTool - Generate standard-compliant LLMs.txt and LLMs-full.txt files
36
+ *
37
+ * This tool analyzes websites comprehensively and generates LLMs.txt files that define
38
+ * how AI models should interact with the site, including:
39
+ *
40
+ * - Usage guidelines and boundaries
41
+ * - Rate limiting recommendations
42
+ * - Allowed/disallowed paths
43
+ * - API endpoints and preferred access methods
44
+ * - Content classification and restrictions
45
+ * - Contact information and support details
46
+ */
47
+ export class GenerateLLMsTxtTool {
48
+ constructor(options = {}) {
49
+ this.options = {
50
+ timeout: options.timeout || 30000,
51
+ userAgent: options.userAgent || 'LLMs.txt-Generator/1.0',
52
+ ...options
53
+ };
54
+
55
+ this.analyzer = new LLMsTxtAnalyzer({
56
+ timeout: this.options.timeout,
57
+ userAgent: this.options.userAgent
58
+ });
59
+ }
60
+
61
+ async execute(params) {
62
+ const startTime = Date.now();
63
+ logger.info('Starting LLMs.txt generation...');
64
+
65
+ try {
66
+ const validated = GenerateLLMsTxtSchema.parse(params);
67
+ const { url, analysisOptions, outputOptions, complianceLevel, format } = validated;
68
+
69
+ const baseUrl = getBaseUrl(url);
70
+
71
+ // Step 1: Comprehensive Website Analysis
72
+ logger.info(`Analyzing website: ${baseUrl}`);
73
+ const analysis = await this.analyzer.analyzeWebsite(url, analysisOptions);
74
+
75
+ // Step 2: Generate LLMs.txt Content
76
+ const llmsTxtContent = this.generateLLMsTxt(analysis, outputOptions, complianceLevel);
77
+
78
+ // Step 3: Generate LLMs-full.txt Content (if requested)
79
+ let llmsFullTxtContent = null;
80
+ if (format === 'both' || format === 'llms-full-txt') {
81
+ llmsFullTxtContent = this.generateLLMsFullTxt(analysis, outputOptions, complianceLevel);
82
+ }
83
+
84
+ const result = {
85
+ baseUrl,
86
+ generatedAt: new Date().toISOString(),
87
+ analysisStats: {
88
+ pagesAnalyzed: analysis.structure.totalPages || 0,
89
+ apisDetected: analysis.apis.length,
90
+ securityAreasFound: analysis.securityAreas.length,
91
+ analysisTimeMs: analysis.metadata.analysisTimeMs,
92
+ totalTimeMs: Date.now() - startTime
93
+ },
94
+ files: {},
95
+ recommendations: this.generateRecommendations(analysis),
96
+ complianceLevel,
97
+ warnings: this.generateWarnings(analysis)
98
+ };
99
+
100
+ // Add generated content based on format
101
+ if (format === 'both' || format === 'llms-txt') {
102
+ result.files['llms.txt'] = llmsTxtContent;
103
+ }
104
+ if (format === 'both' || format === 'llms-full-txt') {
105
+ result.files['llms-full.txt'] = llmsFullTxtContent;
106
+ }
107
+
108
+ // Include raw analysis if requested
109
+ if (outputOptions.includeAnalysis) {
110
+ result.analysis = analysis;
111
+ }
112
+
113
+ logger.info(`LLMs.txt generation completed in ${Date.now() - startTime}ms`);
114
+ return result;
115
+
116
+ } catch (error) {
117
+ logger.error(`LLMs.txt generation failed: ${error.message}`);
118
+ throw new Error(`LLMs.txt generation failed: ${error.message}`);
119
+ }
120
+ }
121
+
122
+ /**
123
+ * Generate standard LLMs.txt content
124
+ */
125
+ generateLLMsTxt(analysis, outputOptions, complianceLevel) {
126
+ const lines = [];
127
+ const baseUrl = analysis.metadata.baseUrl;
128
+
129
+ // Header
130
+ lines.push('# LLMs.txt');
131
+ lines.push(`# AI Model Usage Guidelines for ${baseUrl}`);
132
+ lines.push(`# Generated on ${new Date().toISOString()}`);
133
+ lines.push('');
134
+
135
+ // Contact Information
136
+ if (outputOptions.contactEmail || outputOptions.organizationName) {
137
+ lines.push('# Contact Information');
138
+ if (outputOptions.organizationName) {
139
+ lines.push(`# Organization: ${outputOptions.organizationName}`);
140
+ }
141
+ if (outputOptions.contactEmail) {
142
+ lines.push(`# Contact: ${outputOptions.contactEmail}`);
143
+ }
144
+ lines.push('');
145
+ }
146
+
147
+ // Usage Policy
148
+ lines.push('# Usage Policy');
149
+ lines.push('User-agent: *');
150
+
151
+ // Rate limiting based on analysis
152
+ if (analysis.rateLimit) {
153
+ lines.push(`Crawl-delay: ${Math.ceil(analysis.rateLimit.recommendedDelay / 1000)}`);
154
+ lines.push(`Request-rate: ${analysis.rateLimit.recommendedRPM}/60`);
155
+ }
156
+
157
+ lines.push('');
158
+
159
+ // Disallowed paths based on security analysis
160
+ if (analysis.securityAreas && analysis.securityAreas.length > 0) {
161
+ lines.push('# Restricted Areas');
162
+ for (const area of analysis.securityAreas) {
163
+ const path = area.path || new URL(area.url).pathname;
164
+ lines.push(`Disallow: ${path}`);
165
+ }
166
+ lines.push('');
167
+ }
168
+
169
+ // Common restriction patterns based on compliance level
170
+ lines.push('# Standard Restrictions');
171
+ const standardRestrictions = this.getStandardRestrictions(complianceLevel);
172
+ for (const restriction of standardRestrictions) {
173
+ lines.push(`Disallow: ${restriction}`);
174
+ }
175
+ lines.push('');
176
+
177
+ // API Information
178
+ if (analysis.apis && analysis.apis.length > 0) {
179
+ lines.push('# Preferred Data Access');
180
+ lines.push('# Use these APIs instead of scraping when possible:');
181
+ for (const api of analysis.apis) {
182
+ if (api.type !== 'documentation') {
183
+ lines.push(`# API: ${api.url} (${api.type})`);
184
+ }
185
+ }
186
+ lines.push('');
187
+ }
188
+
189
+ // Custom guidelines
190
+ if (outputOptions.customGuidelines && outputOptions.customGuidelines.length > 0) {
191
+ lines.push('# Custom Guidelines');
192
+ for (const guideline of outputOptions.customGuidelines) {
193
+ lines.push(`# ${guideline}`);
194
+ }
195
+ lines.push('');
196
+ }
197
+
198
+ // Custom restrictions
199
+ if (outputOptions.customRestrictions && outputOptions.customRestrictions.length > 0) {
200
+ lines.push('# Additional Restrictions');
201
+ for (const restriction of outputOptions.customRestrictions) {
202
+ lines.push(`Disallow: ${restriction}`);
203
+ }
204
+ lines.push('');
205
+ }
206
+
207
+ // Sitemap reference
208
+ if (analysis.structure && analysis.structure.sitemap) {
209
+ lines.push('# Site Structure');
210
+ lines.push(`Sitemap: ${baseUrl}/sitemap.xml`);
211
+ lines.push('');
212
+ }
213
+
214
+ // Footer
215
+ lines.push('# This file was generated by LLMs.txt Generator');
216
+ lines.push('# For detailed guidelines, see llms-full.txt');
217
+
218
+ return lines.join('\n');
219
+ }
220
+
221
+ /**
222
+ * Generate detailed LLMs-full.txt content
223
+ */
224
+ generateLLMsFullTxt(analysis, outputOptions, complianceLevel) {
225
+ const lines = [];
226
+ const baseUrl = analysis.metadata.baseUrl;
227
+
228
+ // Header
229
+ lines.push('# LLMs-full.txt');
230
+ lines.push(`# Comprehensive AI Model Usage Guidelines for ${baseUrl}`);
231
+ lines.push(`# Generated on ${new Date().toISOString()}`);
232
+ lines.push('');
233
+
234
+ // Table of Contents
235
+ lines.push('## Table of Contents');
236
+ lines.push('1. Overview and Contact Information');
237
+ lines.push('2. Rate Limiting and Technical Guidelines');
238
+ lines.push('3. Content Access Policy');
239
+ lines.push('4. API and Data Sources');
240
+ lines.push('5. Security and Privacy Guidelines');
241
+ lines.push('6. Compliance and Legal Requirements');
242
+ lines.push('7. Best Practices and Examples');
243
+ lines.push('8. Support and Contact Information');
244
+ lines.push('');
245
+
246
+ // Section 1: Overview
247
+ lines.push('## 1. Overview and Contact Information');
248
+ lines.push('');
249
+ if (outputOptions.organizationName) {
250
+ lines.push(`**Organization:** ${outputOptions.organizationName}`);
251
+ }
252
+ if (outputOptions.contactEmail) {
253
+ lines.push(`**Contact:** ${outputOptions.contactEmail}`);
254
+ }
255
+ lines.push(`**Website:** ${baseUrl}`);
256
+ lines.push(`**Compliance Level:** ${complianceLevel}`);
257
+ lines.push('');
258
+ lines.push('This document provides comprehensive guidelines for AI models accessing this website.');
259
+ lines.push('All automated access should follow these guidelines to ensure responsible usage.');
260
+ lines.push('');
261
+
262
+ // Section 2: Rate Limiting
263
+ lines.push('## 2. Rate Limiting and Technical Guidelines');
264
+ lines.push('');
265
+ if (analysis.rateLimit) {
266
+ lines.push('### Recommended Rate Limits');
267
+ lines.push(`- **Delay between requests:** ${analysis.rateLimit.recommendedDelay}ms minimum`);
268
+ lines.push(`- **Maximum concurrent requests:** ${analysis.rateLimit.maxConcurrency}`);
269
+ lines.push(`- **Requests per minute:** ${analysis.rateLimit.recommendedRPM} maximum`);
270
+ lines.push('');
271
+ lines.push('### Technical Justification');
272
+ lines.push(`${analysis.rateLimit.reasoning}`);
273
+ lines.push(`Average response time: ${analysis.rateLimit.averageResponseTime}ms`);
274
+ lines.push('');
275
+ }
276
+
277
+ lines.push('### User Agent Requirements');
278
+ lines.push('- Use descriptive User-Agent strings identifying your AI model/service');
279
+ lines.push('- Include contact information in User-Agent when possible');
280
+ lines.push('- Example: "MyAI-Bot/1.0 (+https://example.com/bot-info)"');
281
+ lines.push('');
282
+
283
+ // Section 3: Content Access Policy
284
+ lines.push('## 3. Content Access Policy');
285
+ lines.push('');
286
+
287
+ if (analysis.contentTypes) {
288
+ lines.push('### Content Classification');
289
+ lines.push(`- **Public content pages:** ${analysis.contentTypes.public.length} identified`);
290
+ lines.push(`- **Restricted content:** ${analysis.contentTypes.restricted.length} areas`);
291
+ lines.push(`- **Interactive forms:** ${analysis.contentTypes.forms.length} detected`);
292
+ lines.push(`- **Media files:** ${analysis.contentTypes.media.length} found`);
293
+ lines.push('');
294
+ }
295
+
296
+ lines.push('### Allowed Content Types');
297
+ lines.push('- Public articles, blog posts, and informational content');
298
+ lines.push('- Product information and descriptions');
299
+ lines.push('- Published documentation and help content');
300
+ lines.push('- Publicly available media with proper attribution');
301
+ lines.push('');
302
+
303
+ lines.push('### Restricted Content');
304
+ lines.push('- User-generated content requiring authentication');
305
+ lines.push('- Personal information and private data');
306
+ lines.push('- Form submissions and interactive content');
307
+ lines.push('- Administrative and configuration areas');
308
+ lines.push('');
309
+
310
+ // Section 4: APIs and Data Sources
311
+ lines.push('## 4. API and Data Sources');
312
+ lines.push('');
313
+
314
+ if (analysis.apis && analysis.apis.length > 0) {
315
+ lines.push('### Available APIs');
316
+ for (const api of analysis.apis) {
317
+ lines.push(`- **${api.type}:** ${api.url}`);
318
+ if (api.description) {
319
+ lines.push(` - Description: ${api.description}`);
320
+ }
321
+ if (api.contentType) {
322
+ lines.push(` - Content-Type: ${api.contentType}`);
323
+ }
324
+ }
325
+ lines.push('');
326
+ lines.push('**Recommendation:** Use APIs instead of web scraping when available for better performance and reliability.');
327
+ } else {
328
+ lines.push('### No Public APIs Detected');
329
+ lines.push('No public APIs were found during analysis. Web scraping may be necessary but should follow all guidelines in this document.');
330
+ }
331
+ lines.push('');
332
+
333
+ // Section 5: Security and Privacy
334
+ lines.push('## 5. Security and Privacy Guidelines');
335
+ lines.push('');
336
+
337
+ if (analysis.securityAreas && analysis.securityAreas.length > 0) {
338
+ lines.push('### Restricted Areas Detected');
339
+ const securityAreasByType = {};
340
+ for (const area of analysis.securityAreas) {
341
+ if (!securityAreasByType[area.type]) {
342
+ securityAreasByType[area.type] = [];
343
+ }
344
+ securityAreasByType[area.type].push(area.path);
345
+ }
346
+
347
+ for (const [type, paths] of Object.entries(securityAreasByType)) {
348
+ lines.push(`**${type.charAt(0).toUpperCase() + type.slice(1)} Areas:**`);
349
+ for (const path of paths) {
350
+ lines.push(`- ${path}`);
351
+ }
352
+ lines.push('');
353
+ }
354
+ }
355
+
356
+ lines.push('### Privacy Requirements');
357
+ lines.push('- Do not collect, store, or process personal information');
358
+ lines.push('- Respect user privacy and data protection regulations');
359
+ lines.push('- Avoid accessing user-specific content or accounts');
360
+ lines.push('- Do not attempt to bypass authentication mechanisms');
361
+ lines.push('');
362
+
363
+ // Section 6: Compliance
364
+ lines.push('## 6. Compliance and Legal Requirements');
365
+ lines.push('');
366
+ lines.push('### Data Protection Compliance');
367
+ lines.push('- **GDPR:** Respect European data protection requirements');
368
+ lines.push('- **CCPA:** Follow California Consumer Privacy Act guidelines');
369
+ lines.push('- **COPPA:** Extra caution with any content that might involve minors');
370
+ lines.push('');
371
+ lines.push('### Terms of Service');
372
+ lines.push('- Review and comply with website Terms of Service');
373
+ lines.push('- Respect intellectual property and copyright');
374
+ lines.push('- Provide proper attribution when using content');
375
+ lines.push('');
376
+
377
+ // Section 7: Best Practices
378
+ lines.push('## 7. Best Practices and Examples');
379
+ lines.push('');
380
+ lines.push('### Recommended Practices');
381
+ lines.push('1. **Start with robots.txt:** Always check and follow robots.txt directives');
382
+ lines.push('2. **Use structured data:** Look for JSON-LD, microdata, and other structured formats');
383
+ lines.push('3. **Respect meta tags:** Pay attention to meta robots tags and directives');
384
+ lines.push('4. **Cache responsibly:** Cache content appropriately to reduce server load');
385
+ lines.push('5. **Handle errors gracefully:** Implement proper error handling and retries');
386
+ lines.push('');
387
+
388
+ if (analysis.structure && analysis.structure.robotsTxt) {
389
+ lines.push('### Robots.txt Status');
390
+ lines.push('A robots.txt file was found and should be respected. Key directives:');
391
+ const robotsLines = analysis.structure.robotsTxt.split('\n').slice(0, 10);
392
+ for (const line of robotsLines) {
393
+ if (line.trim() && !line.startsWith('#')) {
394
+ lines.push(`- ${line.trim()}`);
395
+ }
396
+ }
397
+ lines.push('');
398
+ }
399
+
400
+ lines.push('### Example Usage Patterns');
401
+ lines.push('```');
402
+ lines.push('# Good: Respectful crawling with delays');
403
+ lines.push('for url in urls:');
404
+ lines.push(' response = fetch(url, headers={"User-Agent": "MyBot/1.0"})');
405
+ lines.push(` time.sleep(${Math.ceil((analysis.rateLimit?.recommendedDelay || 1000) / 1000)})`);
406
+ lines.push(' process(response)');
407
+ lines.push('');
408
+ lines.push('# Bad: Aggressive crawling without delays');
409
+ lines.push('# for url in urls:');
410
+ lines.push('# response = fetch(url) # No delay, no user agent');
411
+ lines.push('# process(response)');
412
+ lines.push('```');
413
+ lines.push('');
414
+
415
+ // Section 8: Support
416
+ lines.push('## 8. Support and Contact Information');
417
+ lines.push('');
418
+ if (outputOptions.contactEmail) {
419
+ lines.push(`**Primary Contact:** ${outputOptions.contactEmail}`);
420
+ }
421
+ lines.push('');
422
+ lines.push('### Reporting Issues');
423
+ lines.push('If you encounter issues or need clarification on these guidelines:');
424
+ lines.push('1. Check this document for guidance');
425
+ lines.push('2. Review the basic llms.txt file');
426
+ lines.push('3. Contact us using the information above');
427
+ lines.push('');
428
+ lines.push('### Updates');
429
+ lines.push('These guidelines may be updated periodically. Check the generation date above');
430
+ lines.push('and consider regenerating this file if accessing the site after significant changes.');
431
+ lines.push('');
432
+
433
+ // Footer
434
+ lines.push('---');
435
+ lines.push('');
436
+ lines.push('**Generated by:** LLMs.txt Generator v1.0');
437
+ lines.push('**Analysis Date:** ' + analysis.metadata.analyzedAt);
438
+ lines.push('**Analysis Coverage:** ' + (analysis.structure.totalPages || 'N/A') + ' pages analyzed');
439
+ if (analysis.errors && analysis.errors.length > 0) {
440
+ lines.push('**Note:** Some analysis errors occurred. Contact support if needed.');
441
+ }
442
+
443
+ return lines.join('\n');
444
+ }
445
+
446
+ /**
447
+ * Get standard restriction patterns based on compliance level
448
+ */
449
+ getStandardRestrictions(complianceLevel) {
450
+ const basic = [
451
+ '/admin',
452
+ '/wp-admin',
453
+ '/login',
454
+ '/user',
455
+ '/account'
456
+ ];
457
+
458
+ const standard = [
459
+ ...basic,
460
+ '/private',
461
+ '/internal',
462
+ '/config',
463
+ '/settings',
464
+ '/auth',
465
+ '/oauth',
466
+ '/signin',
467
+ '/*?password=*',
468
+ '/*?token=*'
469
+ ];
470
+
471
+ const strict = [
472
+ ...standard,
473
+ '/api/*/private',
474
+ '/dashboard',
475
+ '/profile',
476
+ '/*?session=*',
477
+ '/*?key=*',
478
+ '/cms',
479
+ '/administrator',
480
+ '/*.env',
481
+ '/*.config'
482
+ ];
483
+
484
+ switch (complianceLevel) {
485
+ case 'basic': return basic;
486
+ case 'strict': return strict;
487
+ default: return standard;
488
+ }
489
+ }
490
+
491
+ /**
492
+ * Generate deployment and implementation recommendations
493
+ */
494
+ generateRecommendations(analysis) {
495
+ const recommendations = [];
496
+
497
+ // Rate limiting recommendations
498
+ if (analysis.rateLimit && analysis.rateLimit.averageResponseTime > 2000) {
499
+ recommendations.push({
500
+ type: 'performance',
501
+ priority: 'high',
502
+ message: 'Server response times are slow. Consider very conservative rate limiting.'
503
+ });
504
+ }
505
+
506
+ // Security recommendations
507
+ if (analysis.securityAreas && analysis.securityAreas.length > 5) {
508
+ recommendations.push({
509
+ type: 'security',
510
+ priority: 'medium',
511
+ message: 'Multiple security areas detected. Review restrictions carefully.'
512
+ });
513
+ }
514
+
515
+ // API recommendations
516
+ if (analysis.apis && analysis.apis.length > 0) {
517
+ recommendations.push({
518
+ type: 'integration',
519
+ priority: 'high',
520
+ message: 'APIs detected. Strongly recommend using APIs instead of scraping.'
521
+ });
522
+ }
523
+
524
+ // Structure recommendations
525
+ if (analysis.structure && analysis.structure.totalPages > 1000) {
526
+ recommendations.push({
527
+ type: 'scale',
528
+ priority: 'medium',
529
+ message: 'Large website detected. Consider focused crawling strategies.'
530
+ });
531
+ }
532
+
533
+ return recommendations;
534
+ }
535
+
536
+ /**
537
+ * Generate warnings about potential issues
538
+ */
539
+ generateWarnings(analysis) {
540
+ const warnings = [];
541
+
542
+ // Analysis errors
543
+ if (analysis.errors && analysis.errors.length > 0) {
544
+ warnings.push({
545
+ type: 'analysis',
546
+ message: `${analysis.errors.length} errors occurred during analysis. Guidelines may be incomplete.`
547
+ });
548
+ }
549
+
550
+ // Missing robots.txt
551
+ if (!analysis.structure || !analysis.structure.robotsTxt) {
552
+ warnings.push({
553
+ type: 'robots',
554
+ message: 'No robots.txt found. Extra caution recommended.'
555
+ });
556
+ }
557
+
558
+ // High security areas
559
+ if (analysis.securityAreas && analysis.securityAreas.length > 10) {
560
+ warnings.push({
561
+ type: 'security',
562
+ message: 'Many restricted areas detected. This site may not be suitable for broad crawling.'
563
+ });
564
+ }
565
+
566
+ return warnings;
567
+ }
568
+ }
569
+
570
+ export default GenerateLLMsTxtTool;