crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,570 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { LLMsTxtAnalyzer } from '../../core/LLMsTxtAnalyzer.js';
|
|
3
|
+
import { Logger } from '../../utils/Logger.js';
|
|
4
|
+
import { getBaseUrl } from '../../utils/urlNormalizer.js';
|
|
5
|
+
|
|
6
|
+
const logger = new Logger('GenerateLLMsTxtTool');
|
|
7
|
+
|
|
8
|
+
const GenerateLLMsTxtSchema = z.object({
|
|
9
|
+
url: z.string().url().describe('The website URL to analyze and generate LLMs.txt for'),
|
|
10
|
+
|
|
11
|
+
analysisOptions: z.object({
|
|
12
|
+
maxDepth: z.number().min(1).max(5).optional().default(3).describe('Maximum crawl depth for analysis'),
|
|
13
|
+
maxPages: z.number().min(10).max(500).optional().default(100).describe('Maximum pages to analyze'),
|
|
14
|
+
detectAPIs: z.boolean().optional().default(true).describe('Whether to detect API endpoints'),
|
|
15
|
+
analyzeContent: z.boolean().optional().default(true).describe('Whether to analyze content types'),
|
|
16
|
+
checkSecurity: z.boolean().optional().default(true).describe('Whether to check security boundaries'),
|
|
17
|
+
respectRobots: z.boolean().optional().default(true).describe('Whether to respect robots.txt')
|
|
18
|
+
}).optional().default({}),
|
|
19
|
+
|
|
20
|
+
outputOptions: z.object({
|
|
21
|
+
includeDetailed: z.boolean().optional().default(true).describe('Generate detailed LLMs-full.txt'),
|
|
22
|
+
includeAnalysis: z.boolean().optional().default(false).describe('Include raw analysis data'),
|
|
23
|
+
contactEmail: z.string().email().optional().describe('Contact email for the LLMs.txt'),
|
|
24
|
+
organizationName: z.string().optional().describe('Organization name'),
|
|
25
|
+
customGuidelines: z.array(z.string()).optional().describe('Additional custom guidelines'),
|
|
26
|
+
customRestrictions: z.array(z.string()).optional().describe('Additional restrictions')
|
|
27
|
+
}).optional().default({}),
|
|
28
|
+
|
|
29
|
+
complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard').describe('Compliance level for generated guidelines'),
|
|
30
|
+
|
|
31
|
+
format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both').describe('Which files to generate')
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* GenerateLLMsTxtTool - Generate standard-compliant LLMs.txt and LLMs-full.txt files
|
|
36
|
+
*
|
|
37
|
+
* This tool analyzes websites comprehensively and generates LLMs.txt files that define
|
|
38
|
+
* how AI models should interact with the site, including:
|
|
39
|
+
*
|
|
40
|
+
* - Usage guidelines and boundaries
|
|
41
|
+
* - Rate limiting recommendations
|
|
42
|
+
* - Allowed/disallowed paths
|
|
43
|
+
* - API endpoints and preferred access methods
|
|
44
|
+
* - Content classification and restrictions
|
|
45
|
+
* - Contact information and support details
|
|
46
|
+
*/
|
|
47
|
+
export class GenerateLLMsTxtTool {
|
|
48
|
+
constructor(options = {}) {
|
|
49
|
+
this.options = {
|
|
50
|
+
timeout: options.timeout || 30000,
|
|
51
|
+
userAgent: options.userAgent || 'LLMs.txt-Generator/1.0',
|
|
52
|
+
...options
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
this.analyzer = new LLMsTxtAnalyzer({
|
|
56
|
+
timeout: this.options.timeout,
|
|
57
|
+
userAgent: this.options.userAgent
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
async execute(params) {
|
|
62
|
+
const startTime = Date.now();
|
|
63
|
+
logger.info('Starting LLMs.txt generation...');
|
|
64
|
+
|
|
65
|
+
try {
|
|
66
|
+
const validated = GenerateLLMsTxtSchema.parse(params);
|
|
67
|
+
const { url, analysisOptions, outputOptions, complianceLevel, format } = validated;
|
|
68
|
+
|
|
69
|
+
const baseUrl = getBaseUrl(url);
|
|
70
|
+
|
|
71
|
+
// Step 1: Comprehensive Website Analysis
|
|
72
|
+
logger.info(`Analyzing website: ${baseUrl}`);
|
|
73
|
+
const analysis = await this.analyzer.analyzeWebsite(url, analysisOptions);
|
|
74
|
+
|
|
75
|
+
// Step 2: Generate LLMs.txt Content
|
|
76
|
+
const llmsTxtContent = this.generateLLMsTxt(analysis, outputOptions, complianceLevel);
|
|
77
|
+
|
|
78
|
+
// Step 3: Generate LLMs-full.txt Content (if requested)
|
|
79
|
+
let llmsFullTxtContent = null;
|
|
80
|
+
if (format === 'both' || format === 'llms-full-txt') {
|
|
81
|
+
llmsFullTxtContent = this.generateLLMsFullTxt(analysis, outputOptions, complianceLevel);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const result = {
|
|
85
|
+
baseUrl,
|
|
86
|
+
generatedAt: new Date().toISOString(),
|
|
87
|
+
analysisStats: {
|
|
88
|
+
pagesAnalyzed: analysis.structure.totalPages || 0,
|
|
89
|
+
apisDetected: analysis.apis.length,
|
|
90
|
+
securityAreasFound: analysis.securityAreas.length,
|
|
91
|
+
analysisTimeMs: analysis.metadata.analysisTimeMs,
|
|
92
|
+
totalTimeMs: Date.now() - startTime
|
|
93
|
+
},
|
|
94
|
+
files: {},
|
|
95
|
+
recommendations: this.generateRecommendations(analysis),
|
|
96
|
+
complianceLevel,
|
|
97
|
+
warnings: this.generateWarnings(analysis)
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
// Add generated content based on format
|
|
101
|
+
if (format === 'both' || format === 'llms-txt') {
|
|
102
|
+
result.files['llms.txt'] = llmsTxtContent;
|
|
103
|
+
}
|
|
104
|
+
if (format === 'both' || format === 'llms-full-txt') {
|
|
105
|
+
result.files['llms-full.txt'] = llmsFullTxtContent;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Include raw analysis if requested
|
|
109
|
+
if (outputOptions.includeAnalysis) {
|
|
110
|
+
result.analysis = analysis;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
logger.info(`LLMs.txt generation completed in ${Date.now() - startTime}ms`);
|
|
114
|
+
return result;
|
|
115
|
+
|
|
116
|
+
} catch (error) {
|
|
117
|
+
logger.error(`LLMs.txt generation failed: ${error.message}`);
|
|
118
|
+
throw new Error(`LLMs.txt generation failed: ${error.message}`);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Generate standard LLMs.txt content
|
|
124
|
+
*/
|
|
125
|
+
generateLLMsTxt(analysis, outputOptions, complianceLevel) {
|
|
126
|
+
const lines = [];
|
|
127
|
+
const baseUrl = analysis.metadata.baseUrl;
|
|
128
|
+
|
|
129
|
+
// Header
|
|
130
|
+
lines.push('# LLMs.txt');
|
|
131
|
+
lines.push(`# AI Model Usage Guidelines for ${baseUrl}`);
|
|
132
|
+
lines.push(`# Generated on ${new Date().toISOString()}`);
|
|
133
|
+
lines.push('');
|
|
134
|
+
|
|
135
|
+
// Contact Information
|
|
136
|
+
if (outputOptions.contactEmail || outputOptions.organizationName) {
|
|
137
|
+
lines.push('# Contact Information');
|
|
138
|
+
if (outputOptions.organizationName) {
|
|
139
|
+
lines.push(`# Organization: ${outputOptions.organizationName}`);
|
|
140
|
+
}
|
|
141
|
+
if (outputOptions.contactEmail) {
|
|
142
|
+
lines.push(`# Contact: ${outputOptions.contactEmail}`);
|
|
143
|
+
}
|
|
144
|
+
lines.push('');
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Usage Policy
|
|
148
|
+
lines.push('# Usage Policy');
|
|
149
|
+
lines.push('User-agent: *');
|
|
150
|
+
|
|
151
|
+
// Rate limiting based on analysis
|
|
152
|
+
if (analysis.rateLimit) {
|
|
153
|
+
lines.push(`Crawl-delay: ${Math.ceil(analysis.rateLimit.recommendedDelay / 1000)}`);
|
|
154
|
+
lines.push(`Request-rate: ${analysis.rateLimit.recommendedRPM}/60`);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
lines.push('');
|
|
158
|
+
|
|
159
|
+
// Disallowed paths based on security analysis
|
|
160
|
+
if (analysis.securityAreas && analysis.securityAreas.length > 0) {
|
|
161
|
+
lines.push('# Restricted Areas');
|
|
162
|
+
for (const area of analysis.securityAreas) {
|
|
163
|
+
const path = area.path || new URL(area.url).pathname;
|
|
164
|
+
lines.push(`Disallow: ${path}`);
|
|
165
|
+
}
|
|
166
|
+
lines.push('');
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// Common restriction patterns based on compliance level
|
|
170
|
+
lines.push('# Standard Restrictions');
|
|
171
|
+
const standardRestrictions = this.getStandardRestrictions(complianceLevel);
|
|
172
|
+
for (const restriction of standardRestrictions) {
|
|
173
|
+
lines.push(`Disallow: ${restriction}`);
|
|
174
|
+
}
|
|
175
|
+
lines.push('');
|
|
176
|
+
|
|
177
|
+
// API Information
|
|
178
|
+
if (analysis.apis && analysis.apis.length > 0) {
|
|
179
|
+
lines.push('# Preferred Data Access');
|
|
180
|
+
lines.push('# Use these APIs instead of scraping when possible:');
|
|
181
|
+
for (const api of analysis.apis) {
|
|
182
|
+
if (api.type !== 'documentation') {
|
|
183
|
+
lines.push(`# API: ${api.url} (${api.type})`);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
lines.push('');
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// Custom guidelines
|
|
190
|
+
if (outputOptions.customGuidelines && outputOptions.customGuidelines.length > 0) {
|
|
191
|
+
lines.push('# Custom Guidelines');
|
|
192
|
+
for (const guideline of outputOptions.customGuidelines) {
|
|
193
|
+
lines.push(`# ${guideline}`);
|
|
194
|
+
}
|
|
195
|
+
lines.push('');
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// Custom restrictions
|
|
199
|
+
if (outputOptions.customRestrictions && outputOptions.customRestrictions.length > 0) {
|
|
200
|
+
lines.push('# Additional Restrictions');
|
|
201
|
+
for (const restriction of outputOptions.customRestrictions) {
|
|
202
|
+
lines.push(`Disallow: ${restriction}`);
|
|
203
|
+
}
|
|
204
|
+
lines.push('');
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Sitemap reference
|
|
208
|
+
if (analysis.structure && analysis.structure.sitemap) {
|
|
209
|
+
lines.push('# Site Structure');
|
|
210
|
+
lines.push(`Sitemap: ${baseUrl}/sitemap.xml`);
|
|
211
|
+
lines.push('');
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Footer
|
|
215
|
+
lines.push('# This file was generated by LLMs.txt Generator');
|
|
216
|
+
lines.push('# For detailed guidelines, see llms-full.txt');
|
|
217
|
+
|
|
218
|
+
return lines.join('\n');
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Generate detailed LLMs-full.txt content
|
|
223
|
+
*/
|
|
224
|
+
generateLLMsFullTxt(analysis, outputOptions, complianceLevel) {
|
|
225
|
+
const lines = [];
|
|
226
|
+
const baseUrl = analysis.metadata.baseUrl;
|
|
227
|
+
|
|
228
|
+
// Header
|
|
229
|
+
lines.push('# LLMs-full.txt');
|
|
230
|
+
lines.push(`# Comprehensive AI Model Usage Guidelines for ${baseUrl}`);
|
|
231
|
+
lines.push(`# Generated on ${new Date().toISOString()}`);
|
|
232
|
+
lines.push('');
|
|
233
|
+
|
|
234
|
+
// Table of Contents
|
|
235
|
+
lines.push('## Table of Contents');
|
|
236
|
+
lines.push('1. Overview and Contact Information');
|
|
237
|
+
lines.push('2. Rate Limiting and Technical Guidelines');
|
|
238
|
+
lines.push('3. Content Access Policy');
|
|
239
|
+
lines.push('4. API and Data Sources');
|
|
240
|
+
lines.push('5. Security and Privacy Guidelines');
|
|
241
|
+
lines.push('6. Compliance and Legal Requirements');
|
|
242
|
+
lines.push('7. Best Practices and Examples');
|
|
243
|
+
lines.push('8. Support and Contact Information');
|
|
244
|
+
lines.push('');
|
|
245
|
+
|
|
246
|
+
// Section 1: Overview
|
|
247
|
+
lines.push('## 1. Overview and Contact Information');
|
|
248
|
+
lines.push('');
|
|
249
|
+
if (outputOptions.organizationName) {
|
|
250
|
+
lines.push(`**Organization:** ${outputOptions.organizationName}`);
|
|
251
|
+
}
|
|
252
|
+
if (outputOptions.contactEmail) {
|
|
253
|
+
lines.push(`**Contact:** ${outputOptions.contactEmail}`);
|
|
254
|
+
}
|
|
255
|
+
lines.push(`**Website:** ${baseUrl}`);
|
|
256
|
+
lines.push(`**Compliance Level:** ${complianceLevel}`);
|
|
257
|
+
lines.push('');
|
|
258
|
+
lines.push('This document provides comprehensive guidelines for AI models accessing this website.');
|
|
259
|
+
lines.push('All automated access should follow these guidelines to ensure responsible usage.');
|
|
260
|
+
lines.push('');
|
|
261
|
+
|
|
262
|
+
// Section 2: Rate Limiting
|
|
263
|
+
lines.push('## 2. Rate Limiting and Technical Guidelines');
|
|
264
|
+
lines.push('');
|
|
265
|
+
if (analysis.rateLimit) {
|
|
266
|
+
lines.push('### Recommended Rate Limits');
|
|
267
|
+
lines.push(`- **Delay between requests:** ${analysis.rateLimit.recommendedDelay}ms minimum`);
|
|
268
|
+
lines.push(`- **Maximum concurrent requests:** ${analysis.rateLimit.maxConcurrency}`);
|
|
269
|
+
lines.push(`- **Requests per minute:** ${analysis.rateLimit.recommendedRPM} maximum`);
|
|
270
|
+
lines.push('');
|
|
271
|
+
lines.push('### Technical Justification');
|
|
272
|
+
lines.push(`${analysis.rateLimit.reasoning}`);
|
|
273
|
+
lines.push(`Average response time: ${analysis.rateLimit.averageResponseTime}ms`);
|
|
274
|
+
lines.push('');
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
lines.push('### User Agent Requirements');
|
|
278
|
+
lines.push('- Use descriptive User-Agent strings identifying your AI model/service');
|
|
279
|
+
lines.push('- Include contact information in User-Agent when possible');
|
|
280
|
+
lines.push('- Example: "MyAI-Bot/1.0 (+https://example.com/bot-info)"');
|
|
281
|
+
lines.push('');
|
|
282
|
+
|
|
283
|
+
// Section 3: Content Access Policy
|
|
284
|
+
lines.push('## 3. Content Access Policy');
|
|
285
|
+
lines.push('');
|
|
286
|
+
|
|
287
|
+
if (analysis.contentTypes) {
|
|
288
|
+
lines.push('### Content Classification');
|
|
289
|
+
lines.push(`- **Public content pages:** ${analysis.contentTypes.public.length} identified`);
|
|
290
|
+
lines.push(`- **Restricted content:** ${analysis.contentTypes.restricted.length} areas`);
|
|
291
|
+
lines.push(`- **Interactive forms:** ${analysis.contentTypes.forms.length} detected`);
|
|
292
|
+
lines.push(`- **Media files:** ${analysis.contentTypes.media.length} found`);
|
|
293
|
+
lines.push('');
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
lines.push('### Allowed Content Types');
|
|
297
|
+
lines.push('- Public articles, blog posts, and informational content');
|
|
298
|
+
lines.push('- Product information and descriptions');
|
|
299
|
+
lines.push('- Published documentation and help content');
|
|
300
|
+
lines.push('- Publicly available media with proper attribution');
|
|
301
|
+
lines.push('');
|
|
302
|
+
|
|
303
|
+
lines.push('### Restricted Content');
|
|
304
|
+
lines.push('- User-generated content requiring authentication');
|
|
305
|
+
lines.push('- Personal information and private data');
|
|
306
|
+
lines.push('- Form submissions and interactive content');
|
|
307
|
+
lines.push('- Administrative and configuration areas');
|
|
308
|
+
lines.push('');
|
|
309
|
+
|
|
310
|
+
// Section 4: APIs and Data Sources
|
|
311
|
+
lines.push('## 4. API and Data Sources');
|
|
312
|
+
lines.push('');
|
|
313
|
+
|
|
314
|
+
if (analysis.apis && analysis.apis.length > 0) {
|
|
315
|
+
lines.push('### Available APIs');
|
|
316
|
+
for (const api of analysis.apis) {
|
|
317
|
+
lines.push(`- **${api.type}:** ${api.url}`);
|
|
318
|
+
if (api.description) {
|
|
319
|
+
lines.push(` - Description: ${api.description}`);
|
|
320
|
+
}
|
|
321
|
+
if (api.contentType) {
|
|
322
|
+
lines.push(` - Content-Type: ${api.contentType}`);
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
lines.push('');
|
|
326
|
+
lines.push('**Recommendation:** Use APIs instead of web scraping when available for better performance and reliability.');
|
|
327
|
+
} else {
|
|
328
|
+
lines.push('### No Public APIs Detected');
|
|
329
|
+
lines.push('No public APIs were found during analysis. Web scraping may be necessary but should follow all guidelines in this document.');
|
|
330
|
+
}
|
|
331
|
+
lines.push('');
|
|
332
|
+
|
|
333
|
+
// Section 5: Security and Privacy
|
|
334
|
+
lines.push('## 5. Security and Privacy Guidelines');
|
|
335
|
+
lines.push('');
|
|
336
|
+
|
|
337
|
+
if (analysis.securityAreas && analysis.securityAreas.length > 0) {
|
|
338
|
+
lines.push('### Restricted Areas Detected');
|
|
339
|
+
const securityAreasByType = {};
|
|
340
|
+
for (const area of analysis.securityAreas) {
|
|
341
|
+
if (!securityAreasByType[area.type]) {
|
|
342
|
+
securityAreasByType[area.type] = [];
|
|
343
|
+
}
|
|
344
|
+
securityAreasByType[area.type].push(area.path);
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
for (const [type, paths] of Object.entries(securityAreasByType)) {
|
|
348
|
+
lines.push(`**${type.charAt(0).toUpperCase() + type.slice(1)} Areas:**`);
|
|
349
|
+
for (const path of paths) {
|
|
350
|
+
lines.push(`- ${path}`);
|
|
351
|
+
}
|
|
352
|
+
lines.push('');
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
lines.push('### Privacy Requirements');
|
|
357
|
+
lines.push('- Do not collect, store, or process personal information');
|
|
358
|
+
lines.push('- Respect user privacy and data protection regulations');
|
|
359
|
+
lines.push('- Avoid accessing user-specific content or accounts');
|
|
360
|
+
lines.push('- Do not attempt to bypass authentication mechanisms');
|
|
361
|
+
lines.push('');
|
|
362
|
+
|
|
363
|
+
// Section 6: Compliance
|
|
364
|
+
lines.push('## 6. Compliance and Legal Requirements');
|
|
365
|
+
lines.push('');
|
|
366
|
+
lines.push('### Data Protection Compliance');
|
|
367
|
+
lines.push('- **GDPR:** Respect European data protection requirements');
|
|
368
|
+
lines.push('- **CCPA:** Follow California Consumer Privacy Act guidelines');
|
|
369
|
+
lines.push('- **COPPA:** Extra caution with any content that might involve minors');
|
|
370
|
+
lines.push('');
|
|
371
|
+
lines.push('### Terms of Service');
|
|
372
|
+
lines.push('- Review and comply with website Terms of Service');
|
|
373
|
+
lines.push('- Respect intellectual property and copyright');
|
|
374
|
+
lines.push('- Provide proper attribution when using content');
|
|
375
|
+
lines.push('');
|
|
376
|
+
|
|
377
|
+
// Section 7: Best Practices
|
|
378
|
+
lines.push('## 7. Best Practices and Examples');
|
|
379
|
+
lines.push('');
|
|
380
|
+
lines.push('### Recommended Practices');
|
|
381
|
+
lines.push('1. **Start with robots.txt:** Always check and follow robots.txt directives');
|
|
382
|
+
lines.push('2. **Use structured data:** Look for JSON-LD, microdata, and other structured formats');
|
|
383
|
+
lines.push('3. **Respect meta tags:** Pay attention to meta robots tags and directives');
|
|
384
|
+
lines.push('4. **Cache responsibly:** Cache content appropriately to reduce server load');
|
|
385
|
+
lines.push('5. **Handle errors gracefully:** Implement proper error handling and retries');
|
|
386
|
+
lines.push('');
|
|
387
|
+
|
|
388
|
+
if (analysis.structure && analysis.structure.robotsTxt) {
|
|
389
|
+
lines.push('### Robots.txt Status');
|
|
390
|
+
lines.push('A robots.txt file was found and should be respected. Key directives:');
|
|
391
|
+
const robotsLines = analysis.structure.robotsTxt.split('\n').slice(0, 10);
|
|
392
|
+
for (const line of robotsLines) {
|
|
393
|
+
if (line.trim() && !line.startsWith('#')) {
|
|
394
|
+
lines.push(`- ${line.trim()}`);
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
lines.push('');
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
lines.push('### Example Usage Patterns');
|
|
401
|
+
lines.push('```');
|
|
402
|
+
lines.push('# Good: Respectful crawling with delays');
|
|
403
|
+
lines.push('for url in urls:');
|
|
404
|
+
lines.push(' response = fetch(url, headers={"User-Agent": "MyBot/1.0"})');
|
|
405
|
+
lines.push(` time.sleep(${Math.ceil((analysis.rateLimit?.recommendedDelay || 1000) / 1000)})`);
|
|
406
|
+
lines.push(' process(response)');
|
|
407
|
+
lines.push('');
|
|
408
|
+
lines.push('# Bad: Aggressive crawling without delays');
|
|
409
|
+
lines.push('# for url in urls:');
|
|
410
|
+
lines.push('# response = fetch(url) # No delay, no user agent');
|
|
411
|
+
lines.push('# process(response)');
|
|
412
|
+
lines.push('```');
|
|
413
|
+
lines.push('');
|
|
414
|
+
|
|
415
|
+
// Section 8: Support
|
|
416
|
+
lines.push('## 8. Support and Contact Information');
|
|
417
|
+
lines.push('');
|
|
418
|
+
if (outputOptions.contactEmail) {
|
|
419
|
+
lines.push(`**Primary Contact:** ${outputOptions.contactEmail}`);
|
|
420
|
+
}
|
|
421
|
+
lines.push('');
|
|
422
|
+
lines.push('### Reporting Issues');
|
|
423
|
+
lines.push('If you encounter issues or need clarification on these guidelines:');
|
|
424
|
+
lines.push('1. Check this document for guidance');
|
|
425
|
+
lines.push('2. Review the basic llms.txt file');
|
|
426
|
+
lines.push('3. Contact us using the information above');
|
|
427
|
+
lines.push('');
|
|
428
|
+
lines.push('### Updates');
|
|
429
|
+
lines.push('These guidelines may be updated periodically. Check the generation date above');
|
|
430
|
+
lines.push('and consider regenerating this file if accessing the site after significant changes.');
|
|
431
|
+
lines.push('');
|
|
432
|
+
|
|
433
|
+
// Footer
|
|
434
|
+
lines.push('---');
|
|
435
|
+
lines.push('');
|
|
436
|
+
lines.push('**Generated by:** LLMs.txt Generator v1.0');
|
|
437
|
+
lines.push('**Analysis Date:** ' + analysis.metadata.analyzedAt);
|
|
438
|
+
lines.push('**Analysis Coverage:** ' + (analysis.structure.totalPages || 'N/A') + ' pages analyzed');
|
|
439
|
+
if (analysis.errors && analysis.errors.length > 0) {
|
|
440
|
+
lines.push('**Note:** Some analysis errors occurred. Contact support if needed.');
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
return lines.join('\n');
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
/**
|
|
447
|
+
* Get standard restriction patterns based on compliance level
|
|
448
|
+
*/
|
|
449
|
+
getStandardRestrictions(complianceLevel) {
|
|
450
|
+
const basic = [
|
|
451
|
+
'/admin',
|
|
452
|
+
'/wp-admin',
|
|
453
|
+
'/login',
|
|
454
|
+
'/user',
|
|
455
|
+
'/account'
|
|
456
|
+
];
|
|
457
|
+
|
|
458
|
+
const standard = [
|
|
459
|
+
...basic,
|
|
460
|
+
'/private',
|
|
461
|
+
'/internal',
|
|
462
|
+
'/config',
|
|
463
|
+
'/settings',
|
|
464
|
+
'/auth',
|
|
465
|
+
'/oauth',
|
|
466
|
+
'/signin',
|
|
467
|
+
'/*?password=*',
|
|
468
|
+
'/*?token=*'
|
|
469
|
+
];
|
|
470
|
+
|
|
471
|
+
const strict = [
|
|
472
|
+
...standard,
|
|
473
|
+
'/api/*/private',
|
|
474
|
+
'/dashboard',
|
|
475
|
+
'/profile',
|
|
476
|
+
'/*?session=*',
|
|
477
|
+
'/*?key=*',
|
|
478
|
+
'/cms',
|
|
479
|
+
'/administrator',
|
|
480
|
+
'/*.env',
|
|
481
|
+
'/*.config'
|
|
482
|
+
];
|
|
483
|
+
|
|
484
|
+
switch (complianceLevel) {
|
|
485
|
+
case 'basic': return basic;
|
|
486
|
+
case 'strict': return strict;
|
|
487
|
+
default: return standard;
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Generate deployment and implementation recommendations
|
|
493
|
+
*/
|
|
494
|
+
generateRecommendations(analysis) {
|
|
495
|
+
const recommendations = [];
|
|
496
|
+
|
|
497
|
+
// Rate limiting recommendations
|
|
498
|
+
if (analysis.rateLimit && analysis.rateLimit.averageResponseTime > 2000) {
|
|
499
|
+
recommendations.push({
|
|
500
|
+
type: 'performance',
|
|
501
|
+
priority: 'high',
|
|
502
|
+
message: 'Server response times are slow. Consider very conservative rate limiting.'
|
|
503
|
+
});
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
// Security recommendations
|
|
507
|
+
if (analysis.securityAreas && analysis.securityAreas.length > 5) {
|
|
508
|
+
recommendations.push({
|
|
509
|
+
type: 'security',
|
|
510
|
+
priority: 'medium',
|
|
511
|
+
message: 'Multiple security areas detected. Review restrictions carefully.'
|
|
512
|
+
});
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
// API recommendations
|
|
516
|
+
if (analysis.apis && analysis.apis.length > 0) {
|
|
517
|
+
recommendations.push({
|
|
518
|
+
type: 'integration',
|
|
519
|
+
priority: 'high',
|
|
520
|
+
message: 'APIs detected. Strongly recommend using APIs instead of scraping.'
|
|
521
|
+
});
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
// Structure recommendations
|
|
525
|
+
if (analysis.structure && analysis.structure.totalPages > 1000) {
|
|
526
|
+
recommendations.push({
|
|
527
|
+
type: 'scale',
|
|
528
|
+
priority: 'medium',
|
|
529
|
+
message: 'Large website detected. Consider focused crawling strategies.'
|
|
530
|
+
});
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
return recommendations;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
/**
|
|
537
|
+
* Generate warnings about potential issues
|
|
538
|
+
*/
|
|
539
|
+
generateWarnings(analysis) {
|
|
540
|
+
const warnings = [];
|
|
541
|
+
|
|
542
|
+
// Analysis errors
|
|
543
|
+
if (analysis.errors && analysis.errors.length > 0) {
|
|
544
|
+
warnings.push({
|
|
545
|
+
type: 'analysis',
|
|
546
|
+
message: `${analysis.errors.length} errors occurred during analysis. Guidelines may be incomplete.`
|
|
547
|
+
});
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
// Missing robots.txt
|
|
551
|
+
if (!analysis.structure || !analysis.structure.robotsTxt) {
|
|
552
|
+
warnings.push({
|
|
553
|
+
type: 'robots',
|
|
554
|
+
message: 'No robots.txt found. Extra caution recommended.'
|
|
555
|
+
});
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
// High security areas
|
|
559
|
+
if (analysis.securityAreas && analysis.securityAreas.length > 10) {
|
|
560
|
+
warnings.push({
|
|
561
|
+
type: 'security',
|
|
562
|
+
message: 'Many restricted areas detected. This site may not be suitable for broad crawling.'
|
|
563
|
+
});
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
return warnings;
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
export default GenerateLLMsTxtTool;
|