@gulibs/safe-coder 0.0.25 → 0.0.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +682 -902
- package/dist/cache/cache-manager.d.ts +71 -0
- package/dist/cache/cache-manager.d.ts.map +1 -0
- package/dist/cache/cache-manager.js +244 -0
- package/dist/cache/cache-manager.js.map +1 -0
- package/dist/executor/cli-executor.d.ts +106 -0
- package/dist/executor/cli-executor.d.ts.map +1 -0
- package/dist/executor/cli-executor.js +133 -0
- package/dist/executor/cli-executor.js.map +1 -0
- package/dist/executor/dependency-checker.d.ts +23 -0
- package/dist/executor/dependency-checker.d.ts.map +1 -0
- package/dist/executor/dependency-checker.js +62 -0
- package/dist/executor/dependency-checker.js.map +1 -0
- package/dist/index.js +3 -4
- package/dist/index.js.map +1 -1
- package/dist/processor/content-processor.d.ts +76 -0
- package/dist/processor/content-processor.d.ts.map +1 -0
- package/dist/processor/content-processor.js +182 -0
- package/dist/processor/content-processor.js.map +1 -0
- package/dist/processor/guide-generator.d.ts +68 -0
- package/dist/processor/guide-generator.d.ts.map +1 -0
- package/dist/processor/guide-generator.js +189 -0
- package/dist/processor/guide-generator.js.map +1 -0
- package/dist/server/safe-coder-mcp.d.ts +18 -0
- package/dist/server/safe-coder-mcp.d.ts.map +1 -0
- package/dist/server/safe-coder-mcp.js +164 -0
- package/dist/server/safe-coder-mcp.js.map +1 -0
- package/dist/tools/cache-tools.d.ts +42 -0
- package/dist/tools/cache-tools.d.ts.map +1 -0
- package/dist/tools/cache-tools.js +70 -0
- package/dist/tools/cache-tools.js.map +1 -0
- package/dist/tools/crawl-documentation.d.ts +57 -0
- package/dist/tools/crawl-documentation.d.ts.map +1 -0
- package/dist/tools/crawl-documentation.js +96 -0
- package/dist/tools/crawl-documentation.js.map +1 -0
- package/dist/tools/index.d.ts +4 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +4 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/tools/save-skill.d.ts +49 -0
- package/dist/tools/save-skill.d.ts.map +1 -0
- package/dist/tools/save-skill.js +207 -0
- package/dist/tools/save-skill.js.map +1 -0
- package/package.json +18 -22
- package/dist/documentation/browser-doc-browser.d.ts +0 -41
- package/dist/documentation/browser-doc-browser.d.ts.map +0 -1
- package/dist/documentation/browser-doc-browser.js +0 -357
- package/dist/documentation/browser-doc-browser.js.map +0 -1
- package/dist/documentation/cache.d.ts +0 -13
- package/dist/documentation/cache.d.ts.map +0 -1
- package/dist/documentation/cache.js +0 -48
- package/dist/documentation/cache.js.map +0 -1
- package/dist/documentation/checkpoint-manager.d.ts +0 -38
- package/dist/documentation/checkpoint-manager.d.ts.map +0 -1
- package/dist/documentation/checkpoint-manager.js +0 -101
- package/dist/documentation/checkpoint-manager.js.map +0 -1
- package/dist/documentation/doc-crawler.d.ts +0 -185
- package/dist/documentation/doc-crawler.d.ts.map +0 -1
- package/dist/documentation/doc-crawler.js +0 -1162
- package/dist/documentation/doc-crawler.js.map +0 -1
- package/dist/documentation/github-client.d.ts +0 -13
- package/dist/documentation/github-client.d.ts.map +0 -1
- package/dist/documentation/github-client.js +0 -90
- package/dist/documentation/github-client.js.map +0 -1
- package/dist/documentation/http-fetcher.d.ts +0 -8
- package/dist/documentation/http-fetcher.d.ts.map +0 -1
- package/dist/documentation/http-fetcher.js +0 -31
- package/dist/documentation/http-fetcher.js.map +0 -1
- package/dist/documentation/index.d.ts +0 -16
- package/dist/documentation/index.d.ts.map +0 -1
- package/dist/documentation/index.js +0 -159
- package/dist/documentation/index.js.map +0 -1
- package/dist/documentation/llms-txt/detector.d.ts +0 -31
- package/dist/documentation/llms-txt/detector.d.ts.map +0 -1
- package/dist/documentation/llms-txt/detector.js +0 -77
- package/dist/documentation/llms-txt/detector.js.map +0 -1
- package/dist/documentation/llms-txt/downloader.d.ts +0 -30
- package/dist/documentation/llms-txt/downloader.d.ts.map +0 -1
- package/dist/documentation/llms-txt/downloader.js +0 -84
- package/dist/documentation/llms-txt/downloader.js.map +0 -1
- package/dist/documentation/llms-txt/index.d.ts +0 -4
- package/dist/documentation/llms-txt/index.d.ts.map +0 -1
- package/dist/documentation/llms-txt/index.js +0 -4
- package/dist/documentation/llms-txt/index.js.map +0 -1
- package/dist/documentation/llms-txt/parser.d.ts +0 -43
- package/dist/documentation/llms-txt/parser.d.ts.map +0 -1
- package/dist/documentation/llms-txt/parser.js +0 -177
- package/dist/documentation/llms-txt/parser.js.map +0 -1
- package/dist/documentation/normalizer.d.ts +0 -6
- package/dist/documentation/normalizer.d.ts.map +0 -1
- package/dist/documentation/normalizer.js +0 -38
- package/dist/documentation/normalizer.js.map +0 -1
- package/dist/documentation/npm-client.d.ts +0 -19
- package/dist/documentation/npm-client.d.ts.map +0 -1
- package/dist/documentation/npm-client.js +0 -182
- package/dist/documentation/npm-client.js.map +0 -1
- package/dist/documentation/skill-generator.d.ts +0 -108
- package/dist/documentation/skill-generator.d.ts.map +0 -1
- package/dist/documentation/skill-generator.js +0 -642
- package/dist/documentation/skill-generator.js.map +0 -1
- package/dist/documentation/web-doc-browser.d.ts +0 -67
- package/dist/documentation/web-doc-browser.d.ts.map +0 -1
- package/dist/documentation/web-doc-browser.js +0 -555
- package/dist/documentation/web-doc-browser.js.map +0 -1
- package/dist/errors/api-validator.d.ts +0 -9
- package/dist/errors/api-validator.d.ts.map +0 -1
- package/dist/errors/api-validator.js +0 -57
- package/dist/errors/api-validator.js.map +0 -1
- package/dist/errors/contextual-analysis.d.ts +0 -14
- package/dist/errors/contextual-analysis.d.ts.map +0 -1
- package/dist/errors/contextual-analysis.js +0 -173
- package/dist/errors/contextual-analysis.js.map +0 -1
- package/dist/errors/cross-file-analyzer.d.ts +0 -16
- package/dist/errors/cross-file-analyzer.d.ts.map +0 -1
- package/dist/errors/cross-file-analyzer.js +0 -172
- package/dist/errors/cross-file-analyzer.js.map +0 -1
- package/dist/errors/eslint-integration.d.ts +0 -9
- package/dist/errors/eslint-integration.d.ts.map +0 -1
- package/dist/errors/eslint-integration.js +0 -131
- package/dist/errors/eslint-integration.js.map +0 -1
- package/dist/errors/framework-detector.d.ts +0 -10
- package/dist/errors/framework-detector.d.ts.map +0 -1
- package/dist/errors/framework-detector.js +0 -126
- package/dist/errors/framework-detector.js.map +0 -1
- package/dist/errors/index.d.ts +0 -18
- package/dist/errors/index.d.ts.map +0 -1
- package/dist/errors/index.js +0 -134
- package/dist/errors/index.js.map +0 -1
- package/dist/errors/pattern-matcher.d.ts +0 -25
- package/dist/errors/pattern-matcher.d.ts.map +0 -1
- package/dist/errors/pattern-matcher.js +0 -44
- package/dist/errors/pattern-matcher.js.map +0 -1
- package/dist/errors/patterns.d.ts +0 -11
- package/dist/errors/patterns.d.ts.map +0 -1
- package/dist/errors/patterns.js +0 -351
- package/dist/errors/patterns.js.map +0 -1
- package/dist/errors/performance-detector.d.ts +0 -11
- package/dist/errors/performance-detector.d.ts.map +0 -1
- package/dist/errors/performance-detector.js +0 -119
- package/dist/errors/performance-detector.js.map +0 -1
- package/dist/errors/runtime-detector.d.ts +0 -7
- package/dist/errors/runtime-detector.d.ts.map +0 -1
- package/dist/errors/runtime-detector.js +0 -86
- package/dist/errors/runtime-detector.js.map +0 -1
- package/dist/errors/security-detector.d.ts +0 -6
- package/dist/errors/security-detector.d.ts.map +0 -1
- package/dist/errors/security-detector.js +0 -75
- package/dist/errors/security-detector.js.map +0 -1
- package/dist/errors/typescript-integration.d.ts +0 -6
- package/dist/errors/typescript-integration.d.ts.map +0 -1
- package/dist/errors/typescript-integration.js +0 -46
- package/dist/errors/typescript-integration.js.map +0 -1
- package/dist/server/mcp-server.d.ts +0 -14
- package/dist/server/mcp-server.d.ts.map +0 -1
- package/dist/server/mcp-server.js +0 -776
- package/dist/server/mcp-server.js.map +0 -1
- package/dist/types/documentation.d.ts +0 -26
- package/dist/types/documentation.d.ts.map +0 -1
- package/dist/types/documentation.js +0 -2
- package/dist/types/documentation.js.map +0 -1
- package/dist/utils/config.d.ts +0 -21
- package/dist/utils/config.d.ts.map +0 -1
- package/dist/utils/config.js +0 -34
- package/dist/utils/config.js.map +0 -1
- package/dist/utils/http-client.d.ts +0 -17
- package/dist/utils/http-client.d.ts.map +0 -1
- package/dist/utils/http-client.js +0 -62
- package/dist/utils/http-client.js.map +0 -1
- package/dist/utils/logger.d.ts +0 -36
- package/dist/utils/logger.d.ts.map +0 -1
- package/dist/utils/logger.js +0 -128
- package/dist/utils/logger.js.map +0 -1
- package/dist/utils/rate-limiter.d.ts +0 -9
- package/dist/utils/rate-limiter.d.ts.map +0 -1
- package/dist/utils/rate-limiter.js +0 -26
- package/dist/utils/rate-limiter.js.map +0 -1
- package/dist/validation/auto-fix.d.ts +0 -15
- package/dist/validation/auto-fix.d.ts.map +0 -1
- package/dist/validation/auto-fix.js +0 -49
- package/dist/validation/auto-fix.js.map +0 -1
- package/dist/validation/index.d.ts +0 -21
- package/dist/validation/index.d.ts.map +0 -1
- package/dist/validation/index.js +0 -45
- package/dist/validation/index.js.map +0 -1
- package/dist/validation/resolution-db.d.ts +0 -15
- package/dist/validation/resolution-db.d.ts.map +0 -1
- package/dist/validation/resolution-db.js +0 -62
- package/dist/validation/resolution-db.js.map +0 -1
|
@@ -1,1162 +0,0 @@
|
|
|
1
|
-
import { HttpClient } from '../utils/http-client.js';
|
|
2
|
-
import { logger } from '../utils/logger.js';
|
|
3
|
-
import { WebDocumentationBrowser } from './web-doc-browser.js';
|
|
4
|
-
import { LlmsTxtDetector, LlmsTxtDownloader, LlmsTxtParser } from './llms-txt/index.js';
|
|
5
|
-
import { CheckpointManager } from './checkpoint-manager.js';
|
|
6
|
-
import { join } from 'path';
|
|
7
|
-
import { tmpdir } from 'os';
|
|
8
|
-
export class DocumentationCrawler {
|
|
9
|
-
browser;
|
|
10
|
-
visitedUrls;
|
|
11
|
-
urlQueue;
|
|
12
|
-
crawledPages;
|
|
13
|
-
errors;
|
|
14
|
-
options;
|
|
15
|
-
baseUrl;
|
|
16
|
-
linkDiscoveryStats;
|
|
17
|
-
checkpointManager;
|
|
18
|
-
pagesSinceLastCheckpoint;
|
|
19
|
-
DOCUMENTATION_PATTERNS = [
|
|
20
|
-
/\/docs?\//i,
|
|
21
|
-
/\/documentation/i,
|
|
22
|
-
/\/guide/i,
|
|
23
|
-
/\/tutorial/i,
|
|
24
|
-
/\/api/i,
|
|
25
|
-
/\/reference/i,
|
|
26
|
-
/\/manual/i,
|
|
27
|
-
/\/help/i,
|
|
28
|
-
/\/about/i,
|
|
29
|
-
/\/getting-started/i,
|
|
30
|
-
];
|
|
31
|
-
EXCLUDED_PATTERNS = [
|
|
32
|
-
/\/login/i,
|
|
33
|
-
/\/signup/i,
|
|
34
|
-
/\/register/i,
|
|
35
|
-
/\/checkout/i,
|
|
36
|
-
/\/cart/i,
|
|
37
|
-
/\/payment/i,
|
|
38
|
-
/\/home$/i,
|
|
39
|
-
// Don't exclude root path - it might be documentation
|
|
40
|
-
// /^\/$/,
|
|
41
|
-
];
|
|
42
|
-
constructor(httpClient) {
|
|
43
|
-
this.browser = new WebDocumentationBrowser(httpClient);
|
|
44
|
-
this.visitedUrls = new Set();
|
|
45
|
-
this.urlQueue = [];
|
|
46
|
-
this.crawledPages = [];
|
|
47
|
-
this.errors = [];
|
|
48
|
-
this.options = {
|
|
49
|
-
crawlStrategy: 'bfs', // Default to breadth-first search
|
|
50
|
-
maxDepth: 3,
|
|
51
|
-
maxPages: 50,
|
|
52
|
-
includePaths: [],
|
|
53
|
-
excludePaths: [],
|
|
54
|
-
rateLimit: 500, // 500ms default delay
|
|
55
|
-
maxRetries: 2, // Default 2 retries
|
|
56
|
-
retryDelay: 1000, // Default 1 second delay before retry
|
|
57
|
-
useBrowserAutomation: false, // Default to HTTP-only for backward compatibility
|
|
58
|
-
skipLlmsTxt: false, // Enable llms.txt detection by default
|
|
59
|
-
workers: 1, // Default to single-threaded crawling
|
|
60
|
-
};
|
|
61
|
-
this.baseUrl = new URL('https://example.com');
|
|
62
|
-
this.linkDiscoveryStats = {
|
|
63
|
-
totalLinksFound: 0,
|
|
64
|
-
linksFiltered: {
|
|
65
|
-
notContent: 0,
|
|
66
|
-
externalDomain: 0,
|
|
67
|
-
alreadyVisited: 0,
|
|
68
|
-
excludedPattern: 0,
|
|
69
|
-
depthLimit: 0,
|
|
70
|
-
},
|
|
71
|
-
linksQueued: 0,
|
|
72
|
-
pagesDiscovered: 0,
|
|
73
|
-
pagesCrawled: 0,
|
|
74
|
-
};
|
|
75
|
-
this.pagesSinceLastCheckpoint = 0;
|
|
76
|
-
}
|
|
77
|
-
/**
|
|
78
|
-
* Crawl documentation starting from a root URL
|
|
79
|
-
* Uses HTTP client (axios) exclusively - no browser automation
|
|
80
|
-
* For SPA sites that require JavaScript rendering, use Cursor/Claude's built-in browser tools
|
|
81
|
-
* Supports both BFS (breadth-first) and DFS (depth-first) crawl strategies
|
|
82
|
-
*/
|
|
83
|
-
async crawl(rootUrl, options = {}) {
|
|
84
|
-
const strategy = options.crawlStrategy || 'bfs';
|
|
85
|
-
logger.info('Starting documentation crawl using HTTP client (axios)', {
|
|
86
|
-
url: rootUrl,
|
|
87
|
-
strategy,
|
|
88
|
-
method: 'HTTP GET',
|
|
89
|
-
client: 'axios/HttpClient',
|
|
90
|
-
note: 'For SPA sites, use Cursor/Claude browser tools to get rendered content first',
|
|
91
|
-
});
|
|
92
|
-
// Reset state
|
|
93
|
-
this.visitedUrls.clear();
|
|
94
|
-
this.urlQueue = [];
|
|
95
|
-
this.crawledPages = [];
|
|
96
|
-
this.errors = [];
|
|
97
|
-
this.linkDiscoveryStats = {
|
|
98
|
-
totalLinksFound: 0,
|
|
99
|
-
linksFiltered: {
|
|
100
|
-
notContent: 0,
|
|
101
|
-
externalDomain: 0,
|
|
102
|
-
alreadyVisited: 0,
|
|
103
|
-
excludedPattern: 0,
|
|
104
|
-
depthLimit: 0,
|
|
105
|
-
},
|
|
106
|
-
linksQueued: 0,
|
|
107
|
-
pagesDiscovered: 0,
|
|
108
|
-
pagesCrawled: 0,
|
|
109
|
-
};
|
|
110
|
-
// Merge options
|
|
111
|
-
this.options = {
|
|
112
|
-
...this.options,
|
|
113
|
-
...options,
|
|
114
|
-
};
|
|
115
|
-
// Parse and validate root URL
|
|
116
|
-
try {
|
|
117
|
-
this.baseUrl = new URL(rootUrl);
|
|
118
|
-
}
|
|
119
|
-
catch (error) {
|
|
120
|
-
throw new Error(`Invalid root URL: ${rootUrl}`);
|
|
121
|
-
}
|
|
122
|
-
// No longer require documentation-only pages - allow any website with extractable content
|
|
123
|
-
logger.debug('Starting crawl from URL (permissive mode)', { url: rootUrl });
|
|
124
|
-
// Setup checkpoint manager if enabled
|
|
125
|
-
if (this.options.checkpoint?.enabled) {
|
|
126
|
-
const checkpointFile = this.options.checkpoint.file ||
|
|
127
|
-
join(tmpdir(), `safe-coder-checkpoint-${this.sanitizeFilename(rootUrl)}.json`);
|
|
128
|
-
this.checkpointManager = new CheckpointManager(checkpointFile);
|
|
129
|
-
// Try to resume from checkpoint if requested
|
|
130
|
-
if (this.options.resume) {
|
|
131
|
-
const loaded = await this.loadCheckpoint();
|
|
132
|
-
if (loaded) {
|
|
133
|
-
logger.info('Resumed from checkpoint', {
|
|
134
|
-
pagesCrawled: this.crawledPages.length,
|
|
135
|
-
pendingUrls: this.urlQueue.length,
|
|
136
|
-
visitedUrls: this.visitedUrls.size,
|
|
137
|
-
});
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
// Try to detect and use llms.txt if available (unless explicitly disabled)
|
|
142
|
-
if (!this.options.skipLlmsTxt) {
|
|
143
|
-
await this.tryLlmsTxt(rootUrl);
|
|
144
|
-
}
|
|
145
|
-
// Detect SPA and provide warning
|
|
146
|
-
try {
|
|
147
|
-
const spaDetection = await this.browser.detectSPA(rootUrl);
|
|
148
|
-
if (spaDetection.isSPA && spaDetection.confidence !== 'low') {
|
|
149
|
-
logger.warn('SPA detected at root URL - crawling may be limited', {
|
|
150
|
-
url: rootUrl,
|
|
151
|
-
confidence: spaDetection.confidence,
|
|
152
|
-
indicators: spaDetection.indicators,
|
|
153
|
-
suggestion: spaDetection.suggestion,
|
|
154
|
-
});
|
|
155
|
-
// Add warning to first page if SPA detected
|
|
156
|
-
if (spaDetection.suggestion) {
|
|
157
|
-
logger.info('SPA Detection Warning', {
|
|
158
|
-
message: spaDetection.suggestion,
|
|
159
|
-
recommendation: 'Consider using browser automation tools to get fully rendered content before crawling.',
|
|
160
|
-
});
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
catch (error) {
|
|
165
|
-
// SPA detection failure is not critical, continue crawling
|
|
166
|
-
logger.debug('SPA detection failed, continuing with crawl', {
|
|
167
|
-
url: rootUrl,
|
|
168
|
-
error: error instanceof Error ? error.message : String(error),
|
|
169
|
-
});
|
|
170
|
-
}
|
|
171
|
-
// Start crawling from root
|
|
172
|
-
this.urlQueue.push({ url: rootUrl, depth: 0 });
|
|
173
|
-
let maxDepthReached = 0;
|
|
174
|
-
// Process queue - use parallel workers if specified
|
|
175
|
-
const startTime = Date.now();
|
|
176
|
-
const workerCount = this.options.workers || 1;
|
|
177
|
-
if (workerCount > 1) {
|
|
178
|
-
logger.info('Using parallel crawling', { workers: workerCount });
|
|
179
|
-
maxDepthReached = await this.crawlWithWorkers(startTime);
|
|
180
|
-
}
|
|
181
|
-
else {
|
|
182
|
-
maxDepthReached = await this.crawlSequential(startTime);
|
|
183
|
-
}
|
|
184
|
-
// Update final statistics
|
|
185
|
-
this.linkDiscoveryStats.pagesDiscovered = this.visitedUrls.size;
|
|
186
|
-
// Calculate final statistics
|
|
187
|
-
const totalTime = ((Date.now() - startTime) / 1000).toFixed(2);
|
|
188
|
-
const avgTimePerPage = this.crawledPages.length > 0
|
|
189
|
-
? ((Date.now() - startTime) / this.crawledPages.length / 1000).toFixed(2)
|
|
190
|
-
: '0';
|
|
191
|
-
const successRate = this.linkDiscoveryStats.pagesDiscovered > 0
|
|
192
|
-
? ((this.crawledPages.length / this.linkDiscoveryStats.pagesDiscovered) * 100).toFixed(1)
|
|
193
|
-
: '0';
|
|
194
|
-
// Log crawl completion with comprehensive statistics
|
|
195
|
-
logger.info('Documentation crawl completed using HTTP client (axios)', {
|
|
196
|
-
totalPages: this.crawledPages.length,
|
|
197
|
-
maxDepthReached,
|
|
198
|
-
errors: this.errors.length,
|
|
199
|
-
totalTimeSeconds: totalTime,
|
|
200
|
-
avgTimePerPageSeconds: avgTimePerPage,
|
|
201
|
-
successRate: `${successRate}%`,
|
|
202
|
-
method: 'HTTP GET',
|
|
203
|
-
client: 'axios/HttpClient',
|
|
204
|
-
linkStats: {
|
|
205
|
-
totalLinksFound: this.linkDiscoveryStats.totalLinksFound,
|
|
206
|
-
linksQueued: this.linkDiscoveryStats.linksQueued,
|
|
207
|
-
linksFiltered: this.linkDiscoveryStats.linksFiltered,
|
|
208
|
-
pagesDiscovered: this.linkDiscoveryStats.pagesDiscovered,
|
|
209
|
-
pagesCrawled: this.linkDiscoveryStats.pagesCrawled,
|
|
210
|
-
},
|
|
211
|
-
errorBreakdown: this.getErrorBreakdown(),
|
|
212
|
-
});
|
|
213
|
-
// Validate if content is sufficient for skill generation
|
|
214
|
-
const validation = this.canGenerateSkill(this.crawledPages);
|
|
215
|
-
const abandoned = !validation.canGenerate;
|
|
216
|
-
const abandonReason = validation.reason;
|
|
217
|
-
if (abandoned) {
|
|
218
|
-
logger.warn('Crawl completed but content is insufficient for skill generation', {
|
|
219
|
-
reason: abandonReason,
|
|
220
|
-
pagesCrawled: this.crawledPages.length,
|
|
221
|
-
suggestion: 'Consider crawling more pages or a different website',
|
|
222
|
-
});
|
|
223
|
-
}
|
|
224
|
-
// Clear checkpoint after successful completion
|
|
225
|
-
if (this.checkpointManager && !abandoned) {
|
|
226
|
-
await this.clearCheckpoint();
|
|
227
|
-
}
|
|
228
|
-
return {
|
|
229
|
-
pages: this.crawledPages,
|
|
230
|
-
totalPages: this.crawledPages.length,
|
|
231
|
-
maxDepthReached,
|
|
232
|
-
errors: this.errors,
|
|
233
|
-
linkDiscoveryStats: this.linkDiscoveryStats,
|
|
234
|
-
abandoned,
|
|
235
|
-
abandonReason,
|
|
236
|
-
};
|
|
237
|
-
}
|
|
238
|
-
/**
|
|
239
|
-
* Sequential crawling (single-threaded)
|
|
240
|
-
*/
|
|
241
|
-
async crawlSequential(startTime) {
|
|
242
|
-
let maxDepthReached = 0;
|
|
243
|
-
let lastProgressLog = Date.now();
|
|
244
|
-
const PROGRESS_LOG_INTERVAL = 5000; // Log progress every 5 seconds
|
|
245
|
-
while (this.urlQueue.length > 0 && this.crawledPages.length < this.options.maxPages) {
|
|
246
|
-
// Use different strategies for getting next URL
|
|
247
|
-
// BFS: shift() - take from front (queue behavior)
|
|
248
|
-
// DFS: pop() - take from back (stack behavior)
|
|
249
|
-
const queued = this.options.crawlStrategy === 'dfs' ? this.urlQueue.pop() : this.urlQueue.shift();
|
|
250
|
-
if (!queued)
|
|
251
|
-
break;
|
|
252
|
-
const { url, depth } = queued;
|
|
253
|
-
// Log progress periodically
|
|
254
|
-
const now = Date.now();
|
|
255
|
-
if (now - lastProgressLog >= PROGRESS_LOG_INTERVAL) {
|
|
256
|
-
const elapsed = ((now - startTime) / 1000).toFixed(1);
|
|
257
|
-
const pagesPerSecond = (this.crawledPages.length / elapsed).toFixed(2);
|
|
258
|
-
logger.info('Crawl progress', {
|
|
259
|
-
pagesCrawled: this.crawledPages.length,
|
|
260
|
-
pagesRemaining: this.urlQueue.length,
|
|
261
|
-
maxPages: this.options.maxPages,
|
|
262
|
-
errors: this.errors.length,
|
|
263
|
-
elapsedSeconds: elapsed,
|
|
264
|
-
pagesPerSecond,
|
|
265
|
-
currentDepth: depth,
|
|
266
|
-
maxDepth: this.options.maxDepth,
|
|
267
|
-
});
|
|
268
|
-
lastProgressLog = now;
|
|
269
|
-
}
|
|
270
|
-
// Skip if already visited
|
|
271
|
-
if (this.visitedUrls.has(url)) {
|
|
272
|
-
continue;
|
|
273
|
-
}
|
|
274
|
-
// Check depth limit
|
|
275
|
-
if (depth > this.options.maxDepth) {
|
|
276
|
-
continue;
|
|
277
|
-
}
|
|
278
|
-
// Mark as visited
|
|
279
|
-
this.visitedUrls.add(url);
|
|
280
|
-
maxDepthReached = Math.max(maxDepthReached, depth);
|
|
281
|
-
await this.processPage(url, depth);
|
|
282
|
-
// Rate limiting
|
|
283
|
-
if (this.options.rateLimit > 0 && this.urlQueue.length > 0) {
|
|
284
|
-
await this.delay(this.options.rateLimit);
|
|
285
|
-
}
|
|
286
|
-
}
|
|
287
|
-
return maxDepthReached;
|
|
288
|
-
}
|
|
289
|
-
/**
|
|
290
|
-
* Parallel crawling with multiple workers
|
|
291
|
-
*/
|
|
292
|
-
async crawlWithWorkers(startTime) {
|
|
293
|
-
let maxDepthReached = 0;
|
|
294
|
-
let lastProgressLog = Date.now();
|
|
295
|
-
const PROGRESS_LOG_INTERVAL = 5000;
|
|
296
|
-
const workerCount = this.options.workers || 1;
|
|
297
|
-
while (this.urlQueue.length > 0 && this.crawledPages.length < this.options.maxPages) {
|
|
298
|
-
// Log progress periodically
|
|
299
|
-
const now = Date.now();
|
|
300
|
-
if (now - lastProgressLog >= PROGRESS_LOG_INTERVAL) {
|
|
301
|
-
const elapsed = ((now - startTime) / 1000).toFixed(1);
|
|
302
|
-
const pagesPerSecond = (this.crawledPages.length / elapsed).toFixed(2);
|
|
303
|
-
logger.info('Crawl progress (parallel)', {
|
|
304
|
-
pagesCrawled: this.crawledPages.length,
|
|
305
|
-
pagesRemaining: this.urlQueue.length,
|
|
306
|
-
maxPages: this.options.maxPages,
|
|
307
|
-
errors: this.errors.length,
|
|
308
|
-
elapsedSeconds: elapsed,
|
|
309
|
-
pagesPerSecond,
|
|
310
|
-
workers: workerCount,
|
|
311
|
-
});
|
|
312
|
-
lastProgressLog = now;
|
|
313
|
-
}
|
|
314
|
-
// Get batch of URLs to process in parallel
|
|
315
|
-
const batch = [];
|
|
316
|
-
const batchSize = Math.min(workerCount, this.urlQueue.length, this.options.maxPages - this.crawledPages.length);
|
|
317
|
-
for (let i = 0; i < batchSize; i++) {
|
|
318
|
-
const queued = this.options.crawlStrategy === 'dfs' ? this.urlQueue.pop() : this.urlQueue.shift();
|
|
319
|
-
if (!queued)
|
|
320
|
-
break;
|
|
321
|
-
// Skip if already visited
|
|
322
|
-
if (this.visitedUrls.has(queued.url)) {
|
|
323
|
-
continue;
|
|
324
|
-
}
|
|
325
|
-
// Check depth limit
|
|
326
|
-
if (queued.depth > this.options.maxDepth) {
|
|
327
|
-
continue;
|
|
328
|
-
}
|
|
329
|
-
// Mark as visited
|
|
330
|
-
this.visitedUrls.add(queued.url);
|
|
331
|
-
maxDepthReached = Math.max(maxDepthReached, queued.depth);
|
|
332
|
-
batch.push(queued);
|
|
333
|
-
}
|
|
334
|
-
if (batch.length === 0) {
|
|
335
|
-
break;
|
|
336
|
-
}
|
|
337
|
-
// Process batch in parallel
|
|
338
|
-
await Promise.all(batch.map(async (queued) => {
|
|
339
|
-
await this.processPage(queued.url, queued.depth);
|
|
340
|
-
// Rate limiting (per worker)
|
|
341
|
-
if (this.options.rateLimit > 0) {
|
|
342
|
-
await this.delay(this.options.rateLimit);
|
|
343
|
-
}
|
|
344
|
-
}));
|
|
345
|
-
}
|
|
346
|
-
return maxDepthReached;
|
|
347
|
-
}
|
|
348
|
-
/**
|
|
349
|
-
* Process a single page (shared by both sequential and parallel crawling)
|
|
350
|
-
*/
|
|
351
|
-
async processPage(url, depth) {
|
|
352
|
-
try {
|
|
353
|
-
// Crawl the page using HTTP GET with retry logic
|
|
354
|
-
logger.debug('Fetching page via HTTP GET', { url, depth, method: 'HTTP GET', client: 'axios' });
|
|
355
|
-
const page = await this.fetchPageWithRetry(url);
|
|
356
|
-
// Check if page has minimal content (possible SPA issue)
|
|
357
|
-
const contentLength = page.content.length;
|
|
358
|
-
const linksCount = page.navigationLinks.length;
|
|
359
|
-
if (contentLength < 200 && linksCount < 3) {
|
|
360
|
-
logger.warn('Page has minimal content - may be SPA', {
|
|
361
|
-
url,
|
|
362
|
-
contentLength,
|
|
363
|
-
linksCount,
|
|
364
|
-
suggestion: 'This page may require JavaScript rendering. Consider using browser automation tools.',
|
|
365
|
-
});
|
|
366
|
-
}
|
|
367
|
-
// Convert to CrawledPage format
|
|
368
|
-
const crawledPage = {
|
|
369
|
-
url: page.url,
|
|
370
|
-
title: page.title,
|
|
371
|
-
content: page.content,
|
|
372
|
-
depth,
|
|
373
|
-
sections: page.sections,
|
|
374
|
-
navigationLinks: page.navigationLinks,
|
|
375
|
-
headings: page.headings,
|
|
376
|
-
codeSamples: page.codeSamples,
|
|
377
|
-
};
|
|
378
|
-
this.crawledPages.push(crawledPage);
|
|
379
|
-
this.linkDiscoveryStats.pagesCrawled++;
|
|
380
|
-
this.pagesSinceLastCheckpoint++;
|
|
381
|
-
// Save checkpoint if interval reached
|
|
382
|
-
if (this.checkpointManager && this.options.checkpoint?.enabled) {
|
|
383
|
-
const interval = this.options.checkpoint.interval || 10;
|
|
384
|
-
if (this.pagesSinceLastCheckpoint >= interval) {
|
|
385
|
-
await this.saveCheckpoint();
|
|
386
|
-
this.pagesSinceLastCheckpoint = 0;
|
|
387
|
-
}
|
|
388
|
-
}
|
|
389
|
-
const totalLinksOnPage = page.navigationLinks.length;
|
|
390
|
-
this.linkDiscoveryStats.totalLinksFound += totalLinksOnPage;
|
|
391
|
-
logger.debug('Page fetched and parsed successfully', {
|
|
392
|
-
url,
|
|
393
|
-
title: page.title.substring(0, 50),
|
|
394
|
-
linksFound: totalLinksOnPage,
|
|
395
|
-
depth,
|
|
396
|
-
});
|
|
397
|
-
// Discover and queue new URLs
|
|
398
|
-
if (depth < this.options.maxDepth) {
|
|
399
|
-
const discoveryResult = this.discoverDocumentationLinks(page, depth + 1);
|
|
400
|
-
const newUrls = discoveryResult.discovered;
|
|
401
|
-
logger.debug('Link discovery completed', {
|
|
402
|
-
url,
|
|
403
|
-
totalLinksOnPage,
|
|
404
|
-
discovered: newUrls.length,
|
|
405
|
-
filtered: discoveryResult.filtered,
|
|
406
|
-
});
|
|
407
|
-
let queuedCount = 0;
|
|
408
|
-
let skippedAlreadyVisited = 0;
|
|
409
|
-
for (const newUrl of newUrls) {
|
|
410
|
-
if (!this.visitedUrls.has(newUrl.url)) {
|
|
411
|
-
// Also check if it's already in the queue to avoid duplicates
|
|
412
|
-
const alreadyInQueue = this.urlQueue.some(q => q.url === newUrl.url);
|
|
413
|
-
if (!alreadyInQueue) {
|
|
414
|
-
this.urlQueue.push(newUrl);
|
|
415
|
-
this.linkDiscoveryStats.linksQueued++;
|
|
416
|
-
queuedCount++;
|
|
417
|
-
}
|
|
418
|
-
else {
|
|
419
|
-
skippedAlreadyVisited++;
|
|
420
|
-
}
|
|
421
|
-
}
|
|
422
|
-
else {
|
|
423
|
-
skippedAlreadyVisited++;
|
|
424
|
-
}
|
|
425
|
-
}
|
|
426
|
-
logger.debug('Links queued', {
|
|
427
|
-
url,
|
|
428
|
-
queued: queuedCount,
|
|
429
|
-
skippedAlreadyVisited,
|
|
430
|
-
queueLengthAfter: this.urlQueue.length,
|
|
431
|
-
});
|
|
432
|
-
}
|
|
433
|
-
else {
|
|
434
|
-
this.linkDiscoveryStats.linksFiltered.depthLimit += totalLinksOnPage;
|
|
435
|
-
}
|
|
436
|
-
}
|
|
437
|
-
catch (error) {
|
|
438
|
-
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
439
|
-
const errorType = this.classifyError(error);
|
|
440
|
-
this.errors.push({
|
|
441
|
-
url,
|
|
442
|
-
error: `${errorType}: ${errorMessage}`,
|
|
443
|
-
});
|
|
444
|
-
logger.warn('Page crawl failed', {
|
|
445
|
-
url,
|
|
446
|
-
error: errorMessage,
|
|
447
|
-
errorType,
|
|
448
|
-
depth,
|
|
449
|
-
willContinue: true,
|
|
450
|
-
});
|
|
451
|
-
}
|
|
452
|
-
}
|
|
453
|
-
/**
|
|
454
|
-
* Discover documentation links from a crawled page
|
|
455
|
-
*/
|
|
456
|
-
discoverDocumentationLinks(page, nextDepth) {
|
|
457
|
-
const discovered = [];
|
|
458
|
-
const filtered = {
|
|
459
|
-
notContent: 0, // Renamed from notDocumentation
|
|
460
|
-
externalDomain: 0,
|
|
461
|
-
alreadyVisited: 0,
|
|
462
|
-
excludedPattern: 0,
|
|
463
|
-
};
|
|
464
|
-
const linkDetails = [];
|
|
465
|
-
for (const link of page.navigationLinks) {
|
|
466
|
-
// Only follow internal links
|
|
467
|
-
if (!link.isInternal) {
|
|
468
|
-
filtered.externalDomain++;
|
|
469
|
-
this.linkDiscoveryStats.linksFiltered.externalDomain++;
|
|
470
|
-
linkDetails.push({ url: link.url, reason: 'not_internal' });
|
|
471
|
-
continue;
|
|
472
|
-
}
|
|
473
|
-
try {
|
|
474
|
-
const linkUrl = new URL(link.url);
|
|
475
|
-
// Must be same origin
|
|
476
|
-
if (linkUrl.origin !== this.baseUrl.origin) {
|
|
477
|
-
filtered.externalDomain++;
|
|
478
|
-
this.linkDiscoveryStats.linksFiltered.externalDomain++;
|
|
479
|
-
linkDetails.push({ url: link.url, reason: 'different_origin' });
|
|
480
|
-
continue;
|
|
481
|
-
}
|
|
482
|
-
// Check if already visited
|
|
483
|
-
const normalizedUrl = linkUrl.href.split('#')[0];
|
|
484
|
-
if (this.visitedUrls.has(normalizedUrl)) {
|
|
485
|
-
filtered.alreadyVisited++;
|
|
486
|
-
this.linkDiscoveryStats.linksFiltered.alreadyVisited++;
|
|
487
|
-
linkDetails.push({ url: link.url, reason: 'already_visited' });
|
|
488
|
-
continue;
|
|
489
|
-
}
|
|
490
|
-
// Check if it's a valid content path (permissive - only exclude clearly non-content)
|
|
491
|
-
if (!this.isDocumentationPath(linkUrl.pathname)) {
|
|
492
|
-
filtered.notContent++;
|
|
493
|
-
this.linkDiscoveryStats.linksFiltered.notContent++;
|
|
494
|
-
linkDetails.push({ url: link.url, reason: 'not_content_path', pathname: linkUrl.pathname });
|
|
495
|
-
continue;
|
|
496
|
-
}
|
|
497
|
-
// Check exclude patterns
|
|
498
|
-
if (this.shouldExclude(linkUrl.pathname)) {
|
|
499
|
-
filtered.excludedPattern++;
|
|
500
|
-
this.linkDiscoveryStats.linksFiltered.excludedPattern++;
|
|
501
|
-
linkDetails.push({ url: link.url, reason: 'excluded_pattern', pathname: linkUrl.pathname });
|
|
502
|
-
continue;
|
|
503
|
-
}
|
|
504
|
-
// Check include patterns (if specified)
|
|
505
|
-
if (this.options.includePaths.length > 0) {
|
|
506
|
-
const matchesInclude = this.options.includePaths.some(pattern => linkUrl.pathname.includes(pattern));
|
|
507
|
-
if (!matchesInclude) {
|
|
508
|
-
filtered.notContent++;
|
|
509
|
-
this.linkDiscoveryStats.linksFiltered.notContent++;
|
|
510
|
-
linkDetails.push({ url: link.url, reason: 'not_in_include_paths', pathname: linkUrl.pathname });
|
|
511
|
-
continue;
|
|
512
|
-
}
|
|
513
|
-
}
|
|
514
|
-
// Check exclude patterns (if specified)
|
|
515
|
-
if (this.options.excludePaths.length > 0) {
|
|
516
|
-
const matchesExclude = this.options.excludePaths.some(pattern => linkUrl.pathname.includes(pattern));
|
|
517
|
-
if (matchesExclude) {
|
|
518
|
-
filtered.excludedPattern++;
|
|
519
|
-
this.linkDiscoveryStats.linksFiltered.excludedPattern++;
|
|
520
|
-
linkDetails.push({ url: link.url, reason: 'matches_exclude_paths', pathname: linkUrl.pathname });
|
|
521
|
-
continue;
|
|
522
|
-
}
|
|
523
|
-
}
|
|
524
|
-
// Add to discovered links
|
|
525
|
-
discovered.push({
|
|
526
|
-
url: normalizedUrl,
|
|
527
|
-
depth: nextDepth,
|
|
528
|
-
});
|
|
529
|
-
linkDetails.push({ url: link.url, reason: 'accepted' });
|
|
530
|
-
}
|
|
531
|
-
catch (error) {
|
|
532
|
-
// Invalid URL, skip
|
|
533
|
-
linkDetails.push({ url: link.url, reason: 'invalid_url', error: error instanceof Error ? error.message : String(error) });
|
|
534
|
-
continue;
|
|
535
|
-
}
|
|
536
|
-
}
|
|
537
|
-
// Log detailed filtering information
|
|
538
|
-
const filteredLinksByReason = linkDetails
|
|
539
|
-
.filter(d => d.reason !== 'accepted')
|
|
540
|
-
.reduce((acc, d) => {
|
|
541
|
-
if (!acc[d.reason])
|
|
542
|
-
acc[d.reason] = [];
|
|
543
|
-
acc[d.reason].push({ url: d.url, pathname: d.pathname, error: d.error });
|
|
544
|
-
return acc;
|
|
545
|
-
}, {});
|
|
546
|
-
logger.info('Link filtering details', {
|
|
547
|
-
totalLinks: page.navigationLinks.length,
|
|
548
|
-
discovered: discovered.length,
|
|
549
|
-
filtered: {
|
|
550
|
-
notContent: filtered.notContent,
|
|
551
|
-
externalDomain: filtered.externalDomain,
|
|
552
|
-
alreadyVisited: filtered.alreadyVisited,
|
|
553
|
-
excludedPattern: filtered.excludedPattern,
|
|
554
|
-
},
|
|
555
|
-
filteredLinksByReason,
|
|
556
|
-
sampleAcceptedLinks: linkDetails
|
|
557
|
-
.filter(d => d.reason === 'accepted')
|
|
558
|
-
.slice(0, 10)
|
|
559
|
-
.map(d => d.url),
|
|
560
|
-
});
|
|
561
|
-
return {
|
|
562
|
-
discovered,
|
|
563
|
-
filtered,
|
|
564
|
-
alreadyVisited: filtered.alreadyVisited,
|
|
565
|
-
notContent: filtered.notContent,
|
|
566
|
-
externalDomain: filtered.externalDomain,
|
|
567
|
-
excludedPattern: filtered.excludedPattern,
|
|
568
|
-
};
|
|
569
|
-
}
|
|
570
|
-
/**
|
|
571
|
-
* Check if a path should be crawled (permissive - only exclude clearly non-content paths)
|
|
572
|
-
*/
|
|
573
|
-
isDocumentationPath(pathname) {
|
|
574
|
-
// Exclude clearly non-content pages
|
|
575
|
-
if (this.shouldExclude(pathname)) {
|
|
576
|
-
return false;
|
|
577
|
-
}
|
|
578
|
-
// Exclude static resources
|
|
579
|
-
const looksLikeStaticResource = /\.(?:css|js|json|xml|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|eot|pdf|zip|exe|dmg)$/i.test(pathname);
|
|
580
|
-
if (looksLikeStaticResource) {
|
|
581
|
-
return false;
|
|
582
|
-
}
|
|
583
|
-
// Exclude API endpoints that are clearly not content (unless they're documentation APIs)
|
|
584
|
-
// Keep API endpoints that might be documentation (e.g., /api/docs, /docs/api)
|
|
585
|
-
const looksLikeApiEndpoint = /^\/api\/[^/]+$/i.test(pathname);
|
|
586
|
-
if (looksLikeApiEndpoint && !pathname.includes('/docs') && !pathname.includes('/documentation')) {
|
|
587
|
-
return false;
|
|
588
|
-
}
|
|
589
|
-
// Allow root path
|
|
590
|
-
if (pathname === '/' || pathname === '') {
|
|
591
|
-
return true;
|
|
592
|
-
}
|
|
593
|
-
// Exclude paths with file extensions (unless they're HTML pages)
|
|
594
|
-
const hasFileExtension = /\.[a-z]{2,4}$/i.test(pathname.split('?')[0]);
|
|
595
|
-
if (hasFileExtension && !pathname.match(/\.(html?|htm)$/i)) {
|
|
596
|
-
return false;
|
|
597
|
-
}
|
|
598
|
-
// Permissive: allow any path that doesn't match exclusion patterns
|
|
599
|
-
// This allows crawling any website, not just documentation
|
|
600
|
-
return true;
|
|
601
|
-
}
|
|
602
|
-
/**
|
|
603
|
-
* Check if a path should be excluded
|
|
604
|
-
*/
|
|
605
|
-
shouldExclude(pathname) {
|
|
606
|
-
return this.EXCLUDED_PATTERNS.some(pattern => pattern.test(pathname));
|
|
607
|
-
}
|
|
608
|
-
/**
|
|
609
|
-
* Check if crawled content is sufficient for skill generation
|
|
610
|
-
* Enhanced with multi-dimensional quality metrics
|
|
611
|
-
*/
|
|
612
|
-
canGenerateSkill(pages) {
|
|
613
|
-
if (pages.length === 0) {
|
|
614
|
-
return { canGenerate: false, reason: 'empty_pages' };
|
|
615
|
-
}
|
|
616
|
-
const metrics = this.evaluateContentQuality(pages);
|
|
617
|
-
// All pages are media-only
|
|
618
|
-
if (metrics.mediaOnlyPages === pages.length && !metrics.hasTextContent) {
|
|
619
|
-
return { canGenerate: false, reason: 'media_only' };
|
|
620
|
-
}
|
|
621
|
-
// No pages have sufficient content
|
|
622
|
-
if (!metrics.hasSufficientContent) {
|
|
623
|
-
return { canGenerate: false, reason: 'insufficient_content' };
|
|
624
|
-
}
|
|
625
|
-
// No structured content (headings, sections)
|
|
626
|
-
if (!metrics.hasStructuredContent) {
|
|
627
|
-
return { canGenerate: false, reason: 'no_structured_content' };
|
|
628
|
-
}
|
|
629
|
-
return { canGenerate: true };
|
|
630
|
-
}
|
|
631
|
-
/**
|
|
632
|
-
* Evaluate content quality with multi-dimensional metrics
|
|
633
|
-
*/
|
|
634
|
-
evaluateContentQuality(pages) {
|
|
635
|
-
const MIN_CONTENT_LENGTH = 100;
|
|
636
|
-
let hasSufficientContent = false;
|
|
637
|
-
let hasStructuredContent = false;
|
|
638
|
-
let hasTextContent = false;
|
|
639
|
-
let mediaOnlyCount = 0;
|
|
640
|
-
let totalContentLength = 0;
|
|
641
|
-
let totalCodeSamples = 0;
|
|
642
|
-
// Track content diversity
|
|
643
|
-
const urlPatterns = new Set();
|
|
644
|
-
const titlePatterns = new Set();
|
|
645
|
-
for (const page of pages) {
|
|
646
|
-
const contentLength = (page.content || '').trim().length;
|
|
647
|
-
const hasHeadings = page.headings && page.headings.length > 0;
|
|
648
|
-
const hasText = contentLength > 0;
|
|
649
|
-
totalContentLength += contentLength;
|
|
650
|
-
totalCodeSamples += (page.codeSamples || []).length;
|
|
651
|
-
// Check if page is media-only
|
|
652
|
-
const hasImages = /<img[^>]*>/i.test(page.content || '');
|
|
653
|
-
const hasMedia = hasImages || (page.codeSamples && page.codeSamples.length > 0);
|
|
654
|
-
if (hasMedia && contentLength < MIN_CONTENT_LENGTH) {
|
|
655
|
-
mediaOnlyCount++;
|
|
656
|
-
}
|
|
657
|
-
if (contentLength >= MIN_CONTENT_LENGTH) {
|
|
658
|
-
hasSufficientContent = true;
|
|
659
|
-
}
|
|
660
|
-
if (hasHeadings) {
|
|
661
|
-
hasStructuredContent = true;
|
|
662
|
-
}
|
|
663
|
-
if (hasText) {
|
|
664
|
-
hasTextContent = true;
|
|
665
|
-
}
|
|
666
|
-
// Track diversity
|
|
667
|
-
try {
|
|
668
|
-
const urlPath = new URL(page.url).pathname;
|
|
669
|
-
const pathSegments = urlPath.split('/').filter(s => s);
|
|
670
|
-
if (pathSegments.length > 0) {
|
|
671
|
-
urlPatterns.add(pathSegments[0]);
|
|
672
|
-
}
|
|
673
|
-
}
|
|
674
|
-
catch {
|
|
675
|
-
// Invalid URL, skip
|
|
676
|
-
}
|
|
677
|
-
// Track title diversity
|
|
678
|
-
const titleWords = page.title.toLowerCase().split(/\s+/).slice(0, 3);
|
|
679
|
-
titlePatterns.add(titleWords.join(' '));
|
|
680
|
-
}
|
|
681
|
-
// Calculate diversity score (0-1)
|
|
682
|
-
const contentDiversity = Math.min(1, (urlPatterns.size + titlePatterns.size) / (pages.length * 0.5));
|
|
683
|
-
// Calculate API coverage score (0-1)
|
|
684
|
-
const pagesWithCode = pages.filter(p => p.codeSamples && p.codeSamples.length > 0).length;
|
|
685
|
-
const apiCoverage = pages.length > 0 ? pagesWithCode / pages.length : 0;
|
|
686
|
-
const avgContentLength = pages.length > 0 ? totalContentLength / pages.length : 0;
|
|
687
|
-
return {
|
|
688
|
-
hasSufficientContent,
|
|
689
|
-
hasStructuredContent,
|
|
690
|
-
hasTextContent,
|
|
691
|
-
mediaOnlyPages: mediaOnlyCount,
|
|
692
|
-
contentDiversity,
|
|
693
|
-
apiCoverage,
|
|
694
|
-
avgContentLength,
|
|
695
|
-
totalCodeSamples,
|
|
696
|
-
};
|
|
697
|
-
}
|
|
698
|
-
/**
|
|
699
|
-
* Check if should continue crawling based on content quality
|
|
700
|
-
*/
|
|
701
|
-
shouldContinueCrawling(currentPages, maxPages) {
|
|
702
|
-
if (currentPages >= maxPages) {
|
|
703
|
-
return false;
|
|
704
|
-
}
|
|
705
|
-
// Evaluate quality every 10 pages
|
|
706
|
-
if (currentPages % 10 === 0 && currentPages > 0) {
|
|
707
|
-
const metrics = this.evaluateContentQuality(this.crawledPages);
|
|
708
|
-
// High quality content - can stop early if we have enough
|
|
709
|
-
if (metrics.hasSufficientContent &&
|
|
710
|
-
metrics.contentDiversity > 0.7 &&
|
|
711
|
-
metrics.apiCoverage > 0.5 &&
|
|
712
|
-
currentPages >= maxPages * 0.5) {
|
|
713
|
-
logger.info('High quality content detected, considering early stop', {
|
|
714
|
-
currentPages,
|
|
715
|
-
maxPages,
|
|
716
|
-
diversity: metrics.contentDiversity.toFixed(2),
|
|
717
|
-
apiCoverage: metrics.apiCoverage.toFixed(2),
|
|
718
|
-
});
|
|
719
|
-
// Continue but log the possibility
|
|
720
|
-
}
|
|
721
|
-
// Low quality warning
|
|
722
|
-
if (currentPages >= maxPages * 0.8 && !metrics.hasSufficientContent) {
|
|
723
|
-
logger.warn('Approaching page limit but content quality is low', {
|
|
724
|
-
currentPages,
|
|
725
|
-
maxPages,
|
|
726
|
-
diversity: metrics.contentDiversity.toFixed(2),
|
|
727
|
-
apiCoverage: metrics.apiCoverage.toFixed(2),
|
|
728
|
-
suggestion: 'Consider increasing maxPages or refining includePaths',
|
|
729
|
-
});
|
|
730
|
-
}
|
|
731
|
-
}
|
|
732
|
-
return currentPages < maxPages;
|
|
733
|
-
}
|
|
734
|
-
/**
|
|
735
|
-
* Fetch a page with retry logic
|
|
736
|
-
* Supports both HTML pages and Markdown files
|
|
737
|
-
*/
|
|
738
|
-
async fetchPageWithRetry(url, retryCount = 0) {
|
|
739
|
-
try {
|
|
740
|
-
// Check if this is a Markdown file
|
|
741
|
-
if (url.endsWith('.md') || url.includes('.md?') || url.includes('.md#')) {
|
|
742
|
-
return await this.extractMarkdownContent(url);
|
|
743
|
-
}
|
|
744
|
-
// Regular HTML page
|
|
745
|
-
return await this.browser.browsePage(url);
|
|
746
|
-
}
|
|
747
|
-
catch (error) {
|
|
748
|
-
const errorType = this.classifyError(error);
|
|
749
|
-
const isRetryable = this.isRetryableError(error);
|
|
750
|
-
if (isRetryable && retryCount < this.options.maxRetries) {
|
|
751
|
-
const delay = this.options.retryDelay * (retryCount + 1); // Exponential backoff
|
|
752
|
-
logger.info('Retrying page fetch', {
|
|
753
|
-
url,
|
|
754
|
-
retryCount: retryCount + 1,
|
|
755
|
-
maxRetries: this.options.maxRetries,
|
|
756
|
-
delay,
|
|
757
|
-
errorType,
|
|
758
|
-
});
|
|
759
|
-
await this.delay(delay);
|
|
760
|
-
return this.fetchPageWithRetry(url, retryCount + 1);
|
|
761
|
-
}
|
|
762
|
-
// Not retryable or max retries reached
|
|
763
|
-
throw error;
|
|
764
|
-
}
|
|
765
|
-
}
|
|
766
|
-
/**
|
|
767
|
-
* Extract content from Markdown file
|
|
768
|
-
* Converts Markdown structure to WebDocumentationPage format
|
|
769
|
-
*/
|
|
770
|
-
async extractMarkdownContent(url) {
|
|
771
|
-
logger.debug('Extracting Markdown content', { url });
|
|
772
|
-
// Fetch raw markdown content
|
|
773
|
-
const httpClient = new HttpClient();
|
|
774
|
-
const response = await httpClient.get(url, {
|
|
775
|
-
responseType: 'text',
|
|
776
|
-
timeout: 30000,
|
|
777
|
-
});
|
|
778
|
-
const markdownContent = response.data;
|
|
779
|
-
// Parse markdown structure
|
|
780
|
-
const parsed = this.parseMarkdown(markdownContent, url);
|
|
781
|
-
return {
|
|
782
|
-
url,
|
|
783
|
-
title: parsed.title,
|
|
784
|
-
content: parsed.content,
|
|
785
|
-
searchableContent: parsed.content, // Add searchable content for consistency
|
|
786
|
-
sections: parsed.sections,
|
|
787
|
-
navigationLinks: parsed.links,
|
|
788
|
-
headings: parsed.headings,
|
|
789
|
-
codeSamples: parsed.codeSamples,
|
|
790
|
-
isDocumentation: true,
|
|
791
|
-
};
|
|
792
|
-
}
|
|
793
|
-
/**
|
|
794
|
-
* Parse Markdown content into structured data
|
|
795
|
-
*/
|
|
796
|
-
parseMarkdown(content, url) {
|
|
797
|
-
const lines = content.split('\n');
|
|
798
|
-
let title = '';
|
|
799
|
-
const headings = [];
|
|
800
|
-
const codeSamples = [];
|
|
801
|
-
const sections = [];
|
|
802
|
-
const links = [];
|
|
803
|
-
const contentLines = [];
|
|
804
|
-
// Extract title from first h1
|
|
805
|
-
for (const line of lines) {
|
|
806
|
-
if (line.startsWith('# ')) {
|
|
807
|
-
title = line.substring(2).trim();
|
|
808
|
-
break;
|
|
809
|
-
}
|
|
810
|
-
}
|
|
811
|
-
// Extract headings (h2-h6)
|
|
812
|
-
const headingRegex = /^(#{2,6})\s+(.+)$/;
|
|
813
|
-
for (const line of lines) {
|
|
814
|
-
const match = line.match(headingRegex);
|
|
815
|
-
if (match) {
|
|
816
|
-
const level = match[1].length;
|
|
817
|
-
const text = match[2].trim();
|
|
818
|
-
const id = text.toLowerCase().replace(/[^\w\s-]/g, '').replace(/\s+/g, '-');
|
|
819
|
-
headings.push({
|
|
820
|
-
level: `h${level}`,
|
|
821
|
-
text,
|
|
822
|
-
id,
|
|
823
|
-
});
|
|
824
|
-
}
|
|
825
|
-
}
|
|
826
|
-
// Extract code blocks
|
|
827
|
-
const codeBlockRegex = /```(\w+)?\n([\s\S]*?)```/g;
|
|
828
|
-
let match;
|
|
829
|
-
while ((match = codeBlockRegex.exec(content)) !== null) {
|
|
830
|
-
const language = match[1] || 'text';
|
|
831
|
-
const code = match[2].trim();
|
|
832
|
-
if (code.length > 10) {
|
|
833
|
-
codeSamples.push({
|
|
834
|
-
code,
|
|
835
|
-
language,
|
|
836
|
-
});
|
|
837
|
-
}
|
|
838
|
-
}
|
|
839
|
-
// Extract content (remove code blocks and headings)
|
|
840
|
-
let contentWithoutCode = content.replace(codeBlockRegex, '');
|
|
841
|
-
contentWithoutCode = contentWithoutCode.replace(/^#{1,6}\s+.+$/gm, '');
|
|
842
|
-
for (const para of contentWithoutCode.split('\n\n')) {
|
|
843
|
-
const trimmed = para.trim();
|
|
844
|
-
if (trimmed.length > 20) {
|
|
845
|
-
contentLines.push(trimmed);
|
|
846
|
-
}
|
|
847
|
-
}
|
|
848
|
-
// Extract links (markdown format)
|
|
849
|
-
const linkRegex = /\[([^\]]*)\]\(([^)]+)\)/g;
|
|
850
|
-
while ((match = linkRegex.exec(content)) !== null) {
|
|
851
|
-
const text = match[1];
|
|
852
|
-
const linkUrl = match[2].trim();
|
|
853
|
-
// Skip anchors
|
|
854
|
-
if (linkUrl.startsWith('#')) {
|
|
855
|
-
continue;
|
|
856
|
-
}
|
|
857
|
-
// Resolve relative URLs
|
|
858
|
-
let absoluteUrl;
|
|
859
|
-
try {
|
|
860
|
-
if (linkUrl.startsWith('http://') || linkUrl.startsWith('https://')) {
|
|
861
|
-
absoluteUrl = linkUrl;
|
|
862
|
-
}
|
|
863
|
-
else {
|
|
864
|
-
absoluteUrl = new URL(linkUrl, url).href;
|
|
865
|
-
}
|
|
866
|
-
// Remove fragment
|
|
867
|
-
absoluteUrl = absoluteUrl.split('#')[0];
|
|
868
|
-
// Only include .md URLs to avoid client-side rendered HTML pages
|
|
869
|
-
if (absoluteUrl.endsWith('.md') || absoluteUrl.includes('.md?')) {
|
|
870
|
-
const linkOrigin = new URL(absoluteUrl).origin;
|
|
871
|
-
const baseOrigin = this.baseUrl.origin;
|
|
872
|
-
links.push({
|
|
873
|
-
text,
|
|
874
|
-
url: absoluteUrl,
|
|
875
|
-
isInternal: linkOrigin === baseOrigin,
|
|
876
|
-
});
|
|
877
|
-
}
|
|
878
|
-
}
|
|
879
|
-
catch (error) {
|
|
880
|
-
// Invalid URL, skip
|
|
881
|
-
logger.debug('Invalid URL in markdown link', { url: linkUrl });
|
|
882
|
-
}
|
|
883
|
-
}
|
|
884
|
-
// Build sections from headings
|
|
885
|
-
let currentSection = null;
|
|
886
|
-
let currentContent = [];
|
|
887
|
-
for (const line of lines) {
|
|
888
|
-
const headerMatch = line.match(headingRegex);
|
|
889
|
-
if (headerMatch) {
|
|
890
|
-
// Save previous section
|
|
891
|
-
if (currentSection) {
|
|
892
|
-
currentSection.content = currentContent.join('\n').trim();
|
|
893
|
-
if (currentSection.content.length > 0) {
|
|
894
|
-
sections.push(currentSection);
|
|
895
|
-
}
|
|
896
|
-
}
|
|
897
|
-
// Start new section
|
|
898
|
-
const text = headerMatch[2].trim();
|
|
899
|
-
currentSection = {
|
|
900
|
-
title: text,
|
|
901
|
-
content: '',
|
|
902
|
-
anchor: text.toLowerCase().replace(/[^\w\s-]/g, '').replace(/\s+/g, '-'),
|
|
903
|
-
};
|
|
904
|
-
currentContent = [];
|
|
905
|
-
}
|
|
906
|
-
else if (currentSection) {
|
|
907
|
-
currentContent.push(line);
|
|
908
|
-
}
|
|
909
|
-
}
|
|
910
|
-
// Save last section
|
|
911
|
-
if (currentSection) {
|
|
912
|
-
currentSection.content = currentContent.join('\n').trim();
|
|
913
|
-
if (currentSection.content.length > 0) {
|
|
914
|
-
sections.push(currentSection);
|
|
915
|
-
}
|
|
916
|
-
}
|
|
917
|
-
return {
|
|
918
|
-
title: title || 'Untitled',
|
|
919
|
-
content: contentLines.join('\n\n'),
|
|
920
|
-
headings,
|
|
921
|
-
codeSamples,
|
|
922
|
-
sections,
|
|
923
|
-
links,
|
|
924
|
-
};
|
|
925
|
-
}
|
|
926
|
-
/**
|
|
927
|
-
* Classify error type for better error messages
|
|
928
|
-
*/
|
|
929
|
-
classifyError(error) {
|
|
930
|
-
if (!(error instanceof Error)) {
|
|
931
|
-
return 'UnknownError';
|
|
932
|
-
}
|
|
933
|
-
const message = error.message.toLowerCase();
|
|
934
|
-
const errorName = error.name.toLowerCase();
|
|
935
|
-
// Network errors
|
|
936
|
-
if (errorName.includes('timeout') || message.includes('timeout')) {
|
|
937
|
-
return 'TimeoutError';
|
|
938
|
-
}
|
|
939
|
-
if (errorName.includes('network') || message.includes('network') || message.includes('econnrefused')) {
|
|
940
|
-
return 'NetworkError';
|
|
941
|
-
}
|
|
942
|
-
if (message.includes('econnreset') || message.includes('socket')) {
|
|
943
|
-
return 'ConnectionError';
|
|
944
|
-
}
|
|
945
|
-
// HTTP errors
|
|
946
|
-
if (errorName.includes('http') || message.includes('status')) {
|
|
947
|
-
if (message.includes('404'))
|
|
948
|
-
return 'NotFoundError';
|
|
949
|
-
if (message.includes('403'))
|
|
950
|
-
return 'ForbiddenError';
|
|
951
|
-
if (message.includes('401'))
|
|
952
|
-
return 'UnauthorizedError';
|
|
953
|
-
if (message.includes('429'))
|
|
954
|
-
return 'RateLimitError';
|
|
955
|
-
if (message.includes('500') || message.includes('502') || message.includes('503')) {
|
|
956
|
-
return 'ServerError';
|
|
957
|
-
}
|
|
958
|
-
return 'HttpError';
|
|
959
|
-
}
|
|
960
|
-
// Content errors
|
|
961
|
-
if (message.includes('documentation') || message.includes('not appear to be')) {
|
|
962
|
-
return 'NotDocumentationError';
|
|
963
|
-
}
|
|
964
|
-
if (message.includes('spa') || message.includes('javascript')) {
|
|
965
|
-
return 'SPAError';
|
|
966
|
-
}
|
|
967
|
-
return 'UnknownError';
|
|
968
|
-
}
|
|
969
|
-
/**
|
|
970
|
-
* Check if an error is retryable
|
|
971
|
-
*/
|
|
972
|
-
isRetryableError(error) {
|
|
973
|
-
if (!(error instanceof Error)) {
|
|
974
|
-
return false;
|
|
975
|
-
}
|
|
976
|
-
const errorType = this.classifyError(error);
|
|
977
|
-
// Retryable errors
|
|
978
|
-
const retryableTypes = [
|
|
979
|
-
'TimeoutError',
|
|
980
|
-
'NetworkError',
|
|
981
|
-
'ConnectionError',
|
|
982
|
-
'RateLimitError',
|
|
983
|
-
'ServerError', // 500, 502, 503
|
|
984
|
-
];
|
|
985
|
-
return retryableTypes.includes(errorType);
|
|
986
|
-
}
|
|
987
|
-
/**
|
|
988
|
-
* Get error breakdown by type
|
|
989
|
-
*/
|
|
990
|
-
getErrorBreakdown() {
|
|
991
|
-
const breakdown = {};
|
|
992
|
-
for (const error of this.errors) {
|
|
993
|
-
const errorType = error.error.split(':')[0] || 'UnknownError';
|
|
994
|
-
breakdown[errorType] = (breakdown[errorType] || 0) + 1;
|
|
995
|
-
}
|
|
996
|
-
return breakdown;
|
|
997
|
-
}
|
|
998
|
-
/**
|
|
999
|
-
* Try to detect and use llms.txt for optimized crawling
|
|
1000
|
-
*/
|
|
1001
|
-
async tryLlmsTxt(rootUrl) {
|
|
1002
|
-
logger.info('Checking for llms.txt files', { url: rootUrl });
|
|
1003
|
-
try {
|
|
1004
|
-
const detector = new LlmsTxtDetector(rootUrl);
|
|
1005
|
-
const variants = await detector.detectAll();
|
|
1006
|
-
if (variants.length === 0) {
|
|
1007
|
-
logger.info('No llms.txt files found, proceeding with normal crawl');
|
|
1008
|
-
return;
|
|
1009
|
-
}
|
|
1010
|
-
logger.info('Found llms.txt variants', {
|
|
1011
|
-
count: variants.length,
|
|
1012
|
-
variants: variants.map(v => v.variant),
|
|
1013
|
-
});
|
|
1014
|
-
// Download all variants
|
|
1015
|
-
const downloader = new LlmsTxtDownloader();
|
|
1016
|
-
const downloaded = await downloader.downloadAll(variants);
|
|
1017
|
-
if (downloaded.length === 0) {
|
|
1018
|
-
logger.warn('Failed to download any llms.txt variants');
|
|
1019
|
-
return;
|
|
1020
|
-
}
|
|
1021
|
-
// Use the largest variant (most comprehensive)
|
|
1022
|
-
const largest = downloaded.reduce((prev, current) => current.size > prev.size ? current : prev);
|
|
1023
|
-
logger.info('Using llms.txt for URL extraction', {
|
|
1024
|
-
variant: largest.variant,
|
|
1025
|
-
size: largest.size,
|
|
1026
|
-
});
|
|
1027
|
-
// Parse URLs from llms.txt
|
|
1028
|
-
const parser = new LlmsTxtParser(largest.content, rootUrl);
|
|
1029
|
-
const extractedUrls = parser.extractUrls();
|
|
1030
|
-
if (extractedUrls.length > 0) {
|
|
1031
|
-
logger.info('Extracted URLs from llms.txt', {
|
|
1032
|
-
count: extractedUrls.length,
|
|
1033
|
-
});
|
|
1034
|
-
// Add URLs to queue with depth 0
|
|
1035
|
-
for (const url of extractedUrls) {
|
|
1036
|
-
if (this.isValidUrl(url) && !this.visitedUrls.has(url)) {
|
|
1037
|
-
this.urlQueue.push({ url, depth: 0 });
|
|
1038
|
-
}
|
|
1039
|
-
}
|
|
1040
|
-
logger.info('Added llms.txt URLs to crawl queue', {
|
|
1041
|
-
added: this.urlQueue.length,
|
|
1042
|
-
});
|
|
1043
|
-
}
|
|
1044
|
-
else {
|
|
1045
|
-
logger.info('No URLs extracted from llms.txt, using normal crawl');
|
|
1046
|
-
}
|
|
1047
|
-
}
|
|
1048
|
-
catch (error) {
|
|
1049
|
-
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1050
|
-
logger.warn('llms.txt detection failed, continuing with normal crawl', {
|
|
1051
|
-
error: errorMessage,
|
|
1052
|
-
});
|
|
1053
|
-
// Continue with normal crawling if llms.txt fails
|
|
1054
|
-
}
|
|
1055
|
-
}
|
|
1056
|
-
/**
|
|
1057
|
-
* Check if a URL is valid for crawling
|
|
1058
|
-
*/
|
|
1059
|
-
isValidUrl(url) {
|
|
1060
|
-
try {
|
|
1061
|
-
const parsed = new URL(url);
|
|
1062
|
-
// Must be same origin as base URL
|
|
1063
|
-
if (parsed.origin !== this.baseUrl.origin) {
|
|
1064
|
-
return false;
|
|
1065
|
-
}
|
|
1066
|
-
// Must be http or https
|
|
1067
|
-
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
|
|
1068
|
-
return false;
|
|
1069
|
-
}
|
|
1070
|
-
return true;
|
|
1071
|
-
}
|
|
1072
|
-
catch {
|
|
1073
|
-
return false;
|
|
1074
|
-
}
|
|
1075
|
-
}
|
|
1076
|
-
/**
|
|
1077
|
-
* Save checkpoint
|
|
1078
|
-
*/
|
|
1079
|
-
async saveCheckpoint() {
|
|
1080
|
-
if (!this.checkpointManager) {
|
|
1081
|
-
return;
|
|
1082
|
-
}
|
|
1083
|
-
const checkpointData = {
|
|
1084
|
-
config: this.options,
|
|
1085
|
-
visitedUrls: Array.from(this.visitedUrls),
|
|
1086
|
-
pendingUrls: this.urlQueue,
|
|
1087
|
-
pagesCrawled: this.crawledPages.length,
|
|
1088
|
-
lastUpdated: new Date().toISOString(),
|
|
1089
|
-
baseUrl: this.baseUrl.href,
|
|
1090
|
-
};
|
|
1091
|
-
try {
|
|
1092
|
-
await this.checkpointManager.saveCheckpoint(checkpointData);
|
|
1093
|
-
}
|
|
1094
|
-
catch (error) {
|
|
1095
|
-
logger.warn('Failed to save checkpoint', {
|
|
1096
|
-
error: error instanceof Error ? error.message : String(error),
|
|
1097
|
-
});
|
|
1098
|
-
}
|
|
1099
|
-
}
|
|
1100
|
-
/**
|
|
1101
|
-
* Load checkpoint and restore state
|
|
1102
|
-
*/
|
|
1103
|
-
async loadCheckpoint() {
|
|
1104
|
-
if (!this.checkpointManager) {
|
|
1105
|
-
return false;
|
|
1106
|
-
}
|
|
1107
|
-
try {
|
|
1108
|
-
const data = await this.checkpointManager.loadCheckpoint();
|
|
1109
|
-
if (!data) {
|
|
1110
|
-
logger.info('No checkpoint found to resume from');
|
|
1111
|
-
return false;
|
|
1112
|
-
}
|
|
1113
|
-
// Restore state
|
|
1114
|
-
this.visitedUrls = new Set(data.visitedUrls);
|
|
1115
|
-
this.urlQueue = data.pendingUrls;
|
|
1116
|
-
// Note: crawledPages are not restored as they will be regenerated
|
|
1117
|
-
logger.info('State restored from checkpoint', {
|
|
1118
|
-
visitedUrls: this.visitedUrls.size,
|
|
1119
|
-
pendingUrls: this.urlQueue.length,
|
|
1120
|
-
lastUpdated: data.lastUpdated,
|
|
1121
|
-
});
|
|
1122
|
-
return true;
|
|
1123
|
-
}
|
|
1124
|
-
catch (error) {
|
|
1125
|
-
logger.warn('Failed to load checkpoint', {
|
|
1126
|
-
error: error instanceof Error ? error.message : String(error),
|
|
1127
|
-
});
|
|
1128
|
-
return false;
|
|
1129
|
-
}
|
|
1130
|
-
}
|
|
1131
|
-
/**
|
|
1132
|
-
* Clear checkpoint after successful crawl
|
|
1133
|
-
*/
|
|
1134
|
-
async clearCheckpoint() {
|
|
1135
|
-
if (this.checkpointManager) {
|
|
1136
|
-
try {
|
|
1137
|
-
await this.checkpointManager.clearCheckpoint();
|
|
1138
|
-
}
|
|
1139
|
-
catch (error) {
|
|
1140
|
-
logger.debug('Failed to clear checkpoint', {
|
|
1141
|
-
error: error instanceof Error ? error.message : String(error),
|
|
1142
|
-
});
|
|
1143
|
-
}
|
|
1144
|
-
}
|
|
1145
|
-
}
|
|
1146
|
-
/**
|
|
1147
|
-
* Sanitize filename for checkpoint
|
|
1148
|
-
*/
|
|
1149
|
-
sanitizeFilename(url) {
|
|
1150
|
-
return url
|
|
1151
|
-
.replace(/[^a-z0-9]/gi, '-')
|
|
1152
|
-
.replace(/-+/g, '-')
|
|
1153
|
-
.substring(0, 64);
|
|
1154
|
-
}
|
|
1155
|
-
/**
|
|
1156
|
-
* Delay helper for rate limiting
|
|
1157
|
-
*/
|
|
1158
|
-
delay(ms) {
|
|
1159
|
-
return new Promise(resolve => setTimeout(resolve, ms));
|
|
1160
|
-
}
|
|
1161
|
-
}
|
|
1162
|
-
//# sourceMappingURL=doc-crawler.js.map
|