@gulibs/safe-coder 0.0.26 → 0.0.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. package/README.md +678 -994
  2. package/dist/cache/cache-manager.d.ts +71 -0
  3. package/dist/cache/cache-manager.d.ts.map +1 -0
  4. package/dist/cache/cache-manager.js +244 -0
  5. package/dist/cache/cache-manager.js.map +1 -0
  6. package/dist/executor/cli-executor.d.ts +106 -0
  7. package/dist/executor/cli-executor.d.ts.map +1 -0
  8. package/dist/executor/cli-executor.js +133 -0
  9. package/dist/executor/cli-executor.js.map +1 -0
  10. package/dist/executor/dependency-checker.d.ts +23 -0
  11. package/dist/executor/dependency-checker.d.ts.map +1 -0
  12. package/dist/executor/dependency-checker.js +62 -0
  13. package/dist/executor/dependency-checker.js.map +1 -0
  14. package/dist/index.js +3 -4
  15. package/dist/index.js.map +1 -1
  16. package/dist/processor/content-processor.d.ts +76 -0
  17. package/dist/processor/content-processor.d.ts.map +1 -0
  18. package/dist/processor/content-processor.js +182 -0
  19. package/dist/processor/content-processor.js.map +1 -0
  20. package/dist/processor/guide-generator.d.ts +68 -0
  21. package/dist/processor/guide-generator.d.ts.map +1 -0
  22. package/dist/processor/guide-generator.js +189 -0
  23. package/dist/processor/guide-generator.js.map +1 -0
  24. package/dist/server/safe-coder-mcp.d.ts +18 -0
  25. package/dist/server/safe-coder-mcp.d.ts.map +1 -0
  26. package/dist/server/safe-coder-mcp.js +164 -0
  27. package/dist/server/safe-coder-mcp.js.map +1 -0
  28. package/dist/tools/cache-tools.d.ts +42 -0
  29. package/dist/tools/cache-tools.d.ts.map +1 -0
  30. package/dist/tools/cache-tools.js +70 -0
  31. package/dist/tools/cache-tools.js.map +1 -0
  32. package/dist/tools/crawl-documentation.d.ts +57 -0
  33. package/dist/tools/crawl-documentation.d.ts.map +1 -0
  34. package/dist/tools/crawl-documentation.js +96 -0
  35. package/dist/tools/crawl-documentation.js.map +1 -0
  36. package/dist/tools/index.d.ts +4 -0
  37. package/dist/tools/index.d.ts.map +1 -0
  38. package/dist/tools/index.js +4 -0
  39. package/dist/tools/index.js.map +1 -0
  40. package/dist/tools/save-skill.d.ts +49 -0
  41. package/dist/tools/save-skill.d.ts.map +1 -0
  42. package/dist/tools/save-skill.js +207 -0
  43. package/dist/tools/save-skill.js.map +1 -0
  44. package/package.json +18 -28
  45. package/dist/documentation/browser-doc-browser.d.ts +0 -41
  46. package/dist/documentation/browser-doc-browser.d.ts.map +0 -1
  47. package/dist/documentation/browser-doc-browser.js +0 -357
  48. package/dist/documentation/browser-doc-browser.js.map +0 -1
  49. package/dist/documentation/browser-manager.d.ts +0 -51
  50. package/dist/documentation/browser-manager.d.ts.map +0 -1
  51. package/dist/documentation/browser-manager.js +0 -260
  52. package/dist/documentation/browser-manager.js.map +0 -1
  53. package/dist/documentation/cache.d.ts +0 -13
  54. package/dist/documentation/cache.d.ts.map +0 -1
  55. package/dist/documentation/cache.js +0 -48
  56. package/dist/documentation/cache.js.map +0 -1
  57. package/dist/documentation/checkpoint-manager.d.ts +0 -38
  58. package/dist/documentation/checkpoint-manager.d.ts.map +0 -1
  59. package/dist/documentation/checkpoint-manager.js +0 -101
  60. package/dist/documentation/checkpoint-manager.js.map +0 -1
  61. package/dist/documentation/doc-crawler.d.ts +0 -221
  62. package/dist/documentation/doc-crawler.d.ts.map +0 -1
  63. package/dist/documentation/doc-crawler.js +0 -1415
  64. package/dist/documentation/doc-crawler.js.map +0 -1
  65. package/dist/documentation/github-client.d.ts +0 -13
  66. package/dist/documentation/github-client.d.ts.map +0 -1
  67. package/dist/documentation/github-client.js +0 -90
  68. package/dist/documentation/github-client.js.map +0 -1
  69. package/dist/documentation/http-fetcher.d.ts +0 -8
  70. package/dist/documentation/http-fetcher.d.ts.map +0 -1
  71. package/dist/documentation/http-fetcher.js +0 -31
  72. package/dist/documentation/http-fetcher.js.map +0 -1
  73. package/dist/documentation/index.d.ts +0 -16
  74. package/dist/documentation/index.d.ts.map +0 -1
  75. package/dist/documentation/index.js +0 -159
  76. package/dist/documentation/index.js.map +0 -1
  77. package/dist/documentation/llms-txt/detector.d.ts +0 -31
  78. package/dist/documentation/llms-txt/detector.d.ts.map +0 -1
  79. package/dist/documentation/llms-txt/detector.js +0 -77
  80. package/dist/documentation/llms-txt/detector.js.map +0 -1
  81. package/dist/documentation/llms-txt/downloader.d.ts +0 -30
  82. package/dist/documentation/llms-txt/downloader.d.ts.map +0 -1
  83. package/dist/documentation/llms-txt/downloader.js +0 -84
  84. package/dist/documentation/llms-txt/downloader.js.map +0 -1
  85. package/dist/documentation/llms-txt/index.d.ts +0 -4
  86. package/dist/documentation/llms-txt/index.d.ts.map +0 -1
  87. package/dist/documentation/llms-txt/index.js +0 -4
  88. package/dist/documentation/llms-txt/index.js.map +0 -1
  89. package/dist/documentation/llms-txt/parser.d.ts +0 -43
  90. package/dist/documentation/llms-txt/parser.d.ts.map +0 -1
  91. package/dist/documentation/llms-txt/parser.js +0 -177
  92. package/dist/documentation/llms-txt/parser.js.map +0 -1
  93. package/dist/documentation/normalizer.d.ts +0 -6
  94. package/dist/documentation/normalizer.d.ts.map +0 -1
  95. package/dist/documentation/normalizer.js +0 -38
  96. package/dist/documentation/normalizer.js.map +0 -1
  97. package/dist/documentation/npm-client.d.ts +0 -19
  98. package/dist/documentation/npm-client.d.ts.map +0 -1
  99. package/dist/documentation/npm-client.js +0 -182
  100. package/dist/documentation/npm-client.js.map +0 -1
  101. package/dist/documentation/skill-generator.d.ts +0 -108
  102. package/dist/documentation/skill-generator.d.ts.map +0 -1
  103. package/dist/documentation/skill-generator.js +0 -642
  104. package/dist/documentation/skill-generator.js.map +0 -1
  105. package/dist/documentation/web-doc-browser.d.ts +0 -67
  106. package/dist/documentation/web-doc-browser.d.ts.map +0 -1
  107. package/dist/documentation/web-doc-browser.js +0 -555
  108. package/dist/documentation/web-doc-browser.js.map +0 -1
  109. package/dist/errors/api-validator.d.ts +0 -9
  110. package/dist/errors/api-validator.d.ts.map +0 -1
  111. package/dist/errors/api-validator.js +0 -57
  112. package/dist/errors/api-validator.js.map +0 -1
  113. package/dist/errors/contextual-analysis.d.ts +0 -14
  114. package/dist/errors/contextual-analysis.d.ts.map +0 -1
  115. package/dist/errors/contextual-analysis.js +0 -173
  116. package/dist/errors/contextual-analysis.js.map +0 -1
  117. package/dist/errors/cross-file-analyzer.d.ts +0 -16
  118. package/dist/errors/cross-file-analyzer.d.ts.map +0 -1
  119. package/dist/errors/cross-file-analyzer.js +0 -172
  120. package/dist/errors/cross-file-analyzer.js.map +0 -1
  121. package/dist/errors/eslint-integration.d.ts +0 -9
  122. package/dist/errors/eslint-integration.d.ts.map +0 -1
  123. package/dist/errors/eslint-integration.js +0 -131
  124. package/dist/errors/eslint-integration.js.map +0 -1
  125. package/dist/errors/framework-detector.d.ts +0 -10
  126. package/dist/errors/framework-detector.d.ts.map +0 -1
  127. package/dist/errors/framework-detector.js +0 -126
  128. package/dist/errors/framework-detector.js.map +0 -1
  129. package/dist/errors/index.d.ts +0 -18
  130. package/dist/errors/index.d.ts.map +0 -1
  131. package/dist/errors/index.js +0 -134
  132. package/dist/errors/index.js.map +0 -1
  133. package/dist/errors/pattern-matcher.d.ts +0 -25
  134. package/dist/errors/pattern-matcher.d.ts.map +0 -1
  135. package/dist/errors/pattern-matcher.js +0 -44
  136. package/dist/errors/pattern-matcher.js.map +0 -1
  137. package/dist/errors/patterns.d.ts +0 -11
  138. package/dist/errors/patterns.d.ts.map +0 -1
  139. package/dist/errors/patterns.js +0 -351
  140. package/dist/errors/patterns.js.map +0 -1
  141. package/dist/errors/performance-detector.d.ts +0 -11
  142. package/dist/errors/performance-detector.d.ts.map +0 -1
  143. package/dist/errors/performance-detector.js +0 -119
  144. package/dist/errors/performance-detector.js.map +0 -1
  145. package/dist/errors/runtime-detector.d.ts +0 -7
  146. package/dist/errors/runtime-detector.d.ts.map +0 -1
  147. package/dist/errors/runtime-detector.js +0 -86
  148. package/dist/errors/runtime-detector.js.map +0 -1
  149. package/dist/errors/security-detector.d.ts +0 -6
  150. package/dist/errors/security-detector.d.ts.map +0 -1
  151. package/dist/errors/security-detector.js +0 -75
  152. package/dist/errors/security-detector.js.map +0 -1
  153. package/dist/errors/typescript-integration.d.ts +0 -6
  154. package/dist/errors/typescript-integration.d.ts.map +0 -1
  155. package/dist/errors/typescript-integration.js +0 -46
  156. package/dist/errors/typescript-integration.js.map +0 -1
  157. package/dist/server/mcp-server.d.ts +0 -14
  158. package/dist/server/mcp-server.d.ts.map +0 -1
  159. package/dist/server/mcp-server.js +0 -793
  160. package/dist/server/mcp-server.js.map +0 -1
  161. package/dist/types/documentation.d.ts +0 -26
  162. package/dist/types/documentation.d.ts.map +0 -1
  163. package/dist/types/documentation.js +0 -2
  164. package/dist/types/documentation.js.map +0 -1
  165. package/dist/utils/config.d.ts +0 -21
  166. package/dist/utils/config.d.ts.map +0 -1
  167. package/dist/utils/config.js +0 -34
  168. package/dist/utils/config.js.map +0 -1
  169. package/dist/utils/http-client.d.ts +0 -17
  170. package/dist/utils/http-client.d.ts.map +0 -1
  171. package/dist/utils/http-client.js +0 -62
  172. package/dist/utils/http-client.js.map +0 -1
  173. package/dist/utils/logger.d.ts +0 -36
  174. package/dist/utils/logger.d.ts.map +0 -1
  175. package/dist/utils/logger.js +0 -128
  176. package/dist/utils/logger.js.map +0 -1
  177. package/dist/utils/rate-limiter.d.ts +0 -9
  178. package/dist/utils/rate-limiter.d.ts.map +0 -1
  179. package/dist/utils/rate-limiter.js +0 -26
  180. package/dist/utils/rate-limiter.js.map +0 -1
  181. package/dist/validation/auto-fix.d.ts +0 -15
  182. package/dist/validation/auto-fix.d.ts.map +0 -1
  183. package/dist/validation/auto-fix.js +0 -49
  184. package/dist/validation/auto-fix.js.map +0 -1
  185. package/dist/validation/index.d.ts +0 -21
  186. package/dist/validation/index.d.ts.map +0 -1
  187. package/dist/validation/index.js +0 -45
  188. package/dist/validation/index.js.map +0 -1
  189. package/dist/validation/resolution-db.d.ts +0 -15
  190. package/dist/validation/resolution-db.d.ts.map +0 -1
  191. package/dist/validation/resolution-db.js +0 -62
  192. package/dist/validation/resolution-db.js.map +0 -1
@@ -1,1415 +0,0 @@
1
- import { HttpClient } from '../utils/http-client.js';
2
- import { logger } from '../utils/logger.js';
3
- import { WebDocumentationBrowser } from './web-doc-browser.js';
4
- import { LlmsTxtDetector, LlmsTxtDownloader, LlmsTxtParser } from './llms-txt/index.js';
5
- import { CheckpointManager } from './checkpoint-manager.js';
6
- import { BrowserManager } from './browser-manager.js';
7
- import { join } from 'path';
8
- import { tmpdir } from 'os';
9
- import * as cheerio from 'cheerio';
10
- export class DocumentationCrawler {
11
- browser;
12
- browserManager;
13
- visitedUrls;
14
- urlQueue;
15
- crawledPages;
16
- errors;
17
- options;
18
- baseUrl;
19
- linkDiscoveryStats;
20
- checkpointManager;
21
- pagesSinceLastCheckpoint;
22
- DOCUMENTATION_PATTERNS = [
23
- /\/docs?\//i,
24
- /\/documentation/i,
25
- /\/guide/i,
26
- /\/tutorial/i,
27
- /\/api/i,
28
- /\/reference/i,
29
- /\/manual/i,
30
- /\/help/i,
31
- /\/about/i,
32
- /\/getting-started/i,
33
- ];
34
- EXCLUDED_PATTERNS = [
35
- /\/login/i,
36
- /\/signup/i,
37
- /\/register/i,
38
- /\/checkout/i,
39
- /\/cart/i,
40
- /\/payment/i,
41
- /\/home$/i,
42
- // Don't exclude root path - it might be documentation
43
- // /^\/$/,
44
- ];
45
- constructor(httpClient) {
46
- this.browser = new WebDocumentationBrowser(httpClient);
47
- this.visitedUrls = new Set();
48
- this.urlQueue = [];
49
- this.crawledPages = [];
50
- this.errors = [];
51
- this.options = {
52
- crawlStrategy: 'bfs', // Default to breadth-first search
53
- maxDepth: 3,
54
- maxPages: 50,
55
- includePaths: [],
56
- excludePaths: [],
57
- rateLimit: 500, // 500ms default delay
58
- maxRetries: 2, // Default 2 retries
59
- retryDelay: 1000, // Default 1 second delay before retry
60
- useBrowserAutomation: false, // Default to HTTP-only for backward compatibility
61
- skipLlmsTxt: false, // Enable llms.txt detection by default
62
- workers: 1, // Default to single-threaded crawling
63
- spaStrategy: 'smart', // Smart SPA handling by default
64
- spaFallback: 'warn', // Warn users when browser rendering fails
65
- };
66
- this.baseUrl = new URL('https://example.com');
67
- this.linkDiscoveryStats = {
68
- totalLinksFound: 0,
69
- linksFiltered: {
70
- notContent: 0,
71
- externalDomain: 0,
72
- alreadyVisited: 0,
73
- excludedPattern: 0,
74
- depthLimit: 0,
75
- },
76
- linksQueued: 0,
77
- pagesDiscovered: 0,
78
- pagesCrawled: 0,
79
- };
80
- this.pagesSinceLastCheckpoint = 0;
81
- }
82
- /**
83
- * Crawl documentation starting from a root URL
84
- * Uses HTTP client (axios) exclusively - no browser automation
85
- * For SPA sites that require JavaScript rendering, use Cursor/Claude's built-in browser tools
86
- * Supports both BFS (breadth-first) and DFS (depth-first) crawl strategies
87
- */
88
- async crawl(rootUrl, options = {}) {
89
- const strategy = options.crawlStrategy || 'bfs';
90
- logger.info('Starting documentation crawl using HTTP client (axios)', {
91
- url: rootUrl,
92
- strategy,
93
- method: 'HTTP GET',
94
- client: 'axios/HttpClient',
95
- note: 'For SPA sites, use Cursor/Claude browser tools to get rendered content first',
96
- });
97
- // Reset state
98
- this.visitedUrls.clear();
99
- this.urlQueue = [];
100
- this.crawledPages = [];
101
- this.errors = [];
102
- this.linkDiscoveryStats = {
103
- totalLinksFound: 0,
104
- linksFiltered: {
105
- notContent: 0,
106
- externalDomain: 0,
107
- alreadyVisited: 0,
108
- excludedPattern: 0,
109
- depthLimit: 0,
110
- },
111
- linksQueued: 0,
112
- pagesDiscovered: 0,
113
- pagesCrawled: 0,
114
- };
115
- // Merge options
116
- this.options = {
117
- ...this.options,
118
- ...options,
119
- };
120
- // Parse and validate root URL
121
- try {
122
- this.baseUrl = new URL(rootUrl);
123
- }
124
- catch (error) {
125
- throw new Error(`Invalid root URL: ${rootUrl}`);
126
- }
127
- // No longer require documentation-only pages - allow any website with extractable content
128
- logger.debug('Starting crawl from URL (permissive mode)', { url: rootUrl });
129
- // Setup checkpoint manager if enabled
130
- if (this.options.checkpoint?.enabled) {
131
- const checkpointFile = this.options.checkpoint.file ||
132
- join(tmpdir(), `safe-coder-checkpoint-${this.sanitizeFilename(rootUrl)}.json`);
133
- this.checkpointManager = new CheckpointManager(checkpointFile);
134
- // Try to resume from checkpoint if requested
135
- if (this.options.resume) {
136
- const loaded = await this.loadCheckpoint();
137
- if (loaded) {
138
- logger.info('Resumed from checkpoint', {
139
- pagesCrawled: this.crawledPages.length,
140
- pendingUrls: this.urlQueue.length,
141
- visitedUrls: this.visitedUrls.size,
142
- });
143
- }
144
- }
145
- }
146
- // Try to detect and use llms.txt if available (unless explicitly disabled)
147
- if (!this.options.skipLlmsTxt) {
148
- await this.tryLlmsTxt(rootUrl);
149
- }
150
- // Detect SPA and provide warning
151
- try {
152
- const spaDetection = await this.browser.detectSPA(rootUrl);
153
- if (spaDetection.isSPA && spaDetection.confidence !== 'low') {
154
- logger.warn('SPA detected at root URL - crawling may be limited', {
155
- url: rootUrl,
156
- confidence: spaDetection.confidence,
157
- indicators: spaDetection.indicators,
158
- suggestion: spaDetection.suggestion,
159
- });
160
- // Add warning to first page if SPA detected
161
- if (spaDetection.suggestion) {
162
- logger.info('SPA Detection Warning', {
163
- message: spaDetection.suggestion,
164
- recommendation: 'Consider using browser automation tools to get fully rendered content before crawling.',
165
- });
166
- }
167
- }
168
- }
169
- catch (error) {
170
- // SPA detection failure is not critical, continue crawling
171
- logger.debug('SPA detection failed, continuing with crawl', {
172
- url: rootUrl,
173
- error: error instanceof Error ? error.message : String(error),
174
- });
175
- }
176
- // Start crawling from root
177
- this.urlQueue.push({ url: rootUrl, depth: 0 });
178
- let maxDepthReached = 0;
179
- // Process queue - use parallel workers if specified
180
- const startTime = Date.now();
181
- const workerCount = this.options.workers || 1;
182
- if (workerCount > 1) {
183
- logger.info('Using parallel crawling', { workers: workerCount });
184
- maxDepthReached = await this.crawlWithWorkers(startTime);
185
- }
186
- else {
187
- maxDepthReached = await this.crawlSequential(startTime);
188
- }
189
- // Update final statistics
190
- this.linkDiscoveryStats.pagesDiscovered = this.visitedUrls.size;
191
- // Calculate final statistics
192
- const totalTime = ((Date.now() - startTime) / 1000).toFixed(2);
193
- const avgTimePerPage = this.crawledPages.length > 0
194
- ? ((Date.now() - startTime) / this.crawledPages.length / 1000).toFixed(2)
195
- : '0';
196
- const successRate = this.linkDiscoveryStats.pagesDiscovered > 0
197
- ? ((this.crawledPages.length / this.linkDiscoveryStats.pagesDiscovered) * 100).toFixed(1)
198
- : '0';
199
- // Log crawl completion with comprehensive statistics
200
- logger.info('Documentation crawl completed using HTTP client (axios)', {
201
- totalPages: this.crawledPages.length,
202
- maxDepthReached,
203
- errors: this.errors.length,
204
- totalTimeSeconds: totalTime,
205
- avgTimePerPageSeconds: avgTimePerPage,
206
- successRate: `${successRate}%`,
207
- method: 'HTTP GET',
208
- client: 'axios/HttpClient',
209
- linkStats: {
210
- totalLinksFound: this.linkDiscoveryStats.totalLinksFound,
211
- linksQueued: this.linkDiscoveryStats.linksQueued,
212
- linksFiltered: this.linkDiscoveryStats.linksFiltered,
213
- pagesDiscovered: this.linkDiscoveryStats.pagesDiscovered,
214
- pagesCrawled: this.linkDiscoveryStats.pagesCrawled,
215
- },
216
- errorBreakdown: this.getErrorBreakdown(),
217
- });
218
- // Validate if content is sufficient for skill generation
219
- const validation = this.canGenerateSkill(this.crawledPages);
220
- const abandoned = !validation.canGenerate;
221
- const abandonReason = validation.reason;
222
- if (abandoned) {
223
- logger.warn('Crawl completed but content is insufficient for skill generation', {
224
- reason: abandonReason,
225
- pagesCrawled: this.crawledPages.length,
226
- suggestion: 'Consider crawling more pages or a different website',
227
- });
228
- }
229
- // Clear checkpoint after successful completion
230
- if (this.checkpointManager && !abandoned) {
231
- await this.clearCheckpoint();
232
- }
233
- return {
234
- pages: this.crawledPages,
235
- totalPages: this.crawledPages.length,
236
- maxDepthReached,
237
- errors: this.errors,
238
- linkDiscoveryStats: this.linkDiscoveryStats,
239
- abandoned,
240
- abandonReason,
241
- };
242
- }
243
- /**
244
- * Sequential crawling (single-threaded)
245
- */
246
- async crawlSequential(startTime) {
247
- let maxDepthReached = 0;
248
- let lastProgressLog = Date.now();
249
- const PROGRESS_LOG_INTERVAL = 5000; // Log progress every 5 seconds
250
- while (this.urlQueue.length > 0 && this.crawledPages.length < this.options.maxPages) {
251
- // Use different strategies for getting next URL
252
- // BFS: shift() - take from front (queue behavior)
253
- // DFS: pop() - take from back (stack behavior)
254
- const queued = this.options.crawlStrategy === 'dfs' ? this.urlQueue.pop() : this.urlQueue.shift();
255
- if (!queued)
256
- break;
257
- const { url, depth } = queued;
258
- // Log progress periodically
259
- const now = Date.now();
260
- if (now - lastProgressLog >= PROGRESS_LOG_INTERVAL) {
261
- const elapsed = ((now - startTime) / 1000).toFixed(1);
262
- const pagesPerSecond = (this.crawledPages.length / elapsed).toFixed(2);
263
- logger.info('Crawl progress', {
264
- pagesCrawled: this.crawledPages.length,
265
- pagesRemaining: this.urlQueue.length,
266
- maxPages: this.options.maxPages,
267
- errors: this.errors.length,
268
- elapsedSeconds: elapsed,
269
- pagesPerSecond,
270
- currentDepth: depth,
271
- maxDepth: this.options.maxDepth,
272
- });
273
- lastProgressLog = now;
274
- }
275
- // Skip if already visited
276
- if (this.visitedUrls.has(url)) {
277
- continue;
278
- }
279
- // Check depth limit
280
- if (depth > this.options.maxDepth) {
281
- continue;
282
- }
283
- // Mark as visited
284
- this.visitedUrls.add(url);
285
- maxDepthReached = Math.max(maxDepthReached, depth);
286
- await this.processPage(url, depth);
287
- // Rate limiting
288
- if (this.options.rateLimit > 0 && this.urlQueue.length > 0) {
289
- await this.delay(this.options.rateLimit);
290
- }
291
- }
292
- return maxDepthReached;
293
- }
294
- /**
295
- * Parallel crawling with multiple workers
296
- */
297
- async crawlWithWorkers(startTime) {
298
- let maxDepthReached = 0;
299
- let lastProgressLog = Date.now();
300
- const PROGRESS_LOG_INTERVAL = 5000;
301
- const workerCount = this.options.workers || 1;
302
- while (this.urlQueue.length > 0 && this.crawledPages.length < this.options.maxPages) {
303
- // Log progress periodically
304
- const now = Date.now();
305
- if (now - lastProgressLog >= PROGRESS_LOG_INTERVAL) {
306
- const elapsed = ((now - startTime) / 1000).toFixed(1);
307
- const pagesPerSecond = (this.crawledPages.length / elapsed).toFixed(2);
308
- logger.info('Crawl progress (parallel)', {
309
- pagesCrawled: this.crawledPages.length,
310
- pagesRemaining: this.urlQueue.length,
311
- maxPages: this.options.maxPages,
312
- errors: this.errors.length,
313
- elapsedSeconds: elapsed,
314
- pagesPerSecond,
315
- workers: workerCount,
316
- });
317
- lastProgressLog = now;
318
- }
319
- // Get batch of URLs to process in parallel
320
- const batch = [];
321
- const batchSize = Math.min(workerCount, this.urlQueue.length, this.options.maxPages - this.crawledPages.length);
322
- for (let i = 0; i < batchSize; i++) {
323
- const queued = this.options.crawlStrategy === 'dfs' ? this.urlQueue.pop() : this.urlQueue.shift();
324
- if (!queued)
325
- break;
326
- // Skip if already visited
327
- if (this.visitedUrls.has(queued.url)) {
328
- continue;
329
- }
330
- // Check depth limit
331
- if (queued.depth > this.options.maxDepth) {
332
- continue;
333
- }
334
- // Mark as visited
335
- this.visitedUrls.add(queued.url);
336
- maxDepthReached = Math.max(maxDepthReached, queued.depth);
337
- batch.push(queued);
338
- }
339
- if (batch.length === 0) {
340
- break;
341
- }
342
- // Process batch in parallel
343
- await Promise.all(batch.map(async (queued) => {
344
- await this.processPage(queued.url, queued.depth);
345
- // Rate limiting (per worker)
346
- if (this.options.rateLimit > 0) {
347
- await this.delay(this.options.rateLimit);
348
- }
349
- }));
350
- }
351
- return maxDepthReached;
352
- }
353
- /**
354
- * Process a single page (shared by both sequential and parallel crawling)
355
- */
356
- async processPage(url, depth) {
357
- try {
358
- // Crawl the page using HTTP GET with retry logic
359
- logger.debug('Fetching page via HTTP GET', { url, depth, method: 'HTTP GET', client: 'axios' });
360
- const page = await this.fetchPageWithRetry(url);
361
- // Check if page has minimal content (possible SPA issue)
362
- const contentLength = page.content.length;
363
- const linksCount = page.navigationLinks.length;
364
- if (contentLength < 200 && linksCount < 3) {
365
- logger.warn('Page has minimal content - may be SPA', {
366
- url,
367
- contentLength,
368
- linksCount,
369
- suggestion: 'This page may require JavaScript rendering. Consider using browser automation tools.',
370
- });
371
- }
372
- // Convert to CrawledPage format
373
- const crawledPage = {
374
- url: page.url,
375
- title: page.title,
376
- content: page.content,
377
- depth,
378
- sections: page.sections,
379
- navigationLinks: page.navigationLinks,
380
- headings: page.headings,
381
- codeSamples: page.codeSamples,
382
- };
383
- this.crawledPages.push(crawledPage);
384
- this.linkDiscoveryStats.pagesCrawled++;
385
- this.pagesSinceLastCheckpoint++;
386
- // Save checkpoint if interval reached
387
- if (this.checkpointManager && this.options.checkpoint?.enabled) {
388
- const interval = this.options.checkpoint.interval || 10;
389
- if (this.pagesSinceLastCheckpoint >= interval) {
390
- await this.saveCheckpoint();
391
- this.pagesSinceLastCheckpoint = 0;
392
- }
393
- }
394
- const totalLinksOnPage = page.navigationLinks.length;
395
- this.linkDiscoveryStats.totalLinksFound += totalLinksOnPage;
396
- logger.debug('Page fetched and parsed successfully', {
397
- url,
398
- title: page.title.substring(0, 50),
399
- linksFound: totalLinksOnPage,
400
- depth,
401
- });
402
- // Discover and queue new URLs
403
- if (depth < this.options.maxDepth) {
404
- const discoveryResult = this.discoverDocumentationLinks(page, depth + 1);
405
- const newUrls = discoveryResult.discovered;
406
- logger.debug('Link discovery completed', {
407
- url,
408
- totalLinksOnPage,
409
- discovered: newUrls.length,
410
- filtered: discoveryResult.filtered,
411
- });
412
- let queuedCount = 0;
413
- let skippedAlreadyVisited = 0;
414
- for (const newUrl of newUrls) {
415
- if (!this.visitedUrls.has(newUrl.url)) {
416
- // Also check if it's already in the queue to avoid duplicates
417
- const alreadyInQueue = this.urlQueue.some(q => q.url === newUrl.url);
418
- if (!alreadyInQueue) {
419
- this.urlQueue.push(newUrl);
420
- this.linkDiscoveryStats.linksQueued++;
421
- queuedCount++;
422
- }
423
- else {
424
- skippedAlreadyVisited++;
425
- }
426
- }
427
- else {
428
- skippedAlreadyVisited++;
429
- }
430
- }
431
- logger.debug('Links queued', {
432
- url,
433
- queued: queuedCount,
434
- skippedAlreadyVisited,
435
- queueLengthAfter: this.urlQueue.length,
436
- });
437
- }
438
- else {
439
- this.linkDiscoveryStats.linksFiltered.depthLimit += totalLinksOnPage;
440
- }
441
- }
442
- catch (error) {
443
- const errorMessage = error instanceof Error ? error.message : String(error);
444
- const errorType = this.classifyError(error);
445
- this.errors.push({
446
- url,
447
- error: `${errorType}: ${errorMessage}`,
448
- });
449
- logger.warn('Page crawl failed', {
450
- url,
451
- error: errorMessage,
452
- errorType,
453
- depth,
454
- willContinue: true,
455
- });
456
- }
457
- }
458
- /**
459
- * Discover documentation links from a crawled page
460
- */
461
- discoverDocumentationLinks(page, nextDepth) {
462
- const discovered = [];
463
- const filtered = {
464
- notContent: 0, // Renamed from notDocumentation
465
- externalDomain: 0,
466
- alreadyVisited: 0,
467
- excludedPattern: 0,
468
- };
469
- const linkDetails = [];
470
- for (const link of page.navigationLinks) {
471
- // Only follow internal links
472
- if (!link.isInternal) {
473
- filtered.externalDomain++;
474
- this.linkDiscoveryStats.linksFiltered.externalDomain++;
475
- linkDetails.push({ url: link.url, reason: 'not_internal' });
476
- continue;
477
- }
478
- try {
479
- const linkUrl = new URL(link.url);
480
- // Must be same origin
481
- if (linkUrl.origin !== this.baseUrl.origin) {
482
- filtered.externalDomain++;
483
- this.linkDiscoveryStats.linksFiltered.externalDomain++;
484
- linkDetails.push({ url: link.url, reason: 'different_origin' });
485
- continue;
486
- }
487
- // Check if already visited
488
- const normalizedUrl = linkUrl.href.split('#')[0];
489
- if (this.visitedUrls.has(normalizedUrl)) {
490
- filtered.alreadyVisited++;
491
- this.linkDiscoveryStats.linksFiltered.alreadyVisited++;
492
- linkDetails.push({ url: link.url, reason: 'already_visited' });
493
- continue;
494
- }
495
- // Check if it's a valid content path (permissive - only exclude clearly non-content)
496
- if (!this.isDocumentationPath(linkUrl.pathname)) {
497
- filtered.notContent++;
498
- this.linkDiscoveryStats.linksFiltered.notContent++;
499
- linkDetails.push({ url: link.url, reason: 'not_content_path', pathname: linkUrl.pathname });
500
- continue;
501
- }
502
- // Check exclude patterns
503
- if (this.shouldExclude(linkUrl.pathname)) {
504
- filtered.excludedPattern++;
505
- this.linkDiscoveryStats.linksFiltered.excludedPattern++;
506
- linkDetails.push({ url: link.url, reason: 'excluded_pattern', pathname: linkUrl.pathname });
507
- continue;
508
- }
509
- // Check include patterns (if specified)
510
- if (this.options.includePaths.length > 0) {
511
- const matchesInclude = this.options.includePaths.some(pattern => linkUrl.pathname.includes(pattern));
512
- if (!matchesInclude) {
513
- filtered.notContent++;
514
- this.linkDiscoveryStats.linksFiltered.notContent++;
515
- linkDetails.push({ url: link.url, reason: 'not_in_include_paths', pathname: linkUrl.pathname });
516
- continue;
517
- }
518
- }
519
- // Check exclude patterns (if specified)
520
- if (this.options.excludePaths.length > 0) {
521
- const matchesExclude = this.options.excludePaths.some(pattern => linkUrl.pathname.includes(pattern));
522
- if (matchesExclude) {
523
- filtered.excludedPattern++;
524
- this.linkDiscoveryStats.linksFiltered.excludedPattern++;
525
- linkDetails.push({ url: link.url, reason: 'matches_exclude_paths', pathname: linkUrl.pathname });
526
- continue;
527
- }
528
- }
529
- // Add to discovered links
530
- discovered.push({
531
- url: normalizedUrl,
532
- depth: nextDepth,
533
- });
534
- linkDetails.push({ url: link.url, reason: 'accepted' });
535
- }
536
- catch (error) {
537
- // Invalid URL, skip
538
- linkDetails.push({ url: link.url, reason: 'invalid_url', error: error instanceof Error ? error.message : String(error) });
539
- continue;
540
- }
541
- }
542
- // Log detailed filtering information
543
- const filteredLinksByReason = linkDetails
544
- .filter(d => d.reason !== 'accepted')
545
- .reduce((acc, d) => {
546
- if (!acc[d.reason])
547
- acc[d.reason] = [];
548
- acc[d.reason].push({ url: d.url, pathname: d.pathname, error: d.error });
549
- return acc;
550
- }, {});
551
- logger.info('Link filtering details', {
552
- totalLinks: page.navigationLinks.length,
553
- discovered: discovered.length,
554
- filtered: {
555
- notContent: filtered.notContent,
556
- externalDomain: filtered.externalDomain,
557
- alreadyVisited: filtered.alreadyVisited,
558
- excludedPattern: filtered.excludedPattern,
559
- },
560
- filteredLinksByReason,
561
- sampleAcceptedLinks: linkDetails
562
- .filter(d => d.reason === 'accepted')
563
- .slice(0, 10)
564
- .map(d => d.url),
565
- });
566
- return {
567
- discovered,
568
- filtered,
569
- alreadyVisited: filtered.alreadyVisited,
570
- notContent: filtered.notContent,
571
- externalDomain: filtered.externalDomain,
572
- excludedPattern: filtered.excludedPattern,
573
- };
574
- }
575
- /**
576
- * Check if a path should be crawled (permissive - only exclude clearly non-content paths)
577
- */
578
- isDocumentationPath(pathname) {
579
- // Exclude clearly non-content pages
580
- if (this.shouldExclude(pathname)) {
581
- return false;
582
- }
583
- // Exclude static resources
584
- const looksLikeStaticResource = /\.(?:css|js|json|xml|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|eot|pdf|zip|exe|dmg)$/i.test(pathname);
585
- if (looksLikeStaticResource) {
586
- return false;
587
- }
588
- // Exclude API endpoints that are clearly not content (unless they're documentation APIs)
589
- // Keep API endpoints that might be documentation (e.g., /api/docs, /docs/api)
590
- const looksLikeApiEndpoint = /^\/api\/[^/]+$/i.test(pathname);
591
- if (looksLikeApiEndpoint && !pathname.includes('/docs') && !pathname.includes('/documentation')) {
592
- return false;
593
- }
594
- // Allow root path
595
- if (pathname === '/' || pathname === '') {
596
- return true;
597
- }
598
- // Exclude paths with file extensions (unless they're HTML pages)
599
- const hasFileExtension = /\.[a-z]{2,4}$/i.test(pathname.split('?')[0]);
600
- if (hasFileExtension && !pathname.match(/\.(html?|htm)$/i)) {
601
- return false;
602
- }
603
- // Permissive: allow any path that doesn't match exclusion patterns
604
- // This allows crawling any website, not just documentation
605
- return true;
606
- }
607
- /**
608
- * Check if a path should be excluded
609
- */
610
- shouldExclude(pathname) {
611
- return this.EXCLUDED_PATTERNS.some(pattern => pattern.test(pathname));
612
- }
613
- /**
614
- * Check if crawled content is sufficient for skill generation
615
- * Enhanced with multi-dimensional quality metrics
616
- */
617
- canGenerateSkill(pages) {
618
- if (pages.length === 0) {
619
- return { canGenerate: false, reason: 'empty_pages' };
620
- }
621
- const metrics = this.evaluateContentQuality(pages);
622
- // All pages are media-only
623
- if (metrics.mediaOnlyPages === pages.length && !metrics.hasTextContent) {
624
- return { canGenerate: false, reason: 'media_only' };
625
- }
626
- // No pages have sufficient content
627
- if (!metrics.hasSufficientContent) {
628
- return { canGenerate: false, reason: 'insufficient_content' };
629
- }
630
- // No structured content (headings, sections)
631
- if (!metrics.hasStructuredContent) {
632
- return { canGenerate: false, reason: 'no_structured_content' };
633
- }
634
- return { canGenerate: true };
635
- }
636
- /**
637
- * Evaluate content quality with multi-dimensional metrics
638
- */
639
- evaluateContentQuality(pages) {
640
- const MIN_CONTENT_LENGTH = 100;
641
- let hasSufficientContent = false;
642
- let hasStructuredContent = false;
643
- let hasTextContent = false;
644
- let mediaOnlyCount = 0;
645
- let totalContentLength = 0;
646
- let totalCodeSamples = 0;
647
- // Track content diversity
648
- const urlPatterns = new Set();
649
- const titlePatterns = new Set();
650
- for (const page of pages) {
651
- const contentLength = (page.content || '').trim().length;
652
- const hasHeadings = page.headings && page.headings.length > 0;
653
- const hasText = contentLength > 0;
654
- totalContentLength += contentLength;
655
- totalCodeSamples += (page.codeSamples || []).length;
656
- // Check if page is media-only
657
- const hasImages = /<img[^>]*>/i.test(page.content || '');
658
- const hasMedia = hasImages || (page.codeSamples && page.codeSamples.length > 0);
659
- if (hasMedia && contentLength < MIN_CONTENT_LENGTH) {
660
- mediaOnlyCount++;
661
- }
662
- if (contentLength >= MIN_CONTENT_LENGTH) {
663
- hasSufficientContent = true;
664
- }
665
- if (hasHeadings) {
666
- hasStructuredContent = true;
667
- }
668
- if (hasText) {
669
- hasTextContent = true;
670
- }
671
- // Track diversity
672
- try {
673
- const urlPath = new URL(page.url).pathname;
674
- const pathSegments = urlPath.split('/').filter(s => s);
675
- if (pathSegments.length > 0) {
676
- urlPatterns.add(pathSegments[0]);
677
- }
678
- }
679
- catch {
680
- // Invalid URL, skip
681
- }
682
- // Track title diversity
683
- const titleWords = page.title.toLowerCase().split(/\s+/).slice(0, 3);
684
- titlePatterns.add(titleWords.join(' '));
685
- }
686
- // Calculate diversity score (0-1)
687
- const contentDiversity = Math.min(1, (urlPatterns.size + titlePatterns.size) / (pages.length * 0.5));
688
- // Calculate API coverage score (0-1)
689
- const pagesWithCode = pages.filter(p => p.codeSamples && p.codeSamples.length > 0).length;
690
- const apiCoverage = pages.length > 0 ? pagesWithCode / pages.length : 0;
691
- const avgContentLength = pages.length > 0 ? totalContentLength / pages.length : 0;
692
- return {
693
- hasSufficientContent,
694
- hasStructuredContent,
695
- hasTextContent,
696
- mediaOnlyPages: mediaOnlyCount,
697
- contentDiversity,
698
- apiCoverage,
699
- avgContentLength,
700
- totalCodeSamples,
701
- };
702
- }
703
- /**
704
- * Check if should continue crawling based on content quality
705
- */
706
- shouldContinueCrawling(currentPages, maxPages) {
707
- if (currentPages >= maxPages) {
708
- return false;
709
- }
710
- // Evaluate quality every 10 pages
711
- if (currentPages % 10 === 0 && currentPages > 0) {
712
- const metrics = this.evaluateContentQuality(this.crawledPages);
713
- // High quality content - can stop early if we have enough
714
- if (metrics.hasSufficientContent &&
715
- metrics.contentDiversity > 0.7 &&
716
- metrics.apiCoverage > 0.5 &&
717
- currentPages >= maxPages * 0.5) {
718
- logger.info('High quality content detected, considering early stop', {
719
- currentPages,
720
- maxPages,
721
- diversity: metrics.contentDiversity.toFixed(2),
722
- apiCoverage: metrics.apiCoverage.toFixed(2),
723
- });
724
- // Continue but log the possibility
725
- }
726
- // Low quality warning
727
- if (currentPages >= maxPages * 0.8 && !metrics.hasSufficientContent) {
728
- logger.warn('Approaching page limit but content quality is low', {
729
- currentPages,
730
- maxPages,
731
- diversity: metrics.contentDiversity.toFixed(2),
732
- apiCoverage: metrics.apiCoverage.toFixed(2),
733
- suggestion: 'Consider increasing maxPages or refining includePaths',
734
- });
735
- }
736
- }
737
- return currentPages < maxPages;
738
- }
739
- /**
740
- * Fetch a page with retry logic
741
- * Supports HTML pages, Markdown files, and SPA rendering
742
- */
743
- async fetchPageWithRetry(url, retryCount = 0) {
744
- try {
745
- // 1. Check if this is a Markdown file
746
- if (url.endsWith('.md') || url.includes('.md?') || url.includes('.md#')) {
747
- return await this.extractMarkdownContent(url);
748
- }
749
- // 2. Try HTTP crawl first
750
- const page = await this.browser.browsePage(url);
751
- // 3. Smart strategy: check if content is sufficient
752
- if (this.options.spaStrategy === 'smart') {
753
- const needsBrowser = await this.shouldUseBrowser(page, url);
754
- if (needsBrowser) {
755
- logger.info('Content insufficient, switching to browser rendering', {
756
- url,
757
- contentLength: page.content.length,
758
- linksCount: page.navigationLinks.length,
759
- });
760
- return await this.fetchWithBrowser(url);
761
- }
762
- }
763
- // 4. Auto strategy: use browser for detected SPA
764
- if (this.options.spaStrategy === 'auto') {
765
- const spaDetection = await this.browser.detectSPA(url, page.content);
766
- if (spaDetection.isSPA && spaDetection.confidence !== 'low') {
767
- logger.info('SPA detected, using browser rendering', {
768
- url,
769
- confidence: spaDetection.confidence,
770
- indicators: spaDetection.indicators,
771
- });
772
- return await this.fetchWithBrowser(url);
773
- }
774
- }
775
- return page;
776
- }
777
- catch (error) {
778
- const errorType = this.classifyError(error);
779
- const isRetryable = this.isRetryableError(error);
780
- if (isRetryable && retryCount < this.options.maxRetries) {
781
- const delay = this.options.retryDelay * (retryCount + 1); // Exponential backoff
782
- logger.info('Retrying page fetch', {
783
- url,
784
- retryCount: retryCount + 1,
785
- maxRetries: this.options.maxRetries,
786
- delay,
787
- errorType,
788
- });
789
- await this.delay(delay);
790
- return this.fetchPageWithRetry(url, retryCount + 1);
791
- }
792
- // Not retryable or max retries reached
793
- throw error;
794
- }
795
- }
796
- /**
797
- * Extract content from Markdown file
798
- * Converts Markdown structure to WebDocumentationPage format
799
- */
800
- async extractMarkdownContent(url) {
801
- logger.debug('Extracting Markdown content', { url });
802
- // Fetch raw markdown content
803
- const httpClient = new HttpClient();
804
- const response = await httpClient.get(url, {
805
- responseType: 'text',
806
- timeout: 30000,
807
- });
808
- const markdownContent = response.data;
809
- // Parse markdown structure
810
- const parsed = this.parseMarkdown(markdownContent, url);
811
- return {
812
- url,
813
- title: parsed.title,
814
- content: parsed.content,
815
- searchableContent: parsed.content, // Add searchable content for consistency
816
- sections: parsed.sections,
817
- navigationLinks: parsed.links,
818
- headings: parsed.headings,
819
- codeSamples: parsed.codeSamples,
820
- isDocumentation: true,
821
- };
822
- }
823
- /**
824
- * Parse Markdown content into structured data
825
- */
826
- parseMarkdown(content, url) {
827
- const lines = content.split('\n');
828
- let title = '';
829
- const headings = [];
830
- const codeSamples = [];
831
- const sections = [];
832
- const links = [];
833
- const contentLines = [];
834
- // Extract title from first h1
835
- for (const line of lines) {
836
- if (line.startsWith('# ')) {
837
- title = line.substring(2).trim();
838
- break;
839
- }
840
- }
841
- // Extract headings (h2-h6)
842
- const headingRegex = /^(#{2,6})\s+(.+)$/;
843
- for (const line of lines) {
844
- const match = line.match(headingRegex);
845
- if (match) {
846
- const level = match[1].length;
847
- const text = match[2].trim();
848
- const id = text.toLowerCase().replace(/[^\w\s-]/g, '').replace(/\s+/g, '-');
849
- headings.push({
850
- level: `h${level}`,
851
- text,
852
- id,
853
- });
854
- }
855
- }
856
- // Extract code blocks
857
- const codeBlockRegex = /```(\w+)?\n([\s\S]*?)```/g;
858
- let match;
859
- while ((match = codeBlockRegex.exec(content)) !== null) {
860
- const language = match[1] || 'text';
861
- const code = match[2].trim();
862
- if (code.length > 10) {
863
- codeSamples.push({
864
- code,
865
- language,
866
- });
867
- }
868
- }
869
- // Extract content (remove code blocks and headings)
870
- let contentWithoutCode = content.replace(codeBlockRegex, '');
871
- contentWithoutCode = contentWithoutCode.replace(/^#{1,6}\s+.+$/gm, '');
872
- for (const para of contentWithoutCode.split('\n\n')) {
873
- const trimmed = para.trim();
874
- if (trimmed.length > 20) {
875
- contentLines.push(trimmed);
876
- }
877
- }
878
- // Extract links (markdown format)
879
- const linkRegex = /\[([^\]]*)\]\(([^)]+)\)/g;
880
- while ((match = linkRegex.exec(content)) !== null) {
881
- const text = match[1];
882
- const linkUrl = match[2].trim();
883
- // Skip anchors
884
- if (linkUrl.startsWith('#')) {
885
- continue;
886
- }
887
- // Resolve relative URLs
888
- let absoluteUrl;
889
- try {
890
- if (linkUrl.startsWith('http://') || linkUrl.startsWith('https://')) {
891
- absoluteUrl = linkUrl;
892
- }
893
- else {
894
- absoluteUrl = new URL(linkUrl, url).href;
895
- }
896
- // Remove fragment
897
- absoluteUrl = absoluteUrl.split('#')[0];
898
- // Only include .md URLs to avoid client-side rendered HTML pages
899
- if (absoluteUrl.endsWith('.md') || absoluteUrl.includes('.md?')) {
900
- const linkOrigin = new URL(absoluteUrl).origin;
901
- const baseOrigin = this.baseUrl.origin;
902
- links.push({
903
- text,
904
- url: absoluteUrl,
905
- isInternal: linkOrigin === baseOrigin,
906
- });
907
- }
908
- }
909
- catch (error) {
910
- // Invalid URL, skip
911
- logger.debug('Invalid URL in markdown link', { url: linkUrl });
912
- }
913
- }
914
- // Build sections from headings
915
- let currentSection = null;
916
- let currentContent = [];
917
- for (const line of lines) {
918
- const headerMatch = line.match(headingRegex);
919
- if (headerMatch) {
920
- // Save previous section
921
- if (currentSection) {
922
- currentSection.content = currentContent.join('\n').trim();
923
- if (currentSection.content.length > 0) {
924
- sections.push(currentSection);
925
- }
926
- }
927
- // Start new section
928
- const text = headerMatch[2].trim();
929
- currentSection = {
930
- title: text,
931
- content: '',
932
- anchor: text.toLowerCase().replace(/[^\w\s-]/g, '').replace(/\s+/g, '-'),
933
- };
934
- currentContent = [];
935
- }
936
- else if (currentSection) {
937
- currentContent.push(line);
938
- }
939
- }
940
- // Save last section
941
- if (currentSection) {
942
- currentSection.content = currentContent.join('\n').trim();
943
- if (currentSection.content.length > 0) {
944
- sections.push(currentSection);
945
- }
946
- }
947
- return {
948
- title: title || 'Untitled',
949
- content: contentLines.join('\n\n'),
950
- headings,
951
- codeSamples,
952
- sections,
953
- links,
954
- };
955
- }
956
- /**
957
- * Classify error type for better error messages
958
- */
959
- classifyError(error) {
960
- if (!(error instanceof Error)) {
961
- return 'UnknownError';
962
- }
963
- const message = error.message.toLowerCase();
964
- const errorName = error.name.toLowerCase();
965
- // Network errors
966
- if (errorName.includes('timeout') || message.includes('timeout')) {
967
- return 'TimeoutError';
968
- }
969
- if (errorName.includes('network') || message.includes('network') || message.includes('econnrefused')) {
970
- return 'NetworkError';
971
- }
972
- if (message.includes('econnreset') || message.includes('socket')) {
973
- return 'ConnectionError';
974
- }
975
- // HTTP errors
976
- if (errorName.includes('http') || message.includes('status')) {
977
- if (message.includes('404'))
978
- return 'NotFoundError';
979
- if (message.includes('403'))
980
- return 'ForbiddenError';
981
- if (message.includes('401'))
982
- return 'UnauthorizedError';
983
- if (message.includes('429'))
984
- return 'RateLimitError';
985
- if (message.includes('500') || message.includes('502') || message.includes('503')) {
986
- return 'ServerError';
987
- }
988
- return 'HttpError';
989
- }
990
- // Content errors
991
- if (message.includes('documentation') || message.includes('not appear to be')) {
992
- return 'NotDocumentationError';
993
- }
994
- if (message.includes('spa') || message.includes('javascript')) {
995
- return 'SPAError';
996
- }
997
- return 'UnknownError';
998
- }
999
- /**
1000
- * Check if an error is retryable
1001
- */
1002
- isRetryableError(error) {
1003
- if (!(error instanceof Error)) {
1004
- return false;
1005
- }
1006
- const errorType = this.classifyError(error);
1007
- // Retryable errors
1008
- const retryableTypes = [
1009
- 'TimeoutError',
1010
- 'NetworkError',
1011
- 'ConnectionError',
1012
- 'RateLimitError',
1013
- 'ServerError', // 500, 502, 503
1014
- ];
1015
- return retryableTypes.includes(errorType);
1016
- }
1017
- /**
1018
- * Get error breakdown by type
1019
- */
1020
- getErrorBreakdown() {
1021
- const breakdown = {};
1022
- for (const error of this.errors) {
1023
- const errorType = error.error.split(':')[0] || 'UnknownError';
1024
- breakdown[errorType] = (breakdown[errorType] || 0) + 1;
1025
- }
1026
- return breakdown;
1027
- }
1028
- /**
1029
- * Try to detect and use llms.txt for optimized crawling
1030
- */
1031
- async tryLlmsTxt(rootUrl) {
1032
- logger.info('Checking for llms.txt files', { url: rootUrl });
1033
- try {
1034
- const detector = new LlmsTxtDetector(rootUrl);
1035
- const variants = await detector.detectAll();
1036
- if (variants.length === 0) {
1037
- logger.info('No llms.txt files found, proceeding with normal crawl');
1038
- return;
1039
- }
1040
- logger.info('Found llms.txt variants', {
1041
- count: variants.length,
1042
- variants: variants.map(v => v.variant),
1043
- });
1044
- // Download all variants
1045
- const downloader = new LlmsTxtDownloader();
1046
- const downloaded = await downloader.downloadAll(variants);
1047
- if (downloaded.length === 0) {
1048
- logger.warn('Failed to download any llms.txt variants');
1049
- return;
1050
- }
1051
- // Use the largest variant (most comprehensive)
1052
- const largest = downloaded.reduce((prev, current) => current.size > prev.size ? current : prev);
1053
- logger.info('Using llms.txt for URL extraction', {
1054
- variant: largest.variant,
1055
- size: largest.size,
1056
- });
1057
- // Parse URLs from llms.txt
1058
- const parser = new LlmsTxtParser(largest.content, rootUrl);
1059
- const extractedUrls = parser.extractUrls();
1060
- if (extractedUrls.length > 0) {
1061
- logger.info('Extracted URLs from llms.txt', {
1062
- count: extractedUrls.length,
1063
- });
1064
- // Add URLs to queue with depth 0
1065
- for (const url of extractedUrls) {
1066
- if (this.isValidUrl(url) && !this.visitedUrls.has(url)) {
1067
- this.urlQueue.push({ url, depth: 0 });
1068
- }
1069
- }
1070
- logger.info('Added llms.txt URLs to crawl queue', {
1071
- added: this.urlQueue.length,
1072
- });
1073
- }
1074
- else {
1075
- logger.info('No URLs extracted from llms.txt, using normal crawl');
1076
- }
1077
- }
1078
- catch (error) {
1079
- const errorMessage = error instanceof Error ? error.message : String(error);
1080
- logger.warn('llms.txt detection failed, continuing with normal crawl', {
1081
- error: errorMessage,
1082
- });
1083
- // Continue with normal crawling if llms.txt fails
1084
- }
1085
- }
1086
- /**
1087
- * Check if a URL is valid for crawling
1088
- */
1089
- isValidUrl(url) {
1090
- try {
1091
- const parsed = new URL(url);
1092
- // Must be same origin as base URL
1093
- if (parsed.origin !== this.baseUrl.origin) {
1094
- return false;
1095
- }
1096
- // Must be http or https
1097
- if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
1098
- return false;
1099
- }
1100
- return true;
1101
- }
1102
- catch {
1103
- return false;
1104
- }
1105
- }
1106
- /**
1107
- * Save checkpoint
1108
- */
1109
- async saveCheckpoint() {
1110
- if (!this.checkpointManager) {
1111
- return;
1112
- }
1113
- const checkpointData = {
1114
- config: this.options,
1115
- visitedUrls: Array.from(this.visitedUrls),
1116
- pendingUrls: this.urlQueue,
1117
- pagesCrawled: this.crawledPages.length,
1118
- lastUpdated: new Date().toISOString(),
1119
- baseUrl: this.baseUrl.href,
1120
- };
1121
- try {
1122
- await this.checkpointManager.saveCheckpoint(checkpointData);
1123
- }
1124
- catch (error) {
1125
- logger.warn('Failed to save checkpoint', {
1126
- error: error instanceof Error ? error.message : String(error),
1127
- });
1128
- }
1129
- }
1130
- /**
1131
- * Load checkpoint and restore state
1132
- */
1133
- async loadCheckpoint() {
1134
- if (!this.checkpointManager) {
1135
- return false;
1136
- }
1137
- try {
1138
- const data = await this.checkpointManager.loadCheckpoint();
1139
- if (!data) {
1140
- logger.info('No checkpoint found to resume from');
1141
- return false;
1142
- }
1143
- // Restore state
1144
- this.visitedUrls = new Set(data.visitedUrls);
1145
- this.urlQueue = data.pendingUrls;
1146
- // Note: crawledPages are not restored as they will be regenerated
1147
- logger.info('State restored from checkpoint', {
1148
- visitedUrls: this.visitedUrls.size,
1149
- pendingUrls: this.urlQueue.length,
1150
- lastUpdated: data.lastUpdated,
1151
- });
1152
- return true;
1153
- }
1154
- catch (error) {
1155
- logger.warn('Failed to load checkpoint', {
1156
- error: error instanceof Error ? error.message : String(error),
1157
- });
1158
- return false;
1159
- }
1160
- }
1161
- /**
1162
- * Clear checkpoint after successful crawl
1163
- */
1164
- async clearCheckpoint() {
1165
- if (this.checkpointManager) {
1166
- try {
1167
- await this.checkpointManager.clearCheckpoint();
1168
- }
1169
- catch (error) {
1170
- logger.debug('Failed to clear checkpoint', {
1171
- error: error instanceof Error ? error.message : String(error),
1172
- });
1173
- }
1174
- }
1175
- }
1176
- /**
1177
- * Sanitize filename for checkpoint
1178
- */
1179
- sanitizeFilename(url) {
1180
- return url
1181
- .replace(/[^a-z0-9]/gi, '-')
1182
- .replace(/-+/g, '-')
1183
- .substring(0, 64);
1184
- }
1185
- /**
1186
- * Check if browser rendering is needed
1187
- */
1188
- async shouldUseBrowser(page, url) {
1189
- // 1. Content too short
1190
- if (page.content.length < 200) {
1191
- logger.debug('Content too short, may need browser', {
1192
- url,
1193
- length: page.content.length
1194
- });
1195
- return true;
1196
- }
1197
- // 2. No navigation links
1198
- if (page.navigationLinks.length < 3) {
1199
- logger.debug('Few navigation links, may need browser', {
1200
- url,
1201
- links: page.navigationLinks.length
1202
- });
1203
- return true;
1204
- }
1205
- // 3. SPA detected but content insufficient
1206
- const spaDetection = await this.browser.detectSPA(url, page.content);
1207
- if (spaDetection.isSPA && page.content.length < 500) {
1208
- logger.debug('SPA detected with insufficient content', {
1209
- url,
1210
- confidence: spaDetection.confidence,
1211
- length: page.content.length
1212
- });
1213
- return true;
1214
- }
1215
- return false;
1216
- }
1217
- /**
1218
- * Fetch page using browser rendering
1219
- */
1220
- async fetchWithBrowser(url) {
1221
- try {
1222
- // Lazy initialize browser
1223
- if (!this.browserManager) {
1224
- this.browserManager = new BrowserManager();
1225
- await this.browserManager.launch(this.options.browserConfig);
1226
- }
1227
- // Render page
1228
- const result = await this.browserManager.renderPage(url);
1229
- // Convert to WebDocumentationPage format
1230
- return this.parseRenderedPage(result);
1231
- }
1232
- catch (error) {
1233
- const errorMsg = error instanceof Error ? error.message : String(error);
1234
- logger.error('Browser rendering failed', { url, error: errorMsg });
1235
- // Handle failure based on fallback strategy
1236
- return this.handleBrowserFailure(url, errorMsg);
1237
- }
1238
- }
1239
- /**
1240
- * Parse browser-rendered HTML into WebDocumentationPage
1241
- */
1242
- parseRenderedPage(result) {
1243
- const $ = cheerio.load(result.html);
1244
- // Extract text content (remove scripts and styles)
1245
- $('script, style, noscript').remove();
1246
- const bodyText = $('body').text().trim();
1247
- // Extract headings
1248
- const headings = [];
1249
- $('h1, h2, h3, h4, h5, h6').each((_, elem) => {
1250
- const $elem = $(elem);
1251
- const tagName = elem.tagName.toLowerCase();
1252
- const text = $elem.text().trim();
1253
- const id = $elem.attr('id');
1254
- if (text) {
1255
- headings.push({
1256
- level: tagName,
1257
- text,
1258
- id,
1259
- });
1260
- }
1261
- });
1262
- // Extract code samples
1263
- const codeSamples = [];
1264
- $('pre code, code.hljs, .highlight code').each((_, elem) => {
1265
- const $elem = $(elem);
1266
- const code = $elem.text().trim();
1267
- const language = $elem.attr('class')?.match(/language-(\w+)/)?.[1] || 'text';
1268
- if (code.length > 10) {
1269
- codeSamples.push({ code, language });
1270
- }
1271
- });
1272
- // Extract sections
1273
- const sections = [];
1274
- $('section, article, .section, .content-section').each((_, elem) => {
1275
- const $elem = $(elem);
1276
- const heading = $elem.find('h1, h2, h3').first();
1277
- const title = heading.text().trim() || 'Section';
1278
- const content = $elem.text().trim();
1279
- const anchor = heading.attr('id');
1280
- if (content.length > 50) {
1281
- sections.push({ title, content, anchor });
1282
- }
1283
- });
1284
- return {
1285
- url: result.url,
1286
- title: result.title,
1287
- content: bodyText,
1288
- searchableContent: bodyText,
1289
- sections,
1290
- navigationLinks: result.links
1291
- .filter(link => link.url && link.url.startsWith('http'))
1292
- .map(link => {
1293
- try {
1294
- const linkUrl = new URL(link.url);
1295
- return {
1296
- text: link.text,
1297
- url: link.url,
1298
- isInternal: linkUrl.origin === this.baseUrl.origin,
1299
- };
1300
- }
1301
- catch {
1302
- return null;
1303
- }
1304
- })
1305
- .filter((link) => link !== null),
1306
- headings,
1307
- codeSamples,
1308
- isDocumentation: true,
1309
- };
1310
- }
1311
- /**
1312
- * Handle browser rendering failure based on fallback strategy
1313
- */
1314
- async handleBrowserFailure(url, error) {
1315
- const strategy = this.options.spaFallback || 'warn';
1316
- switch (strategy) {
1317
- case 'error':
1318
- throw new Error(`Browser rendering failed for ${url}: ${error}`);
1319
- case 'skip':
1320
- logger.warn('Skipping page due to browser failure', { url });
1321
- return this.createEmptyPage(url);
1322
- case 'warn':
1323
- default:
1324
- logger.warn('Browser rendering failed, returning page with installation guide', { url, error });
1325
- return this.createPageWithGuide(url, error);
1326
- }
1327
- }
1328
- /**
1329
- * Create empty page placeholder
1330
- */
1331
- createEmptyPage(url) {
1332
- return {
1333
- url,
1334
- title: 'Page Skipped',
1335
- content: '',
1336
- searchableContent: '',
1337
- sections: [],
1338
- navigationLinks: [],
1339
- headings: [],
1340
- codeSamples: [],
1341
- isDocumentation: false,
1342
- };
1343
- }
1344
- /**
1345
- * Create page with browser installation guide
1346
- */
1347
- createPageWithGuide(url, error) {
1348
- const guide = `
1349
- # Browser Rendering Required
1350
-
1351
- This page appears to be a Single Page Application (SPA) that requires JavaScript rendering.
1352
-
1353
- ## Error
1354
- ${error}
1355
-
1356
- ## Solution
1357
-
1358
- To crawl SPA sites, you need Chrome/Chromium browser installed:
1359
-
1360
- ### macOS
1361
- \`\`\`bash
1362
- brew install --cask google-chrome
1363
- \`\`\`
1364
-
1365
- ### Windows
1366
- \`\`\`bash
1367
- winget install Google.Chrome
1368
- \`\`\`
1369
-
1370
- ### Linux
1371
- \`\`\`bash
1372
- sudo apt install google-chrome-stable
1373
- \`\`\`
1374
-
1375
- ### Alternative: Install puppeteer (includes bundled Chromium)
1376
- \`\`\`bash
1377
- npm install puppeteer
1378
- \`\`\`
1379
-
1380
- ### Alternative: Set browser path
1381
- \`\`\`bash
1382
- export CHROME_PATH=/path/to/chrome
1383
- \`\`\`
1384
-
1385
- See docs/SPA_BROWSER_SETUP.md for detailed instructions.
1386
- `.trim();
1387
- return {
1388
- url,
1389
- title: 'Browser Setup Required',
1390
- content: guide,
1391
- searchableContent: guide,
1392
- sections: [{ title: 'Browser Rendering Required', content: guide }],
1393
- navigationLinks: [],
1394
- headings: [{ level: 'h1', text: 'Browser Rendering Required' }],
1395
- codeSamples: [],
1396
- isDocumentation: false,
1397
- };
1398
- }
1399
- /**
1400
- * Cleanup resources (browser, checkpoint, etc.)
1401
- */
1402
- async cleanup() {
1403
- if (this.browserManager) {
1404
- await this.browserManager.close();
1405
- this.browserManager = undefined;
1406
- }
1407
- }
1408
- /**
1409
- * Delay helper for rate limiting
1410
- */
1411
- delay(ms) {
1412
- return new Promise(resolve => setTimeout(resolve, ms));
1413
- }
1414
- }
1415
- //# sourceMappingURL=doc-crawler.js.map