@gulibs/safe-coder 0.0.24 → 0.0.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/README.md +351 -15
  2. package/dist/documentation/checkpoint-manager.d.ts +38 -0
  3. package/dist/documentation/checkpoint-manager.d.ts.map +1 -0
  4. package/dist/documentation/checkpoint-manager.js +101 -0
  5. package/dist/documentation/checkpoint-manager.js.map +1 -0
  6. package/dist/documentation/doc-crawler.d.ts +67 -1
  7. package/dist/documentation/doc-crawler.d.ts.map +1 -1
  8. package/dist/documentation/doc-crawler.js +677 -150
  9. package/dist/documentation/doc-crawler.js.map +1 -1
  10. package/dist/documentation/llms-txt/detector.d.ts +31 -0
  11. package/dist/documentation/llms-txt/detector.d.ts.map +1 -0
  12. package/dist/documentation/llms-txt/detector.js +77 -0
  13. package/dist/documentation/llms-txt/detector.js.map +1 -0
  14. package/dist/documentation/llms-txt/downloader.d.ts +30 -0
  15. package/dist/documentation/llms-txt/downloader.d.ts.map +1 -0
  16. package/dist/documentation/llms-txt/downloader.js +84 -0
  17. package/dist/documentation/llms-txt/downloader.js.map +1 -0
  18. package/dist/documentation/llms-txt/index.d.ts +4 -0
  19. package/dist/documentation/llms-txt/index.d.ts.map +1 -0
  20. package/dist/documentation/llms-txt/index.js +4 -0
  21. package/dist/documentation/llms-txt/index.js.map +1 -0
  22. package/dist/documentation/llms-txt/parser.d.ts +43 -0
  23. package/dist/documentation/llms-txt/parser.d.ts.map +1 -0
  24. package/dist/documentation/llms-txt/parser.js +177 -0
  25. package/dist/documentation/llms-txt/parser.js.map +1 -0
  26. package/dist/index.js +0 -0
  27. package/dist/server/mcp-server.d.ts.map +1 -1
  28. package/dist/server/mcp-server.js +31 -3
  29. package/dist/server/mcp-server.js.map +1 -1
  30. package/package.json +10 -11
@@ -1,5 +1,10 @@
1
+ import { HttpClient } from '../utils/http-client.js';
1
2
  import { logger } from '../utils/logger.js';
2
3
  import { WebDocumentationBrowser } from './web-doc-browser.js';
4
+ import { LlmsTxtDetector, LlmsTxtDownloader, LlmsTxtParser } from './llms-txt/index.js';
5
+ import { CheckpointManager } from './checkpoint-manager.js';
6
+ import { join } from 'path';
7
+ import { tmpdir } from 'os';
3
8
  export class DocumentationCrawler {
4
9
  browser;
5
10
  visitedUrls;
@@ -9,6 +14,8 @@ export class DocumentationCrawler {
9
14
  options;
10
15
  baseUrl;
11
16
  linkDiscoveryStats;
17
+ checkpointManager;
18
+ pagesSinceLastCheckpoint;
12
19
  DOCUMENTATION_PATTERNS = [
13
20
  /\/docs?\//i,
14
21
  /\/documentation/i,
@@ -39,6 +46,7 @@ export class DocumentationCrawler {
39
46
  this.crawledPages = [];
40
47
  this.errors = [];
41
48
  this.options = {
49
+ crawlStrategy: 'bfs', // Default to breadth-first search
42
50
  maxDepth: 3,
43
51
  maxPages: 50,
44
52
  includePaths: [],
@@ -47,6 +55,8 @@ export class DocumentationCrawler {
47
55
  maxRetries: 2, // Default 2 retries
48
56
  retryDelay: 1000, // Default 1 second delay before retry
49
57
  useBrowserAutomation: false, // Default to HTTP-only for backward compatibility
58
+ skipLlmsTxt: false, // Enable llms.txt detection by default
59
+ workers: 1, // Default to single-threaded crawling
50
60
  };
51
61
  this.baseUrl = new URL('https://example.com');
52
62
  this.linkDiscoveryStats = {
@@ -62,15 +72,19 @@ export class DocumentationCrawler {
62
72
  pagesDiscovered: 0,
63
73
  pagesCrawled: 0,
64
74
  };
75
+ this.pagesSinceLastCheckpoint = 0;
65
76
  }
66
77
  /**
67
78
  * Crawl documentation starting from a root URL
68
79
  * Uses HTTP client (axios) exclusively - no browser automation
69
80
  * For SPA sites that require JavaScript rendering, use Cursor/Claude's built-in browser tools
81
+ * Supports both BFS (breadth-first) and DFS (depth-first) crawl strategies
70
82
  */
71
83
  async crawl(rootUrl, options = {}) {
84
+ const strategy = options.crawlStrategy || 'bfs';
72
85
  logger.info('Starting documentation crawl using HTTP client (axios)', {
73
86
  url: rootUrl,
87
+ strategy,
74
88
  method: 'HTTP GET',
75
89
  client: 'axios/HttpClient',
76
90
  note: 'For SPA sites, use Cursor/Claude browser tools to get rendered content first',
@@ -107,6 +121,27 @@ export class DocumentationCrawler {
107
121
  }
108
122
  // No longer require documentation-only pages - allow any website with extractable content
109
123
  logger.debug('Starting crawl from URL (permissive mode)', { url: rootUrl });
124
+ // Setup checkpoint manager if enabled
125
+ if (this.options.checkpoint?.enabled) {
126
+ const checkpointFile = this.options.checkpoint.file ||
127
+ join(tmpdir(), `safe-coder-checkpoint-${this.sanitizeFilename(rootUrl)}.json`);
128
+ this.checkpointManager = new CheckpointManager(checkpointFile);
129
+ // Try to resume from checkpoint if requested
130
+ if (this.options.resume) {
131
+ const loaded = await this.loadCheckpoint();
132
+ if (loaded) {
133
+ logger.info('Resumed from checkpoint', {
134
+ pagesCrawled: this.crawledPages.length,
135
+ pendingUrls: this.urlQueue.length,
136
+ visitedUrls: this.visitedUrls.size,
137
+ });
138
+ }
139
+ }
140
+ }
141
+ // Try to detect and use llms.txt if available (unless explicitly disabled)
142
+ if (!this.options.skipLlmsTxt) {
143
+ await this.tryLlmsTxt(rootUrl);
144
+ }
110
145
  // Detect SPA and provide warning
111
146
  try {
112
147
  const spaDetection = await this.browser.detectSPA(rootUrl);
@@ -136,144 +171,15 @@ export class DocumentationCrawler {
136
171
  // Start crawling from root
137
172
  this.urlQueue.push({ url: rootUrl, depth: 0 });
138
173
  let maxDepthReached = 0;
139
- // Process queue
174
+ // Process queue - use parallel workers if specified
140
175
  const startTime = Date.now();
141
- let lastProgressLog = Date.now();
142
- const PROGRESS_LOG_INTERVAL = 5000; // Log progress every 5 seconds
143
- while (this.urlQueue.length > 0 && this.crawledPages.length < this.options.maxPages) {
144
- const queued = this.urlQueue.shift();
145
- if (!queued)
146
- break;
147
- const { url, depth } = queued;
148
- // Log progress periodically
149
- const now = Date.now();
150
- if (now - lastProgressLog >= PROGRESS_LOG_INTERVAL) {
151
- const elapsed = ((now - startTime) / 1000).toFixed(1);
152
- const pagesPerSecond = (this.crawledPages.length / elapsed).toFixed(2);
153
- logger.info('Crawl progress', {
154
- pagesCrawled: this.crawledPages.length,
155
- pagesRemaining: this.urlQueue.length,
156
- maxPages: this.options.maxPages,
157
- errors: this.errors.length,
158
- elapsedSeconds: elapsed,
159
- pagesPerSecond,
160
- currentDepth: depth,
161
- maxDepth: this.options.maxDepth,
162
- });
163
- lastProgressLog = now;
164
- }
165
- // Skip if already visited
166
- if (this.visitedUrls.has(url)) {
167
- continue;
168
- }
169
- // Check depth limit
170
- if (depth > this.options.maxDepth) {
171
- continue;
172
- }
173
- // Mark as visited
174
- this.visitedUrls.add(url);
175
- maxDepthReached = Math.max(maxDepthReached, depth);
176
- try {
177
- // Crawl the page using HTTP GET with retry logic
178
- logger.debug('Fetching page via HTTP GET', { url, depth, method: 'HTTP GET', client: 'axios' });
179
- const page = await this.fetchPageWithRetry(url);
180
- // Check if page has minimal content (possible SPA issue)
181
- const contentLength = page.content.length;
182
- const linksCount = page.navigationLinks.length;
183
- if (contentLength < 200 && linksCount < 3) {
184
- logger.warn('Page has minimal content - may be SPA', {
185
- url,
186
- contentLength,
187
- linksCount,
188
- suggestion: 'This page may require JavaScript rendering. Consider using browser automation tools.',
189
- });
190
- }
191
- // Convert to CrawledPage format
192
- const crawledPage = {
193
- url: page.url,
194
- title: page.title,
195
- content: page.content,
196
- depth,
197
- sections: page.sections,
198
- navigationLinks: page.navigationLinks,
199
- headings: page.headings,
200
- codeSamples: page.codeSamples,
201
- };
202
- this.crawledPages.push(crawledPage);
203
- this.linkDiscoveryStats.pagesCrawled++;
204
- const totalLinksOnPage = page.navigationLinks.length;
205
- this.linkDiscoveryStats.totalLinksFound += totalLinksOnPage;
206
- logger.debug('Page fetched and parsed successfully', {
207
- url,
208
- title: page.title.substring(0, 50),
209
- linksFound: totalLinksOnPage,
210
- depth,
211
- });
212
- // Discover and queue new URLs
213
- if (depth < this.options.maxDepth) {
214
- const discoveryResult = this.discoverDocumentationLinks(page, depth + 1);
215
- const newUrls = discoveryResult.discovered;
216
- logger.debug('Link discovery completed', {
217
- url,
218
- totalLinksOnPage,
219
- discovered: newUrls.length,
220
- filtered: discoveryResult.filtered,
221
- alreadyVisited: discoveryResult.alreadyVisited,
222
- notContent: discoveryResult.notContent,
223
- externalDomain: discoveryResult.externalDomain,
224
- excludedPattern: discoveryResult.excludedPattern,
225
- queueLengthBefore: this.urlQueue.length,
226
- });
227
- let queuedCount = 0;
228
- let skippedAlreadyVisited = 0;
229
- for (const newUrl of newUrls) {
230
- if (!this.visitedUrls.has(newUrl.url)) {
231
- // Also check if it's already in the queue to avoid duplicates
232
- const alreadyInQueue = this.urlQueue.some(q => q.url === newUrl.url);
233
- if (!alreadyInQueue) {
234
- this.urlQueue.push(newUrl);
235
- this.linkDiscoveryStats.linksQueued++;
236
- queuedCount++;
237
- }
238
- else {
239
- skippedAlreadyVisited++;
240
- }
241
- }
242
- else {
243
- skippedAlreadyVisited++;
244
- }
245
- }
246
- logger.debug('Links queued', {
247
- url,
248
- queued: queuedCount,
249
- skippedAlreadyVisited,
250
- queueLengthAfter: this.urlQueue.length,
251
- });
252
- }
253
- else {
254
- this.linkDiscoveryStats.linksFiltered.depthLimit += totalLinksOnPage;
255
- }
256
- // Rate limiting
257
- if (this.options.rateLimit > 0 && this.urlQueue.length > 0) {
258
- await this.delay(this.options.rateLimit);
259
- }
260
- }
261
- catch (error) {
262
- const errorMessage = error instanceof Error ? error.message : String(error);
263
- const errorType = this.classifyError(error);
264
- this.errors.push({
265
- url,
266
- error: `${errorType}: ${errorMessage}`,
267
- });
268
- logger.warn('Page crawl failed', {
269
- url,
270
- error: errorMessage,
271
- errorType,
272
- depth,
273
- willContinue: true,
274
- });
275
- // Continue crawling other pages
276
- }
176
+ const workerCount = this.options.workers || 1;
177
+ if (workerCount > 1) {
178
+ logger.info('Using parallel crawling', { workers: workerCount });
179
+ maxDepthReached = await this.crawlWithWorkers(startTime);
180
+ }
181
+ else {
182
+ maxDepthReached = await this.crawlSequential(startTime);
277
183
  }
278
184
  // Update final statistics
279
185
  this.linkDiscoveryStats.pagesDiscovered = this.visitedUrls.size;
@@ -315,6 +221,10 @@ export class DocumentationCrawler {
315
221
  suggestion: 'Consider crawling more pages or a different website',
316
222
  });
317
223
  }
224
+ // Clear checkpoint after successful completion
225
+ if (this.checkpointManager && !abandoned) {
226
+ await this.clearCheckpoint();
227
+ }
318
228
  return {
319
229
  pages: this.crawledPages,
320
230
  totalPages: this.crawledPages.length,
@@ -325,6 +235,221 @@ export class DocumentationCrawler {
325
235
  abandonReason,
326
236
  };
327
237
  }
238
+ /**
239
+ * Sequential crawling (single-threaded)
240
+ */
241
+ async crawlSequential(startTime) {
242
+ let maxDepthReached = 0;
243
+ let lastProgressLog = Date.now();
244
+ const PROGRESS_LOG_INTERVAL = 5000; // Log progress every 5 seconds
245
+ while (this.urlQueue.length > 0 && this.crawledPages.length < this.options.maxPages) {
246
+ // Use different strategies for getting next URL
247
+ // BFS: shift() - take from front (queue behavior)
248
+ // DFS: pop() - take from back (stack behavior)
249
+ const queued = this.options.crawlStrategy === 'dfs' ? this.urlQueue.pop() : this.urlQueue.shift();
250
+ if (!queued)
251
+ break;
252
+ const { url, depth } = queued;
253
+ // Log progress periodically
254
+ const now = Date.now();
255
+ if (now - lastProgressLog >= PROGRESS_LOG_INTERVAL) {
256
+ const elapsed = ((now - startTime) / 1000).toFixed(1);
257
+ const pagesPerSecond = (this.crawledPages.length / elapsed).toFixed(2);
258
+ logger.info('Crawl progress', {
259
+ pagesCrawled: this.crawledPages.length,
260
+ pagesRemaining: this.urlQueue.length,
261
+ maxPages: this.options.maxPages,
262
+ errors: this.errors.length,
263
+ elapsedSeconds: elapsed,
264
+ pagesPerSecond,
265
+ currentDepth: depth,
266
+ maxDepth: this.options.maxDepth,
267
+ });
268
+ lastProgressLog = now;
269
+ }
270
+ // Skip if already visited
271
+ if (this.visitedUrls.has(url)) {
272
+ continue;
273
+ }
274
+ // Check depth limit
275
+ if (depth > this.options.maxDepth) {
276
+ continue;
277
+ }
278
+ // Mark as visited
279
+ this.visitedUrls.add(url);
280
+ maxDepthReached = Math.max(maxDepthReached, depth);
281
+ await this.processPage(url, depth);
282
+ // Rate limiting
283
+ if (this.options.rateLimit > 0 && this.urlQueue.length > 0) {
284
+ await this.delay(this.options.rateLimit);
285
+ }
286
+ }
287
+ return maxDepthReached;
288
+ }
289
+ /**
290
+ * Parallel crawling with multiple workers
291
+ */
292
+ async crawlWithWorkers(startTime) {
293
+ let maxDepthReached = 0;
294
+ let lastProgressLog = Date.now();
295
+ const PROGRESS_LOG_INTERVAL = 5000;
296
+ const workerCount = this.options.workers || 1;
297
+ while (this.urlQueue.length > 0 && this.crawledPages.length < this.options.maxPages) {
298
+ // Log progress periodically
299
+ const now = Date.now();
300
+ if (now - lastProgressLog >= PROGRESS_LOG_INTERVAL) {
301
+ const elapsed = ((now - startTime) / 1000).toFixed(1);
302
+ const pagesPerSecond = (this.crawledPages.length / elapsed).toFixed(2);
303
+ logger.info('Crawl progress (parallel)', {
304
+ pagesCrawled: this.crawledPages.length,
305
+ pagesRemaining: this.urlQueue.length,
306
+ maxPages: this.options.maxPages,
307
+ errors: this.errors.length,
308
+ elapsedSeconds: elapsed,
309
+ pagesPerSecond,
310
+ workers: workerCount,
311
+ });
312
+ lastProgressLog = now;
313
+ }
314
+ // Get batch of URLs to process in parallel
315
+ const batch = [];
316
+ const batchSize = Math.min(workerCount, this.urlQueue.length, this.options.maxPages - this.crawledPages.length);
317
+ for (let i = 0; i < batchSize; i++) {
318
+ const queued = this.options.crawlStrategy === 'dfs' ? this.urlQueue.pop() : this.urlQueue.shift();
319
+ if (!queued)
320
+ break;
321
+ // Skip if already visited
322
+ if (this.visitedUrls.has(queued.url)) {
323
+ continue;
324
+ }
325
+ // Check depth limit
326
+ if (queued.depth > this.options.maxDepth) {
327
+ continue;
328
+ }
329
+ // Mark as visited
330
+ this.visitedUrls.add(queued.url);
331
+ maxDepthReached = Math.max(maxDepthReached, queued.depth);
332
+ batch.push(queued);
333
+ }
334
+ if (batch.length === 0) {
335
+ break;
336
+ }
337
+ // Process batch in parallel
338
+ await Promise.all(batch.map(async (queued) => {
339
+ await this.processPage(queued.url, queued.depth);
340
+ // Rate limiting (per worker)
341
+ if (this.options.rateLimit > 0) {
342
+ await this.delay(this.options.rateLimit);
343
+ }
344
+ }));
345
+ }
346
+ return maxDepthReached;
347
+ }
348
+ /**
349
+ * Process a single page (shared by both sequential and parallel crawling)
350
+ */
351
+ async processPage(url, depth) {
352
+ try {
353
+ // Crawl the page using HTTP GET with retry logic
354
+ logger.debug('Fetching page via HTTP GET', { url, depth, method: 'HTTP GET', client: 'axios' });
355
+ const page = await this.fetchPageWithRetry(url);
356
+ // Check if page has minimal content (possible SPA issue)
357
+ const contentLength = page.content.length;
358
+ const linksCount = page.navigationLinks.length;
359
+ if (contentLength < 200 && linksCount < 3) {
360
+ logger.warn('Page has minimal content - may be SPA', {
361
+ url,
362
+ contentLength,
363
+ linksCount,
364
+ suggestion: 'This page may require JavaScript rendering. Consider using browser automation tools.',
365
+ });
366
+ }
367
+ // Convert to CrawledPage format
368
+ const crawledPage = {
369
+ url: page.url,
370
+ title: page.title,
371
+ content: page.content,
372
+ depth,
373
+ sections: page.sections,
374
+ navigationLinks: page.navigationLinks,
375
+ headings: page.headings,
376
+ codeSamples: page.codeSamples,
377
+ };
378
+ this.crawledPages.push(crawledPage);
379
+ this.linkDiscoveryStats.pagesCrawled++;
380
+ this.pagesSinceLastCheckpoint++;
381
+ // Save checkpoint if interval reached
382
+ if (this.checkpointManager && this.options.checkpoint?.enabled) {
383
+ const interval = this.options.checkpoint.interval || 10;
384
+ if (this.pagesSinceLastCheckpoint >= interval) {
385
+ await this.saveCheckpoint();
386
+ this.pagesSinceLastCheckpoint = 0;
387
+ }
388
+ }
389
+ const totalLinksOnPage = page.navigationLinks.length;
390
+ this.linkDiscoveryStats.totalLinksFound += totalLinksOnPage;
391
+ logger.debug('Page fetched and parsed successfully', {
392
+ url,
393
+ title: page.title.substring(0, 50),
394
+ linksFound: totalLinksOnPage,
395
+ depth,
396
+ });
397
+ // Discover and queue new URLs
398
+ if (depth < this.options.maxDepth) {
399
+ const discoveryResult = this.discoverDocumentationLinks(page, depth + 1);
400
+ const newUrls = discoveryResult.discovered;
401
+ logger.debug('Link discovery completed', {
402
+ url,
403
+ totalLinksOnPage,
404
+ discovered: newUrls.length,
405
+ filtered: discoveryResult.filtered,
406
+ });
407
+ let queuedCount = 0;
408
+ let skippedAlreadyVisited = 0;
409
+ for (const newUrl of newUrls) {
410
+ if (!this.visitedUrls.has(newUrl.url)) {
411
+ // Also check if it's already in the queue to avoid duplicates
412
+ const alreadyInQueue = this.urlQueue.some(q => q.url === newUrl.url);
413
+ if (!alreadyInQueue) {
414
+ this.urlQueue.push(newUrl);
415
+ this.linkDiscoveryStats.linksQueued++;
416
+ queuedCount++;
417
+ }
418
+ else {
419
+ skippedAlreadyVisited++;
420
+ }
421
+ }
422
+ else {
423
+ skippedAlreadyVisited++;
424
+ }
425
+ }
426
+ logger.debug('Links queued', {
427
+ url,
428
+ queued: queuedCount,
429
+ skippedAlreadyVisited,
430
+ queueLengthAfter: this.urlQueue.length,
431
+ });
432
+ }
433
+ else {
434
+ this.linkDiscoveryStats.linksFiltered.depthLimit += totalLinksOnPage;
435
+ }
436
+ }
437
+ catch (error) {
438
+ const errorMessage = error instanceof Error ? error.message : String(error);
439
+ const errorType = this.classifyError(error);
440
+ this.errors.push({
441
+ url,
442
+ error: `${errorType}: ${errorMessage}`,
443
+ });
444
+ logger.warn('Page crawl failed', {
445
+ url,
446
+ error: errorMessage,
447
+ errorType,
448
+ depth,
449
+ willContinue: true,
450
+ });
451
+ }
452
+ }
328
453
  /**
329
454
  * Discover documentation links from a crawled page
330
455
  */
@@ -482,22 +607,48 @@ export class DocumentationCrawler {
482
607
  }
483
608
  /**
484
609
  * Check if crawled content is sufficient for skill generation
485
- * Similar logic to SkillGenerator but here for early validation
610
+ * Enhanced with multi-dimensional quality metrics
486
611
  */
487
612
  canGenerateSkill(pages) {
488
613
  if (pages.length === 0) {
489
614
  return { canGenerate: false, reason: 'empty_pages' };
490
615
  }
616
+ const metrics = this.evaluateContentQuality(pages);
617
+ // All pages are media-only
618
+ if (metrics.mediaOnlyPages === pages.length && !metrics.hasTextContent) {
619
+ return { canGenerate: false, reason: 'media_only' };
620
+ }
621
+ // No pages have sufficient content
622
+ if (!metrics.hasSufficientContent) {
623
+ return { canGenerate: false, reason: 'insufficient_content' };
624
+ }
625
+ // No structured content (headings, sections)
626
+ if (!metrics.hasStructuredContent) {
627
+ return { canGenerate: false, reason: 'no_structured_content' };
628
+ }
629
+ return { canGenerate: true };
630
+ }
631
+ /**
632
+ * Evaluate content quality with multi-dimensional metrics
633
+ */
634
+ evaluateContentQuality(pages) {
491
635
  const MIN_CONTENT_LENGTH = 100;
492
636
  let hasSufficientContent = false;
493
637
  let hasStructuredContent = false;
494
638
  let hasTextContent = false;
495
639
  let mediaOnlyCount = 0;
640
+ let totalContentLength = 0;
641
+ let totalCodeSamples = 0;
642
+ // Track content diversity
643
+ const urlPatterns = new Set();
644
+ const titlePatterns = new Set();
496
645
  for (const page of pages) {
497
646
  const contentLength = (page.content || '').trim().length;
498
647
  const hasHeadings = page.headings && page.headings.length > 0;
499
648
  const hasText = contentLength > 0;
500
- // Check if page is media-only (has images but no text)
649
+ totalContentLength += contentLength;
650
+ totalCodeSamples += (page.codeSamples || []).length;
651
+ // Check if page is media-only
501
652
  const hasImages = /<img[^>]*>/i.test(page.content || '');
502
653
  const hasMedia = hasImages || (page.codeSamples && page.codeSamples.length > 0);
503
654
  if (hasMedia && contentLength < MIN_CONTENT_LENGTH) {
@@ -512,26 +663,85 @@ export class DocumentationCrawler {
512
663
  if (hasText) {
513
664
  hasTextContent = true;
514
665
  }
666
+ // Track diversity
667
+ try {
668
+ const urlPath = new URL(page.url).pathname;
669
+ const pathSegments = urlPath.split('/').filter(s => s);
670
+ if (pathSegments.length > 0) {
671
+ urlPatterns.add(pathSegments[0]);
672
+ }
673
+ }
674
+ catch {
675
+ // Invalid URL, skip
676
+ }
677
+ // Track title diversity
678
+ const titleWords = page.title.toLowerCase().split(/\s+/).slice(0, 3);
679
+ titlePatterns.add(titleWords.join(' '));
515
680
  }
516
- // All pages are media-only
517
- if (mediaOnlyCount === pages.length && !hasTextContent) {
518
- return { canGenerate: false, reason: 'media_only' };
519
- }
520
- // No pages have sufficient content
521
- if (!hasSufficientContent) {
522
- return { canGenerate: false, reason: 'insufficient_content' };
681
+ // Calculate diversity score (0-1)
682
+ const contentDiversity = Math.min(1, (urlPatterns.size + titlePatterns.size) / (pages.length * 0.5));
683
+ // Calculate API coverage score (0-1)
684
+ const pagesWithCode = pages.filter(p => p.codeSamples && p.codeSamples.length > 0).length;
685
+ const apiCoverage = pages.length > 0 ? pagesWithCode / pages.length : 0;
686
+ const avgContentLength = pages.length > 0 ? totalContentLength / pages.length : 0;
687
+ return {
688
+ hasSufficientContent,
689
+ hasStructuredContent,
690
+ hasTextContent,
691
+ mediaOnlyPages: mediaOnlyCount,
692
+ contentDiversity,
693
+ apiCoverage,
694
+ avgContentLength,
695
+ totalCodeSamples,
696
+ };
697
+ }
698
+ /**
699
+ * Check if should continue crawling based on content quality
700
+ */
701
+ shouldContinueCrawling(currentPages, maxPages) {
702
+ if (currentPages >= maxPages) {
703
+ return false;
523
704
  }
524
- // No structured content (headings, sections)
525
- if (!hasStructuredContent) {
526
- return { canGenerate: false, reason: 'no_structured_content' };
705
+ // Evaluate quality every 10 pages
706
+ if (currentPages % 10 === 0 && currentPages > 0) {
707
+ const metrics = this.evaluateContentQuality(this.crawledPages);
708
+ // High quality content - can stop early if we have enough
709
+ if (metrics.hasSufficientContent &&
710
+ metrics.contentDiversity > 0.7 &&
711
+ metrics.apiCoverage > 0.5 &&
712
+ currentPages >= maxPages * 0.5) {
713
+ logger.info('High quality content detected, considering early stop', {
714
+ currentPages,
715
+ maxPages,
716
+ diversity: metrics.contentDiversity.toFixed(2),
717
+ apiCoverage: metrics.apiCoverage.toFixed(2),
718
+ });
719
+ // Continue but log the possibility
720
+ }
721
+ // Low quality warning
722
+ if (currentPages >= maxPages * 0.8 && !metrics.hasSufficientContent) {
723
+ logger.warn('Approaching page limit but content quality is low', {
724
+ currentPages,
725
+ maxPages,
726
+ diversity: metrics.contentDiversity.toFixed(2),
727
+ apiCoverage: metrics.apiCoverage.toFixed(2),
728
+ suggestion: 'Consider increasing maxPages or refining includePaths',
729
+ });
730
+ }
527
731
  }
528
- return { canGenerate: true };
732
+ return currentPages < maxPages;
529
733
  }
530
734
  /**
531
735
  * Fetch a page with retry logic
736
+ * Supports both HTML pages and Markdown files
532
737
  */
533
738
  async fetchPageWithRetry(url, retryCount = 0) {
534
739
  try {
740
+ // Check if this is a Markdown file
741
+ if (url.endsWith('.md') || url.includes('.md?') || url.includes('.md#')) {
742
+ return await this.extractMarkdownContent(url);
743
+ }
744
+ // Regular HTML page
535
745
  return await this.browser.browsePage(url);
536
746
  }
537
747
  catch (error) {
@@ -553,6 +763,166 @@ export class DocumentationCrawler {
553
763
  throw error;
554
764
  }
555
765
  }
766
+ /**
767
+ * Extract content from Markdown file
768
+ * Converts Markdown structure to WebDocumentationPage format
769
+ */
770
+ async extractMarkdownContent(url) {
771
+ logger.debug('Extracting Markdown content', { url });
772
+ // Fetch raw markdown content
773
+ const httpClient = new HttpClient();
774
+ const response = await httpClient.get(url, {
775
+ responseType: 'text',
776
+ timeout: 30000,
777
+ });
778
+ const markdownContent = response.data;
779
+ // Parse markdown structure
780
+ const parsed = this.parseMarkdown(markdownContent, url);
781
+ return {
782
+ url,
783
+ title: parsed.title,
784
+ content: parsed.content,
785
+ searchableContent: parsed.content, // Add searchable content for consistency
786
+ sections: parsed.sections,
787
+ navigationLinks: parsed.links,
788
+ headings: parsed.headings,
789
+ codeSamples: parsed.codeSamples,
790
+ isDocumentation: true,
791
+ };
792
+ }
793
+ /**
794
+ * Parse Markdown content into structured data
795
+ */
796
+ parseMarkdown(content, url) {
797
+ const lines = content.split('\n');
798
+ let title = '';
799
+ const headings = [];
800
+ const codeSamples = [];
801
+ const sections = [];
802
+ const links = [];
803
+ const contentLines = [];
804
+ // Extract title from first h1
805
+ for (const line of lines) {
806
+ if (line.startsWith('# ')) {
807
+ title = line.substring(2).trim();
808
+ break;
809
+ }
810
+ }
811
+ // Extract headings (h2-h6)
812
+ const headingRegex = /^(#{2,6})\s+(.+)$/;
813
+ for (const line of lines) {
814
+ const match = line.match(headingRegex);
815
+ if (match) {
816
+ const level = match[1].length;
817
+ const text = match[2].trim();
818
+ const id = text.toLowerCase().replace(/[^\w\s-]/g, '').replace(/\s+/g, '-');
819
+ headings.push({
820
+ level: `h${level}`,
821
+ text,
822
+ id,
823
+ });
824
+ }
825
+ }
826
+ // Extract code blocks
827
+ const codeBlockRegex = /```(\w+)?\n([\s\S]*?)```/g;
828
+ let match;
829
+ while ((match = codeBlockRegex.exec(content)) !== null) {
830
+ const language = match[1] || 'text';
831
+ const code = match[2].trim();
832
+ if (code.length > 10) {
833
+ codeSamples.push({
834
+ code,
835
+ language,
836
+ });
837
+ }
838
+ }
839
+ // Extract content (remove code blocks and headings)
840
+ let contentWithoutCode = content.replace(codeBlockRegex, '');
841
+ contentWithoutCode = contentWithoutCode.replace(/^#{1,6}\s+.+$/gm, '');
842
+ for (const para of contentWithoutCode.split('\n\n')) {
843
+ const trimmed = para.trim();
844
+ if (trimmed.length > 20) {
845
+ contentLines.push(trimmed);
846
+ }
847
+ }
848
+ // Extract links (markdown format)
849
+ const linkRegex = /\[([^\]]*)\]\(([^)]+)\)/g;
850
+ while ((match = linkRegex.exec(content)) !== null) {
851
+ const text = match[1];
852
+ const linkUrl = match[2].trim();
853
+ // Skip anchors
854
+ if (linkUrl.startsWith('#')) {
855
+ continue;
856
+ }
857
+ // Resolve relative URLs
858
+ let absoluteUrl;
859
+ try {
860
+ if (linkUrl.startsWith('http://') || linkUrl.startsWith('https://')) {
861
+ absoluteUrl = linkUrl;
862
+ }
863
+ else {
864
+ absoluteUrl = new URL(linkUrl, url).href;
865
+ }
866
+ // Remove fragment
867
+ absoluteUrl = absoluteUrl.split('#')[0];
868
+ // Only include .md URLs to avoid client-side rendered HTML pages
869
+ if (absoluteUrl.endsWith('.md') || absoluteUrl.includes('.md?')) {
870
+ const linkOrigin = new URL(absoluteUrl).origin;
871
+ const baseOrigin = this.baseUrl.origin;
872
+ links.push({
873
+ text,
874
+ url: absoluteUrl,
875
+ isInternal: linkOrigin === baseOrigin,
876
+ });
877
+ }
878
+ }
879
+ catch (error) {
880
+ // Invalid URL, skip
881
+ logger.debug('Invalid URL in markdown link', { url: linkUrl });
882
+ }
883
+ }
884
+ // Build sections from headings
885
+ let currentSection = null;
886
+ let currentContent = [];
887
+ for (const line of lines) {
888
+ const headerMatch = line.match(headingRegex);
889
+ if (headerMatch) {
890
+ // Save previous section
891
+ if (currentSection) {
892
+ currentSection.content = currentContent.join('\n').trim();
893
+ if (currentSection.content.length > 0) {
894
+ sections.push(currentSection);
895
+ }
896
+ }
897
+ // Start new section
898
+ const text = headerMatch[2].trim();
899
+ currentSection = {
900
+ title: text,
901
+ content: '',
902
+ anchor: text.toLowerCase().replace(/[^\w\s-]/g, '').replace(/\s+/g, '-'),
903
+ };
904
+ currentContent = [];
905
+ }
906
+ else if (currentSection) {
907
+ currentContent.push(line);
908
+ }
909
+ }
910
+ // Save last section
911
+ if (currentSection) {
912
+ currentSection.content = currentContent.join('\n').trim();
913
+ if (currentSection.content.length > 0) {
914
+ sections.push(currentSection);
915
+ }
916
+ }
917
+ return {
918
+ title: title || 'Untitled',
919
+ content: contentLines.join('\n\n'),
920
+ headings,
921
+ codeSamples,
922
+ sections,
923
+ links,
924
+ };
925
+ }
556
926
  /**
557
927
  * Classify error type for better error messages
558
928
  */
@@ -625,6 +995,163 @@ export class DocumentationCrawler {
625
995
  }
626
996
  return breakdown;
627
997
  }
998
+ /**
999
+ * Try to detect and use llms.txt for optimized crawling
1000
+ */
1001
+ async tryLlmsTxt(rootUrl) {
1002
+ logger.info('Checking for llms.txt files', { url: rootUrl });
1003
+ try {
1004
+ const detector = new LlmsTxtDetector(rootUrl);
1005
+ const variants = await detector.detectAll();
1006
+ if (variants.length === 0) {
1007
+ logger.info('No llms.txt files found, proceeding with normal crawl');
1008
+ return;
1009
+ }
1010
+ logger.info('Found llms.txt variants', {
1011
+ count: variants.length,
1012
+ variants: variants.map(v => v.variant),
1013
+ });
1014
+ // Download all variants
1015
+ const downloader = new LlmsTxtDownloader();
1016
+ const downloaded = await downloader.downloadAll(variants);
1017
+ if (downloaded.length === 0) {
1018
+ logger.warn('Failed to download any llms.txt variants');
1019
+ return;
1020
+ }
1021
+ // Use the largest variant (most comprehensive)
1022
+ const largest = downloaded.reduce((prev, current) => current.size > prev.size ? current : prev);
1023
+ logger.info('Using llms.txt for URL extraction', {
1024
+ variant: largest.variant,
1025
+ size: largest.size,
1026
+ });
1027
+ // Parse URLs from llms.txt
1028
+ const parser = new LlmsTxtParser(largest.content, rootUrl);
1029
+ const extractedUrls = parser.extractUrls();
1030
+ if (extractedUrls.length > 0) {
1031
+ logger.info('Extracted URLs from llms.txt', {
1032
+ count: extractedUrls.length,
1033
+ });
1034
+ // Add URLs to queue with depth 0
1035
+ for (const url of extractedUrls) {
1036
+ if (this.isValidUrl(url) && !this.visitedUrls.has(url)) {
1037
+ this.urlQueue.push({ url, depth: 0 });
1038
+ }
1039
+ }
1040
+ logger.info('Added llms.txt URLs to crawl queue', {
1041
+ added: this.urlQueue.length,
1042
+ });
1043
+ }
1044
+ else {
1045
+ logger.info('No URLs extracted from llms.txt, using normal crawl');
1046
+ }
1047
+ }
1048
+ catch (error) {
1049
+ const errorMessage = error instanceof Error ? error.message : String(error);
1050
+ logger.warn('llms.txt detection failed, continuing with normal crawl', {
1051
+ error: errorMessage,
1052
+ });
1053
+ // Continue with normal crawling if llms.txt fails
1054
+ }
1055
+ }
1056
+ /**
1057
+ * Check if a URL is valid for crawling
1058
+ */
1059
+ isValidUrl(url) {
1060
+ try {
1061
+ const parsed = new URL(url);
1062
+ // Must be same origin as base URL
1063
+ if (parsed.origin !== this.baseUrl.origin) {
1064
+ return false;
1065
+ }
1066
+ // Must be http or https
1067
+ if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
1068
+ return false;
1069
+ }
1070
+ return true;
1071
+ }
1072
+ catch {
1073
+ return false;
1074
+ }
1075
+ }
1076
+ /**
1077
+ * Save checkpoint
1078
+ */
1079
+ async saveCheckpoint() {
1080
+ if (!this.checkpointManager) {
1081
+ return;
1082
+ }
1083
+ const checkpointData = {
1084
+ config: this.options,
1085
+ visitedUrls: Array.from(this.visitedUrls),
1086
+ pendingUrls: this.urlQueue,
1087
+ pagesCrawled: this.crawledPages.length,
1088
+ lastUpdated: new Date().toISOString(),
1089
+ baseUrl: this.baseUrl.href,
1090
+ };
1091
+ try {
1092
+ await this.checkpointManager.saveCheckpoint(checkpointData);
1093
+ }
1094
+ catch (error) {
1095
+ logger.warn('Failed to save checkpoint', {
1096
+ error: error instanceof Error ? error.message : String(error),
1097
+ });
1098
+ }
1099
+ }
1100
+ /**
1101
+ * Load checkpoint and restore state
1102
+ */
1103
+ async loadCheckpoint() {
1104
+ if (!this.checkpointManager) {
1105
+ return false;
1106
+ }
1107
+ try {
1108
+ const data = await this.checkpointManager.loadCheckpoint();
1109
+ if (!data) {
1110
+ logger.info('No checkpoint found to resume from');
1111
+ return false;
1112
+ }
1113
+ // Restore state
1114
+ this.visitedUrls = new Set(data.visitedUrls);
1115
+ this.urlQueue = data.pendingUrls;
1116
+ // Note: crawledPages are not restored as they will be regenerated
1117
+ logger.info('State restored from checkpoint', {
1118
+ visitedUrls: this.visitedUrls.size,
1119
+ pendingUrls: this.urlQueue.length,
1120
+ lastUpdated: data.lastUpdated,
1121
+ });
1122
+ return true;
1123
+ }
1124
+ catch (error) {
1125
+ logger.warn('Failed to load checkpoint', {
1126
+ error: error instanceof Error ? error.message : String(error),
1127
+ });
1128
+ return false;
1129
+ }
1130
+ }
1131
+ /**
1132
+ * Clear checkpoint after successful crawl
1133
+ */
1134
+ async clearCheckpoint() {
1135
+ if (this.checkpointManager) {
1136
+ try {
1137
+ await this.checkpointManager.clearCheckpoint();
1138
+ }
1139
+ catch (error) {
1140
+ logger.debug('Failed to clear checkpoint', {
1141
+ error: error instanceof Error ? error.message : String(error),
1142
+ });
1143
+ }
1144
+ }
1145
+ }
1146
+ /**
1147
+ * Sanitize filename for checkpoint
1148
+ */
1149
+ sanitizeFilename(url) {
1150
+ return url
1151
+ .replace(/[^a-z0-9]/gi, '-')
1152
+ .replace(/-+/g, '-')
1153
+ .substring(0, 64);
1154
+ }
628
1155
  /**
629
1156
  * Delay helper for rate limiting
630
1157
  */