@gulibs/safe-coder 0.0.22 → 0.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -25,6 +25,10 @@
25
25
  - 生成结构化的 Agent Skill 输出(Markdown 格式)
26
26
  - 支持深度限制、页面限制和速率控制
27
27
  - 自动组织内容,生成目录和章节
28
+ - **SPA 检测**:自动检测单页应用并提供建议
29
+ - **智能重试**:对临时错误自动重试,提高成功率
30
+ - **错误分类**:详细的错误类型统计和分析
31
+ - **进度监控**:实时显示爬取进度和性能指标
28
32
 
29
33
  ### 网页文档浏览
30
34
 
@@ -376,6 +380,17 @@ pwd
376
380
 
377
381
  配置完成后,你可以在 Cursor 的 AI 对话中使用以下工具:
378
382
 
383
+ **可用工具列表:**
384
+ 1. `get_documentation` - 获取文档
385
+ 2. `browse_documentation` - 浏览网页文档
386
+ 3. `crawl_documentation` - 爬取文档并生成技能
387
+ 4. `detect_spa` - 检测单页应用
388
+ 5. `detect_errors` - 检测代码错误
389
+ 6. `validate_code` - 验证代码
390
+ 7. `resolve_error` - 解决错误
391
+ 8. `get_status` - 获取服务器状态
392
+ 9. `get_version` - 获取版本号
393
+
379
394
  #### `get_documentation` - 获取文档
380
395
 
381
396
  获取库或包的文档。
@@ -431,9 +446,17 @@ pwd
431
446
  - `includePaths`(可选):额外包含的路径模式
432
447
  - `excludePaths`(可选):排除的路径模式
433
448
  - `rateLimit`(可选):请求间隔(毫秒,默认:500)
449
+ - `maxRetries`(可选):失败请求的最大重试次数(默认:2)
450
+ - `retryDelay`(可选):重试前的延迟(毫秒,默认:1000)
434
451
  - `outputDir`(可选):保存技能文件的目录(如果不提供,只返回内容)
435
452
  - `filename`(可选):自定义文件名(不含扩展名)
436
453
 
454
+ **新功能:**
455
+ - ✅ **SPA 检测**:自动检测单页应用(SPA)并提供建议
456
+ - ✅ **智能重试**:对临时错误(超时、网络错误等)自动重试
457
+ - ✅ **错误分类**:详细的错误类型分类和统计
458
+ - ✅ **进度日志**:实时显示爬取进度和统计信息
459
+
437
460
  **使用示例:**
438
461
 
439
462
  ```
@@ -507,6 +530,25 @@ function test() {
507
530
  我遇到了一个 undefined-variable 错误,消息是 "Variable 'x' is used but not defined",在第 5 行
508
531
  ```
509
532
 
533
+ #### `detect_spa` - 检测单页应用
534
+
535
+ 检测网站是否为需要 JavaScript 渲染的单页应用(SPA)。
536
+
537
+ **参数:**
538
+ - `url`(必需):要检测的 URL
539
+
540
+ **使用示例:**
541
+
542
+ ```
543
+ 检测 https://react-dnd.github.io/react-dnd/docs 是否是 SPA
544
+ ```
545
+
546
+ **返回:**
547
+ - `isSPA`:是否为 SPA
548
+ - `confidence`:检测置信度(high/medium/low)
549
+ - `indicators`:检测到的 SPA 指标
550
+ - `suggestion`:建议(如果检测到 SPA)
551
+
510
552
  ### 4. 使用 MCP 资源
511
553
 
512
554
  #### `safe-coder://documentation` - 缓存的文档
@@ -5,6 +5,9 @@ export interface CrawlOptions {
5
5
  includePaths?: string[];
6
6
  excludePaths?: string[];
7
7
  rateLimit?: number;
8
+ maxRetries?: number;
9
+ retryDelay?: number;
10
+ useBrowserAutomation?: boolean;
8
11
  }
9
12
  export interface CrawledPage {
10
13
  url: string;
@@ -34,7 +37,7 @@ export interface CrawledPage {
34
37
  export interface LinkDiscoveryStats {
35
38
  totalLinksFound: number;
36
39
  linksFiltered: {
37
- notDocumentation: number;
40
+ notContent: number;
38
41
  externalDomain: number;
39
42
  alreadyVisited: number;
40
43
  excludedPattern: number;
@@ -44,6 +47,7 @@ export interface LinkDiscoveryStats {
44
47
  pagesDiscovered: number;
45
48
  pagesCrawled: number;
46
49
  }
50
+ export type AbandonReason = 'insufficient_content' | 'media_only' | 'empty_pages' | 'no_structured_content';
47
51
  export interface CrawlResult {
48
52
  pages: CrawledPage[];
49
53
  totalPages: number;
@@ -53,6 +57,8 @@ export interface CrawlResult {
53
57
  error: string;
54
58
  }>;
55
59
  linkDiscoveryStats: LinkDiscoveryStats;
60
+ abandoned?: boolean;
61
+ abandonReason?: AbandonReason;
56
62
  }
57
63
  export declare class DocumentationCrawler {
58
64
  private browser;
@@ -77,13 +83,34 @@ export declare class DocumentationCrawler {
77
83
  */
78
84
  private discoverDocumentationLinks;
79
85
  /**
80
- * Check if a path is a documentation path
86
+ * Check if a path should be crawled (permissive - only exclude clearly non-content paths)
81
87
  */
82
88
  private isDocumentationPath;
83
89
  /**
84
90
  * Check if a path should be excluded
85
91
  */
86
92
  private shouldExclude;
93
+ /**
94
+ * Check if crawled content is sufficient for skill generation
95
+ * Similar logic to SkillGenerator but here for early validation
96
+ */
97
+ private canGenerateSkill;
98
+ /**
99
+ * Fetch a page with retry logic
100
+ */
101
+ private fetchPageWithRetry;
102
+ /**
103
+ * Classify error type for better error messages
104
+ */
105
+ private classifyError;
106
+ /**
107
+ * Check if an error is retryable
108
+ */
109
+ private isRetryableError;
110
+ /**
111
+ * Get error breakdown by type
112
+ */
113
+ private getErrorBreakdown;
87
114
  /**
88
115
  * Delay helper for rate limiting
89
116
  */
@@ -1 +1 @@
1
- {"version":3,"file":"doc-crawler.d.ts","sourceRoot":"","sources":["../../src/documentation/doc-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AAIrD,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,WAAW;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,OAAO,EAAE,MAAM,CAAC;QAChB,MAAM,CAAC,EAAE,MAAM,CAAC;KACjB,CAAC,CAAC;IACH,eAAe,EAAE,KAAK,CAAC;QACrB,IAAI,EAAE,MAAM,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,UAAU,EAAE,OAAO,CAAC;KACrB,CAAC,CAAC;IACH,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,EAAE,CAAC,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,WAAW,EAAE,KAAK,CAAC;QACjB,IAAI,EAAE,MAAM,CAAC;QACb,QAAQ,EAAE,MAAM,CAAC;KAClB,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,kBAAkB;IACjC,eAAe,EAAE,MAAM,CAAC;IACxB,aAAa,EAAE;QACb,gBAAgB,EAAE,MAAM,CAAC;QACzB,cAAc,EAAE,MAAM,CAAC;QACvB,cAAc,EAAE,MAAM,CAAC;QACvB,eAAe,EAAE,MAAM,CAAC;QACxB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,CAAC;IACxB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,WAAW,EAAE,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe,EAAE,MAAM,CAAC;IACxB,MAAM,EAAE,KAAK,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IACH,kBAAkB,EAAE,kBAAkB,CAAC;CACxC;AAOD,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,OAAO,CAA0B;IACzC,OAAO,CAAC,WAAW,CAAc;IACjC,OAAO,CAAC,QAAQ,CAAc;IAC9B,OAAO,CAAC,YAAY,CAAgB;IACpC,OAAO,CAAC,MAAM,CAAwC;IACtD,OAAO,CAAC,OAAO,CAAyB;IACxC,OAAO,CAAC,OAAO,CAAM;IACrB,OAAO,CAAC,kBAAkB,CAAqB;IAC/C,OAAO,CAAC,QAAQ,CAAC,sBAAsB,CAWrC;IACF,OAAO,CAAC,QAAQ,CAAC,iBAAiB,CAUhC;gBAEU,UAAU,CAAC,EAAE,UAAU;IA8BnC;;;;OAIG;IACG,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,WAAW,CAAC;IA+L9E;;OAEG;IACH,OAAO,CAAC,0BAA0B;IAiJlC;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAgD3B;;OAEG;IACH,OAAO,CAAC,aAAa;IAIrB;;OAEG;IACH,OAAO,CAAC,KAAK;CAGd"}
1
+ {"version":3,"file":"doc-crawler.d.ts","sourceRoot":"","sources":["../../src/documentation/doc-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AAIrD,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,oBAAoB,CAAC,EAAE,OAAO,CAAC;CAChC;AAED,MAAM,WAAW,WAAW;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,OAAO,EAAE,MAAM,CAAC;QAChB,MAAM,CAAC,EAAE,MAAM,CAAC;KACjB,CAAC,CAAC;IACH,eAAe,EAAE,KAAK,CAAC;QACrB,IAAI,EAAE,MAAM,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,UAAU,EAAE,OAAO,CAAC;KACrB,CAAC,CAAC;IACH,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,EAAE,CAAC,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,WAAW,EAAE,KAAK,CAAC;QACjB,IAAI,EAAE,MAAM,CAAC;QACb,QAAQ,EAAE,MAAM,CAAC;KAClB,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,kBAAkB;IACjC,eAAe,EAAE,MAAM,CAAC;IACxB,aAAa,EAAE;QACb,UAAU,EAAE,MAAM,CAAC;QACnB,cAAc,EAAE,MAAM,CAAC;QACvB,cAAc,EAAE,MAAM,CAAC;QACvB,eAAe,EAAE,MAAM,CAAC;QACxB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,CAAC;IACxB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,MAAM,aAAa,GACrB,sBAAsB,GACtB,YAAY,GACZ,aAAa,GACb,uBAAuB,CAAC;AAE5B,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,WAAW,EAAE,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe,EAAE,MAAM,CAAC;IACxB,MAAM,EAAE,KAAK,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IACH,kBAAkB,EAAE,kBAAkB,CAAC;IACvC,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,aAAa,CAAC,EAAE,aAAa,CAAC;CAC/B;AAOD,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,OAAO,CAA0B;IACzC,OAAO,CAAC,WAAW,CAAc;IACjC,OAAO,CAAC,QAAQ,CAAc;IAC9B,OAAO,CAAC,YAAY,CAAgB;IACpC,OAAO,CAAC,MAAM,CAAwC;IACtD,OAAO,CAAC,OAAO,CAAyB;IACxC,OAAO,CAAC,OAAO,CAAM;IACrB,OAAO,CAAC,kBAAkB,CAAqB;IAC/C,OAAO,CAAC,QAAQ,CAAC,sBAAsB,CAWrC;IACF,OAAO,CAAC,QAAQ,CAAC,iBAAiB,CAUhC;gBAEU,UAAU,CAAC,EAAE,UAAU;IAiCnC;;;;OAIG;IACG,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,WAAW,CAAC;IA6R9E;;OAEG;IACH,OAAO,CAAC,0BAA0B;IAiJlC;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAmC3B;;OAEG;IACH,OAAO,CAAC,aAAa;IAIrB;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IAoDxB;;OAEG;YACW,kBAAkB;IA0BhC;;OAEG;IACH,OAAO,CAAC,aAAa;IA0CrB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAWzB;;OAEG;IACH,OAAO,CAAC,KAAK;CAGd"}
@@ -44,12 +44,15 @@ export class DocumentationCrawler {
44
44
  includePaths: [],
45
45
  excludePaths: [],
46
46
  rateLimit: 500, // 500ms default delay
47
+ maxRetries: 2, // Default 2 retries
48
+ retryDelay: 1000, // Default 1 second delay before retry
49
+ useBrowserAutomation: false, // Default to HTTP-only for backward compatibility
47
50
  };
48
51
  this.baseUrl = new URL('https://example.com');
49
52
  this.linkDiscoveryStats = {
50
53
  totalLinksFound: 0,
51
54
  linksFiltered: {
52
- notDocumentation: 0,
55
+ notContent: 0,
53
56
  externalDomain: 0,
54
57
  alreadyVisited: 0,
55
58
  excludedPattern: 0,
@@ -80,7 +83,7 @@ export class DocumentationCrawler {
80
83
  this.linkDiscoveryStats = {
81
84
  totalLinksFound: 0,
82
85
  linksFiltered: {
83
- notDocumentation: 0,
86
+ notContent: 0,
84
87
  externalDomain: 0,
85
88
  alreadyVisited: 0,
86
89
  excludedPattern: 0,
@@ -102,22 +105,63 @@ export class DocumentationCrawler {
102
105
  catch (error) {
103
106
  throw new Error(`Invalid root URL: ${rootUrl}`);
104
107
  }
105
- // Check if root URL is documentation
106
- logger.debug('Checking if URL is documentation page (HTTP request)', { url: rootUrl });
107
- const isDoc = await this.browser.isDocumentationPage(rootUrl);
108
- if (!isDoc) {
109
- throw new Error(`The provided URL does not appear to be a documentation page: ${rootUrl}\n` +
110
- `Note: For SPA sites that require JavaScript rendering, use Cursor/Claude's browser tools to get rendered HTML first, then process it.`);
108
+ // No longer require documentation-only pages - allow any website with extractable content
109
+ logger.debug('Starting crawl from URL (permissive mode)', { url: rootUrl });
110
+ // Detect SPA and provide warning
111
+ try {
112
+ const spaDetection = await this.browser.detectSPA(rootUrl);
113
+ if (spaDetection.isSPA && spaDetection.confidence !== 'low') {
114
+ logger.warn('SPA detected at root URL - crawling may be limited', {
115
+ url: rootUrl,
116
+ confidence: spaDetection.confidence,
117
+ indicators: spaDetection.indicators,
118
+ suggestion: spaDetection.suggestion,
119
+ });
120
+ // Add warning to first page if SPA detected
121
+ if (spaDetection.suggestion) {
122
+ logger.info('SPA Detection Warning', {
123
+ message: spaDetection.suggestion,
124
+ recommendation: 'Consider using browser automation tools to get fully rendered content before crawling.',
125
+ });
126
+ }
127
+ }
128
+ }
129
+ catch (error) {
130
+ // SPA detection failure is not critical, continue crawling
131
+ logger.debug('SPA detection failed, continuing with crawl', {
132
+ url: rootUrl,
133
+ error: error instanceof Error ? error.message : String(error),
134
+ });
111
135
  }
112
136
  // Start crawling from root
113
137
  this.urlQueue.push({ url: rootUrl, depth: 0 });
114
138
  let maxDepthReached = 0;
115
139
  // Process queue
140
+ const startTime = Date.now();
141
+ let lastProgressLog = Date.now();
142
+ const PROGRESS_LOG_INTERVAL = 5000; // Log progress every 5 seconds
116
143
  while (this.urlQueue.length > 0 && this.crawledPages.length < this.options.maxPages) {
117
144
  const queued = this.urlQueue.shift();
118
145
  if (!queued)
119
146
  break;
120
147
  const { url, depth } = queued;
148
+ // Log progress periodically
149
+ const now = Date.now();
150
+ if (now - lastProgressLog >= PROGRESS_LOG_INTERVAL) {
151
+ const elapsed = ((now - startTime) / 1000).toFixed(1);
152
+ const pagesPerSecond = (this.crawledPages.length / elapsed).toFixed(2);
153
+ logger.info('Crawl progress', {
154
+ pagesCrawled: this.crawledPages.length,
155
+ pagesRemaining: this.urlQueue.length,
156
+ maxPages: this.options.maxPages,
157
+ errors: this.errors.length,
158
+ elapsedSeconds: elapsed,
159
+ pagesPerSecond,
160
+ currentDepth: depth,
161
+ maxDepth: this.options.maxDepth,
162
+ });
163
+ lastProgressLog = now;
164
+ }
121
165
  // Skip if already visited
122
166
  if (this.visitedUrls.has(url)) {
123
167
  continue;
@@ -130,9 +174,20 @@ export class DocumentationCrawler {
130
174
  this.visitedUrls.add(url);
131
175
  maxDepthReached = Math.max(maxDepthReached, depth);
132
176
  try {
133
- // Crawl the page using HTTP GET
177
+ // Crawl the page using HTTP GET with retry logic
134
178
  logger.debug('Fetching page via HTTP GET', { url, depth, method: 'HTTP GET', client: 'axios' });
135
- const page = await this.browser.browsePage(url);
179
+ const page = await this.fetchPageWithRetry(url);
180
+ // Check if page has minimal content (possible SPA issue)
181
+ const contentLength = page.content.length;
182
+ const linksCount = page.navigationLinks.length;
183
+ if (contentLength < 200 && linksCount < 3) {
184
+ logger.warn('Page has minimal content - may be SPA', {
185
+ url,
186
+ contentLength,
187
+ linksCount,
188
+ suggestion: 'This page may require JavaScript rendering. Consider using browser automation tools.',
189
+ });
190
+ }
136
191
  // Convert to CrawledPage format
137
192
  const crawledPage = {
138
193
  url: page.url,
@@ -164,7 +219,7 @@ export class DocumentationCrawler {
164
219
  discovered: newUrls.length,
165
220
  filtered: discoveryResult.filtered,
166
221
  alreadyVisited: discoveryResult.alreadyVisited,
167
- notDocumentation: discoveryResult.notDocumentation,
222
+ notContent: discoveryResult.notContent,
168
223
  externalDomain: discoveryResult.externalDomain,
169
224
  excludedPattern: discoveryResult.excludedPattern,
170
225
  queueLengthBefore: this.urlQueue.length,
@@ -204,20 +259,40 @@ export class DocumentationCrawler {
204
259
  }
205
260
  }
206
261
  catch (error) {
262
+ const errorMessage = error instanceof Error ? error.message : String(error);
263
+ const errorType = this.classifyError(error);
207
264
  this.errors.push({
208
265
  url,
209
- error: error instanceof Error ? error.message : String(error),
266
+ error: `${errorType}: ${errorMessage}`,
267
+ });
268
+ logger.warn('Page crawl failed', {
269
+ url,
270
+ error: errorMessage,
271
+ errorType,
272
+ depth,
273
+ willContinue: true,
210
274
  });
211
275
  // Continue crawling other pages
212
276
  }
213
277
  }
214
278
  // Update final statistics
215
279
  this.linkDiscoveryStats.pagesDiscovered = this.visitedUrls.size;
216
- // Log crawl completion with statistics
280
+ // Calculate final statistics
281
+ const totalTime = ((Date.now() - startTime) / 1000).toFixed(2);
282
+ const avgTimePerPage = this.crawledPages.length > 0
283
+ ? ((Date.now() - startTime) / this.crawledPages.length / 1000).toFixed(2)
284
+ : '0';
285
+ const successRate = this.linkDiscoveryStats.pagesDiscovered > 0
286
+ ? ((this.crawledPages.length / this.linkDiscoveryStats.pagesDiscovered) * 100).toFixed(1)
287
+ : '0';
288
+ // Log crawl completion with comprehensive statistics
217
289
  logger.info('Documentation crawl completed using HTTP client (axios)', {
218
290
  totalPages: this.crawledPages.length,
219
291
  maxDepthReached,
220
292
  errors: this.errors.length,
293
+ totalTimeSeconds: totalTime,
294
+ avgTimePerPageSeconds: avgTimePerPage,
295
+ successRate: `${successRate}%`,
221
296
  method: 'HTTP GET',
222
297
  client: 'axios/HttpClient',
223
298
  linkStats: {
@@ -227,13 +302,27 @@ export class DocumentationCrawler {
227
302
  pagesDiscovered: this.linkDiscoveryStats.pagesDiscovered,
228
303
  pagesCrawled: this.linkDiscoveryStats.pagesCrawled,
229
304
  },
305
+ errorBreakdown: this.getErrorBreakdown(),
230
306
  });
307
+ // Validate if content is sufficient for skill generation
308
+ const validation = this.canGenerateSkill(this.crawledPages);
309
+ const abandoned = !validation.canGenerate;
310
+ const abandonReason = validation.reason;
311
+ if (abandoned) {
312
+ logger.warn('Crawl completed but content is insufficient for skill generation', {
313
+ reason: abandonReason,
314
+ pagesCrawled: this.crawledPages.length,
315
+ suggestion: 'Consider crawling more pages or a different website',
316
+ });
317
+ }
231
318
  return {
232
319
  pages: this.crawledPages,
233
320
  totalPages: this.crawledPages.length,
234
321
  maxDepthReached,
235
322
  errors: this.errors,
236
323
  linkDiscoveryStats: this.linkDiscoveryStats,
324
+ abandoned,
325
+ abandonReason,
237
326
  };
238
327
  }
239
328
  /**
@@ -242,7 +331,7 @@ export class DocumentationCrawler {
242
331
  discoverDocumentationLinks(page, nextDepth) {
243
332
  const discovered = [];
244
333
  const filtered = {
245
- notDocumentation: 0,
334
+ notContent: 0, // Renamed from notDocumentation
246
335
  externalDomain: 0,
247
336
  alreadyVisited: 0,
248
337
  excludedPattern: 0,
@@ -273,11 +362,11 @@ export class DocumentationCrawler {
273
362
  linkDetails.push({ url: link.url, reason: 'already_visited' });
274
363
  continue;
275
364
  }
276
- // Check if it's a documentation path
365
+ // Check if it's a valid content path (permissive - only exclude clearly non-content)
277
366
  if (!this.isDocumentationPath(linkUrl.pathname)) {
278
- filtered.notDocumentation++;
279
- this.linkDiscoveryStats.linksFiltered.notDocumentation++;
280
- linkDetails.push({ url: link.url, reason: 'not_documentation_path', pathname: linkUrl.pathname });
367
+ filtered.notContent++;
368
+ this.linkDiscoveryStats.linksFiltered.notContent++;
369
+ linkDetails.push({ url: link.url, reason: 'not_content_path', pathname: linkUrl.pathname });
281
370
  continue;
282
371
  }
283
372
  // Check exclude patterns
@@ -291,8 +380,8 @@ export class DocumentationCrawler {
291
380
  if (this.options.includePaths.length > 0) {
292
381
  const matchesInclude = this.options.includePaths.some(pattern => linkUrl.pathname.includes(pattern));
293
382
  if (!matchesInclude) {
294
- filtered.notDocumentation++;
295
- this.linkDiscoveryStats.linksFiltered.notDocumentation++;
383
+ filtered.notContent++;
384
+ this.linkDiscoveryStats.linksFiltered.notContent++;
296
385
  linkDetails.push({ url: link.url, reason: 'not_in_include_paths', pathname: linkUrl.pathname });
297
386
  continue;
298
387
  }
@@ -333,7 +422,7 @@ export class DocumentationCrawler {
333
422
  totalLinks: page.navigationLinks.length,
334
423
  discovered: discovered.length,
335
424
  filtered: {
336
- notDocumentation: filtered.notDocumentation,
425
+ notContent: filtered.notContent,
337
426
  externalDomain: filtered.externalDomain,
338
427
  alreadyVisited: filtered.alreadyVisited,
339
428
  excludedPattern: filtered.excludedPattern,
@@ -348,52 +437,42 @@ export class DocumentationCrawler {
348
437
  discovered,
349
438
  filtered,
350
439
  alreadyVisited: filtered.alreadyVisited,
351
- notDocumentation: filtered.notDocumentation,
440
+ notContent: filtered.notContent,
352
441
  externalDomain: filtered.externalDomain,
353
442
  excludedPattern: filtered.excludedPattern,
354
443
  };
355
444
  }
356
445
  /**
357
- * Check if a path is a documentation path
446
+ * Check if a path should be crawled (permissive - only exclude clearly non-content paths)
358
447
  */
359
448
  isDocumentationPath(pathname) {
360
- // Check against documentation patterns
361
- const matchesPattern = this.DOCUMENTATION_PATTERNS.some(pattern => pattern.test(pathname));
362
- // If it matches a pattern, it's definitely documentation
363
- if (matchesPattern) {
449
+ // Exclude clearly non-content pages
450
+ if (this.shouldExclude(pathname)) {
451
+ return false;
452
+ }
453
+ // Exclude static resources
454
+ const looksLikeStaticResource = /\.(?:css|js|json|xml|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|eot|pdf|zip|exe|dmg)$/i.test(pathname);
455
+ if (looksLikeStaticResource) {
456
+ return false;
457
+ }
458
+ // Exclude API endpoints that are clearly not content (unless they're documentation APIs)
459
+ // Keep API endpoints that might be documentation (e.g., /api/docs, /docs/api)
460
+ const looksLikeApiEndpoint = /^\/api\/[^/]+$/i.test(pathname);
461
+ if (looksLikeApiEndpoint && !pathname.includes('/docs') && !pathname.includes('/documentation')) {
462
+ return false;
463
+ }
464
+ // Allow root path
465
+ if (pathname === '/' || pathname === '') {
364
466
  return true;
365
467
  }
366
- // Additional check: if the base URL is a documentation page (which we verified at start),
367
- // then paths on the same domain are likely documentation too (unless they match excluded patterns)
368
- // This helps with sites that have documentation at root level or non-standard paths
369
- // Only apply this if the path doesn't match excluded patterns
370
- if (!this.shouldExclude(pathname)) {
371
- // If pathname is just "/" or empty, it's the root - check if base URL was documentation
372
- if (pathname === '/' || pathname === '') {
373
- return true; // Root of a documentation site is documentation
374
- }
375
- // For documentation sites, be more permissive:
376
- // 1. If path contains common documentation keywords
377
- // 2. If path looks like a documentation structure (no file extensions like .html, .php, etc.)
378
- // 3. If path doesn't look like an API endpoint or static resource
379
- const hasFileExtension = /\.[a-z]{2,4}$/i.test(pathname.split('?')[0]);
380
- const looksLikeStaticResource = /\.(?:css|js|json|xml|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|eot)$/i.test(pathname);
381
- const looksLikeApiEndpoint = /^\/api\/[^/]+$/i.test(pathname) && !/\/docs\/api\//i.test(pathname);
382
- // If it's a static resource or API endpoint (not docs), exclude it
383
- if (looksLikeStaticResource || (looksLikeApiEndpoint && !pathname.includes('/docs'))) {
384
- return false;
385
- }
386
- // If it has a file extension (but not a static resource), be conservative
387
- if (hasFileExtension) {
388
- return false;
389
- }
390
- // For paths without file extensions, check if they contain documentation keywords
391
- // OR if they're under common documentation paths
392
- const looksLikeDoc = /(?:doc|guide|tutorial|api|reference|manual|help|about|getting-started|examples?)/i.test(pathname);
393
- const isUnderDocPath = /^\/(?:docs?|documentation|guides?|tutorials?|api|reference|manual|help|examples?)/i.test(pathname);
394
- return looksLikeDoc || isUnderDocPath;
468
+ // Exclude paths with file extensions (unless they're HTML pages)
469
+ const hasFileExtension = /\.[a-z]{2,4}$/i.test(pathname.split('?')[0]);
470
+ if (hasFileExtension && !pathname.match(/\.(html?|htm)$/i)) {
471
+ return false;
395
472
  }
396
- return false;
473
+ // Permissive: allow any path that doesn't match exclusion patterns
474
+ // This allows crawling any website, not just documentation
475
+ return true;
397
476
  }
398
477
  /**
399
478
  * Check if a path should be excluded
@@ -401,6 +480,151 @@ export class DocumentationCrawler {
401
480
  shouldExclude(pathname) {
402
481
  return this.EXCLUDED_PATTERNS.some(pattern => pattern.test(pathname));
403
482
  }
483
+ /**
484
+ * Check if crawled content is sufficient for skill generation
485
+ * Similar logic to SkillGenerator but here for early validation
486
+ */
487
+ canGenerateSkill(pages) {
488
+ if (pages.length === 0) {
489
+ return { canGenerate: false, reason: 'empty_pages' };
490
+ }
491
+ const MIN_CONTENT_LENGTH = 100;
492
+ let hasSufficientContent = false;
493
+ let hasStructuredContent = false;
494
+ let hasTextContent = false;
495
+ let mediaOnlyCount = 0;
496
+ for (const page of pages) {
497
+ const contentLength = (page.content || '').trim().length;
498
+ const hasHeadings = page.headings && page.headings.length > 0;
499
+ const hasText = contentLength > 0;
500
+ // Check if page is media-only (has images but no text)
501
+ const hasImages = /<img[^>]*>/i.test(page.content || '');
502
+ const hasMedia = hasImages || (page.codeSamples && page.codeSamples.length > 0);
503
+ if (hasMedia && contentLength < MIN_CONTENT_LENGTH) {
504
+ mediaOnlyCount++;
505
+ }
506
+ if (contentLength >= MIN_CONTENT_LENGTH) {
507
+ hasSufficientContent = true;
508
+ }
509
+ if (hasHeadings) {
510
+ hasStructuredContent = true;
511
+ }
512
+ if (hasText) {
513
+ hasTextContent = true;
514
+ }
515
+ }
516
+ // All pages are media-only
517
+ if (mediaOnlyCount === pages.length && !hasTextContent) {
518
+ return { canGenerate: false, reason: 'media_only' };
519
+ }
520
+ // No pages have sufficient content
521
+ if (!hasSufficientContent) {
522
+ return { canGenerate: false, reason: 'insufficient_content' };
523
+ }
524
+ // No structured content (headings, sections)
525
+ if (!hasStructuredContent) {
526
+ return { canGenerate: false, reason: 'no_structured_content' };
527
+ }
528
+ return { canGenerate: true };
529
+ }
530
+ /**
531
+ * Fetch a page with retry logic
532
+ */
533
+ async fetchPageWithRetry(url, retryCount = 0) {
534
+ try {
535
+ return await this.browser.browsePage(url);
536
+ }
537
+ catch (error) {
538
+ const errorType = this.classifyError(error);
539
+ const isRetryable = this.isRetryableError(error);
540
+ if (isRetryable && retryCount < this.options.maxRetries) {
541
+ const delay = this.options.retryDelay * (retryCount + 1); // Exponential backoff
542
+ logger.info('Retrying page fetch', {
543
+ url,
544
+ retryCount: retryCount + 1,
545
+ maxRetries: this.options.maxRetries,
546
+ delay,
547
+ errorType,
548
+ });
549
+ await this.delay(delay);
550
+ return this.fetchPageWithRetry(url, retryCount + 1);
551
+ }
552
+ // Not retryable or max retries reached
553
+ throw error;
554
+ }
555
+ }
556
+ /**
557
+ * Classify error type for better error messages
558
+ */
559
+ classifyError(error) {
560
+ if (!(error instanceof Error)) {
561
+ return 'UnknownError';
562
+ }
563
+ const message = error.message.toLowerCase();
564
+ const errorName = error.name.toLowerCase();
565
+ // Network errors
566
+ if (errorName.includes('timeout') || message.includes('timeout')) {
567
+ return 'TimeoutError';
568
+ }
569
+ if (errorName.includes('network') || message.includes('network') || message.includes('econnrefused')) {
570
+ return 'NetworkError';
571
+ }
572
+ if (message.includes('econnreset') || message.includes('socket')) {
573
+ return 'ConnectionError';
574
+ }
575
+ // HTTP errors
576
+ if (errorName.includes('http') || message.includes('status')) {
577
+ if (message.includes('404'))
578
+ return 'NotFoundError';
579
+ if (message.includes('403'))
580
+ return 'ForbiddenError';
581
+ if (message.includes('401'))
582
+ return 'UnauthorizedError';
583
+ if (message.includes('429'))
584
+ return 'RateLimitError';
585
+ if (message.includes('500') || message.includes('502') || message.includes('503')) {
586
+ return 'ServerError';
587
+ }
588
+ return 'HttpError';
589
+ }
590
+ // Content errors
591
+ if (message.includes('documentation') || message.includes('not appear to be')) {
592
+ return 'NotDocumentationError';
593
+ }
594
+ if (message.includes('spa') || message.includes('javascript')) {
595
+ return 'SPAError';
596
+ }
597
+ return 'UnknownError';
598
+ }
599
+ /**
600
+ * Check if an error is retryable
601
+ */
602
+ isRetryableError(error) {
603
+ if (!(error instanceof Error)) {
604
+ return false;
605
+ }
606
+ const errorType = this.classifyError(error);
607
+ // Retryable errors
608
+ const retryableTypes = [
609
+ 'TimeoutError',
610
+ 'NetworkError',
611
+ 'ConnectionError',
612
+ 'RateLimitError',
613
+ 'ServerError', // 500, 502, 503
614
+ ];
615
+ return retryableTypes.includes(errorType);
616
+ }
617
+ /**
618
+ * Get error breakdown by type
619
+ */
620
+ getErrorBreakdown() {
621
+ const breakdown = {};
622
+ for (const error of this.errors) {
623
+ const errorType = error.error.split(':')[0] || 'UnknownError';
624
+ breakdown[errorType] = (breakdown[errorType] || 0) + 1;
625
+ }
626
+ return breakdown;
627
+ }
404
628
  /**
405
629
  * Delay helper for rate limiting
406
630
  */