@gulibs/safe-coder 0.0.22 → 0.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +42 -0
- package/dist/documentation/doc-crawler.d.ts +29 -2
- package/dist/documentation/doc-crawler.d.ts.map +1 -1
- package/dist/documentation/doc-crawler.js +281 -57
- package/dist/documentation/doc-crawler.js.map +1 -1
- package/dist/documentation/skill-generator.d.ts +38 -2
- package/dist/documentation/skill-generator.d.ts.map +1 -1
- package/dist/documentation/skill-generator.js +331 -62
- package/dist/documentation/skill-generator.js.map +1 -1
- package/dist/documentation/web-doc-browser.d.ts +12 -0
- package/dist/documentation/web-doc-browser.d.ts.map +1 -1
- package/dist/documentation/web-doc-browser.js +103 -0
- package/dist/documentation/web-doc-browser.js.map +1 -1
- package/dist/server/mcp-server.d.ts.map +1 -1
- package/dist/server/mcp-server.js +167 -10
- package/dist/server/mcp-server.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -25,6 +25,10 @@
|
|
|
25
25
|
- 生成结构化的 Agent Skill 输出(Markdown 格式)
|
|
26
26
|
- 支持深度限制、页面限制和速率控制
|
|
27
27
|
- 自动组织内容,生成目录和章节
|
|
28
|
+
- **SPA 检测**:自动检测单页应用并提供建议
|
|
29
|
+
- **智能重试**:对临时错误自动重试,提高成功率
|
|
30
|
+
- **错误分类**:详细的错误类型统计和分析
|
|
31
|
+
- **进度监控**:实时显示爬取进度和性能指标
|
|
28
32
|
|
|
29
33
|
### 网页文档浏览
|
|
30
34
|
|
|
@@ -376,6 +380,17 @@ pwd
|
|
|
376
380
|
|
|
377
381
|
配置完成后,你可以在 Cursor 的 AI 对话中使用以下工具:
|
|
378
382
|
|
|
383
|
+
**可用工具列表:**
|
|
384
|
+
1. `get_documentation` - 获取文档
|
|
385
|
+
2. `browse_documentation` - 浏览网页文档
|
|
386
|
+
3. `crawl_documentation` - 爬取文档并生成技能
|
|
387
|
+
4. `detect_spa` - 检测单页应用
|
|
388
|
+
5. `detect_errors` - 检测代码错误
|
|
389
|
+
6. `validate_code` - 验证代码
|
|
390
|
+
7. `resolve_error` - 解决错误
|
|
391
|
+
8. `get_status` - 获取服务器状态
|
|
392
|
+
9. `get_version` - 获取版本号
|
|
393
|
+
|
|
379
394
|
#### `get_documentation` - 获取文档
|
|
380
395
|
|
|
381
396
|
获取库或包的文档。
|
|
@@ -431,9 +446,17 @@ pwd
|
|
|
431
446
|
- `includePaths`(可选):额外包含的路径模式
|
|
432
447
|
- `excludePaths`(可选):排除的路径模式
|
|
433
448
|
- `rateLimit`(可选):请求间隔(毫秒,默认:500)
|
|
449
|
+
- `maxRetries`(可选):失败请求的最大重试次数(默认:2)
|
|
450
|
+
- `retryDelay`(可选):重试前的延迟(毫秒,默认:1000)
|
|
434
451
|
- `outputDir`(可选):保存技能文件的目录(如果不提供,只返回内容)
|
|
435
452
|
- `filename`(可选):自定义文件名(不含扩展名)
|
|
436
453
|
|
|
454
|
+
**新功能:**
|
|
455
|
+
- ✅ **SPA 检测**:自动检测单页应用(SPA)并提供建议
|
|
456
|
+
- ✅ **智能重试**:对临时错误(超时、网络错误等)自动重试
|
|
457
|
+
- ✅ **错误分类**:详细的错误类型分类和统计
|
|
458
|
+
- ✅ **进度日志**:实时显示爬取进度和统计信息
|
|
459
|
+
|
|
437
460
|
**使用示例:**
|
|
438
461
|
|
|
439
462
|
```
|
|
@@ -507,6 +530,25 @@ function test() {
|
|
|
507
530
|
我遇到了一个 undefined-variable 错误,消息是 "Variable 'x' is used but not defined",在第 5 行
|
|
508
531
|
```
|
|
509
532
|
|
|
533
|
+
#### `detect_spa` - 检测单页应用
|
|
534
|
+
|
|
535
|
+
检测网站是否为需要 JavaScript 渲染的单页应用(SPA)。
|
|
536
|
+
|
|
537
|
+
**参数:**
|
|
538
|
+
- `url`(必需):要检测的 URL
|
|
539
|
+
|
|
540
|
+
**使用示例:**
|
|
541
|
+
|
|
542
|
+
```
|
|
543
|
+
检测 https://react-dnd.github.io/react-dnd/docs 是否是 SPA
|
|
544
|
+
```
|
|
545
|
+
|
|
546
|
+
**返回:**
|
|
547
|
+
- `isSPA`:是否为 SPA
|
|
548
|
+
- `confidence`:检测置信度(high/medium/low)
|
|
549
|
+
- `indicators`:检测到的 SPA 指标
|
|
550
|
+
- `suggestion`:建议(如果检测到 SPA)
|
|
551
|
+
|
|
510
552
|
### 4. 使用 MCP 资源
|
|
511
553
|
|
|
512
554
|
#### `safe-coder://documentation` - 缓存的文档
|
|
@@ -5,6 +5,9 @@ export interface CrawlOptions {
|
|
|
5
5
|
includePaths?: string[];
|
|
6
6
|
excludePaths?: string[];
|
|
7
7
|
rateLimit?: number;
|
|
8
|
+
maxRetries?: number;
|
|
9
|
+
retryDelay?: number;
|
|
10
|
+
useBrowserAutomation?: boolean;
|
|
8
11
|
}
|
|
9
12
|
export interface CrawledPage {
|
|
10
13
|
url: string;
|
|
@@ -34,7 +37,7 @@ export interface CrawledPage {
|
|
|
34
37
|
export interface LinkDiscoveryStats {
|
|
35
38
|
totalLinksFound: number;
|
|
36
39
|
linksFiltered: {
|
|
37
|
-
|
|
40
|
+
notContent: number;
|
|
38
41
|
externalDomain: number;
|
|
39
42
|
alreadyVisited: number;
|
|
40
43
|
excludedPattern: number;
|
|
@@ -44,6 +47,7 @@ export interface LinkDiscoveryStats {
|
|
|
44
47
|
pagesDiscovered: number;
|
|
45
48
|
pagesCrawled: number;
|
|
46
49
|
}
|
|
50
|
+
export type AbandonReason = 'insufficient_content' | 'media_only' | 'empty_pages' | 'no_structured_content';
|
|
47
51
|
export interface CrawlResult {
|
|
48
52
|
pages: CrawledPage[];
|
|
49
53
|
totalPages: number;
|
|
@@ -53,6 +57,8 @@ export interface CrawlResult {
|
|
|
53
57
|
error: string;
|
|
54
58
|
}>;
|
|
55
59
|
linkDiscoveryStats: LinkDiscoveryStats;
|
|
60
|
+
abandoned?: boolean;
|
|
61
|
+
abandonReason?: AbandonReason;
|
|
56
62
|
}
|
|
57
63
|
export declare class DocumentationCrawler {
|
|
58
64
|
private browser;
|
|
@@ -77,13 +83,34 @@ export declare class DocumentationCrawler {
|
|
|
77
83
|
*/
|
|
78
84
|
private discoverDocumentationLinks;
|
|
79
85
|
/**
|
|
80
|
-
* Check if a path
|
|
86
|
+
* Check if a path should be crawled (permissive - only exclude clearly non-content paths)
|
|
81
87
|
*/
|
|
82
88
|
private isDocumentationPath;
|
|
83
89
|
/**
|
|
84
90
|
* Check if a path should be excluded
|
|
85
91
|
*/
|
|
86
92
|
private shouldExclude;
|
|
93
|
+
/**
|
|
94
|
+
* Check if crawled content is sufficient for skill generation
|
|
95
|
+
* Similar logic to SkillGenerator but here for early validation
|
|
96
|
+
*/
|
|
97
|
+
private canGenerateSkill;
|
|
98
|
+
/**
|
|
99
|
+
* Fetch a page with retry logic
|
|
100
|
+
*/
|
|
101
|
+
private fetchPageWithRetry;
|
|
102
|
+
/**
|
|
103
|
+
* Classify error type for better error messages
|
|
104
|
+
*/
|
|
105
|
+
private classifyError;
|
|
106
|
+
/**
|
|
107
|
+
* Check if an error is retryable
|
|
108
|
+
*/
|
|
109
|
+
private isRetryableError;
|
|
110
|
+
/**
|
|
111
|
+
* Get error breakdown by type
|
|
112
|
+
*/
|
|
113
|
+
private getErrorBreakdown;
|
|
87
114
|
/**
|
|
88
115
|
* Delay helper for rate limiting
|
|
89
116
|
*/
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"doc-crawler.d.ts","sourceRoot":"","sources":["../../src/documentation/doc-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AAIrD,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"doc-crawler.d.ts","sourceRoot":"","sources":["../../src/documentation/doc-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AAIrD,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,oBAAoB,CAAC,EAAE,OAAO,CAAC;CAChC;AAED,MAAM,WAAW,WAAW;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,OAAO,EAAE,MAAM,CAAC;QAChB,MAAM,CAAC,EAAE,MAAM,CAAC;KACjB,CAAC,CAAC;IACH,eAAe,EAAE,KAAK,CAAC;QACrB,IAAI,EAAE,MAAM,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,UAAU,EAAE,OAAO,CAAC;KACrB,CAAC,CAAC;IACH,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,EAAE,CAAC,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,WAAW,EAAE,KAAK,CAAC;QACjB,IAAI,EAAE,MAAM,CAAC;QACb,QAAQ,EAAE,MAAM,CAAC;KAClB,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,kBAAkB;IACjC,eAAe,EAAE,MAAM,CAAC;IACxB,aAAa,EAAE;QACb,UAAU,EAAE,MAAM,CAAC;QACnB,cAAc,EAAE,MAAM,CAAC;QACvB,cAAc,EAAE,MAAM,CAAC;QACvB,eAAe,EAAE,MAAM,CAAC;QACxB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,CAAC;IACxB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,MAAM,aAAa,GACrB,sBAAsB,GACtB,YAAY,GACZ,aAAa,GACb,uBAAuB,CAAC;AAE5B,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,WAAW,EAAE,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe,EAAE,MAAM,CAAC;IACxB,MAAM,EAAE,KAAK,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IACH,kBAAkB,EAAE,kBAAkB,CAAC;IACvC,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,aAAa,CAAC,EAAE,aAAa,CAAC;CAC/B;AAOD,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,OAAO,CAA0B;IACzC,OAAO,CAAC,WAAW,CAAc;IACjC,OAAO,CAAC,QAAQ,CAAc;IAC9B,OAAO,CAAC,YAAY,CAAgB;IACpC,OAAO,CAAC,MAAM,CAAwC;IACtD,OAAO,CAAC,OAAO,CAAyB;IACxC,OAAO,CAAC,OAAO,CAAM;IACrB,OAAO,CAAC,kBAAkB,CAAqB;IAC/C,OAAO,CAAC,QAAQ,CAAC,sBAAsB,CAWrC;IACF,OAAO,CAAC,QAAQ,CAAC,iBAAiB,CAUhC;gBAEU,UAAU,CAAC,EAAE,UAAU;IAiCnC;;;;OAIG;IACG,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,WAAW,CAAC;IA6R9E;;OAEG;IACH,OAAO,CAAC,0BAA0B;IAiJlC;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAmC3B;;OAEG;IACH,OAAO,CAAC,aAAa;IAIrB;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IAoDxB;;OAEG;YACW,kBAAkB;IA0BhC;;OAEG;IACH,OAAO,CAAC,aAAa;IA0CrB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAWzB;;OAEG;IACH,OAAO,CAAC,KAAK;CAGd"}
|
|
@@ -44,12 +44,15 @@ export class DocumentationCrawler {
|
|
|
44
44
|
includePaths: [],
|
|
45
45
|
excludePaths: [],
|
|
46
46
|
rateLimit: 500, // 500ms default delay
|
|
47
|
+
maxRetries: 2, // Default 2 retries
|
|
48
|
+
retryDelay: 1000, // Default 1 second delay before retry
|
|
49
|
+
useBrowserAutomation: false, // Default to HTTP-only for backward compatibility
|
|
47
50
|
};
|
|
48
51
|
this.baseUrl = new URL('https://example.com');
|
|
49
52
|
this.linkDiscoveryStats = {
|
|
50
53
|
totalLinksFound: 0,
|
|
51
54
|
linksFiltered: {
|
|
52
|
-
|
|
55
|
+
notContent: 0,
|
|
53
56
|
externalDomain: 0,
|
|
54
57
|
alreadyVisited: 0,
|
|
55
58
|
excludedPattern: 0,
|
|
@@ -80,7 +83,7 @@ export class DocumentationCrawler {
|
|
|
80
83
|
this.linkDiscoveryStats = {
|
|
81
84
|
totalLinksFound: 0,
|
|
82
85
|
linksFiltered: {
|
|
83
|
-
|
|
86
|
+
notContent: 0,
|
|
84
87
|
externalDomain: 0,
|
|
85
88
|
alreadyVisited: 0,
|
|
86
89
|
excludedPattern: 0,
|
|
@@ -102,22 +105,63 @@ export class DocumentationCrawler {
|
|
|
102
105
|
catch (error) {
|
|
103
106
|
throw new Error(`Invalid root URL: ${rootUrl}`);
|
|
104
107
|
}
|
|
105
|
-
//
|
|
106
|
-
logger.debug('
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
108
|
+
// No longer require documentation-only pages - allow any website with extractable content
|
|
109
|
+
logger.debug('Starting crawl from URL (permissive mode)', { url: rootUrl });
|
|
110
|
+
// Detect SPA and provide warning
|
|
111
|
+
try {
|
|
112
|
+
const spaDetection = await this.browser.detectSPA(rootUrl);
|
|
113
|
+
if (spaDetection.isSPA && spaDetection.confidence !== 'low') {
|
|
114
|
+
logger.warn('SPA detected at root URL - crawling may be limited', {
|
|
115
|
+
url: rootUrl,
|
|
116
|
+
confidence: spaDetection.confidence,
|
|
117
|
+
indicators: spaDetection.indicators,
|
|
118
|
+
suggestion: spaDetection.suggestion,
|
|
119
|
+
});
|
|
120
|
+
// Add warning to first page if SPA detected
|
|
121
|
+
if (spaDetection.suggestion) {
|
|
122
|
+
logger.info('SPA Detection Warning', {
|
|
123
|
+
message: spaDetection.suggestion,
|
|
124
|
+
recommendation: 'Consider using browser automation tools to get fully rendered content before crawling.',
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
catch (error) {
|
|
130
|
+
// SPA detection failure is not critical, continue crawling
|
|
131
|
+
logger.debug('SPA detection failed, continuing with crawl', {
|
|
132
|
+
url: rootUrl,
|
|
133
|
+
error: error instanceof Error ? error.message : String(error),
|
|
134
|
+
});
|
|
111
135
|
}
|
|
112
136
|
// Start crawling from root
|
|
113
137
|
this.urlQueue.push({ url: rootUrl, depth: 0 });
|
|
114
138
|
let maxDepthReached = 0;
|
|
115
139
|
// Process queue
|
|
140
|
+
const startTime = Date.now();
|
|
141
|
+
let lastProgressLog = Date.now();
|
|
142
|
+
const PROGRESS_LOG_INTERVAL = 5000; // Log progress every 5 seconds
|
|
116
143
|
while (this.urlQueue.length > 0 && this.crawledPages.length < this.options.maxPages) {
|
|
117
144
|
const queued = this.urlQueue.shift();
|
|
118
145
|
if (!queued)
|
|
119
146
|
break;
|
|
120
147
|
const { url, depth } = queued;
|
|
148
|
+
// Log progress periodically
|
|
149
|
+
const now = Date.now();
|
|
150
|
+
if (now - lastProgressLog >= PROGRESS_LOG_INTERVAL) {
|
|
151
|
+
const elapsed = ((now - startTime) / 1000).toFixed(1);
|
|
152
|
+
const pagesPerSecond = (this.crawledPages.length / elapsed).toFixed(2);
|
|
153
|
+
logger.info('Crawl progress', {
|
|
154
|
+
pagesCrawled: this.crawledPages.length,
|
|
155
|
+
pagesRemaining: this.urlQueue.length,
|
|
156
|
+
maxPages: this.options.maxPages,
|
|
157
|
+
errors: this.errors.length,
|
|
158
|
+
elapsedSeconds: elapsed,
|
|
159
|
+
pagesPerSecond,
|
|
160
|
+
currentDepth: depth,
|
|
161
|
+
maxDepth: this.options.maxDepth,
|
|
162
|
+
});
|
|
163
|
+
lastProgressLog = now;
|
|
164
|
+
}
|
|
121
165
|
// Skip if already visited
|
|
122
166
|
if (this.visitedUrls.has(url)) {
|
|
123
167
|
continue;
|
|
@@ -130,9 +174,20 @@ export class DocumentationCrawler {
|
|
|
130
174
|
this.visitedUrls.add(url);
|
|
131
175
|
maxDepthReached = Math.max(maxDepthReached, depth);
|
|
132
176
|
try {
|
|
133
|
-
// Crawl the page using HTTP GET
|
|
177
|
+
// Crawl the page using HTTP GET with retry logic
|
|
134
178
|
logger.debug('Fetching page via HTTP GET', { url, depth, method: 'HTTP GET', client: 'axios' });
|
|
135
|
-
const page = await this.
|
|
179
|
+
const page = await this.fetchPageWithRetry(url);
|
|
180
|
+
// Check if page has minimal content (possible SPA issue)
|
|
181
|
+
const contentLength = page.content.length;
|
|
182
|
+
const linksCount = page.navigationLinks.length;
|
|
183
|
+
if (contentLength < 200 && linksCount < 3) {
|
|
184
|
+
logger.warn('Page has minimal content - may be SPA', {
|
|
185
|
+
url,
|
|
186
|
+
contentLength,
|
|
187
|
+
linksCount,
|
|
188
|
+
suggestion: 'This page may require JavaScript rendering. Consider using browser automation tools.',
|
|
189
|
+
});
|
|
190
|
+
}
|
|
136
191
|
// Convert to CrawledPage format
|
|
137
192
|
const crawledPage = {
|
|
138
193
|
url: page.url,
|
|
@@ -164,7 +219,7 @@ export class DocumentationCrawler {
|
|
|
164
219
|
discovered: newUrls.length,
|
|
165
220
|
filtered: discoveryResult.filtered,
|
|
166
221
|
alreadyVisited: discoveryResult.alreadyVisited,
|
|
167
|
-
|
|
222
|
+
notContent: discoveryResult.notContent,
|
|
168
223
|
externalDomain: discoveryResult.externalDomain,
|
|
169
224
|
excludedPattern: discoveryResult.excludedPattern,
|
|
170
225
|
queueLengthBefore: this.urlQueue.length,
|
|
@@ -204,20 +259,40 @@ export class DocumentationCrawler {
|
|
|
204
259
|
}
|
|
205
260
|
}
|
|
206
261
|
catch (error) {
|
|
262
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
263
|
+
const errorType = this.classifyError(error);
|
|
207
264
|
this.errors.push({
|
|
208
265
|
url,
|
|
209
|
-
error:
|
|
266
|
+
error: `${errorType}: ${errorMessage}`,
|
|
267
|
+
});
|
|
268
|
+
logger.warn('Page crawl failed', {
|
|
269
|
+
url,
|
|
270
|
+
error: errorMessage,
|
|
271
|
+
errorType,
|
|
272
|
+
depth,
|
|
273
|
+
willContinue: true,
|
|
210
274
|
});
|
|
211
275
|
// Continue crawling other pages
|
|
212
276
|
}
|
|
213
277
|
}
|
|
214
278
|
// Update final statistics
|
|
215
279
|
this.linkDiscoveryStats.pagesDiscovered = this.visitedUrls.size;
|
|
216
|
-
//
|
|
280
|
+
// Calculate final statistics
|
|
281
|
+
const totalTime = ((Date.now() - startTime) / 1000).toFixed(2);
|
|
282
|
+
const avgTimePerPage = this.crawledPages.length > 0
|
|
283
|
+
? ((Date.now() - startTime) / this.crawledPages.length / 1000).toFixed(2)
|
|
284
|
+
: '0';
|
|
285
|
+
const successRate = this.linkDiscoveryStats.pagesDiscovered > 0
|
|
286
|
+
? ((this.crawledPages.length / this.linkDiscoveryStats.pagesDiscovered) * 100).toFixed(1)
|
|
287
|
+
: '0';
|
|
288
|
+
// Log crawl completion with comprehensive statistics
|
|
217
289
|
logger.info('Documentation crawl completed using HTTP client (axios)', {
|
|
218
290
|
totalPages: this.crawledPages.length,
|
|
219
291
|
maxDepthReached,
|
|
220
292
|
errors: this.errors.length,
|
|
293
|
+
totalTimeSeconds: totalTime,
|
|
294
|
+
avgTimePerPageSeconds: avgTimePerPage,
|
|
295
|
+
successRate: `${successRate}%`,
|
|
221
296
|
method: 'HTTP GET',
|
|
222
297
|
client: 'axios/HttpClient',
|
|
223
298
|
linkStats: {
|
|
@@ -227,13 +302,27 @@ export class DocumentationCrawler {
|
|
|
227
302
|
pagesDiscovered: this.linkDiscoveryStats.pagesDiscovered,
|
|
228
303
|
pagesCrawled: this.linkDiscoveryStats.pagesCrawled,
|
|
229
304
|
},
|
|
305
|
+
errorBreakdown: this.getErrorBreakdown(),
|
|
230
306
|
});
|
|
307
|
+
// Validate if content is sufficient for skill generation
|
|
308
|
+
const validation = this.canGenerateSkill(this.crawledPages);
|
|
309
|
+
const abandoned = !validation.canGenerate;
|
|
310
|
+
const abandonReason = validation.reason;
|
|
311
|
+
if (abandoned) {
|
|
312
|
+
logger.warn('Crawl completed but content is insufficient for skill generation', {
|
|
313
|
+
reason: abandonReason,
|
|
314
|
+
pagesCrawled: this.crawledPages.length,
|
|
315
|
+
suggestion: 'Consider crawling more pages or a different website',
|
|
316
|
+
});
|
|
317
|
+
}
|
|
231
318
|
return {
|
|
232
319
|
pages: this.crawledPages,
|
|
233
320
|
totalPages: this.crawledPages.length,
|
|
234
321
|
maxDepthReached,
|
|
235
322
|
errors: this.errors,
|
|
236
323
|
linkDiscoveryStats: this.linkDiscoveryStats,
|
|
324
|
+
abandoned,
|
|
325
|
+
abandonReason,
|
|
237
326
|
};
|
|
238
327
|
}
|
|
239
328
|
/**
|
|
@@ -242,7 +331,7 @@ export class DocumentationCrawler {
|
|
|
242
331
|
discoverDocumentationLinks(page, nextDepth) {
|
|
243
332
|
const discovered = [];
|
|
244
333
|
const filtered = {
|
|
245
|
-
|
|
334
|
+
notContent: 0, // Renamed from notDocumentation
|
|
246
335
|
externalDomain: 0,
|
|
247
336
|
alreadyVisited: 0,
|
|
248
337
|
excludedPattern: 0,
|
|
@@ -273,11 +362,11 @@ export class DocumentationCrawler {
|
|
|
273
362
|
linkDetails.push({ url: link.url, reason: 'already_visited' });
|
|
274
363
|
continue;
|
|
275
364
|
}
|
|
276
|
-
// Check if it's a
|
|
365
|
+
// Check if it's a valid content path (permissive - only exclude clearly non-content)
|
|
277
366
|
if (!this.isDocumentationPath(linkUrl.pathname)) {
|
|
278
|
-
filtered.
|
|
279
|
-
this.linkDiscoveryStats.linksFiltered.
|
|
280
|
-
linkDetails.push({ url: link.url, reason: '
|
|
367
|
+
filtered.notContent++;
|
|
368
|
+
this.linkDiscoveryStats.linksFiltered.notContent++;
|
|
369
|
+
linkDetails.push({ url: link.url, reason: 'not_content_path', pathname: linkUrl.pathname });
|
|
281
370
|
continue;
|
|
282
371
|
}
|
|
283
372
|
// Check exclude patterns
|
|
@@ -291,8 +380,8 @@ export class DocumentationCrawler {
|
|
|
291
380
|
if (this.options.includePaths.length > 0) {
|
|
292
381
|
const matchesInclude = this.options.includePaths.some(pattern => linkUrl.pathname.includes(pattern));
|
|
293
382
|
if (!matchesInclude) {
|
|
294
|
-
filtered.
|
|
295
|
-
this.linkDiscoveryStats.linksFiltered.
|
|
383
|
+
filtered.notContent++;
|
|
384
|
+
this.linkDiscoveryStats.linksFiltered.notContent++;
|
|
296
385
|
linkDetails.push({ url: link.url, reason: 'not_in_include_paths', pathname: linkUrl.pathname });
|
|
297
386
|
continue;
|
|
298
387
|
}
|
|
@@ -333,7 +422,7 @@ export class DocumentationCrawler {
|
|
|
333
422
|
totalLinks: page.navigationLinks.length,
|
|
334
423
|
discovered: discovered.length,
|
|
335
424
|
filtered: {
|
|
336
|
-
|
|
425
|
+
notContent: filtered.notContent,
|
|
337
426
|
externalDomain: filtered.externalDomain,
|
|
338
427
|
alreadyVisited: filtered.alreadyVisited,
|
|
339
428
|
excludedPattern: filtered.excludedPattern,
|
|
@@ -348,52 +437,42 @@ export class DocumentationCrawler {
|
|
|
348
437
|
discovered,
|
|
349
438
|
filtered,
|
|
350
439
|
alreadyVisited: filtered.alreadyVisited,
|
|
351
|
-
|
|
440
|
+
notContent: filtered.notContent,
|
|
352
441
|
externalDomain: filtered.externalDomain,
|
|
353
442
|
excludedPattern: filtered.excludedPattern,
|
|
354
443
|
};
|
|
355
444
|
}
|
|
356
445
|
/**
|
|
357
|
-
* Check if a path
|
|
446
|
+
* Check if a path should be crawled (permissive - only exclude clearly non-content paths)
|
|
358
447
|
*/
|
|
359
448
|
isDocumentationPath(pathname) {
|
|
360
|
-
//
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
449
|
+
// Exclude clearly non-content pages
|
|
450
|
+
if (this.shouldExclude(pathname)) {
|
|
451
|
+
return false;
|
|
452
|
+
}
|
|
453
|
+
// Exclude static resources
|
|
454
|
+
const looksLikeStaticResource = /\.(?:css|js|json|xml|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|eot|pdf|zip|exe|dmg)$/i.test(pathname);
|
|
455
|
+
if (looksLikeStaticResource) {
|
|
456
|
+
return false;
|
|
457
|
+
}
|
|
458
|
+
// Exclude API endpoints that are clearly not content (unless they're documentation APIs)
|
|
459
|
+
// Keep API endpoints that might be documentation (e.g., /api/docs, /docs/api)
|
|
460
|
+
const looksLikeApiEndpoint = /^\/api\/[^/]+$/i.test(pathname);
|
|
461
|
+
if (looksLikeApiEndpoint && !pathname.includes('/docs') && !pathname.includes('/documentation')) {
|
|
462
|
+
return false;
|
|
463
|
+
}
|
|
464
|
+
// Allow root path
|
|
465
|
+
if (pathname === '/' || pathname === '') {
|
|
364
466
|
return true;
|
|
365
467
|
}
|
|
366
|
-
//
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
if (!this.shouldExclude(pathname)) {
|
|
371
|
-
// If pathname is just "/" or empty, it's the root - check if base URL was documentation
|
|
372
|
-
if (pathname === '/' || pathname === '') {
|
|
373
|
-
return true; // Root of a documentation site is documentation
|
|
374
|
-
}
|
|
375
|
-
// For documentation sites, be more permissive:
|
|
376
|
-
// 1. If path contains common documentation keywords
|
|
377
|
-
// 2. If path looks like a documentation structure (no file extensions like .html, .php, etc.)
|
|
378
|
-
// 3. If path doesn't look like an API endpoint or static resource
|
|
379
|
-
const hasFileExtension = /\.[a-z]{2,4}$/i.test(pathname.split('?')[0]);
|
|
380
|
-
const looksLikeStaticResource = /\.(?:css|js|json|xml|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|eot)$/i.test(pathname);
|
|
381
|
-
const looksLikeApiEndpoint = /^\/api\/[^/]+$/i.test(pathname) && !/\/docs\/api\//i.test(pathname);
|
|
382
|
-
// If it's a static resource or API endpoint (not docs), exclude it
|
|
383
|
-
if (looksLikeStaticResource || (looksLikeApiEndpoint && !pathname.includes('/docs'))) {
|
|
384
|
-
return false;
|
|
385
|
-
}
|
|
386
|
-
// If it has a file extension (but not a static resource), be conservative
|
|
387
|
-
if (hasFileExtension) {
|
|
388
|
-
return false;
|
|
389
|
-
}
|
|
390
|
-
// For paths without file extensions, check if they contain documentation keywords
|
|
391
|
-
// OR if they're under common documentation paths
|
|
392
|
-
const looksLikeDoc = /(?:doc|guide|tutorial|api|reference|manual|help|about|getting-started|examples?)/i.test(pathname);
|
|
393
|
-
const isUnderDocPath = /^\/(?:docs?|documentation|guides?|tutorials?|api|reference|manual|help|examples?)/i.test(pathname);
|
|
394
|
-
return looksLikeDoc || isUnderDocPath;
|
|
468
|
+
// Exclude paths with file extensions (unless they're HTML pages)
|
|
469
|
+
const hasFileExtension = /\.[a-z]{2,4}$/i.test(pathname.split('?')[0]);
|
|
470
|
+
if (hasFileExtension && !pathname.match(/\.(html?|htm)$/i)) {
|
|
471
|
+
return false;
|
|
395
472
|
}
|
|
396
|
-
|
|
473
|
+
// Permissive: allow any path that doesn't match exclusion patterns
|
|
474
|
+
// This allows crawling any website, not just documentation
|
|
475
|
+
return true;
|
|
397
476
|
}
|
|
398
477
|
/**
|
|
399
478
|
* Check if a path should be excluded
|
|
@@ -401,6 +480,151 @@ export class DocumentationCrawler {
|
|
|
401
480
|
shouldExclude(pathname) {
|
|
402
481
|
return this.EXCLUDED_PATTERNS.some(pattern => pattern.test(pathname));
|
|
403
482
|
}
|
|
483
|
+
/**
|
|
484
|
+
* Check if crawled content is sufficient for skill generation
|
|
485
|
+
* Similar logic to SkillGenerator but here for early validation
|
|
486
|
+
*/
|
|
487
|
+
canGenerateSkill(pages) {
|
|
488
|
+
if (pages.length === 0) {
|
|
489
|
+
return { canGenerate: false, reason: 'empty_pages' };
|
|
490
|
+
}
|
|
491
|
+
const MIN_CONTENT_LENGTH = 100;
|
|
492
|
+
let hasSufficientContent = false;
|
|
493
|
+
let hasStructuredContent = false;
|
|
494
|
+
let hasTextContent = false;
|
|
495
|
+
let mediaOnlyCount = 0;
|
|
496
|
+
for (const page of pages) {
|
|
497
|
+
const contentLength = (page.content || '').trim().length;
|
|
498
|
+
const hasHeadings = page.headings && page.headings.length > 0;
|
|
499
|
+
const hasText = contentLength > 0;
|
|
500
|
+
// Check if page is media-only (has images but no text)
|
|
501
|
+
const hasImages = /<img[^>]*>/i.test(page.content || '');
|
|
502
|
+
const hasMedia = hasImages || (page.codeSamples && page.codeSamples.length > 0);
|
|
503
|
+
if (hasMedia && contentLength < MIN_CONTENT_LENGTH) {
|
|
504
|
+
mediaOnlyCount++;
|
|
505
|
+
}
|
|
506
|
+
if (contentLength >= MIN_CONTENT_LENGTH) {
|
|
507
|
+
hasSufficientContent = true;
|
|
508
|
+
}
|
|
509
|
+
if (hasHeadings) {
|
|
510
|
+
hasStructuredContent = true;
|
|
511
|
+
}
|
|
512
|
+
if (hasText) {
|
|
513
|
+
hasTextContent = true;
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
// All pages are media-only
|
|
517
|
+
if (mediaOnlyCount === pages.length && !hasTextContent) {
|
|
518
|
+
return { canGenerate: false, reason: 'media_only' };
|
|
519
|
+
}
|
|
520
|
+
// No pages have sufficient content
|
|
521
|
+
if (!hasSufficientContent) {
|
|
522
|
+
return { canGenerate: false, reason: 'insufficient_content' };
|
|
523
|
+
}
|
|
524
|
+
// No structured content (headings, sections)
|
|
525
|
+
if (!hasStructuredContent) {
|
|
526
|
+
return { canGenerate: false, reason: 'no_structured_content' };
|
|
527
|
+
}
|
|
528
|
+
return { canGenerate: true };
|
|
529
|
+
}
|
|
530
|
+
/**
|
|
531
|
+
* Fetch a page with retry logic
|
|
532
|
+
*/
|
|
533
|
+
async fetchPageWithRetry(url, retryCount = 0) {
|
|
534
|
+
try {
|
|
535
|
+
return await this.browser.browsePage(url);
|
|
536
|
+
}
|
|
537
|
+
catch (error) {
|
|
538
|
+
const errorType = this.classifyError(error);
|
|
539
|
+
const isRetryable = this.isRetryableError(error);
|
|
540
|
+
if (isRetryable && retryCount < this.options.maxRetries) {
|
|
541
|
+
const delay = this.options.retryDelay * (retryCount + 1); // Exponential backoff
|
|
542
|
+
logger.info('Retrying page fetch', {
|
|
543
|
+
url,
|
|
544
|
+
retryCount: retryCount + 1,
|
|
545
|
+
maxRetries: this.options.maxRetries,
|
|
546
|
+
delay,
|
|
547
|
+
errorType,
|
|
548
|
+
});
|
|
549
|
+
await this.delay(delay);
|
|
550
|
+
return this.fetchPageWithRetry(url, retryCount + 1);
|
|
551
|
+
}
|
|
552
|
+
// Not retryable or max retries reached
|
|
553
|
+
throw error;
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
/**
|
|
557
|
+
* Classify error type for better error messages
|
|
558
|
+
*/
|
|
559
|
+
classifyError(error) {
|
|
560
|
+
if (!(error instanceof Error)) {
|
|
561
|
+
return 'UnknownError';
|
|
562
|
+
}
|
|
563
|
+
const message = error.message.toLowerCase();
|
|
564
|
+
const errorName = error.name.toLowerCase();
|
|
565
|
+
// Network errors
|
|
566
|
+
if (errorName.includes('timeout') || message.includes('timeout')) {
|
|
567
|
+
return 'TimeoutError';
|
|
568
|
+
}
|
|
569
|
+
if (errorName.includes('network') || message.includes('network') || message.includes('econnrefused')) {
|
|
570
|
+
return 'NetworkError';
|
|
571
|
+
}
|
|
572
|
+
if (message.includes('econnreset') || message.includes('socket')) {
|
|
573
|
+
return 'ConnectionError';
|
|
574
|
+
}
|
|
575
|
+
// HTTP errors
|
|
576
|
+
if (errorName.includes('http') || message.includes('status')) {
|
|
577
|
+
if (message.includes('404'))
|
|
578
|
+
return 'NotFoundError';
|
|
579
|
+
if (message.includes('403'))
|
|
580
|
+
return 'ForbiddenError';
|
|
581
|
+
if (message.includes('401'))
|
|
582
|
+
return 'UnauthorizedError';
|
|
583
|
+
if (message.includes('429'))
|
|
584
|
+
return 'RateLimitError';
|
|
585
|
+
if (message.includes('500') || message.includes('502') || message.includes('503')) {
|
|
586
|
+
return 'ServerError';
|
|
587
|
+
}
|
|
588
|
+
return 'HttpError';
|
|
589
|
+
}
|
|
590
|
+
// Content errors
|
|
591
|
+
if (message.includes('documentation') || message.includes('not appear to be')) {
|
|
592
|
+
return 'NotDocumentationError';
|
|
593
|
+
}
|
|
594
|
+
if (message.includes('spa') || message.includes('javascript')) {
|
|
595
|
+
return 'SPAError';
|
|
596
|
+
}
|
|
597
|
+
return 'UnknownError';
|
|
598
|
+
}
|
|
599
|
+
/**
|
|
600
|
+
* Check if an error is retryable
|
|
601
|
+
*/
|
|
602
|
+
isRetryableError(error) {
|
|
603
|
+
if (!(error instanceof Error)) {
|
|
604
|
+
return false;
|
|
605
|
+
}
|
|
606
|
+
const errorType = this.classifyError(error);
|
|
607
|
+
// Retryable errors
|
|
608
|
+
const retryableTypes = [
|
|
609
|
+
'TimeoutError',
|
|
610
|
+
'NetworkError',
|
|
611
|
+
'ConnectionError',
|
|
612
|
+
'RateLimitError',
|
|
613
|
+
'ServerError', // 500, 502, 503
|
|
614
|
+
];
|
|
615
|
+
return retryableTypes.includes(errorType);
|
|
616
|
+
}
|
|
617
|
+
/**
|
|
618
|
+
* Get error breakdown by type
|
|
619
|
+
*/
|
|
620
|
+
getErrorBreakdown() {
|
|
621
|
+
const breakdown = {};
|
|
622
|
+
for (const error of this.errors) {
|
|
623
|
+
const errorType = error.error.split(':')[0] || 'UnknownError';
|
|
624
|
+
breakdown[errorType] = (breakdown[errorType] || 0) + 1;
|
|
625
|
+
}
|
|
626
|
+
return breakdown;
|
|
627
|
+
}
|
|
404
628
|
/**
|
|
405
629
|
* Delay helper for rate limiting
|
|
406
630
|
*/
|