npm - @gulibs/safe-coder - Versions diffs - 0.0.22 → 0.0.24 - Mend

@gulibs/safe-coder 0.0.22 → 0.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +42 -0
package/dist/documentation/doc-crawler.d.ts +29 -2
package/dist/documentation/doc-crawler.d.ts.map +1 -1
package/dist/documentation/doc-crawler.js +281 -57
package/dist/documentation/doc-crawler.js.map +1 -1
package/dist/documentation/skill-generator.d.ts +38 -2
package/dist/documentation/skill-generator.d.ts.map +1 -1
package/dist/documentation/skill-generator.js +331 -62
package/dist/documentation/skill-generator.js.map +1 -1
package/dist/documentation/web-doc-browser.d.ts +12 -0
package/dist/documentation/web-doc-browser.d.ts.map +1 -1
package/dist/documentation/web-doc-browser.js +103 -0
package/dist/documentation/web-doc-browser.js.map +1 -1
package/dist/server/mcp-server.d.ts.map +1 -1
package/dist/server/mcp-server.js +167 -10
package/dist/server/mcp-server.js.map +1 -1
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -25,6 +25,10 @@
 - 生成结构化的 Agent Skill 输出（Markdown 格式）
 - 支持深度限制、页面限制和速率控制
 - 自动组织内容，生成目录和章节
+- **SPA 检测**：自动检测单页应用并提供建议
+- **智能重试**：对临时错误自动重试，提高成功率
+- **错误分类**：详细的错误类型统计和分析
+- **进度监控**：实时显示爬取进度和性能指标
 ### 网页文档浏览
@@ -376,6 +380,17 @@ pwd
 配置完成后，你可以在 Cursor 的 AI 对话中使用以下工具：
+**可用工具列表：**
+1. `get_documentation` - 获取文档
+2. `browse_documentation` - 浏览网页文档
+3. `crawl_documentation` - 爬取文档并生成技能
+4. `detect_spa` - 检测单页应用
+5. `detect_errors` - 检测代码错误
+6. `validate_code` - 验证代码
+7. `resolve_error` - 解决错误
+8. `get_status` - 获取服务器状态
+9. `get_version` - 获取版本号
 #### `get_documentation` - 获取文档
 获取库或包的文档。
@@ -431,9 +446,17 @@ pwd
 - `includePaths`（可选）：额外包含的路径模式
 - `excludePaths`（可选）：排除的路径模式
 - `rateLimit`（可选）：请求间隔（毫秒，默认：500）
+- `maxRetries`（可选）：失败请求的最大重试次数（默认：2）
+- `retryDelay`（可选）：重试前的延迟（毫秒，默认：1000）
 - `outputDir`（可选）：保存技能文件的目录（如果不提供，只返回内容）
 - `filename`（可选）：自定义文件名（不含扩展名）
+**新功能：**
+- ✅ **SPA 检测**：自动检测单页应用（SPA）并提供建议
+- ✅ **智能重试**：对临时错误（超时、网络错误等）自动重试
+- ✅ **错误分类**：详细的错误类型分类和统计
+- ✅ **进度日志**：实时显示爬取进度和统计信息
 **使用示例：**
 ```
@@ -507,6 +530,25 @@ function test() {
 我遇到了一个 undefined-variable 错误，消息是 "Variable 'x' is used but not defined"，在第 5 行
 ```
+#### `detect_spa` - 检测单页应用
+检测网站是否为需要 JavaScript 渲染的单页应用（SPA）。
+**参数：**
+- `url`（必需）：要检测的 URL
+**使用示例：**
+```
+检测 https://react-dnd.github.io/react-dnd/docs 是否是 SPA
+```
+**返回：**
+- `isSPA`：是否为 SPA
+- `confidence`：检测置信度（high/medium/low）
+- `indicators`：检测到的 SPA 指标
+- `suggestion`：建议（如果检测到 SPA）
 ### 4. 使用 MCP 资源
 #### `safe-coder://documentation` - 缓存的文档

package/dist/documentation/doc-crawler.d.ts CHANGED Viewed

@@ -5,6 +5,9 @@ export interface CrawlOptions {
     includePaths?: string[];
     excludePaths?: string[];
     rateLimit?: number;
+    maxRetries?: number;
+    retryDelay?: number;
+    useBrowserAutomation?: boolean;
 }
 export interface CrawledPage {
     url: string;
@@ -34,7 +37,7 @@ export interface CrawledPage {
 export interface LinkDiscoveryStats {
     totalLinksFound: number;
     linksFiltered: {
-        notDocumentation: number;
+        notContent: number;
         externalDomain: number;
         alreadyVisited: number;
         excludedPattern: number;
@@ -44,6 +47,7 @@ export interface LinkDiscoveryStats {
     pagesDiscovered: number;
     pagesCrawled: number;
 }
+export type AbandonReason = 'insufficient_content' | 'media_only' | 'empty_pages' | 'no_structured_content';
 export interface CrawlResult {
     pages: CrawledPage[];
     totalPages: number;
@@ -53,6 +57,8 @@ export interface CrawlResult {
         error: string;
     }>;
     linkDiscoveryStats: LinkDiscoveryStats;
+    abandoned?: boolean;
+    abandonReason?: AbandonReason;
 }
 export declare class DocumentationCrawler {
     private browser;
@@ -77,13 +83,34 @@ export declare class DocumentationCrawler {
      */
     private discoverDocumentationLinks;
     /**
-     * Check if a path is a documentation path
+     * Check if a path should be crawled (permissive - only exclude clearly non-content paths)
      */
     private isDocumentationPath;
     /**
      * Check if a path should be excluded
      */
     private shouldExclude;
+    /**
+     * Check if crawled content is sufficient for skill generation
+     * Similar logic to SkillGenerator but here for early validation
+     */
+    private canGenerateSkill;
+    /**
+     * Fetch a page with retry logic
+     */
+    private fetchPageWithRetry;
+    /**
+     * Classify error type for better error messages
+     */
+    private classifyError;
+    /**
+     * Check if an error is retryable
+     */
+    private isRetryableError;
+    /**
+     * Get error breakdown by type
+     */
+    private getErrorBreakdown;
     /**
      * Delay helper for rate limiting
      */

package/dist/documentation/doc-crawler.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"doc-crawler.d.ts","sourceRoot":"","sources":["../../src/documentation/doc-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AAIrD,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;~~CACpB~~;AAED,MAAM,WAAW,WAAW;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,OAAO,EAAE,MAAM,CAAC;QAChB,MAAM,CAAC,EAAE,MAAM,CAAC;KACjB,CAAC,CAAC;IACH,eAAe,EAAE,KAAK,CAAC;QACrB,IAAI,EAAE,MAAM,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,UAAU,EAAE,OAAO,CAAC;KACrB,CAAC,CAAC;IACH,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,EAAE,CAAC,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,WAAW,EAAE,KAAK,CAAC;QACjB,IAAI,EAAE,MAAM,CAAC;QACb,QAAQ,EAAE,MAAM,CAAC;KAClB,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,kBAAkB;IACjC,eAAe,EAAE,MAAM,CAAC;IACxB,aAAa,EAAE;QACb,~~gBAAgB~~,EAAE,MAAM,CAAC;~~QACzB~~,cAAc,EAAE,MAAM,CAAC;QACvB,cAAc,EAAE,MAAM,CAAC;QACvB,eAAe,EAAE,MAAM,CAAC;QACxB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,CAAC;IACxB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,WAAW,EAAE,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe,EAAE,MAAM,CAAC;IACxB,MAAM,EAAE,KAAK,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IACH,kBAAkB,EAAE,kBAAkB,CAAC;~~CACxC~~;AAOD,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,OAAO,CAA0B;IACzC,OAAO,CAAC,WAAW,CAAc;IACjC,OAAO,CAAC,QAAQ,CAAc;IAC9B,OAAO,CAAC,YAAY,CAAgB;IACpC,OAAO,CAAC,MAAM,CAAwC;IACtD,OAAO,CAAC,OAAO,CAAyB;IACxC,OAAO,CAAC,OAAO,CAAM;IACrB,OAAO,CAAC,kBAAkB,CAAqB;IAC/C,OAAO,CAAC,QAAQ,CAAC,sBAAsB,CAWrC;IACF,OAAO,CAAC,QAAQ,CAAC,iBAAiB,CAUhC;gBAEU,UAAU,CAAC,EAAE,UAAU;~~IA8BnC~~;;;;OAIG;IACG,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,WAAW,CAAC;~~IA+L9E~~;;OAEG;IACH,OAAO,CAAC,0BAA0B;IAiJlC;;OAEG;IACH,OAAO,CAAC,mBAAmB;~~IAgD3B~~;;OAEG;IACH,OAAO,CAAC,aAAa;IAIrB;;OAEG;IACH,OAAO,CAAC,KAAK;CAGd"}
1	+ {"version":3,"file":"doc-crawler.d.ts","sourceRoot":"","sources":["../../src/documentation/doc-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AAIrD,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,oBAAoB,CAAC,EAAE,OAAO,CAAC;CAChC;AAED,MAAM,WAAW,WAAW;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,OAAO,EAAE,MAAM,CAAC;QAChB,MAAM,CAAC,EAAE,MAAM,CAAC;KACjB,CAAC,CAAC;IACH,eAAe,EAAE,KAAK,CAAC;QACrB,IAAI,EAAE,MAAM,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,UAAU,EAAE,OAAO,CAAC;KACrB,CAAC,CAAC;IACH,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,EAAE,CAAC,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,WAAW,EAAE,KAAK,CAAC;QACjB,IAAI,EAAE,MAAM,CAAC;QACb,QAAQ,EAAE,MAAM,CAAC;KAClB,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,kBAAkB;IACjC,eAAe,EAAE,MAAM,CAAC;IACxB,aAAa,EAAE;QACb,UAAU,EAAE,MAAM,CAAC;QACnB,cAAc,EAAE,MAAM,CAAC;QACvB,cAAc,EAAE,MAAM,CAAC;QACvB,eAAe,EAAE,MAAM,CAAC;QACxB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,CAAC;IACxB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,MAAM,aAAa,GACrB,sBAAsB,GACtB,YAAY,GACZ,aAAa,GACb,uBAAuB,CAAC;AAE5B,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,WAAW,EAAE,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe,EAAE,MAAM,CAAC;IACxB,MAAM,EAAE,KAAK,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IACH,kBAAkB,EAAE,kBAAkB,CAAC;IACvC,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,aAAa,CAAC,EAAE,aAAa,CAAC;CAC/B;AAOD,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,OAAO,CAA0B;IACzC,OAAO,CAAC,WAAW,CAAc;IACjC,OAAO,CAAC,QAAQ,CAAc;IAC9B,OAAO,CAAC,YAAY,CAAgB;IACpC,OAAO,CAAC,MAAM,CAAwC;IACtD,OAAO,CAAC,OAAO,CAAyB;IACxC,OAAO,CAAC,OAAO,CAAM;IACrB,OAAO,CAAC,kBAAkB,CAAqB;IAC/C,OAAO,CAAC,QAAQ,CAAC,sBAAsB,CAWrC;IACF,OAAO,CAAC,QAAQ,CAAC,iBAAiB,CAUhC;gBAEU,UAAU,CAAC,EAAE,UAAU;IAiCnC;;;;OAIG;IACG,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,WAAW,CAAC;IA6R9E;;OAEG;IACH,OAAO,CAAC,0BAA0B;IAiJlC;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAmC3B;;OAEG;IACH,OAAO,CAAC,aAAa;IAIrB;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IAoDxB;;OAEG;YACW,kBAAkB;IA0BhC;;OAEG;IACH,OAAO,CAAC,aAAa;IA0CrB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAWzB;;OAEG;IACH,OAAO,CAAC,KAAK;CAGd"}

package/dist/documentation/doc-crawler.js CHANGED Viewed

@@ -44,12 +44,15 @@ export class DocumentationCrawler {
             includePaths: [],
             excludePaths: [],
             rateLimit: 500, // 500ms default delay
+            maxRetries: 2, // Default 2 retries
+            retryDelay: 1000, // Default 1 second delay before retry
+            useBrowserAutomation: false, // Default to HTTP-only for backward compatibility
         };
         this.baseUrl = new URL('https://example.com');
         this.linkDiscoveryStats = {
             totalLinksFound: 0,
             linksFiltered: {
-                notDocumentation: 0,
+                notContent: 0,
                 externalDomain: 0,
                 alreadyVisited: 0,
                 excludedPattern: 0,
@@ -80,7 +83,7 @@ export class DocumentationCrawler {
         this.linkDiscoveryStats = {
             totalLinksFound: 0,
             linksFiltered: {
-                notDocumentation: 0,
+                notContent: 0,
                 externalDomain: 0,
                 alreadyVisited: 0,
                 excludedPattern: 0,
@@ -102,22 +105,63 @@ export class DocumentationCrawler {
         catch (error) {
             throw new Error(`Invalid root URL: ${rootUrl}`);
         }
-        // Check if root URL is documentation
-        logger.debug('Checking if URL is documentation page (HTTP request)', { url: rootUrl });
-        const isDoc = await this.browser.isDocumentationPage(rootUrl);
-        if (!isDoc) {
-            throw new Error(`The provided URL does not appear to be a documentation page: ${rootUrl}\n` +
-                `Note: For SPA sites that require JavaScript rendering, use Cursor/Claude's browser tools to get rendered HTML first, then process it.`);
+        // No longer require documentation-only pages - allow any website with extractable content
+        logger.debug('Starting crawl from URL (permissive mode)', { url: rootUrl });
+        // Detect SPA and provide warning
+        try {
+            const spaDetection = await this.browser.detectSPA(rootUrl);
+            if (spaDetection.isSPA && spaDetection.confidence !== 'low') {
+                logger.warn('SPA detected at root URL - crawling may be limited', {
+                    url: rootUrl,
+                    confidence: spaDetection.confidence,
+                    indicators: spaDetection.indicators,
+                    suggestion: spaDetection.suggestion,
+                });
+                // Add warning to first page if SPA detected
+                if (spaDetection.suggestion) {
+                    logger.info('SPA Detection Warning', {
+                        message: spaDetection.suggestion,
+                        recommendation: 'Consider using browser automation tools to get fully rendered content before crawling.',
+                    });
+                }
+            }
+        }
+        catch (error) {
+            // SPA detection failure is not critical, continue crawling
+            logger.debug('SPA detection failed, continuing with crawl', {
+                url: rootUrl,
+                error: error instanceof Error ? error.message : String(error),
+            });
         }
         // Start crawling from root
         this.urlQueue.push({ url: rootUrl, depth: 0 });
         let maxDepthReached = 0;
         // Process queue
+        const startTime = Date.now();
+        let lastProgressLog = Date.now();
+        const PROGRESS_LOG_INTERVAL = 5000; // Log progress every 5 seconds
         while (this.urlQueue.length > 0 && this.crawledPages.length < this.options.maxPages) {
             const queued = this.urlQueue.shift();
             if (!queued)
                 break;
             const { url, depth } = queued;
+            // Log progress periodically
+            const now = Date.now();
+            if (now - lastProgressLog >= PROGRESS_LOG_INTERVAL) {
+                const elapsed = ((now - startTime) / 1000).toFixed(1);
+                const pagesPerSecond = (this.crawledPages.length / elapsed).toFixed(2);
+                logger.info('Crawl progress', {
+                    pagesCrawled: this.crawledPages.length,
+                    pagesRemaining: this.urlQueue.length,
+                    maxPages: this.options.maxPages,
+                    errors: this.errors.length,
+                    elapsedSeconds: elapsed,
+                    pagesPerSecond,
+                    currentDepth: depth,
+                    maxDepth: this.options.maxDepth,
+                });
+                lastProgressLog = now;
+            }
             // Skip if already visited
             if (this.visitedUrls.has(url)) {
                 continue;
@@ -130,9 +174,20 @@ export class DocumentationCrawler {
             this.visitedUrls.add(url);
             maxDepthReached = Math.max(maxDepthReached, depth);
             try {
-                // Crawl the page using HTTP GET
+                // Crawl the page using HTTP GET with retry logic
                 logger.debug('Fetching page via HTTP GET', { url, depth, method: 'HTTP GET', client: 'axios' });
-                const page = await this.browser.browsePage(url);
+                const page = await this.fetchPageWithRetry(url);
+                // Check if page has minimal content (possible SPA issue)
+                const contentLength = page.content.length;
+                const linksCount = page.navigationLinks.length;
+                if (contentLength < 200 && linksCount < 3) {
+                    logger.warn('Page has minimal content - may be SPA', {
+                        url,
+                        contentLength,
+                        linksCount,
+                        suggestion: 'This page may require JavaScript rendering. Consider using browser automation tools.',
+                    });
+                }
                 // Convert to CrawledPage format
                 const crawledPage = {
                     url: page.url,
@@ -164,7 +219,7 @@ export class DocumentationCrawler {
                         discovered: newUrls.length,
                         filtered: discoveryResult.filtered,
                         alreadyVisited: discoveryResult.alreadyVisited,
-                        notDocumentation: discoveryResult.notDocumentation,
+                        notContent: discoveryResult.notContent,
                         externalDomain: discoveryResult.externalDomain,
                         excludedPattern: discoveryResult.excludedPattern,
                         queueLengthBefore: this.urlQueue.length,
@@ -204,20 +259,40 @@ export class DocumentationCrawler {
                 }
             }
             catch (error) {
+                const errorMessage = error instanceof Error ? error.message : String(error);
+                const errorType = this.classifyError(error);
                 this.errors.push({
                     url,
-                    error: error instanceof Error ? error.message : String(error),
+                    error: `${errorType}: ${errorMessage}`,
+                });
+                logger.warn('Page crawl failed', {
+                    url,
+                    error: errorMessage,
+                    errorType,
+                    depth,
+                    willContinue: true,
                 });
                 // Continue crawling other pages
             }
         }
         // Update final statistics
         this.linkDiscoveryStats.pagesDiscovered = this.visitedUrls.size;
-        // Log crawl completion with statistics
+        // Calculate final statistics
+        const totalTime = ((Date.now() - startTime) / 1000).toFixed(2);
+        const avgTimePerPage = this.crawledPages.length > 0
+            ? ((Date.now() - startTime) / this.crawledPages.length / 1000).toFixed(2)
+            : '0';
+        const successRate = this.linkDiscoveryStats.pagesDiscovered > 0
+            ? ((this.crawledPages.length / this.linkDiscoveryStats.pagesDiscovered) * 100).toFixed(1)
+            : '0';
+        // Log crawl completion with comprehensive statistics
         logger.info('Documentation crawl completed using HTTP client (axios)', {
             totalPages: this.crawledPages.length,
             maxDepthReached,
             errors: this.errors.length,
+            totalTimeSeconds: totalTime,
+            avgTimePerPageSeconds: avgTimePerPage,
+            successRate: `${successRate}%`,
             method: 'HTTP GET',
             client: 'axios/HttpClient',
             linkStats: {
@@ -227,13 +302,27 @@ export class DocumentationCrawler {
                 pagesDiscovered: this.linkDiscoveryStats.pagesDiscovered,
                 pagesCrawled: this.linkDiscoveryStats.pagesCrawled,
             },
+            errorBreakdown: this.getErrorBreakdown(),
         });
+        // Validate if content is sufficient for skill generation
+        const validation = this.canGenerateSkill(this.crawledPages);
+        const abandoned = !validation.canGenerate;
+        const abandonReason = validation.reason;
+        if (abandoned) {
+            logger.warn('Crawl completed but content is insufficient for skill generation', {
+                reason: abandonReason,
+                pagesCrawled: this.crawledPages.length,
+                suggestion: 'Consider crawling more pages or a different website',
+            });
+        }
         return {
             pages: this.crawledPages,
             totalPages: this.crawledPages.length,
             maxDepthReached,
             errors: this.errors,
             linkDiscoveryStats: this.linkDiscoveryStats,
+            abandoned,
+            abandonReason,
         };
     }
     /**
@@ -242,7 +331,7 @@ export class DocumentationCrawler {
     discoverDocumentationLinks(page, nextDepth) {
         const discovered = [];
         const filtered = {
-            notDocumentation: 0,
+            notContent: 0, // Renamed from notDocumentation
             externalDomain: 0,
             alreadyVisited: 0,
             excludedPattern: 0,
@@ -273,11 +362,11 @@ export class DocumentationCrawler {
                     linkDetails.push({ url: link.url, reason: 'already_visited' });
                     continue;
                 }
-                // Check if it's a documentation path
+                // Check if it's a valid content path (permissive - only exclude clearly non-content)
                 if (!this.isDocumentationPath(linkUrl.pathname)) {
-                    filtered.notDocumentation++;
-                    this.linkDiscoveryStats.linksFiltered.notDocumentation++;
-                    linkDetails.push({ url: link.url, reason: 'not_documentation_path', pathname: linkUrl.pathname });
+                    filtered.notContent++;
+                    this.linkDiscoveryStats.linksFiltered.notContent++;
+                    linkDetails.push({ url: link.url, reason: 'not_content_path', pathname: linkUrl.pathname });
                     continue;
                 }
                 // Check exclude patterns
@@ -291,8 +380,8 @@ export class DocumentationCrawler {
                 if (this.options.includePaths.length > 0) {
                     const matchesInclude = this.options.includePaths.some(pattern => linkUrl.pathname.includes(pattern));
                     if (!matchesInclude) {
-                        filtered.notDocumentation++;
-                        this.linkDiscoveryStats.linksFiltered.notDocumentation++;
+                        filtered.notContent++;
+                        this.linkDiscoveryStats.linksFiltered.notContent++;
                         linkDetails.push({ url: link.url, reason: 'not_in_include_paths', pathname: linkUrl.pathname });
                         continue;
                     }
@@ -333,7 +422,7 @@ export class DocumentationCrawler {
             totalLinks: page.navigationLinks.length,
             discovered: discovered.length,
             filtered: {
-                notDocumentation: filtered.notDocumentation,
+                notContent: filtered.notContent,
                 externalDomain: filtered.externalDomain,
                 alreadyVisited: filtered.alreadyVisited,
                 excludedPattern: filtered.excludedPattern,
@@ -348,52 +437,42 @@ export class DocumentationCrawler {
             discovered,
             filtered,
             alreadyVisited: filtered.alreadyVisited,
-            notDocumentation: filtered.notDocumentation,
+            notContent: filtered.notContent,
             externalDomain: filtered.externalDomain,
             excludedPattern: filtered.excludedPattern,
         };
     }
     /**
-     * Check if a path is a documentation path
+     * Check if a path should be crawled (permissive - only exclude clearly non-content paths)
      */
     isDocumentationPath(pathname) {
-        // Check against documentation patterns
-        const matchesPattern = this.DOCUMENTATION_PATTERNS.some(pattern => pattern.test(pathname));
-        // If it matches a pattern, it's definitely documentation
-        if (matchesPattern) {
+        // Exclude clearly non-content pages
+        if (this.shouldExclude(pathname)) {
+            return false;
+        }
+        // Exclude static resources
+        const looksLikeStaticResource = /\.(?:css|js|json|xml|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|eot|pdf|zip|exe|dmg)$/i.test(pathname);
+        if (looksLikeStaticResource) {
+            return false;
+        }
+        // Exclude API endpoints that are clearly not content (unless they're documentation APIs)
+        // Keep API endpoints that might be documentation (e.g., /api/docs, /docs/api)
+        const looksLikeApiEndpoint = /^\/api\/[^/]+$/i.test(pathname);
+        if (looksLikeApiEndpoint && !pathname.includes('/docs') && !pathname.includes('/documentation')) {
+            return false;
+        }
+        // Allow root path
+        if (pathname === '/' || pathname === '') {
             return true;
         }
-        // Additional check: if the base URL is a documentation page (which we verified at start),
-        // then paths on the same domain are likely documentation too (unless they match excluded patterns)
-        // This helps with sites that have documentation at root level or non-standard paths
-        // Only apply this if the path doesn't match excluded patterns
-        if (!this.shouldExclude(pathname)) {
-            // If pathname is just "/" or empty, it's the root - check if base URL was documentation
-            if (pathname === '/' || pathname === '') {
-                return true; // Root of a documentation site is documentation
-            }
-            // For documentation sites, be more permissive:
-            // 1. If path contains common documentation keywords
-            // 2. If path looks like a documentation structure (no file extensions like .html, .php, etc.)
-            // 3. If path doesn't look like an API endpoint or static resource
-            const hasFileExtension = /\.[a-z]{2,4}$/i.test(pathname.split('?')[0]);
-            const looksLikeStaticResource = /\.(?:css|js|json|xml|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|eot)$/i.test(pathname);
-            const looksLikeApiEndpoint = /^\/api\/[^/]+$/i.test(pathname) && !/\/docs\/api\//i.test(pathname);
-            // If it's a static resource or API endpoint (not docs), exclude it
-            if (looksLikeStaticResource || (looksLikeApiEndpoint && !pathname.includes('/docs'))) {
-                return false;
-            }
-            // If it has a file extension (but not a static resource), be conservative
-            if (hasFileExtension) {
-                return false;
-            }
-            // For paths without file extensions, check if they contain documentation keywords
-            // OR if they're under common documentation paths
-            const looksLikeDoc = /(?:doc|guide|tutorial|api|reference|manual|help|about|getting-started|examples?)/i.test(pathname);
-            const isUnderDocPath = /^\/(?:docs?|documentation|guides?|tutorials?|api|reference|manual|help|examples?)/i.test(pathname);
-            return looksLikeDoc || isUnderDocPath;
+        // Exclude paths with file extensions (unless they're HTML pages)
+        const hasFileExtension = /\.[a-z]{2,4}$/i.test(pathname.split('?')[0]);
+        if (hasFileExtension && !pathname.match(/\.(html?|htm)$/i)) {
+            return false;
         }
-        return false;
+        // Permissive: allow any path that doesn't match exclusion patterns
+        // This allows crawling any website, not just documentation
+        return true;
     }
     /**
      * Check if a path should be excluded
@@ -401,6 +480,151 @@ export class DocumentationCrawler {
     shouldExclude(pathname) {
         return this.EXCLUDED_PATTERNS.some(pattern => pattern.test(pathname));
     }
+    /**
+     * Check if crawled content is sufficient for skill generation
+     * Similar logic to SkillGenerator but here for early validation
+     */
+    canGenerateSkill(pages) {
+        if (pages.length === 0) {
+            return { canGenerate: false, reason: 'empty_pages' };
+        }
+        const MIN_CONTENT_LENGTH = 100;
+        let hasSufficientContent = false;
+        let hasStructuredContent = false;
+        let hasTextContent = false;
+        let mediaOnlyCount = 0;
+        for (const page of pages) {
+            const contentLength = (page.content || '').trim().length;
+            const hasHeadings = page.headings && page.headings.length > 0;
+            const hasText = contentLength > 0;
+            // Check if page is media-only (has images but no text)
+            const hasImages = /<img[^>]*>/i.test(page.content || '');
+            const hasMedia = hasImages || (page.codeSamples && page.codeSamples.length > 0);
+            if (hasMedia && contentLength < MIN_CONTENT_LENGTH) {
+                mediaOnlyCount++;
+            }
+            if (contentLength >= MIN_CONTENT_LENGTH) {
+                hasSufficientContent = true;
+            }
+            if (hasHeadings) {
+                hasStructuredContent = true;
+            }
+            if (hasText) {
+                hasTextContent = true;
+            }
+        }
+        // All pages are media-only
+        if (mediaOnlyCount === pages.length && !hasTextContent) {
+            return { canGenerate: false, reason: 'media_only' };
+        }
+        // No pages have sufficient content
+        if (!hasSufficientContent) {
+            return { canGenerate: false, reason: 'insufficient_content' };
+        }
+        // No structured content (headings, sections)
+        if (!hasStructuredContent) {
+            return { canGenerate: false, reason: 'no_structured_content' };
+        }
+        return { canGenerate: true };
+    }
+    /**
+     * Fetch a page with retry logic
+     */
+    async fetchPageWithRetry(url, retryCount = 0) {
+        try {
+            return await this.browser.browsePage(url);
+        }
+        catch (error) {
+            const errorType = this.classifyError(error);
+            const isRetryable = this.isRetryableError(error);
+            if (isRetryable && retryCount < this.options.maxRetries) {
+                const delay = this.options.retryDelay * (retryCount + 1); // Exponential backoff
+                logger.info('Retrying page fetch', {
+                    url,
+                    retryCount: retryCount + 1,
+                    maxRetries: this.options.maxRetries,
+                    delay,
+                    errorType,
+                });
+                await this.delay(delay);
+                return this.fetchPageWithRetry(url, retryCount + 1);
+            }
+            // Not retryable or max retries reached
+            throw error;
+        }
+    }
+    /**
+     * Classify error type for better error messages
+     */
+    classifyError(error) {
+        if (!(error instanceof Error)) {
+            return 'UnknownError';
+        }
+        const message = error.message.toLowerCase();
+        const errorName = error.name.toLowerCase();
+        // Network errors
+        if (errorName.includes('timeout') || message.includes('timeout')) {
+            return 'TimeoutError';
+        }
+        if (errorName.includes('network') || message.includes('network') || message.includes('econnrefused')) {
+            return 'NetworkError';
+        }
+        if (message.includes('econnreset') || message.includes('socket')) {
+            return 'ConnectionError';
+        }
+        // HTTP errors
+        if (errorName.includes('http') || message.includes('status')) {
+            if (message.includes('404'))
+                return 'NotFoundError';
+            if (message.includes('403'))
+                return 'ForbiddenError';
+            if (message.includes('401'))
+                return 'UnauthorizedError';
+            if (message.includes('429'))
+                return 'RateLimitError';
+            if (message.includes('500') || message.includes('502') || message.includes('503')) {
+                return 'ServerError';
+            }
+            return 'HttpError';
+        }
+        // Content errors
+        if (message.includes('documentation') || message.includes('not appear to be')) {
+            return 'NotDocumentationError';
+        }
+        if (message.includes('spa') || message.includes('javascript')) {
+            return 'SPAError';
+        }
+        return 'UnknownError';
+    }
+    /**
+     * Check if an error is retryable
+     */
+    isRetryableError(error) {
+        if (!(error instanceof Error)) {
+            return false;
+        }
+        const errorType = this.classifyError(error);
+        // Retryable errors
+        const retryableTypes = [
+            'TimeoutError',
+            'NetworkError',
+            'ConnectionError',
+            'RateLimitError',
+            'ServerError', // 500, 502, 503
+        ];
+        return retryableTypes.includes(errorType);
+    }
+    /**
+     * Get error breakdown by type
+     */
+    getErrorBreakdown() {
+        const breakdown = {};
+        for (const error of this.errors) {
+            const errorType = error.error.split(':')[0] || 'UnknownError';
+            breakdown[errorType] = (breakdown[errorType] || 0) + 1;
+        }
+        return breakdown;
+    }
     /**
      * Delay helper for rate limiting
      */