npm - solo-doc - Versions diffs - 0.3.2 → 0.3.4 - Mend

solo-doc 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md +36 -1
package/dist/bin/solo-doc.js +6 -46
package/dist/src/commands/VSCommand.js +2 -32
package/dist/src/strategies/OCPStrategy.js +47 -0
package/dist/src/utils/filename.js +17 -2
package/dist/src/utils/multiUrlCrawler.js +118 -0
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -44,6 +44,16 @@ solo-doc "https://docs.alauda.io/container_platform/4.2/developer/building_appli
 solo-doc "https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html-single/building_applications/index"
 ```
+### 🔗 多地址聚合 (Multi-URL Aggregation)
+支持一次性爬取多个 URL 并将其合并为一个 Markdown 文件。多个地址之间使用逗号分隔。
+```bash
+# 爬取多个 URL 并合并
+# 将依次爬取 url1 和 url2，合并输出到 combined.md
+solo-doc "https://url1/...,https://url2/..." -o combined.md
+```
 ### 📝 自定义输出文件名
 使用 `-o` 参数指定自定义输出路径。
@@ -112,8 +122,33 @@ solo-doc vs \
 solo-doc vs <url1> <url2>
 ```
+#### 📂 本地文件与多源聚合
+VS 模式支持直接使用本地 Markdown 文件进行对比，无需重复爬取。同时也支持将多个源（URL 或本地文件）聚合后再进行对比。
+```bash
+# 1. 对比多个在线文档 (聚合对比)
+# 场景：将 v1.0 的多个模块文档聚合，与 v2.0 的对应模块进行对比
+solo-doc vs \
+  "https://docs.site/v1/module-a,https://docs.site/v1/module-b" \
+  "https://docs.site/v2/module-a,https://docs.site/v2/module-b"
+# 2. 对比本地文件与在线文档
+solo-doc vs ./local-draft.md "https://docs.prod.com/..."
+# 3. 多文件聚合对比
+# 将 part1.md 和 part2.md 合并作为基准，与 target.md 对比
+solo-doc vs ./part1.md,./part2.md ./target.md
+# 4. 混合使用 (URL + 本地文件)
+solo-doc vs ./intro.md,"https://docs.site/chapter1" ./v2-draft.md
+```
 此命令将按顺序执行：
-1. **自动爬取**: 分别爬取两个 URL 并保存为 Markdown 文件（如果已存在则跳过）。
+1. **数据准备**:
+   - 如果是 URL：自动爬取并保存为 Markdown。
+   - 如果是本地文件：直接读取内容。
+   - 如果是多个源：按顺序合并为一个聚合文件。
 2. **提取目录**: 提取两个文档的目录树结构。
 3. **AI 分析**: 调用配置的 AI 模型，根据 `solo-doc-prompt.md` 定义的提示词进行两步分析：
    > **提示**: `solo-doc` 会优先使用当前执行目录下的 `solo-doc-prompt.md` 文件。你可以复制默认模板到当前目录进行自定义修改。若当前目录不存在该文件，则使用内置默认模板。

package/dist/bin/solo-doc.js CHANGED Viewed

@@ -5,12 +5,8 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
 };
 Object.defineProperty(exports, "__esModule", { value: true });
 const commander_1 = require("commander");
-const CrawlerContext_1 = require("../src/CrawlerContext");
-const OCPStrategy_1 = require("../src/strategies/OCPStrategy");
-const ACPStrategy_1 = require("../src/strategies/ACPStrategy");
-const StrategyDetector_1 = require("../src/utils/StrategyDetector");
-const filename_1 = require("../src/utils/filename");
 const VSCommand_1 = require("../src/commands/VSCommand");
+const multiUrlCrawler_1 = require("../src/utils/multiUrlCrawler");
 const chalk_1 = __importDefault(require("chalk"));
 const path_1 = __importDefault(require("path"));
 const fs_1 = __importDefault(require("fs"));
@@ -63,48 +59,12 @@ program
     .option('-f, --force', 'Force overwrite existing file')
     .action(async (url, options) => {
     try {
-        // 1. Determine Strategy
-        let type = options.type;
-        if (!type) {
-            const detected = StrategyDetector_1.StrategyDetector.detect(url);
-            if (detected !== StrategyDetector_1.StrategyType.UNKNOWN) {
-                type = detected;
-                console.log(chalk_1.default.blue(`[Solo-Doc] Auto-detected strategy: ${type.toUpperCase()}`));
-            }
-        }
-        if (!type || (type !== 'ocp' && type !== 'acp')) {
-            console.error(chalk_1.default.red('Error: Could not detect documentation type.'));
-            console.error(chalk_1.default.yellow('Please use --type <ocp|acp> to specify the documentation type manually.'));
-            process.exit(1);
-        }
-        // 2. Instantiate Strategy
-        let strategy;
-        let defaultOutput;
-        if (type === 'ocp' || type === StrategyDetector_1.StrategyType.OCP) {
-            strategy = new OCPStrategy_1.OCPStrategy();
-            defaultOutput = (0, filename_1.generateDefaultFilename)(url, 'ocp');
-        }
-        else {
-            strategy = new ACPStrategy_1.ACPStrategy();
-            defaultOutput = (0, filename_1.generateDefaultFilename)(url, 'acp');
-        }
-        // 3. Prepare Context
-        const context = new CrawlerContext_1.CrawlerContext(strategy);
-        const outputPath = path_1.default.resolve(process.cwd(), options.output || defaultOutput);
-        // Check if file exists
-        if (fs_1.default.existsSync(outputPath) && !options.force) {
-            console.log(chalk_1.default.yellow('--------------------------------------------------'));
-            console.log(chalk_1.default.yellow(`ℹ  File already exists: ${outputPath}`));
-            console.log(chalk_1.default.yellow('   Skipping crawl to save time.'));
-            console.log(chalk_1.default.gray('   Use --force or -f to overwrite.'));
-            console.log(chalk_1.default.yellow('--------------------------------------------------'));
-            return;
-        }
-        // 4. Run
-        await context.run(url, {
-            output: outputPath,
+        await (0, multiUrlCrawler_1.crawlAndAggregate)(url, {
+            output: options.output,
+            type: options.type,
             limit: options.limit,
-            headless: options.headless
+            headless: options.headless,
+            force: options.force
         });
     }
     catch (error) {

package/dist/src/commands/VSCommand.js CHANGED Viewed

@@ -8,14 +8,10 @@ const fs_1 = __importDefault(require("fs"));
 const path_1 = __importDefault(require("path"));
 const chalk_1 = __importDefault(require("chalk"));
 const ora_1 = __importDefault(require("ora"));
-const CrawlerContext_1 = require("../CrawlerContext");
-const OCPStrategy_1 = require("../strategies/OCPStrategy");
-const ACPStrategy_1 = require("../strategies/ACPStrategy");
-const StrategyDetector_1 = require("../utils/StrategyDetector");
-const filename_1 = require("../utils/filename");
 const TocExtractor_1 = require("../utils/TocExtractor");
 const AIClient_1 = require("../ai/AIClient");
 const config_1 = require("../utils/config");
+const multiUrlCrawler_1 = require("../utils/multiUrlCrawler");
 class VSCommand {
     static async run(baselineUrl, targetUrl, options) {
         console.log(chalk_1.default.blue(`[VS Mode] Starting comparison between:`));
@@ -124,33 +120,7 @@ ${result1}
         console.log(chalk_1.default.green(`[VS Mode] All tasks finished.`));
     }
     static async crawlUrl(url, prefix) {
-        // Try to detect type
-        let type = StrategyDetector_1.StrategyDetector.detect(url);
-        let strategy;
-        // Simple logic: if detects OCP, use OCP. Else ACP (more generic).
-        if (type === StrategyDetector_1.StrategyType.OCP) {
-            strategy = new OCPStrategy_1.OCPStrategy();
-        }
-        else {
-            // Default to ACP which uses Puppeteer
-            strategy = new ACPStrategy_1.ACPStrategy();
-        }
-        const filename = (0, filename_1.generateDefaultFilename)(url, prefix);
-        const outputPath = path_1.default.resolve(process.cwd(), filename);
-        // Check if file exists
-        if (fs_1.default.existsSync(outputPath)) {
-            console.log(chalk_1.default.yellow('--------------------------------------------------'));
-            console.log(chalk_1.default.yellow(`ℹ  File already exists: ${outputPath}`));
-            console.log(chalk_1.default.yellow('   Using cached version for comparison.'));
-            console.log(chalk_1.default.yellow('--------------------------------------------------'));
-            return outputPath;
-        }
-        console.log(chalk_1.default.blue(`[VS Mode] Crawling ${url} -> ${filename}...`));
-        const context = new CrawlerContext_1.CrawlerContext(strategy);
-        // Suppress console log from crawler to keep output clean?
-        // Or keep it to show progress. Keep it.
-        await context.run(url, { output: outputPath, headless: true });
-        return outputPath;
+        return await (0, multiUrlCrawler_1.crawlAndAggregate)(url, { prefix });
     }
     static parsePrompts(content) {
         // Match content inside ``` ... ``` blocks that follow "Prompt模板"

package/dist/src/strategies/OCPStrategy.js CHANGED Viewed

@@ -56,6 +56,53 @@ class OCPStrategy {
             console.log(chalk_1.default.cyan(`Fixed:    ${fixedUrl}`));
             url = fixedUrl;
         }
+        // Feature: Landing Page Detection (Product Index)
+        // If URL doesn't look like a specific book (no /html/ or /html-single/), treat as landing page
+        if (!url.includes('/html/') && !url.includes('/html-single/')) {
+            console.log(chalk_1.default.blue(`[OCP] Detected Landing Page (Product Index). Scanning for books...`));
+            try {
+                const { data } = await axios_1.default.get(url);
+                const $ = cheerio.load(data);
+                const baseUrl = new URL(url);
+                const orderedLinks = [];
+                $('a').each((_, el) => {
+                    const href = $(el).attr('href');
+                    if (href && (href.includes('/html/') || href.includes('/html-single/'))) {
+                        try {
+                            const absoluteUrl = new URL(href, baseUrl).href;
+                            // Basic filtering to avoid navigating away or to unrelated docs
+                            // We assume valid books share the same base path prefix usually, but let's be permissive for now
+                            if (!orderedLinks.includes(absoluteUrl)) {
+                                orderedLinks.push(absoluteUrl);
+                            }
+                        }
+                        catch (e) { }
+                    }
+                });
+                if (orderedLinks.length > 0) {
+                    console.log(chalk_1.default.green(`[OCP] Found ${orderedLinks.length} books/sections. Starting recursive crawl...`));
+                    let aggregatedMarkdown = `# ${$('title').text().trim()} (Aggregated)\n\n`;
+                    for (const [index, bookUrl] of orderedLinks.entries()) {
+                        console.log(chalk_1.default.blue(`\n[OCP] Processing Book ${index + 1}/${orderedLinks.length}: ${bookUrl}`));
+                        try {
+                            // Recursive call
+                            const bookMarkdown = await this.execute(bookUrl, options);
+                            aggregatedMarkdown += `\n\n---\n\n${bookMarkdown}`;
+                        }
+                        catch (e) {
+                            console.error(chalk_1.default.red(`[OCP] Failed to crawl book ${bookUrl}: ${e.message}`));
+                        }
+                    }
+                    return aggregatedMarkdown;
+                }
+                else {
+                    console.log(chalk_1.default.yellow(`[OCP] No books found on landing page. Treating as single page.`));
+                }
+            }
+            catch (e) {
+                console.warn(chalk_1.default.yellow(`[OCP] Failed to scan landing page: ${e.message}. Proceeding as standard page.`));
+            }
+        }
         // Optimisation: Try to convert multi-page URL (/html/) to single-page URL (/html-single/)
         // Example: .../html/building_applications/index -> .../html-single/building_applications/index
         if (url.includes('/html/') && !url.includes('/html-single/')) {

package/dist/src/utils/filename.js CHANGED Viewed

@@ -6,9 +6,24 @@ const generateDefaultFilename = (urlStr, typePrefix) => {
         const u = new URL(urlStr);
         // Get the last path segment that isn't 'index.html' or 'index' or empty
         const segments = u.pathname.split('/').filter(s => s && s !== 'index.html' && s !== 'index');
-        const lastSegment = segments.length > 0 ? segments[segments.length - 1] : 'docs';
+        let safeName = 'docs';
+        if (segments.length > 0) {
+            const lastSegment = segments[segments.length - 1];
+            // Check if last segment is a version number (e.g., "3.2", "4.14", "v1.0")
+            // simple check: starts with digit or v+digit
+            const isVersion = /^[vV]?\d+(\.\d+)*$/.test(lastSegment);
+            if (isVersion && segments.length > 1) {
+                // If it's a version, prepend the previous segment for context
+                // e.g., .../service_mesh/3.2 -> service_mesh_3.2
+                const productSegment = segments[segments.length - 2];
+                safeName = `${productSegment}_${lastSegment}`;
+            }
+            else {
+                safeName = lastSegment;
+            }
+        }
         // Sanitize filename
-        const safeName = lastSegment.replace(/[^a-zA-Z0-9-_]/g, '_');
+        safeName = safeName.replace(/[^a-zA-Z0-9-_]/g, '_');
         return `${typePrefix}-${safeName}.md`;
     }
     catch (e) {

package/dist/src/utils/multiUrlCrawler.js ADDED Viewed

@@ -0,0 +1,118 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.crawlAndAggregate = crawlAndAggregate;
+const fs_1 = __importDefault(require("fs"));
+const path_1 = __importDefault(require("path"));
+const chalk_1 = __importDefault(require("chalk"));
+const StrategyDetector_1 = require("./StrategyDetector");
+const OCPStrategy_1 = require("../strategies/OCPStrategy");
+const ACPStrategy_1 = require("../strategies/ACPStrategy");
+const filename_1 = require("./filename");
+async function crawlAndAggregate(urlInput, options) {
+    const inputs = urlInput.split(',').map(u => u.trim()).filter(u => u.length > 0);
+    if (inputs.length === 0)
+        throw new Error("No valid inputs provided");
+    // Determine output path
+    let outputPath;
+    if (options.output) {
+        outputPath = path_1.default.resolve(process.cwd(), options.output);
+    }
+    else {
+        const firstInput = inputs[0];
+        // Check if first input is a local file
+        const isFile = fs_1.default.existsSync(firstInput) && fs_1.default.statSync(firstInput).isFile();
+        let prefix = options.prefix;
+        if (!prefix) {
+            if (isFile) {
+                prefix = 'local';
+            }
+            else {
+                let type = options.type;
+                if (!type) {
+                    const detected = StrategyDetector_1.StrategyDetector.detect(firstInput);
+                    type = detected !== StrategyDetector_1.StrategyType.UNKNOWN ? detected : 'acp';
+                }
+                prefix = type;
+            }
+        }
+        if (isFile) {
+            const basename = path_1.default.basename(firstInput, path_1.default.extname(firstInput));
+            outputPath = path_1.default.resolve(process.cwd(), `${prefix}-${basename}.md`);
+        }
+        else {
+            const defaultName = (0, filename_1.generateDefaultFilename)(firstInput, prefix);
+            outputPath = path_1.default.resolve(process.cwd(), defaultName);
+        }
+    }
+    // Check cache
+    if (fs_1.default.existsSync(outputPath) && !options.force) {
+        console.log(chalk_1.default.yellow('--------------------------------------------------'));
+        console.log(chalk_1.default.yellow(`ℹ  File already exists: ${outputPath}`));
+        console.log(chalk_1.default.yellow('   Skipping crawl/read to save time. (Use --force to overwrite)'));
+        console.log(chalk_1.default.yellow('--------------------------------------------------'));
+        return outputPath;
+    }
+    let aggregatedMarkdown = '';
+    console.log(chalk_1.default.blue(`[Multi-URL] Found ${inputs.length} inputs to process.`));
+    for (const [index, input] of inputs.entries()) {
+        console.log(chalk_1.default.blue(`\n[Multi-URL] Processing ${index + 1}/${inputs.length}: ${input}`));
+        // 1. Check if local file
+        if (fs_1.default.existsSync(input) && fs_1.default.statSync(input).isFile()) {
+            console.log(chalk_1.default.green(`[Solo-Doc] Detected local file. Reading content...`));
+            const content = fs_1.default.readFileSync(input, 'utf-8');
+            if (index > 0) {
+                aggregatedMarkdown += '\n\n---\n\n';
+            }
+            aggregatedMarkdown += content;
+            continue;
+        }
+        // 2. Assume URL and Crawl
+        let strategy;
+        let strategyType = options.type;
+        if (!strategyType) {
+            const detected = StrategyDetector_1.StrategyDetector.detect(input);
+            if (detected !== StrategyDetector_1.StrategyType.UNKNOWN) {
+                strategyType = detected;
+                console.log(chalk_1.default.blue(`[Solo-Doc] Auto-detected strategy: ${strategyType.toUpperCase()}`));
+            }
+            else {
+                strategyType = 'acp'; // Default
+                console.log(chalk_1.default.yellow(`[Solo-Doc] Could not detect strategy, defaulting to ACP`));
+            }
+        }
+        else {
+            console.log(chalk_1.default.blue(`[Solo-Doc] Using forced strategy: ${strategyType.toUpperCase()}`));
+        }
+        if (strategyType === 'ocp' || strategyType === StrategyDetector_1.StrategyType.OCP) {
+            strategy = new OCPStrategy_1.OCPStrategy();
+        }
+        else {
+            strategy = new ACPStrategy_1.ACPStrategy();
+        }
+        try {
+            const markdown = await strategy.execute(input, {
+                output: outputPath,
+                limit: options.limit,
+                headless: options.headless
+            });
+            // Add separator if not first
+            if (index > 0) {
+                aggregatedMarkdown += '\n\n---\n\n';
+            }
+            aggregatedMarkdown += markdown;
+        }
+        catch (e) {
+            console.error(chalk_1.default.red(`[Solo-Doc] Error during crawl of ${input}: ${e.message}`));
+            // Continue with other URLs? Or fail hard?
+            // Requirement doesn't specify, but failing hard is safer to avoid incomplete docs.
+            throw e;
+        }
+    }
+    // Write to file
+    fs_1.default.writeFileSync(outputPath, aggregatedMarkdown);
+    console.log(chalk_1.default.green(`\n✔ Aggregated content written to: ${outputPath}`));
+    return outputPath;
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "solo-doc",
-  "version": "0.3.2",
+  "version": "0.3.4",
   "main": "dist/bin/solo-doc.js",
   "bin": {
     "solo-doc": "dist/bin/solo-doc.js"