solo-doc 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -44,6 +44,16 @@ solo-doc "https://docs.alauda.io/container_platform/4.2/developer/building_appli
44
44
  solo-doc "https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html-single/building_applications/index"
45
45
  ```
46
46
 
47
+ ### 🔗 多地址聚合 (Multi-URL Aggregation)
48
+
49
+ 支持一次性爬取多个 URL 并将其合并为一个 Markdown 文件。多个地址之间使用逗号分隔。
50
+
51
+ ```bash
52
+ # 爬取多个 URL 并合并
53
+ # 将依次爬取 url1 和 url2,合并输出到 combined.md
54
+ solo-doc "https://url1/...,https://url2/..." -o combined.md
55
+ ```
56
+
47
57
  ### 📝 自定义输出文件名
48
58
 
49
59
  使用 `-o` 参数指定自定义输出路径。
@@ -112,8 +122,33 @@ solo-doc vs \
112
122
  solo-doc vs <url1> <url2>
113
123
  ```
114
124
 
125
+ #### 📂 本地文件与多源聚合
126
+
127
+ VS 模式支持直接使用本地 Markdown 文件进行对比,无需重复爬取。同时也支持将多个源(URL 或本地文件)聚合后再进行对比。
128
+
129
+ ```bash
130
+ # 1. 对比多个在线文档 (聚合对比)
131
+ # 场景:将 v1.0 的多个模块文档聚合,与 v2.0 的对应模块进行对比
132
+ solo-doc vs \
133
+ "https://docs.site/v1/module-a,https://docs.site/v1/module-b" \
134
+ "https://docs.site/v2/module-a,https://docs.site/v2/module-b"
135
+
136
+ # 2. 对比本地文件与在线文档
137
+ solo-doc vs ./local-draft.md "https://docs.prod.com/..."
138
+
139
+ # 3. 多文件聚合对比
140
+ # 将 part1.md 和 part2.md 合并作为基准,与 target.md 对比
141
+ solo-doc vs ./part1.md,./part2.md ./target.md
142
+
143
+ # 4. 混合使用 (URL + 本地文件)
144
+ solo-doc vs ./intro.md,"https://docs.site/chapter1" ./v2-draft.md
145
+ ```
146
+
115
147
  此命令将按顺序执行:
116
- 1. **自动爬取**: 分别爬取两个 URL 并保存为 Markdown 文件(如果已存在则跳过)。
148
+ 1. **数据准备**:
149
+ - 如果是 URL:自动爬取并保存为 Markdown。
150
+ - 如果是本地文件:直接读取内容。
151
+ - 如果是多个源:按顺序合并为一个聚合文件。
117
152
  2. **提取目录**: 提取两个文档的目录树结构。
118
153
  3. **AI 分析**: 调用配置的 AI 模型,根据 `solo-doc-prompt.md` 定义的提示词进行两步分析:
119
154
  > **提示**: `solo-doc` 会优先使用当前执行目录下的 `solo-doc-prompt.md` 文件。你可以复制默认模板到当前目录进行自定义修改。若当前目录不存在该文件,则使用内置默认模板。
@@ -5,12 +5,8 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
5
5
  };
6
6
  Object.defineProperty(exports, "__esModule", { value: true });
7
7
  const commander_1 = require("commander");
8
- const CrawlerContext_1 = require("../src/CrawlerContext");
9
- const OCPStrategy_1 = require("../src/strategies/OCPStrategy");
10
- const ACPStrategy_1 = require("../src/strategies/ACPStrategy");
11
- const StrategyDetector_1 = require("../src/utils/StrategyDetector");
12
- const filename_1 = require("../src/utils/filename");
13
8
  const VSCommand_1 = require("../src/commands/VSCommand");
9
+ const multiUrlCrawler_1 = require("../src/utils/multiUrlCrawler");
14
10
  const chalk_1 = __importDefault(require("chalk"));
15
11
  const path_1 = __importDefault(require("path"));
16
12
  const fs_1 = __importDefault(require("fs"));
@@ -63,48 +59,12 @@ program
63
59
  .option('-f, --force', 'Force overwrite existing file')
64
60
  .action(async (url, options) => {
65
61
  try {
66
- // 1. Determine Strategy
67
- let type = options.type;
68
- if (!type) {
69
- const detected = StrategyDetector_1.StrategyDetector.detect(url);
70
- if (detected !== StrategyDetector_1.StrategyType.UNKNOWN) {
71
- type = detected;
72
- console.log(chalk_1.default.blue(`[Solo-Doc] Auto-detected strategy: ${type.toUpperCase()}`));
73
- }
74
- }
75
- if (!type || (type !== 'ocp' && type !== 'acp')) {
76
- console.error(chalk_1.default.red('Error: Could not detect documentation type.'));
77
- console.error(chalk_1.default.yellow('Please use --type <ocp|acp> to specify the documentation type manually.'));
78
- process.exit(1);
79
- }
80
- // 2. Instantiate Strategy
81
- let strategy;
82
- let defaultOutput;
83
- if (type === 'ocp' || type === StrategyDetector_1.StrategyType.OCP) {
84
- strategy = new OCPStrategy_1.OCPStrategy();
85
- defaultOutput = (0, filename_1.generateDefaultFilename)(url, 'ocp');
86
- }
87
- else {
88
- strategy = new ACPStrategy_1.ACPStrategy();
89
- defaultOutput = (0, filename_1.generateDefaultFilename)(url, 'acp');
90
- }
91
- // 3. Prepare Context
92
- const context = new CrawlerContext_1.CrawlerContext(strategy);
93
- const outputPath = path_1.default.resolve(process.cwd(), options.output || defaultOutput);
94
- // Check if file exists
95
- if (fs_1.default.existsSync(outputPath) && !options.force) {
96
- console.log(chalk_1.default.yellow('--------------------------------------------------'));
97
- console.log(chalk_1.default.yellow(`ℹ File already exists: ${outputPath}`));
98
- console.log(chalk_1.default.yellow(' Skipping crawl to save time.'));
99
- console.log(chalk_1.default.gray(' Use --force or -f to overwrite.'));
100
- console.log(chalk_1.default.yellow('--------------------------------------------------'));
101
- return;
102
- }
103
- // 4. Run
104
- await context.run(url, {
105
- output: outputPath,
62
+ await (0, multiUrlCrawler_1.crawlAndAggregate)(url, {
63
+ output: options.output,
64
+ type: options.type,
106
65
  limit: options.limit,
107
- headless: options.headless
66
+ headless: options.headless,
67
+ force: options.force
108
68
  });
109
69
  }
110
70
  catch (error) {
@@ -8,14 +8,10 @@ const fs_1 = __importDefault(require("fs"));
8
8
  const path_1 = __importDefault(require("path"));
9
9
  const chalk_1 = __importDefault(require("chalk"));
10
10
  const ora_1 = __importDefault(require("ora"));
11
- const CrawlerContext_1 = require("../CrawlerContext");
12
- const OCPStrategy_1 = require("../strategies/OCPStrategy");
13
- const ACPStrategy_1 = require("../strategies/ACPStrategy");
14
- const StrategyDetector_1 = require("../utils/StrategyDetector");
15
- const filename_1 = require("../utils/filename");
16
11
  const TocExtractor_1 = require("../utils/TocExtractor");
17
12
  const AIClient_1 = require("../ai/AIClient");
18
13
  const config_1 = require("../utils/config");
14
+ const multiUrlCrawler_1 = require("../utils/multiUrlCrawler");
19
15
  class VSCommand {
20
16
  static async run(baselineUrl, targetUrl, options) {
21
17
  console.log(chalk_1.default.blue(`[VS Mode] Starting comparison between:`));
@@ -124,33 +120,7 @@ ${result1}
124
120
  console.log(chalk_1.default.green(`[VS Mode] All tasks finished.`));
125
121
  }
126
122
  static async crawlUrl(url, prefix) {
127
- // Try to detect type
128
- let type = StrategyDetector_1.StrategyDetector.detect(url);
129
- let strategy;
130
- // Simple logic: if detects OCP, use OCP. Else ACP (more generic).
131
- if (type === StrategyDetector_1.StrategyType.OCP) {
132
- strategy = new OCPStrategy_1.OCPStrategy();
133
- }
134
- else {
135
- // Default to ACP which uses Puppeteer
136
- strategy = new ACPStrategy_1.ACPStrategy();
137
- }
138
- const filename = (0, filename_1.generateDefaultFilename)(url, prefix);
139
- const outputPath = path_1.default.resolve(process.cwd(), filename);
140
- // Check if file exists
141
- if (fs_1.default.existsSync(outputPath)) {
142
- console.log(chalk_1.default.yellow('--------------------------------------------------'));
143
- console.log(chalk_1.default.yellow(`ℹ File already exists: ${outputPath}`));
144
- console.log(chalk_1.default.yellow(' Using cached version for comparison.'));
145
- console.log(chalk_1.default.yellow('--------------------------------------------------'));
146
- return outputPath;
147
- }
148
- console.log(chalk_1.default.blue(`[VS Mode] Crawling ${url} -> ${filename}...`));
149
- const context = new CrawlerContext_1.CrawlerContext(strategy);
150
- // Suppress console log from crawler to keep output clean?
151
- // Or keep it to show progress. Keep it.
152
- await context.run(url, { output: outputPath, headless: true });
153
- return outputPath;
123
+ return await (0, multiUrlCrawler_1.crawlAndAggregate)(url, { prefix });
154
124
  }
155
125
  static parsePrompts(content) {
156
126
  // Match content inside ``` ... ``` blocks that follow "Prompt模板"
@@ -56,6 +56,53 @@ class OCPStrategy {
56
56
  console.log(chalk_1.default.cyan(`Fixed: ${fixedUrl}`));
57
57
  url = fixedUrl;
58
58
  }
59
+ // Feature: Landing Page Detection (Product Index)
60
+ // If URL doesn't look like a specific book (no /html/ or /html-single/), treat as landing page
61
+ if (!url.includes('/html/') && !url.includes('/html-single/')) {
62
+ console.log(chalk_1.default.blue(`[OCP] Detected Landing Page (Product Index). Scanning for books...`));
63
+ try {
64
+ const { data } = await axios_1.default.get(url);
65
+ const $ = cheerio.load(data);
66
+ const baseUrl = new URL(url);
67
+ const orderedLinks = [];
68
+ $('a').each((_, el) => {
69
+ const href = $(el).attr('href');
70
+ if (href && (href.includes('/html/') || href.includes('/html-single/'))) {
71
+ try {
72
+ const absoluteUrl = new URL(href, baseUrl).href;
73
+ // Basic filtering to avoid navigating away or to unrelated docs
74
+ // We assume valid books share the same base path prefix usually, but let's be permissive for now
75
+ if (!orderedLinks.includes(absoluteUrl)) {
76
+ orderedLinks.push(absoluteUrl);
77
+ }
78
+ }
79
+ catch (e) { }
80
+ }
81
+ });
82
+ if (orderedLinks.length > 0) {
83
+ console.log(chalk_1.default.green(`[OCP] Found ${orderedLinks.length} books/sections. Starting recursive crawl...`));
84
+ let aggregatedMarkdown = `# ${$('title').text().trim()} (Aggregated)\n\n`;
85
+ for (const [index, bookUrl] of orderedLinks.entries()) {
86
+ console.log(chalk_1.default.blue(`\n[OCP] Processing Book ${index + 1}/${orderedLinks.length}: ${bookUrl}`));
87
+ try {
88
+ // Recursive call
89
+ const bookMarkdown = await this.execute(bookUrl, options);
90
+ aggregatedMarkdown += `\n\n---\n\n${bookMarkdown}`;
91
+ }
92
+ catch (e) {
93
+ console.error(chalk_1.default.red(`[OCP] Failed to crawl book ${bookUrl}: ${e.message}`));
94
+ }
95
+ }
96
+ return aggregatedMarkdown;
97
+ }
98
+ else {
99
+ console.log(chalk_1.default.yellow(`[OCP] No books found on landing page. Treating as single page.`));
100
+ }
101
+ }
102
+ catch (e) {
103
+ console.warn(chalk_1.default.yellow(`[OCP] Failed to scan landing page: ${e.message}. Proceeding as standard page.`));
104
+ }
105
+ }
59
106
  // Optimisation: Try to convert multi-page URL (/html/) to single-page URL (/html-single/)
60
107
  // Example: .../html/building_applications/index -> .../html-single/building_applications/index
61
108
  if (url.includes('/html/') && !url.includes('/html-single/')) {
@@ -6,9 +6,24 @@ const generateDefaultFilename = (urlStr, typePrefix) => {
6
6
  const u = new URL(urlStr);
7
7
  // Get the last path segment that isn't 'index.html' or 'index' or empty
8
8
  const segments = u.pathname.split('/').filter(s => s && s !== 'index.html' && s !== 'index');
9
- const lastSegment = segments.length > 0 ? segments[segments.length - 1] : 'docs';
9
+ let safeName = 'docs';
10
+ if (segments.length > 0) {
11
+ const lastSegment = segments[segments.length - 1];
12
+ // Check if last segment is a version number (e.g., "3.2", "4.14", "v1.0")
13
+ // simple check: starts with digit or v+digit
14
+ const isVersion = /^[vV]?\d+(\.\d+)*$/.test(lastSegment);
15
+ if (isVersion && segments.length > 1) {
16
+ // If it's a version, prepend the previous segment for context
17
+ // e.g., .../service_mesh/3.2 -> service_mesh_3.2
18
+ const productSegment = segments[segments.length - 2];
19
+ safeName = `${productSegment}_${lastSegment}`;
20
+ }
21
+ else {
22
+ safeName = lastSegment;
23
+ }
24
+ }
10
25
  // Sanitize filename
11
- const safeName = lastSegment.replace(/[^a-zA-Z0-9-_]/g, '_');
26
+ safeName = safeName.replace(/[^a-zA-Z0-9-_]/g, '_');
12
27
  return `${typePrefix}-${safeName}.md`;
13
28
  }
14
29
  catch (e) {
@@ -0,0 +1,118 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.crawlAndAggregate = crawlAndAggregate;
7
+ const fs_1 = __importDefault(require("fs"));
8
+ const path_1 = __importDefault(require("path"));
9
+ const chalk_1 = __importDefault(require("chalk"));
10
+ const StrategyDetector_1 = require("./StrategyDetector");
11
+ const OCPStrategy_1 = require("../strategies/OCPStrategy");
12
+ const ACPStrategy_1 = require("../strategies/ACPStrategy");
13
+ const filename_1 = require("./filename");
14
+ async function crawlAndAggregate(urlInput, options) {
15
+ const inputs = urlInput.split(',').map(u => u.trim()).filter(u => u.length > 0);
16
+ if (inputs.length === 0)
17
+ throw new Error("No valid inputs provided");
18
+ // Determine output path
19
+ let outputPath;
20
+ if (options.output) {
21
+ outputPath = path_1.default.resolve(process.cwd(), options.output);
22
+ }
23
+ else {
24
+ const firstInput = inputs[0];
25
+ // Check if first input is a local file
26
+ const isFile = fs_1.default.existsSync(firstInput) && fs_1.default.statSync(firstInput).isFile();
27
+ let prefix = options.prefix;
28
+ if (!prefix) {
29
+ if (isFile) {
30
+ prefix = 'local';
31
+ }
32
+ else {
33
+ let type = options.type;
34
+ if (!type) {
35
+ const detected = StrategyDetector_1.StrategyDetector.detect(firstInput);
36
+ type = detected !== StrategyDetector_1.StrategyType.UNKNOWN ? detected : 'acp';
37
+ }
38
+ prefix = type;
39
+ }
40
+ }
41
+ if (isFile) {
42
+ const basename = path_1.default.basename(firstInput, path_1.default.extname(firstInput));
43
+ outputPath = path_1.default.resolve(process.cwd(), `${prefix}-${basename}.md`);
44
+ }
45
+ else {
46
+ const defaultName = (0, filename_1.generateDefaultFilename)(firstInput, prefix);
47
+ outputPath = path_1.default.resolve(process.cwd(), defaultName);
48
+ }
49
+ }
50
+ // Check cache
51
+ if (fs_1.default.existsSync(outputPath) && !options.force) {
52
+ console.log(chalk_1.default.yellow('--------------------------------------------------'));
53
+ console.log(chalk_1.default.yellow(`ℹ File already exists: ${outputPath}`));
54
+ console.log(chalk_1.default.yellow(' Skipping crawl/read to save time. (Use --force to overwrite)'));
55
+ console.log(chalk_1.default.yellow('--------------------------------------------------'));
56
+ return outputPath;
57
+ }
58
+ let aggregatedMarkdown = '';
59
+ console.log(chalk_1.default.blue(`[Multi-URL] Found ${inputs.length} inputs to process.`));
60
+ for (const [index, input] of inputs.entries()) {
61
+ console.log(chalk_1.default.blue(`\n[Multi-URL] Processing ${index + 1}/${inputs.length}: ${input}`));
62
+ // 1. Check if local file
63
+ if (fs_1.default.existsSync(input) && fs_1.default.statSync(input).isFile()) {
64
+ console.log(chalk_1.default.green(`[Solo-Doc] Detected local file. Reading content...`));
65
+ const content = fs_1.default.readFileSync(input, 'utf-8');
66
+ if (index > 0) {
67
+ aggregatedMarkdown += '\n\n---\n\n';
68
+ }
69
+ aggregatedMarkdown += content;
70
+ continue;
71
+ }
72
+ // 2. Assume URL and Crawl
73
+ let strategy;
74
+ let strategyType = options.type;
75
+ if (!strategyType) {
76
+ const detected = StrategyDetector_1.StrategyDetector.detect(input);
77
+ if (detected !== StrategyDetector_1.StrategyType.UNKNOWN) {
78
+ strategyType = detected;
79
+ console.log(chalk_1.default.blue(`[Solo-Doc] Auto-detected strategy: ${strategyType.toUpperCase()}`));
80
+ }
81
+ else {
82
+ strategyType = 'acp'; // Default
83
+ console.log(chalk_1.default.yellow(`[Solo-Doc] Could not detect strategy, defaulting to ACP`));
84
+ }
85
+ }
86
+ else {
87
+ console.log(chalk_1.default.blue(`[Solo-Doc] Using forced strategy: ${strategyType.toUpperCase()}`));
88
+ }
89
+ if (strategyType === 'ocp' || strategyType === StrategyDetector_1.StrategyType.OCP) {
90
+ strategy = new OCPStrategy_1.OCPStrategy();
91
+ }
92
+ else {
93
+ strategy = new ACPStrategy_1.ACPStrategy();
94
+ }
95
+ try {
96
+ const markdown = await strategy.execute(input, {
97
+ output: outputPath,
98
+ limit: options.limit,
99
+ headless: options.headless
100
+ });
101
+ // Add separator if not first
102
+ if (index > 0) {
103
+ aggregatedMarkdown += '\n\n---\n\n';
104
+ }
105
+ aggregatedMarkdown += markdown;
106
+ }
107
+ catch (e) {
108
+ console.error(chalk_1.default.red(`[Solo-Doc] Error during crawl of ${input}: ${e.message}`));
109
+ // Continue with other URLs? Or fail hard?
110
+ // Requirement doesn't specify, but failing hard is safer to avoid incomplete docs.
111
+ throw e;
112
+ }
113
+ }
114
+ // Write to file
115
+ fs_1.default.writeFileSync(outputPath, aggregatedMarkdown);
116
+ console.log(chalk_1.default.green(`\n✔ Aggregated content written to: ${outputPath}`));
117
+ return outputPath;
118
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "solo-doc",
3
- "version": "0.3.2",
3
+ "version": "0.3.4",
4
4
  "main": "dist/bin/solo-doc.js",
5
5
  "bin": {
6
6
  "solo-doc": "dist/bin/solo-doc.js"