solo-doc 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env node
2
+ "use strict";
3
+ var __importDefault = (this && this.__importDefault) || function (mod) {
4
+ return (mod && mod.__esModule) ? mod : { "default": mod };
5
+ };
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ const commander_1 = require("commander");
8
+ const CrawlerContext_1 = require("../src/CrawlerContext");
9
+ const OCPStrategy_1 = require("../src/strategies/OCPStrategy");
10
+ const ACPStrategy_1 = require("../src/strategies/ACPStrategy");
11
+ const path_1 = __importDefault(require("path"));
12
+ const program = new commander_1.Command();
13
+ program
14
+ .name('solo-doc')
15
+ .description('CLI to crawl documentation sites and convert to single Markdown file')
16
+ .version('1.0.0');
17
+ program
18
+ .command('ocp <url>')
19
+ .description('Crawl Red Hat OpenShift documentation')
20
+ .option('-o, --output <path>', 'Output file path', 'ocp-docs.md')
21
+ .option('--limit <number>', 'Limit number of pages (for debug)', parseInt)
22
+ .action(async (url, options) => {
23
+ const strategy = new OCPStrategy_1.OCPStrategy();
24
+ const context = new CrawlerContext_1.CrawlerContext(strategy);
25
+ const outputPath = path_1.default.resolve(process.cwd(), options.output);
26
+ await context.run(url, { output: outputPath, limit: options.limit });
27
+ });
28
+ program
29
+ .command('acp <url>')
30
+ .description('Crawl Alauda Container Platform documentation')
31
+ .option('-o, --output <path>', 'Output file path', 'acp-docs.md')
32
+ .option('--limit <number>', 'Limit number of pages (for debug)', parseInt)
33
+ .option('--no-headless', 'Run in headful mode (show browser)')
34
+ .action(async (url, options) => {
35
+ const strategy = new ACPStrategy_1.ACPStrategy();
36
+ const context = new CrawlerContext_1.CrawlerContext(strategy);
37
+ const outputPath = path_1.default.resolve(process.cwd(), options.output);
38
+ await context.run(url, {
39
+ output: outputPath,
40
+ limit: options.limit,
41
+ headless: options.headless
42
+ });
43
+ });
44
+ program.parse(process.argv);
@@ -0,0 +1,28 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.CrawlerContext = void 0;
7
+ const chalk_1 = __importDefault(require("chalk"));
8
+ class CrawlerContext {
9
+ constructor(strategy) {
10
+ this.strategy = strategy;
11
+ }
12
+ async run(url, options) {
13
+ console.log(chalk_1.default.blue(`[Solo-Doc] Starting crawl using strategy: ${this.strategy.name}`));
14
+ console.log(chalk_1.default.gray(`Target: ${url}`));
15
+ try {
16
+ const markdown = await this.strategy.execute(url, options);
17
+ // Write to file
18
+ const fs = require('fs');
19
+ fs.writeFileSync(options.output, markdown, 'utf-8');
20
+ console.log(chalk_1.default.green(`[Solo-Doc] Completed! Output saved to: ${options.output}`));
21
+ }
22
+ catch (error) {
23
+ console.error(chalk_1.default.red(`[Solo-Doc] Error during crawl: ${error.message}`));
24
+ throw error;
25
+ }
26
+ }
27
+ }
28
+ exports.CrawlerContext = CrawlerContext;
@@ -0,0 +1,185 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.ACPStrategy = void 0;
7
+ const markdown_1 = require("../utils/markdown");
8
+ const puppeteer_core_1 = __importDefault(require("puppeteer-core"));
9
+ const ora_1 = __importDefault(require("ora"));
10
+ const chalk_1 = __importDefault(require("chalk"));
11
+ const fs_1 = __importDefault(require("fs"));
12
+ // Hardcoded for now, ideally configurable or auto-detected
13
+ const CHROME_PATH = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome';
14
+ class ACPStrategy {
15
+ constructor() {
16
+ this.name = 'ACP (Alauda Container Platform)';
17
+ }
18
+ async execute(url, options) {
19
+ const spinner = (0, ora_1.default)('Launching Browser for ACP...').start();
20
+ if (!fs_1.default.existsSync(CHROME_PATH)) {
21
+ spinner.fail(`Chrome not found at ${CHROME_PATH}`);
22
+ throw new Error('Chrome executable not found. Please install Google Chrome or configure the path.');
23
+ }
24
+ let browser;
25
+ try {
26
+ browser = await puppeteer_core_1.default.launch({
27
+ executablePath: CHROME_PATH,
28
+ headless: options?.headless ?? true,
29
+ args: ['--no-sandbox', '--disable-setuid-sandbox']
30
+ });
31
+ const page = await browser.newPage();
32
+ // Set viewport to ensure sidebar is visible (responsive design might hide it on small screens)
33
+ await page.setViewport({ width: 1280, height: 800 });
34
+ // Determine the base path for filtering
35
+ // If url ends in .html or similar, strip it to get the folder.
36
+ let baseUrl = url;
37
+ // Simple check: if last segment has a dot, assume file.
38
+ // If no trailing slash and no dot, assume directory? Browsers might add slash.
39
+ // Let's rely on string manipulation.
40
+ if (baseUrl.split('/').pop()?.includes('.')) {
41
+ baseUrl = baseUrl.substring(0, baseUrl.lastIndexOf('/') + 1);
42
+ }
43
+ else if (!baseUrl.endsWith('/')) {
44
+ baseUrl += '/';
45
+ }
46
+ spinner.info(`Scope limited to: ${baseUrl}`);
47
+ // 1. Navigate to home to get the menu
48
+ spinner.text = `Navigating to ${url} to extract menu...`;
49
+ // Increase timeout for initial load
50
+ await page.goto(url, { waitUntil: 'networkidle0', timeout: 60000 });
51
+ // Wait for menu to appear.
52
+ // Rspress usually has a sidebar in a <nav> or aside.
53
+ // We'll look for any 'nav' or specific classes if known.
54
+ try {
55
+ await page.waitForSelector('nav', { timeout: 15000 });
56
+ }
57
+ catch (e) {
58
+ console.log(chalk_1.default.yellow(' Warning: Timeout waiting for "nav", trying to find links anyway...'));
59
+ }
60
+ // 2. Extract Menu Structure from Sidebar
61
+ // This preserves the hierarchy order (Document Order)
62
+ spinner.text = 'Extracting menu structure from sidebar...';
63
+ const links = await page.evaluate((scopeUrl) => {
64
+ // Try to find the main sidebar
65
+ // Common selectors for sidebars in doc frameworks
66
+ const sidebar = document.querySelector('.rspress-nav-menu') ||
67
+ document.querySelector('nav.sidebar') ||
68
+ document.querySelector('aside') ||
69
+ document.querySelector('nav');
70
+ if (!sidebar)
71
+ return [];
72
+ const anchors = Array.from(sidebar.querySelectorAll('a'));
73
+ return anchors
74
+ .filter(a => {
75
+ const href = a.href;
76
+ // const text = a.innerText.trim(); // Text might be empty if it's an icon, but usually not in sidebar
77
+ if (!href)
78
+ return false;
79
+ if (href.includes('#'))
80
+ return false; // Skip anchors
81
+ // Strict filter based on the scopeUrl
82
+ if (!href.startsWith(scopeUrl))
83
+ return false;
84
+ return true;
85
+ })
86
+ .map(a => ({
87
+ title: a.innerText.trim() || 'Untitled',
88
+ url: a.href
89
+ }));
90
+ }, baseUrl);
91
+ // Dedup links but preserve order
92
+ const uniqueLinks = [];
93
+ const seenUrls = new Set();
94
+ // Normalize URL function (strip trailing slash for consistency)
95
+ const normalize = (u) => u.replace(/\/$/, '');
96
+ for (const link of links) {
97
+ const normUrl = normalize(link.url);
98
+ if (!seenUrls.has(normUrl)) {
99
+ seenUrls.add(normUrl);
100
+ uniqueLinks.push(link);
101
+ }
102
+ }
103
+ spinner.succeed(`Found ${uniqueLinks.length} pages in sidebar navigation.`);
104
+ if (uniqueLinks.length === 0) {
105
+ spinner.warn(chalk_1.default.yellow('Warning: No links found in sidebar. Falling back to body links...'));
106
+ // Fallback logic could go here
107
+ }
108
+ let combinedMarkdown = `# Alauda Container Platform Docs\n\n`;
109
+ // 3. Crawl each page in order
110
+ const limit = options?.limit ? options.limit : uniqueLinks.length;
111
+ for (let i = 0; i < limit; i++) {
112
+ const link = uniqueLinks[i];
113
+ const progress = `[${i + 1}/${limit}]`;
114
+ spinner.start(`${progress} Fetching: ${link.title} (${link.url})`);
115
+ try {
116
+ // Navigate to the page
117
+ // Use 'domcontentloaded' for speed if possible, but 'networkidle0' is safer for dynamic content
118
+ await page.goto(link.url, { waitUntil: 'networkidle2', timeout: 30000 });
119
+ // Wait for main content
120
+ await page.waitForSelector('main, article, .rspress-doc-container', { timeout: 5000 }).catch(() => { });
121
+ // Extract content
122
+ const contentHtml = await page.evaluate(() => {
123
+ // Select the main content container
124
+ // Rspress: .rspress-doc-container or main
125
+ const main = document.querySelector('.rspress-doc-container') ||
126
+ document.querySelector('main') ||
127
+ document.querySelector('article') ||
128
+ document.body;
129
+ if (!main)
130
+ return '';
131
+ // Clone to modify
132
+ const clones = main.cloneNode(true);
133
+ // Remove noise: sidebar, header, footer, pagination, toc
134
+ const noiseSelectors = [
135
+ 'nav',
136
+ '.sidebar',
137
+ 'header',
138
+ 'footer',
139
+ '.rspress-doc-footer',
140
+ '.rspress-doc-header',
141
+ '.table-of-contents',
142
+ '.toc'
143
+ ];
144
+ noiseSelectors.forEach(sel => {
145
+ clones.querySelectorAll(sel).forEach(n => n.remove());
146
+ });
147
+ // Resolve relative URLs to absolute URLs
148
+ const anchors = clones.querySelectorAll('a');
149
+ anchors.forEach(a => {
150
+ if (a.href) {
151
+ // a.href property returns the absolute URL in modern browsers
152
+ a.setAttribute('href', a.href);
153
+ }
154
+ });
155
+ // Also resolve images src
156
+ const images = clones.querySelectorAll('img');
157
+ images.forEach(img => {
158
+ if (img.src) {
159
+ img.setAttribute('src', img.src);
160
+ }
161
+ });
162
+ return clones.innerHTML;
163
+ });
164
+ const markdown = markdown_1.converter.convert(contentHtml);
165
+ // Add Title and Content
166
+ combinedMarkdown += `\n\n---\n\n# ${link.title}\n\n${markdown}`;
167
+ }
168
+ catch (err) {
169
+ spinner.fail(`${progress} Failed: ${link.title} - ${err.message}`);
170
+ // Continue to next
171
+ }
172
+ }
173
+ spinner.succeed(`All ${limit} pages processed.`);
174
+ await browser.close();
175
+ return combinedMarkdown;
176
+ }
177
+ catch (error) {
178
+ if (browser)
179
+ await browser.close();
180
+ spinner.fail('ACP Crawl failed');
181
+ throw error;
182
+ }
183
+ }
184
+ }
185
+ exports.ACPStrategy = ACPStrategy;
@@ -0,0 +1,99 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __importDefault = (this && this.__importDefault) || function (mod) {
36
+ return (mod && mod.__esModule) ? mod : { "default": mod };
37
+ };
38
+ Object.defineProperty(exports, "__esModule", { value: true });
39
+ exports.OCPStrategy = void 0;
40
+ const axios_1 = __importDefault(require("axios"));
41
+ const cheerio = __importStar(require("cheerio"));
42
+ const markdown_1 = require("../utils/markdown");
43
+ const ora_1 = __importDefault(require("ora"));
44
+ const chalk_1 = __importDefault(require("chalk"));
45
+ class OCPStrategy {
46
+ constructor() {
47
+ this.name = 'OCP (Red Hat OpenShift)';
48
+ }
49
+ async execute(url, options) {
50
+ const spinner = (0, ora_1.default)('Fetching OCP content...').start();
51
+ try {
52
+ // 1. Fetch the single page HTML
53
+ const { data } = await axios_1.default.get(url);
54
+ const $ = cheerio.load(data);
55
+ spinner.succeed('Content fetched successfully');
56
+ spinner.start('Parsing and cleaning content...');
57
+ // 2. Identify the main content area
58
+ // OCP html-single usually has a main container
59
+ let mainContent = $('#main-content, .pf-c-page__main, article, main').first();
60
+ if (mainContent.length === 0) {
61
+ // Fallback to body if specific container not found, but this is risky
62
+ mainContent = $('body');
63
+ console.warn(chalk_1.default.yellow('Warning: specific main content container not found, using body.'));
64
+ }
65
+ // 3. Clean up unwanted elements before conversion
66
+ // Remove TOCs that are often embedded in the text as "Table of Contents" blocks
67
+ mainContent.find('.toc').remove();
68
+ mainContent.find('.hidden-print').remove(); // Often used for print controls
69
+ mainContent.find('script').remove();
70
+ mainContent.find('style').remove();
71
+ // Resolve relative URLs to absolute URLs
72
+ const baseUrl = new URL(url);
73
+ mainContent.find('a').each((_, element) => {
74
+ const href = $(element).attr('href');
75
+ if (href) {
76
+ try {
77
+ const absoluteUrl = new URL(href, baseUrl).href;
78
+ $(element).attr('href', absoluteUrl);
79
+ }
80
+ catch (e) {
81
+ // Ignore invalid URLs
82
+ }
83
+ }
84
+ });
85
+ // 4. Convert to Markdown
86
+ const html = mainContent.html() || '';
87
+ const markdown = markdown_1.converter.convert(html);
88
+ spinner.succeed('Conversion complete');
89
+ // Add a title header
90
+ const title = $('title').text().trim();
91
+ return `# ${title}\n\n${markdown}`;
92
+ }
93
+ catch (error) {
94
+ spinner.fail('Failed to process OCP document');
95
+ throw error;
96
+ }
97
+ }
98
+ }
99
+ exports.OCPStrategy = OCPStrategy;
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
@@ -0,0 +1,22 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.converter = exports.MarkdownConverter = void 0;
7
+ const turndown_1 = __importDefault(require("turndown"));
8
+ class MarkdownConverter {
9
+ constructor() {
10
+ this.service = new turndown_1.default({
11
+ headingStyle: 'atx',
12
+ codeBlockStyle: 'fenced'
13
+ });
14
+ // Remove scripts, styles, etc.
15
+ this.service.remove(['script', 'style', 'nav', 'footer', 'header', 'iframe', 'noscript']);
16
+ }
17
+ convert(html) {
18
+ return this.service.turndown(html);
19
+ }
20
+ }
21
+ exports.MarkdownConverter = MarkdownConverter;
22
+ exports.converter = new MarkdownConverter();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "solo-doc",
3
- "version": "0.0.1",
3
+ "version": "0.0.2",
4
4
  "main": "dist/bin/solo-doc.js",
5
5
  "bin": {
6
6
  "solo-doc": "dist/bin/solo-doc.js"
@@ -32,6 +32,7 @@
32
32
  "devDependencies": {
33
33
  "@types/node": "^22.10.5",
34
34
  "@types/turndown": "^5.0.5",
35
+ "solo-doc": "^0.0.1",
35
36
  "ts-node": "^10.9.2",
36
37
  "typescript": "^5.7.3"
37
38
  },