npm - solo-doc - Versions diffs - 0.0.1 → 0.0.2 - Mend

solo-doc 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/bin/solo-doc.js +44 -0
package/dist/src/CrawlerContext.js +28 -0
package/dist/src/strategies/ACPStrategy.js +185 -0
package/dist/src/strategies/OCPStrategy.js +99 -0
package/dist/src/types/index.js +2 -0
package/dist/src/utils/markdown.js +22 -0
package/package.json +2 -1

package/dist/bin/solo-doc.js ADDED Viewed

@@ -0,0 +1,44 @@
+#!/usr/bin/env node
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+const commander_1 = require("commander");
+const CrawlerContext_1 = require("../src/CrawlerContext");
+const OCPStrategy_1 = require("../src/strategies/OCPStrategy");
+const ACPStrategy_1 = require("../src/strategies/ACPStrategy");
+const path_1 = __importDefault(require("path"));
+const program = new commander_1.Command();
+program
+    .name('solo-doc')
+    .description('CLI to crawl documentation sites and convert to single Markdown file')
+    .version('1.0.0');
+program
+    .command('ocp <url>')
+    .description('Crawl Red Hat OpenShift documentation')
+    .option('-o, --output <path>', 'Output file path', 'ocp-docs.md')
+    .option('--limit <number>', 'Limit number of pages (for debug)', parseInt)
+    .action(async (url, options) => {
+    const strategy = new OCPStrategy_1.OCPStrategy();
+    const context = new CrawlerContext_1.CrawlerContext(strategy);
+    const outputPath = path_1.default.resolve(process.cwd(), options.output);
+    await context.run(url, { output: outputPath, limit: options.limit });
+});
+program
+    .command('acp <url>')
+    .description('Crawl Alauda Container Platform documentation')
+    .option('-o, --output <path>', 'Output file path', 'acp-docs.md')
+    .option('--limit <number>', 'Limit number of pages (for debug)', parseInt)
+    .option('--no-headless', 'Run in headful mode (show browser)')
+    .action(async (url, options) => {
+    const strategy = new ACPStrategy_1.ACPStrategy();
+    const context = new CrawlerContext_1.CrawlerContext(strategy);
+    const outputPath = path_1.default.resolve(process.cwd(), options.output);
+    await context.run(url, {
+        output: outputPath,
+        limit: options.limit,
+        headless: options.headless
+    });
+});
+program.parse(process.argv);

package/dist/src/CrawlerContext.js ADDED Viewed

@@ -0,0 +1,28 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.CrawlerContext = void 0;
+const chalk_1 = __importDefault(require("chalk"));
+class CrawlerContext {
+    constructor(strategy) {
+        this.strategy = strategy;
+    }
+    async run(url, options) {
+        console.log(chalk_1.default.blue(`[Solo-Doc] Starting crawl using strategy: ${this.strategy.name}`));
+        console.log(chalk_1.default.gray(`Target: ${url}`));
+        try {
+            const markdown = await this.strategy.execute(url, options);
+            // Write to file
+            const fs = require('fs');
+            fs.writeFileSync(options.output, markdown, 'utf-8');
+            console.log(chalk_1.default.green(`[Solo-Doc] Completed! Output saved to: ${options.output}`));
+        }
+        catch (error) {
+            console.error(chalk_1.default.red(`[Solo-Doc] Error during crawl: ${error.message}`));
+            throw error;
+        }
+    }
+}
+exports.CrawlerContext = CrawlerContext;

package/dist/src/strategies/ACPStrategy.js ADDED Viewed

@@ -0,0 +1,185 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.ACPStrategy = void 0;
+const markdown_1 = require("../utils/markdown");
+const puppeteer_core_1 = __importDefault(require("puppeteer-core"));
+const ora_1 = __importDefault(require("ora"));
+const chalk_1 = __importDefault(require("chalk"));
+const fs_1 = __importDefault(require("fs"));
+// Hardcoded for now, ideally configurable or auto-detected
+const CHROME_PATH = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome';
+class ACPStrategy {
+    constructor() {
+        this.name = 'ACP (Alauda Container Platform)';
+    }
+    async execute(url, options) {
+        const spinner = (0, ora_1.default)('Launching Browser for ACP...').start();
+        if (!fs_1.default.existsSync(CHROME_PATH)) {
+            spinner.fail(`Chrome not found at ${CHROME_PATH}`);
+            throw new Error('Chrome executable not found. Please install Google Chrome or configure the path.');
+        }
+        let browser;
+        try {
+            browser = await puppeteer_core_1.default.launch({
+                executablePath: CHROME_PATH,
+                headless: options?.headless ?? true,
+                args: ['--no-sandbox', '--disable-setuid-sandbox']
+            });
+            const page = await browser.newPage();
+            // Set viewport to ensure sidebar is visible (responsive design might hide it on small screens)
+            await page.setViewport({ width: 1280, height: 800 });
+            // Determine the base path for filtering
+            // If url ends in .html or similar, strip it to get the folder.
+            let baseUrl = url;
+            // Simple check: if last segment has a dot, assume file.
+            // If no trailing slash and no dot, assume directory? Browsers might add slash.
+            // Let's rely on string manipulation.
+            if (baseUrl.split('/').pop()?.includes('.')) {
+                baseUrl = baseUrl.substring(0, baseUrl.lastIndexOf('/') + 1);
+            }
+            else if (!baseUrl.endsWith('/')) {
+                baseUrl += '/';
+            }
+            spinner.info(`Scope limited to: ${baseUrl}`);
+            // 1. Navigate to home to get the menu
+            spinner.text = `Navigating to ${url} to extract menu...`;
+            // Increase timeout for initial load
+            await page.goto(url, { waitUntil: 'networkidle0', timeout: 60000 });
+            // Wait for menu to appear.
+            // Rspress usually has a sidebar in a <nav> or aside.
+            // We'll look for any 'nav' or specific classes if known.
+            try {
+                await page.waitForSelector('nav', { timeout: 15000 });
+            }
+            catch (e) {
+                console.log(chalk_1.default.yellow('  Warning: Timeout waiting for "nav", trying to find links anyway...'));
+            }
+            // 2. Extract Menu Structure from Sidebar
+            // This preserves the hierarchy order (Document Order)
+            spinner.text = 'Extracting menu structure from sidebar...';
+            const links = await page.evaluate((scopeUrl) => {
+                // Try to find the main sidebar
+                // Common selectors for sidebars in doc frameworks
+                const sidebar = document.querySelector('.rspress-nav-menu') ||
+                    document.querySelector('nav.sidebar') ||
+                    document.querySelector('aside') ||
+                    document.querySelector('nav');
+                if (!sidebar)
+                    return [];
+                const anchors = Array.from(sidebar.querySelectorAll('a'));
+                return anchors
+                    .filter(a => {
+                    const href = a.href;
+                    // const text = a.innerText.trim(); // Text might be empty if it's an icon, but usually not in sidebar
+                    if (!href)
+                        return false;
+                    if (href.includes('#'))
+                        return false; // Skip anchors
+                    // Strict filter based on the scopeUrl
+                    if (!href.startsWith(scopeUrl))
+                        return false;
+                    return true;
+                })
+                    .map(a => ({
+                    title: a.innerText.trim() || 'Untitled',
+                    url: a.href
+                }));
+            }, baseUrl);
+            // Dedup links but preserve order
+            const uniqueLinks = [];
+            const seenUrls = new Set();
+            // Normalize URL function (strip trailing slash for consistency)
+            const normalize = (u) => u.replace(/\/$/, '');
+            for (const link of links) {
+                const normUrl = normalize(link.url);
+                if (!seenUrls.has(normUrl)) {
+                    seenUrls.add(normUrl);
+                    uniqueLinks.push(link);
+                }
+            }
+            spinner.succeed(`Found ${uniqueLinks.length} pages in sidebar navigation.`);
+            if (uniqueLinks.length === 0) {
+                spinner.warn(chalk_1.default.yellow('Warning: No links found in sidebar. Falling back to body links...'));
+                // Fallback logic could go here
+            }
+            let combinedMarkdown = `# Alauda Container Platform Docs\n\n`;
+            // 3. Crawl each page in order
+            const limit = options?.limit ? options.limit : uniqueLinks.length;
+            for (let i = 0; i < limit; i++) {
+                const link = uniqueLinks[i];
+                const progress = `[${i + 1}/${limit}]`;
+                spinner.start(`${progress} Fetching: ${link.title} (${link.url})`);
+                try {
+                    // Navigate to the page
+                    // Use 'domcontentloaded' for speed if possible, but 'networkidle0' is safer for dynamic content
+                    await page.goto(link.url, { waitUntil: 'networkidle2', timeout: 30000 });
+                    // Wait for main content
+                    await page.waitForSelector('main, article, .rspress-doc-container', { timeout: 5000 }).catch(() => { });
+                    // Extract content
+                    const contentHtml = await page.evaluate(() => {
+                        // Select the main content container
+                        // Rspress: .rspress-doc-container or main
+                        const main = document.querySelector('.rspress-doc-container') ||
+                            document.querySelector('main') ||
+                            document.querySelector('article') ||
+                            document.body;
+                        if (!main)
+                            return '';
+                        // Clone to modify
+                        const clones = main.cloneNode(true);
+                        // Remove noise: sidebar, header, footer, pagination, toc
+                        const noiseSelectors = [
+                            'nav',
+                            '.sidebar',
+                            'header',
+                            'footer',
+                            '.rspress-doc-footer',
+                            '.rspress-doc-header',
+                            '.table-of-contents',
+                            '.toc'
+                        ];
+                        noiseSelectors.forEach(sel => {
+                            clones.querySelectorAll(sel).forEach(n => n.remove());
+                        });
+                        // Resolve relative URLs to absolute URLs
+                        const anchors = clones.querySelectorAll('a');
+                        anchors.forEach(a => {
+                            if (a.href) {
+                                // a.href property returns the absolute URL in modern browsers
+                                a.setAttribute('href', a.href);
+                            }
+                        });
+                        // Also resolve images src
+                        const images = clones.querySelectorAll('img');
+                        images.forEach(img => {
+                            if (img.src) {
+                                img.setAttribute('src', img.src);
+                            }
+                        });
+                        return clones.innerHTML;
+                    });
+                    const markdown = markdown_1.converter.convert(contentHtml);
+                    // Add Title and Content
+                    combinedMarkdown += `\n\n---\n\n# ${link.title}\n\n${markdown}`;
+                }
+                catch (err) {
+                    spinner.fail(`${progress} Failed: ${link.title} - ${err.message}`);
+                    // Continue to next
+                }
+            }
+            spinner.succeed(`All ${limit} pages processed.`);
+            await browser.close();
+            return combinedMarkdown;
+        }
+        catch (error) {
+            if (browser)
+                await browser.close();
+            spinner.fail('ACP Crawl failed');
+            throw error;
+        }
+    }
+}
+exports.ACPStrategy = ACPStrategy;

package/dist/src/strategies/OCPStrategy.js ADDED Viewed

@@ -0,0 +1,99 @@
+"use strict";
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || (function () {
+    var ownKeys = function(o) {
+        ownKeys = Object.getOwnPropertyNames || function (o) {
+            var ar = [];
+            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
+            return ar;
+        };
+        return ownKeys(o);
+    };
+    return function (mod) {
+        if (mod && mod.__esModule) return mod;
+        var result = {};
+        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
+        __setModuleDefault(result, mod);
+        return result;
+    };
+})();
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.OCPStrategy = void 0;
+const axios_1 = __importDefault(require("axios"));
+const cheerio = __importStar(require("cheerio"));
+const markdown_1 = require("../utils/markdown");
+const ora_1 = __importDefault(require("ora"));
+const chalk_1 = __importDefault(require("chalk"));
+class OCPStrategy {
+    constructor() {
+        this.name = 'OCP (Red Hat OpenShift)';
+    }
+    async execute(url, options) {
+        const spinner = (0, ora_1.default)('Fetching OCP content...').start();
+        try {
+            // 1. Fetch the single page HTML
+            const { data } = await axios_1.default.get(url);
+            const $ = cheerio.load(data);
+            spinner.succeed('Content fetched successfully');
+            spinner.start('Parsing and cleaning content...');
+            // 2. Identify the main content area
+            // OCP html-single usually has a main container
+            let mainContent = $('#main-content, .pf-c-page__main, article, main').first();
+            if (mainContent.length === 0) {
+                // Fallback to body if specific container not found, but this is risky
+                mainContent = $('body');
+                console.warn(chalk_1.default.yellow('Warning: specific main content container not found, using body.'));
+            }
+            // 3. Clean up unwanted elements before conversion
+            // Remove TOCs that are often embedded in the text as "Table of Contents" blocks
+            mainContent.find('.toc').remove();
+            mainContent.find('.hidden-print').remove(); // Often used for print controls
+            mainContent.find('script').remove();
+            mainContent.find('style').remove();
+            // Resolve relative URLs to absolute URLs
+            const baseUrl = new URL(url);
+            mainContent.find('a').each((_, element) => {
+                const href = $(element).attr('href');
+                if (href) {
+                    try {
+                        const absoluteUrl = new URL(href, baseUrl).href;
+                        $(element).attr('href', absoluteUrl);
+                    }
+                    catch (e) {
+                        // Ignore invalid URLs
+                    }
+                }
+            });
+            // 4. Convert to Markdown
+            const html = mainContent.html() || '';
+            const markdown = markdown_1.converter.convert(html);
+            spinner.succeed('Conversion complete');
+            // Add a title header
+            const title = $('title').text().trim();
+            return `# ${title}\n\n${markdown}`;
+        }
+        catch (error) {
+            spinner.fail('Failed to process OCP document');
+            throw error;
+        }
+    }
+}
+exports.OCPStrategy = OCPStrategy;

package/dist/src/types/index.js ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ "use strict";
2	+ Object.defineProperty(exports, "__esModule", { value: true });

package/dist/src/utils/markdown.js ADDED Viewed

@@ -0,0 +1,22 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.converter = exports.MarkdownConverter = void 0;
+const turndown_1 = __importDefault(require("turndown"));
+class MarkdownConverter {
+    constructor() {
+        this.service = new turndown_1.default({
+            headingStyle: 'atx',
+            codeBlockStyle: 'fenced'
+        });
+        // Remove scripts, styles, etc.
+        this.service.remove(['script', 'style', 'nav', 'footer', 'header', 'iframe', 'noscript']);
+    }
+    convert(html) {
+        return this.service.turndown(html);
+    }
+}
+exports.MarkdownConverter = MarkdownConverter;
+exports.converter = new MarkdownConverter();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "solo-doc",
-  "version": "0.0.1",
+  "version": "0.0.2",
   "main": "dist/bin/solo-doc.js",
   "bin": {
     "solo-doc": "dist/bin/solo-doc.js"
@@ -32,6 +32,7 @@
   "devDependencies": {
     "@types/node": "^22.10.5",
     "@types/turndown": "^5.0.5",
+    "solo-doc": "^0.0.1",
     "ts-node": "^10.9.2",
     "typescript": "^5.7.3"
   },