npm - portapack - Versions diffs - 0.2.1 → 0.3.1 - Mend

portapack 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/.github/workflows/ci.yml +5 -4
package/CHANGELOG.md +20 -0
package/README.md +81 -219
package/dist/cli/{cli-entry.js → cli-entry.cjs} +620 -513
package/dist/cli/cli-entry.cjs.map +1 -0
package/dist/index.d.ts +51 -56
package/dist/index.js +517 -458
package/dist/index.js.map +1 -1
package/docs/.vitepress/config.ts +0 -1
package/docs/cli.md +108 -45
package/docs/configuration.md +101 -116
package/docs/getting-started.md +74 -44
package/jest.config.ts +18 -8
package/jest.setup.cjs +66 -146
package/package.json +5 -5
package/src/cli/cli-entry.ts +15 -15
package/src/cli/cli.ts +130 -119
package/src/core/bundler.ts +174 -63
package/src/core/extractor.ts +364 -277
package/src/core/web-fetcher.ts +205 -141
package/src/index.ts +161 -224
package/tests/unit/cli/cli-entry.test.ts +66 -77
package/tests/unit/cli/cli.test.ts +243 -145
package/tests/unit/core/bundler.test.ts +334 -258
package/tests/unit/core/extractor.test.ts +608 -1064
package/tests/unit/core/minifier.test.ts +130 -221
package/tests/unit/core/packer.test.ts +255 -106
package/tests/unit/core/parser.test.ts +89 -458
package/tests/unit/core/web-fetcher.test.ts +310 -265
package/tests/unit/index.test.ts +206 -300
package/tests/unit/utils/logger.test.ts +32 -28
package/tsconfig.jest.json +8 -7
package/tsup.config.ts +34 -29
package/dist/cli/cli-entry.js.map +0 -1
package/docs/demo.md +0 -46
package/output.html +0 -1
package/site-packed.html +0 -1
package/test-output.html +0 -0

package/src/core/web-fetcher.ts CHANGED Viewed

@@ -7,14 +7,21 @@
 import * as puppeteer from 'puppeteer';
 import * as fs from 'fs/promises';
 import { Logger } from '../utils/logger'; // Assuming logger is in ../utils
-import { BuildResult, PageEntry } from '../types'; // Assuming types are defined here
+import { BuildResult, PageEntry, BundleMetadata } from '../types'; // Assuming types are defined here
 import { bundleMultiPageHTML } from './bundler'; // Assuming bundler is here
-/**
- * @typedef {object} CrawlResult
- * @property {string} url - The URL of the crawled page.
- * @property {string} html - The HTML content of the crawled page.
- */
+// Puppeteer Launch Options (Consider making configurable)
+const PUPPETEER_LAUNCH_OPTIONS: puppeteer.LaunchOptions = {
+    headless: true,
+    args: [
+        '--no-sandbox', // Often required in containerized environments
+        '--disable-setuid-sandbox',
+        '--disable-dev-shm-usage', // Recommended for Docker/CI
+    ],
+};
+// Default Page Navigation Options (Consider making configurable)
+const DEFAULT_PAGE_TIMEOUT = 30000; // 30 seconds
 /**
  * Fetches the rendered HTML content and basic metadata for a single web page URL.
@@ -22,7 +29,8 @@ import { bundleMultiPageHTML } from './bundler'; // Assuming bundler is here
  *
  * @param {string} url - The fully qualified URL to fetch.
  * @param {Logger} [logger] - Optional logger instance for debug/info messages.
- * @param {number} [timeout=30000] - Navigation timeout in milliseconds.
+ * @param {number} [timeout=DEFAULT_PAGE_TIMEOUT] - Navigation timeout in milliseconds.
+ * @param {string} [userAgent] - Optional custom User-Agent string.
  * @returns {Promise<BuildResult>} A promise that resolves with the fetched HTML
  * and metadata, or rejects on critical errors.
  * @throws {Error} Throws errors from Puppeteer launch, page creation, or navigation failures.
@@ -30,36 +38,44 @@ import { bundleMultiPageHTML } from './bundler'; // Assuming bundler is here
 export async function fetchAndPackWebPage(
     url: string,
     logger?: Logger,
-    timeout: number = 30000
+    timeout: number = DEFAULT_PAGE_TIMEOUT,
+    userAgent?: string,
 ): Promise<BuildResult> {
-    let browser: puppeteer.Browser | null = null; // Initialize browser to null
+    let browser: puppeteer.Browser | null = null;
     const start = Date.now();
-    logger?.debug(`Initiating fetch for single page: ${url}`);
+    logger?.info(`Initiating fetch for single page: ${url}`);
     try {
-        browser = await puppeteer.launch({ headless: true });
-        logger?.debug(`Browser launched for ${url}`);
+        logger?.debug('Launching browser...');
+        browser = await puppeteer.launch(PUPPETEER_LAUNCH_OPTIONS);
+        logger?.debug(`Browser launched successfully (PID: ${browser.process()?.pid}).`);
         const page = await browser.newPage();
-        logger?.debug(`Page created for ${url}`);
+        logger?.debug(`New page created for ${url}`);
+        // Set User-Agent if provided
+        if (userAgent) {
+            await page.setUserAgent(userAgent);
+            logger?.debug(`User-Agent set to: "${userAgent}"`);
+        }
         try {
             logger?.debug(`Navigating to ${url} with timeout ${timeout}ms`);
             await page.goto(url, { waitUntil: 'networkidle2', timeout: timeout });
             logger?.debug(`Navigation successful for ${url}`);
             const html = await page.content();
-            logger?.debug(`Content retrieved for ${url}`);
+            logger?.debug(`Content retrieved for ${url} (${Buffer.byteLength(html, 'utf-8')} bytes)`);
-            const metadata: BuildResult['metadata'] = {
+            const metadata: BundleMetadata = {
                 input: url,
                 outputSize: Buffer.byteLength(html, 'utf-8'),
-                assetCount: 0, // Basic fetch doesn't track assets
+                assetCount: 0, // Basic fetch doesn't track assets processed by *this* tool
                 buildTimeMs: Date.now() - start,
                 errors: [], // No errors if we reached this point
             };
-            await page.close(); // Close the page specifically
+            await page.close();
             logger?.debug(`Page closed for ${url}`);
-            // await browser.close(); // Close the browser instance
+            await browser.close();
             logger?.debug(`Browser closed for ${url}`);
             browser = null; // Ensure browser is marked as closed
@@ -67,25 +83,33 @@ export async function fetchAndPackWebPage(
         } catch (pageError: any) {
             logger?.error(`Error during page processing for ${url}: ${pageError.message}`);
-             // Ensure page is closed even if an error occurred during processing
-            try { await page.close();
-             } catch (closeErr) {
-                throw closeErr;
+             // Attempt to close the page even if processing failed
+            if (page && !page.isClosed()) {
+                try {
+                    await page.close();
+                    logger?.debug(`Page closed after error for ${url}`);
+                } catch (closeErr: any) {
+                     logger?.error(`Failed to close page after error for ${url}: ${closeErr.message}`);
+                     // Decide if this secondary error should be thrown or just logged
+                }
             }
             throw pageError; // Re-throw the original page processing error
         }
     } catch (launchError: any) {
-        logger?.error(`Critical error during browser launch or page creation for ${url}: ${launchError.message}`);
-        // Ensure browser is closed if launch succeeded but newPage failed, etc.
-        // Although if launch fails, browser might be null.
+        logger?.error(`Critical error during browser launch or page setup for ${url}: ${launchError.message}`);
+        // Ensure browser is closed if launch succeeded partially but later failed
         if (browser) {
-            try { await browser.close(); } catch (closeErr) { /* Ignore browser close error */ }
+            try {
+                await browser.close();
+                logger?.debug('Browser closed after launch/setup error.');
+             } catch (closeErr: any) {
+                 logger?.warn(`Failed to close browser after launch/setup error: ${closeErr.message}`);
+             }
+             browser = null;
         }
         throw launchError; // Re-throw the original launch/setup error
     } finally {
-        // Final check: If browser somehow wasn't closed and isn't null, attempt closure.
-        // This handles edge cases where errors might bypass earlier closes.
+        // Final safety net: If browser somehow wasn't closed and isn't null, attempt closure.
         if (browser) {
              logger?.warn(`Closing browser in final cleanup for ${url}. This might indicate an unusual error path.`);
              try { await browser.close(); } catch (closeErr) { /* Ignore final browser close error */ }
@@ -93,156 +117,194 @@ export async function fetchAndPackWebPage(
     }
 }
+/**
+ * @typedef {object} CrawlOptions
+ * @property {number} [maxDepth=1] - Maximum crawl depth.
+ * @property {number} [timeout=DEFAULT_PAGE_TIMEOUT] - Navigation timeout per page.
+ * @property {string[]} [include=[]] - Glob patterns for URLs to include.
+ * @property {string[]} [exclude=[]] - Glob patterns for URLs to exclude.
+ * @property {string} [userAgent] - Custom User-Agent string.
+ * @property {Logger} [logger] - Optional logger instance.
+ */
 /**
  * Internal function to recursively crawl a website starting from a given URL.
  * Uses a single browser instance and manages pages for efficiency during crawl.
  * Implements Breadth-First Search (BFS) using a queue.
+ * Respects same-origin policy and visited URLs.
  *
  * @private
  * @param {string} startUrl - The initial URL to start crawling from.
- * @param {number} maxDepth - The maximum depth of links to follow (1 means only the start URL).
- * @param {Logger} [logger] - Optional logger instance.
+ * @param {CrawlOptions} options - Crawling configuration options.
  * @returns {Promise<PageEntry[]>} A promise resolving to an array of PageEntry objects
  * containing the URL and HTML for each successfully crawled page.
  */
 async function crawlWebsite(
     startUrl: string,
-    maxDepth: number,
-    logger?: Logger
+    options: {
+        maxDepth?: number;
+        timeout?: number;
+        include?: string[]; // Add include/exclude/userAgent later if needed
+        exclude?: string[];
+        userAgent?: string;
+        logger?: Logger;
+    }
 ): Promise<PageEntry[]> {
+    const {
+        maxDepth = 1,
+        timeout = DEFAULT_PAGE_TIMEOUT,
+        // include = ['**'], // TODO: Implement glob filtering
+        // exclude = [],
+        userAgent,
+        logger,
+    } = options;
     logger?.info(`Starting crawl for ${startUrl} with maxDepth ${maxDepth}`);
-    // Don't even start a browser if maxDepth is 0
     if (maxDepth <= 0) {
-        logger?.info('maxDepth is 0 or negative, no pages will be crawled.');
+        logger?.warn('maxDepth is 0 or negative, no pages will be crawled.');
         return [];
     }
-    const browser = await puppeteer.launch({ headless: true });
+    let browser: puppeteer.Browser | null = null;
     const visited = new Set<string>();
     const results: PageEntry[] = [];
-    // Queue stores URLs to visit and their corresponding depth
     const queue: { url: string; depth: number }[] = [];
-    // Initialize startOrigin for same-origin check
     let startOrigin: string;
-    try {
-        startOrigin = new URL(startUrl).origin;
-    } catch (e: any) {
-        logger?.error(`Invalid start URL: ${startUrl}. ${e.message}`);
-        await browser.close();
-        return []; // Cannot start crawl with invalid URL
-    }
-    // Normalize start URL (remove fragment) and add to queue/visited if depth allows
-    let normalizedStartUrl: string;
     try {
-        const parsedStartUrl = new URL(startUrl);
-        parsedStartUrl.hash = ''; // Remove fragment for consistent visited checks
-        normalizedStartUrl = parsedStartUrl.href;
-    } catch (e: any) {
-        logger?.error(`Invalid start URL: ${startUrl}. ${e.message}`);
-        await browser.close();
-        return []; // Cannot start crawl with invalid URL
-    }
+        // Validate start URL and get origin
+        try {
+            startOrigin = new URL(startUrl).origin;
+        } catch (e: any) {
+            logger?.error(`Invalid start URL: ${startUrl}. ${e.message}`);
+            throw new Error(`Invalid start URL: ${startUrl}`); // Propagate error
+        }
-    visited.add(normalizedStartUrl);
-    queue.push({ url: normalizedStartUrl, depth: 1 });
-    logger?.debug(`Queued initial URL: ${normalizedStartUrl} (depth 1)`);
+        // Normalize start URL (remove fragment)
+        let normalizedStartUrl: string;
+        try {
+            const parsedStartUrl = new URL(startUrl);
+            parsedStartUrl.hash = '';
+            normalizedStartUrl = parsedStartUrl.href;
+        } catch (e: any) {
+            logger?.error(`Invalid start URL: ${startUrl}. ${e.message}`);
+            throw new Error(`Invalid start URL: ${startUrl}`); // Propagate error
+        }
-    while (queue.length > 0) {
-        const { url, depth } = queue.shift()!; // Non-null assertion ok due to queue.length check
-        logger?.info(`Processing: ${url} (depth ${depth})`);
-        let page: puppeteer.Page | null = null;
+        // Launch browser *after* validating URL
+        logger?.debug('Launching browser for crawl...');
+        browser = await puppeteer.launch(PUPPETEER_LAUNCH_OPTIONS);
+        logger?.debug(`Browser launched for crawl (PID: ${browser.process()?.pid}).`);
-        try {
-            page = await browser.newPage();
-            // Set a reasonable viewport, sometimes helps with rendering/layout dependent scripts
-            await page.setViewport({ width: 1280, height: 800 });
-            await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
-            const html = await page.content();
+        // Initial queue setup
+        visited.add(normalizedStartUrl);
+        queue.push({ url: normalizedStartUrl, depth: 1 });
+        logger?.debug(`Queued initial URL: ${normalizedStartUrl} (depth 1)`);
-            // Add successfully fetched page to results
-            // Ensure the object structure matches your PageEntry type definition
-            results.push({ url, html });
-            logger?.debug(`Successfully fetched content for ${url}`);
-            // --- Link Discovery ---
-            // Only look for more links if we haven't reached the maximum depth
-            if (depth < maxDepth) {
-                logger?.debug(`Discovering links on ${url} (current depth ${depth}, maxDepth ${maxDepth})`);
-                // Use page.evaluate to get all href attributes directly from the DOM
-                const hrefs = await page.evaluate(() =>
-                    Array.from(document.querySelectorAll('a[href]'), a => a.getAttribute('href'))
-                );
-                logger?.debug(`Found ${hrefs.length} potential hrefs on ${url}`);
-                let linksAdded = 0;
-                for (const href of hrefs) {
-                    if (!href) continue; // Skip empty hrefs like href=""
-                    let absoluteUrl: string;
-                    try {
-                        // Resolve the href relative to the current page's URL
-                        const resolved = new URL(href, url);
-                        // Remove fragment (#) for visited checks and queueing consistency
-                        resolved.hash = '';
-                        absoluteUrl = resolved.href;
-                    } catch (e) {
-                        // Ignore URLs that fail to parse (e.g., "javascript:void(0)")
-                        logger?.debug(`Ignoring invalid URL syntax: "${href}" on page ${url}`);
-                        continue;
-                    }
+        while (queue.length > 0) {
+            const { url, depth } = queue.shift()!;
+            logger?.info(`Processing: ${url} (depth ${depth})`);
+            let page: puppeteer.Page | null = null;
+            try {
+                page = await browser.newPage();
+                if (userAgent) {
+                    await page.setUserAgent(userAgent);
+                }
+                // Consider adding viewport setting if needed: await page.setViewport({ width: 1280, height: 800 });
+                await page.goto(url, { waitUntil: 'networkidle2', timeout: timeout });
+                const html = await page.content();
+                results.push({ url, html }); // Matches PageEntry type
+                logger?.debug(`Successfully fetched content for ${url}`);
+                // Link Discovery (only if not at max depth)
+                if (depth < maxDepth) {
+                    logger?.debug(`Discovering links on ${url} (depth ${depth}/${maxDepth})`);
+                    const hrefs = await page.evaluate(() =>
+                        Array.from(document.querySelectorAll('a[href]'), a => a.getAttribute('href'))
+                    );
+                    logger?.debug(`Found ${hrefs.length} potential hrefs on ${url}`);
+                    let linksAdded = 0;
+                    for (const href of hrefs) {
+                        if (!href) continue;
+                        let absoluteUrl: string;
+                        try {
+                            const resolved = new URL(href, url);
+                            resolved.hash = ''; // Normalize
+                            absoluteUrl = resolved.href;
+                        } catch (e) {
+                            logger?.debug(`Ignoring invalid URL syntax: "${href}" on page ${url}`);
+                            continue;
+                        }
+                        // TODO: Implement include/exclude filtering here using micromatch or similar
+                        // if (!matchesInclude(absoluteUrl, include) || matchesExclude(absoluteUrl, exclude)) {
+                        //     logger?.debug(`Skipping due to include/exclude rules: ${absoluteUrl}`);
+                        //     continue;
+                        // }
-                    // --- Filtering and Queueing ---
-                    // 1. Check if it belongs to the same origin as the start URL
-                    // 2. Check if it has already been visited (or is in the queue)
-                    if (absoluteUrl.startsWith(startOrigin) && !visited.has(absoluteUrl)) {
-                        visited.add(absoluteUrl); // Mark as visited *before* adding to queue
-                        queue.push({ url: absoluteUrl, depth: depth + 1 });
-                        linksAdded++;
-                        // logger?.debug(`Queueing: ${absoluteUrl} (depth ${depth + 1})`); // Verbose
-                    } else {
-                        // logger?.debug(`Skipping (external, visited, or invalid): ${absoluteUrl}`); // Verbose
+                        // Filter: same origin and not visited
+                        if (absoluteUrl.startsWith(startOrigin) && !visited.has(absoluteUrl)) {
+                            visited.add(absoluteUrl);
+                            queue.push({ url: absoluteUrl, depth: depth + 1 });
+                            linksAdded++;
+                        }
                     }
+                    logger?.debug(`Added ${linksAdded} new unique internal links to queue from ${url}`);
+                } else {
+                    logger?.debug(`Max depth (${maxDepth}) reached, not discovering links on ${url}`);
                 }
-                logger?.debug(`Added ${linksAdded} new unique internal links to queue from ${url}`);
-            } else {
-                logger?.debug(`Max depth (${maxDepth}) reached, not discovering links on ${url}`);
-            }
-        } catch (err: any) {
-            // Log errors encountered during page processing (goto, content, evaluate)
-            logger?.warn(`❌ Failed to process ${url}: ${err.message}`);
-            // Optionally add error details to results or a separate error list if needed
-        } finally {
-            // Ensure the page is closed reliably after processing or error
-            if (page) {
-                try {
-                    await page.close();
-                } catch (pageCloseError: any) {
-                    // Log if closing the page fails, but don't let it stop the crawl
-                    logger?.error(`Failed to close page for ${url}: ${pageCloseError.message}`);
+            } catch (err: any) {
+                logger?.warn(`❌ Failed to process ${url}: ${err.message}`);
+                // Continue crawl even if one page fails
+            } finally {
+                if (page && !page.isClosed()) {
+                    try {
+                        await page.close();
+                    } catch (pageCloseError: any) {
+                        logger?.error(`Failed to close page for ${url}: ${pageCloseError.message}`);
+                    }
                 }
             }
+        } // End while loop
+    } catch (error) {
+        // Catch critical errors like invalid start URL or browser launch failure
+        logger?.error(`Critical crawl error: ${error instanceof Error ? error.message : error}`);
+        // Rethrow or handle appropriately
+        throw error;
+    } finally {
+        // Ensure browser is closed after crawl finishes or critical error occurs
+        if (browser) {
+            logger?.info(`Crawl finished or errored. Closing browser.`);
+            await browser.close();
+            logger?.debug(`Browser closed after crawl.`);
         }
-    } // End while loop
+    }
-    logger?.info(`Crawl finished. Closing browser.`);
-    await browser.close();
-    logger?.info(`Found ${results.length} pages.`);
+    logger?.info(`Crawl found ${results.length} pages.`);
     return results;
 }
 /**
  * Fetches all internal pages of a website recursively starting from a given URL,
  * bundles them into a single HTML string using the bundler module, and writes
- * the result to a file.
+ * the result to a file. Creates its own logger unless `loggerInstance` is provided.
  *
  * @export
  * @param {string} startUrl - The fully qualified URL to begin crawling from.
  * @param {string} outputFile - The path where the bundled HTML file should be saved.
  * @param {number} [maxDepth=1] - The maximum depth to crawl links (default: 1, only the start page).
+ * @param {Logger} [loggerInstance] - Optional external logger instance to use.
  * @returns {Promise<{ pages: number; html: string }>} A promise resolving to an object containing
  * the number of pages successfully crawled and the final bundled HTML string.
  * @throws {Error} Throws errors if the crawl initiation fails, bundling fails, or file writing fails.
@@ -250,15 +312,18 @@ async function crawlWebsite(
 export async function recursivelyBundleSite(
     startUrl: string,
     outputFile: string,
-    maxDepth = 1
+    maxDepth = 1,
+    loggerInstance?: Logger // Added optional logger parameter
 ): Promise<{ pages: number; html: string }> {
-    // Create a logger instance specifically for this operation
-    const logger = new Logger();
+    // Use provided logger OR create a new default one
+    const logger = loggerInstance || new Logger();
     logger.info(`Starting recursive site bundle for ${startUrl} to ${outputFile} (maxDepth: ${maxDepth})`);
     try {
         // Step 1: Crawl the website
-        const pages: PageEntry[] = await crawlWebsite(startUrl, maxDepth, logger);
+        // Pass necessary options down to crawlWebsite
+        const crawlOptions = { maxDepth, logger /* Add other options like timeout, userAgent if needed */ };
+        const pages: PageEntry[] = await crawlWebsite(startUrl, crawlOptions);
         if (pages.length === 0) {
             logger.warn("Crawl completed but found 0 pages. Output file may be empty or reflect an empty bundle.");
@@ -267,7 +332,8 @@ export async function recursivelyBundleSite(
         }
         // Step 2: Bundle the HTML content
-        const bundledHtml = bundleMultiPageHTML(pages, logger); // Passing logger for consistency
+        // Pass the same logger instance for consistent logging
+        const bundledHtml = bundleMultiPageHTML(pages, logger);
         logger.info(`Bundling complete. Output size: ${Buffer.byteLength(bundledHtml, 'utf-8')} bytes.`);
         // Step 3: Write the bundled HTML to the output file
@@ -282,11 +348,9 @@ export async function recursivelyBundleSite(
         };
     } catch (error: any) {
         logger.error(`Error during recursive site bundle: ${error.message}`);
-        // Log the stack trace for better debugging if available
         if (error.stack) {
             logger.error(`Stack trace: ${error.stack}`);
         }
-        // Re-throw the error to signal failure to the caller
-        throw error;
+        throw error; // Re-throw the error
     }
 }