portapack 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,14 +7,21 @@
7
7
  import * as puppeteer from 'puppeteer';
8
8
  import * as fs from 'fs/promises';
9
9
  import { Logger } from '../utils/logger'; // Assuming logger is in ../utils
10
- import { BuildResult, PageEntry } from '../types'; // Assuming types are defined here
10
+ import { BuildResult, PageEntry, BundleMetadata } from '../types'; // Assuming types are defined here
11
11
  import { bundleMultiPageHTML } from './bundler'; // Assuming bundler is here
12
12
 
13
- /**
14
- * @typedef {object} CrawlResult
15
- * @property {string} url - The URL of the crawled page.
16
- * @property {string} html - The HTML content of the crawled page.
17
- */
13
+ // Puppeteer Launch Options (Consider making configurable)
14
+ const PUPPETEER_LAUNCH_OPTIONS: puppeteer.LaunchOptions = {
15
+ headless: true,
16
+ args: [
17
+ '--no-sandbox', // Often required in containerized environments
18
+ '--disable-setuid-sandbox',
19
+ '--disable-dev-shm-usage', // Recommended for Docker/CI
20
+ ],
21
+ };
22
+
23
+ // Default Page Navigation Options (Consider making configurable)
24
+ const DEFAULT_PAGE_TIMEOUT = 30000; // 30 seconds
18
25
 
19
26
  /**
20
27
  * Fetches the rendered HTML content and basic metadata for a single web page URL.
@@ -22,7 +29,8 @@ import { bundleMultiPageHTML } from './bundler'; // Assuming bundler is here
22
29
  *
23
30
  * @param {string} url - The fully qualified URL to fetch.
24
31
  * @param {Logger} [logger] - Optional logger instance for debug/info messages.
25
- * @param {number} [timeout=30000] - Navigation timeout in milliseconds.
32
+ * @param {number} [timeout=DEFAULT_PAGE_TIMEOUT] - Navigation timeout in milliseconds.
33
+ * @param {string} [userAgent] - Optional custom User-Agent string.
26
34
  * @returns {Promise<BuildResult>} A promise that resolves with the fetched HTML
27
35
  * and metadata, or rejects on critical errors.
28
36
  * @throws {Error} Throws errors from Puppeteer launch, page creation, or navigation failures.
@@ -30,36 +38,44 @@ import { bundleMultiPageHTML } from './bundler'; // Assuming bundler is here
30
38
  export async function fetchAndPackWebPage(
31
39
  url: string,
32
40
  logger?: Logger,
33
- timeout: number = 30000
41
+ timeout: number = DEFAULT_PAGE_TIMEOUT,
42
+ userAgent?: string,
34
43
  ): Promise<BuildResult> {
35
- let browser: puppeteer.Browser | null = null; // Initialize browser to null
44
+ let browser: puppeteer.Browser | null = null;
36
45
  const start = Date.now();
37
- logger?.debug(`Initiating fetch for single page: ${url}`);
46
+ logger?.info(`Initiating fetch for single page: ${url}`);
38
47
 
39
48
  try {
40
- browser = await puppeteer.launch({ headless: true });
41
- logger?.debug(`Browser launched for ${url}`);
49
+ logger?.debug('Launching browser...');
50
+ browser = await puppeteer.launch(PUPPETEER_LAUNCH_OPTIONS);
51
+ logger?.debug(`Browser launched successfully (PID: ${browser.process()?.pid}).`);
42
52
  const page = await browser.newPage();
43
- logger?.debug(`Page created for ${url}`);
53
+ logger?.debug(`New page created for ${url}`);
54
+
55
+ // Set User-Agent if provided
56
+ if (userAgent) {
57
+ await page.setUserAgent(userAgent);
58
+ logger?.debug(`User-Agent set to: "${userAgent}"`);
59
+ }
44
60
 
45
61
  try {
46
62
  logger?.debug(`Navigating to ${url} with timeout ${timeout}ms`);
47
63
  await page.goto(url, { waitUntil: 'networkidle2', timeout: timeout });
48
64
  logger?.debug(`Navigation successful for ${url}`);
49
65
  const html = await page.content();
50
- logger?.debug(`Content retrieved for ${url}`);
66
+ logger?.debug(`Content retrieved for ${url} (${Buffer.byteLength(html, 'utf-8')} bytes)`);
51
67
 
52
- const metadata: BuildResult['metadata'] = {
68
+ const metadata: BundleMetadata = {
53
69
  input: url,
54
70
  outputSize: Buffer.byteLength(html, 'utf-8'),
55
- assetCount: 0, // Basic fetch doesn't track assets
71
+ assetCount: 0, // Basic fetch doesn't track assets processed by *this* tool
56
72
  buildTimeMs: Date.now() - start,
57
73
  errors: [], // No errors if we reached this point
58
74
  };
59
75
 
60
- await page.close(); // Close the page specifically
76
+ await page.close();
61
77
  logger?.debug(`Page closed for ${url}`);
62
- // await browser.close(); // Close the browser instance
78
+ await browser.close();
63
79
  logger?.debug(`Browser closed for ${url}`);
64
80
  browser = null; // Ensure browser is marked as closed
65
81
 
@@ -67,25 +83,33 @@ export async function fetchAndPackWebPage(
67
83
 
68
84
  } catch (pageError: any) {
69
85
  logger?.error(`Error during page processing for ${url}: ${pageError.message}`);
70
- // Ensure page is closed even if an error occurred during processing
71
- try { await page.close();
72
-
73
- } catch (closeErr) {
74
- throw closeErr;
86
+ // Attempt to close the page even if processing failed
87
+ if (page && !page.isClosed()) {
88
+ try {
89
+ await page.close();
90
+ logger?.debug(`Page closed after error for ${url}`);
91
+ } catch (closeErr: any) {
92
+ logger?.error(`Failed to close page after error for ${url}: ${closeErr.message}`);
93
+ // Decide if this secondary error should be thrown or just logged
94
+ }
75
95
  }
76
96
  throw pageError; // Re-throw the original page processing error
77
97
  }
78
98
  } catch (launchError: any) {
79
- logger?.error(`Critical error during browser launch or page creation for ${url}: ${launchError.message}`);
80
- // Ensure browser is closed if launch succeeded but newPage failed, etc.
81
- // Although if launch fails, browser might be null.
99
+ logger?.error(`Critical error during browser launch or page setup for ${url}: ${launchError.message}`);
100
+ // Ensure browser is closed if launch succeeded partially but later failed
82
101
  if (browser) {
83
- try { await browser.close(); } catch (closeErr) { /* Ignore browser close error */ }
102
+ try {
103
+ await browser.close();
104
+ logger?.debug('Browser closed after launch/setup error.');
105
+ } catch (closeErr: any) {
106
+ logger?.warn(`Failed to close browser after launch/setup error: ${closeErr.message}`);
107
+ }
108
+ browser = null;
84
109
  }
85
110
  throw launchError; // Re-throw the original launch/setup error
86
111
  } finally {
87
- // Final check: If browser somehow wasn't closed and isn't null, attempt closure.
88
- // This handles edge cases where errors might bypass earlier closes.
112
+ // Final safety net: If browser somehow wasn't closed and isn't null, attempt closure.
89
113
  if (browser) {
90
114
  logger?.warn(`Closing browser in final cleanup for ${url}. This might indicate an unusual error path.`);
91
115
  try { await browser.close(); } catch (closeErr) { /* Ignore final browser close error */ }
@@ -93,156 +117,194 @@ export async function fetchAndPackWebPage(
93
117
  }
94
118
  }
95
119
 
120
+
121
+ /**
122
+ * @typedef {object} CrawlOptions
123
+ * @property {number} [maxDepth=1] - Maximum crawl depth.
124
+ * @property {number} [timeout=DEFAULT_PAGE_TIMEOUT] - Navigation timeout per page.
125
+ * @property {string[]} [include=[]] - Glob patterns for URLs to include.
126
+ * @property {string[]} [exclude=[]] - Glob patterns for URLs to exclude.
127
+ * @property {string} [userAgent] - Custom User-Agent string.
128
+ * @property {Logger} [logger] - Optional logger instance.
129
+ */
130
+
96
131
  /**
97
132
  * Internal function to recursively crawl a website starting from a given URL.
98
133
  * Uses a single browser instance and manages pages for efficiency during crawl.
99
134
  * Implements Breadth-First Search (BFS) using a queue.
135
+ * Respects same-origin policy and visited URLs.
100
136
  *
101
137
  * @private
102
138
  * @param {string} startUrl - The initial URL to start crawling from.
103
- * @param {number} maxDepth - The maximum depth of links to follow (1 means only the start URL).
104
- * @param {Logger} [logger] - Optional logger instance.
139
+ * @param {CrawlOptions} options - Crawling configuration options.
105
140
  * @returns {Promise<PageEntry[]>} A promise resolving to an array of PageEntry objects
106
141
  * containing the URL and HTML for each successfully crawled page.
107
142
  */
108
143
  async function crawlWebsite(
109
144
  startUrl: string,
110
- maxDepth: number,
111
- logger?: Logger
145
+ options: {
146
+ maxDepth?: number;
147
+ timeout?: number;
148
+ include?: string[]; // Add include/exclude/userAgent later if needed
149
+ exclude?: string[];
150
+ userAgent?: string;
151
+ logger?: Logger;
152
+ }
112
153
  ): Promise<PageEntry[]> {
154
+ const {
155
+ maxDepth = 1,
156
+ timeout = DEFAULT_PAGE_TIMEOUT,
157
+ // include = ['**'], // TODO: Implement glob filtering
158
+ // exclude = [],
159
+ userAgent,
160
+ logger,
161
+ } = options;
162
+
113
163
  logger?.info(`Starting crawl for ${startUrl} with maxDepth ${maxDepth}`);
114
-
115
- // Don't even start a browser if maxDepth is 0
164
+
116
165
  if (maxDepth <= 0) {
117
- logger?.info('maxDepth is 0 or negative, no pages will be crawled.');
166
+ logger?.warn('maxDepth is 0 or negative, no pages will be crawled.');
118
167
  return [];
119
168
  }
120
-
121
- const browser = await puppeteer.launch({ headless: true });
169
+
170
+ let browser: puppeteer.Browser | null = null;
122
171
  const visited = new Set<string>();
123
172
  const results: PageEntry[] = [];
124
- // Queue stores URLs to visit and their corresponding depth
125
173
  const queue: { url: string; depth: number }[] = [];
126
-
127
- // Initialize startOrigin for same-origin check
128
174
  let startOrigin: string;
129
- try {
130
- startOrigin = new URL(startUrl).origin;
131
- } catch (e: any) {
132
- logger?.error(`Invalid start URL: ${startUrl}. ${e.message}`);
133
- await browser.close();
134
- return []; // Cannot start crawl with invalid URL
135
- }
136
175
 
137
- // Normalize start URL (remove fragment) and add to queue/visited if depth allows
138
- let normalizedStartUrl: string;
139
176
  try {
140
- const parsedStartUrl = new URL(startUrl);
141
- parsedStartUrl.hash = ''; // Remove fragment for consistent visited checks
142
- normalizedStartUrl = parsedStartUrl.href;
143
- } catch (e: any) {
144
- logger?.error(`Invalid start URL: ${startUrl}. ${e.message}`);
145
- await browser.close();
146
- return []; // Cannot start crawl with invalid URL
147
- }
177
+ // Validate start URL and get origin
178
+ try {
179
+ startOrigin = new URL(startUrl).origin;
180
+ } catch (e: any) {
181
+ logger?.error(`Invalid start URL: ${startUrl}. ${e.message}`);
182
+ throw new Error(`Invalid start URL: ${startUrl}`); // Propagate error
183
+ }
148
184
 
149
- visited.add(normalizedStartUrl);
150
- queue.push({ url: normalizedStartUrl, depth: 1 });
151
- logger?.debug(`Queued initial URL: ${normalizedStartUrl} (depth 1)`);
185
+ // Normalize start URL (remove fragment)
186
+ let normalizedStartUrl: string;
187
+ try {
188
+ const parsedStartUrl = new URL(startUrl);
189
+ parsedStartUrl.hash = '';
190
+ normalizedStartUrl = parsedStartUrl.href;
191
+ } catch (e: any) {
192
+ logger?.error(`Invalid start URL: ${startUrl}. ${e.message}`);
193
+ throw new Error(`Invalid start URL: ${startUrl}`); // Propagate error
194
+ }
152
195
 
153
- while (queue.length > 0) {
154
- const { url, depth } = queue.shift()!; // Non-null assertion ok due to queue.length check
155
- logger?.info(`Processing: ${url} (depth ${depth})`);
156
- let page: puppeteer.Page | null = null;
196
+ // Launch browser *after* validating URL
197
+ logger?.debug('Launching browser for crawl...');
198
+ browser = await puppeteer.launch(PUPPETEER_LAUNCH_OPTIONS);
199
+ logger?.debug(`Browser launched for crawl (PID: ${browser.process()?.pid}).`);
157
200
 
158
- try {
159
- page = await browser.newPage();
160
- // Set a reasonable viewport, sometimes helps with rendering/layout dependent scripts
161
- await page.setViewport({ width: 1280, height: 800 });
162
- await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
163
- const html = await page.content();
201
+ // Initial queue setup
202
+ visited.add(normalizedStartUrl);
203
+ queue.push({ url: normalizedStartUrl, depth: 1 });
204
+ logger?.debug(`Queued initial URL: ${normalizedStartUrl} (depth 1)`);
164
205
 
165
- // Add successfully fetched page to results
166
- // Ensure the object structure matches your PageEntry type definition
167
- results.push({ url, html });
168
- logger?.debug(`Successfully fetched content for ${url}`);
169
-
170
- // --- Link Discovery ---
171
- // Only look for more links if we haven't reached the maximum depth
172
- if (depth < maxDepth) {
173
- logger?.debug(`Discovering links on ${url} (current depth ${depth}, maxDepth ${maxDepth})`);
174
- // Use page.evaluate to get all href attributes directly from the DOM
175
- const hrefs = await page.evaluate(() =>
176
- Array.from(document.querySelectorAll('a[href]'), a => a.getAttribute('href'))
177
- );
178
- logger?.debug(`Found ${hrefs.length} potential hrefs on ${url}`);
179
-
180
- let linksAdded = 0;
181
- for (const href of hrefs) {
182
- if (!href) continue; // Skip empty hrefs like href=""
183
-
184
- let absoluteUrl: string;
185
- try {
186
- // Resolve the href relative to the current page's URL
187
- const resolved = new URL(href, url);
188
- // Remove fragment (#) for visited checks and queueing consistency
189
- resolved.hash = '';
190
- absoluteUrl = resolved.href;
191
- } catch (e) {
192
- // Ignore URLs that fail to parse (e.g., "javascript:void(0)")
193
- logger?.debug(`Ignoring invalid URL syntax: "${href}" on page ${url}`);
194
- continue;
195
- }
206
+ while (queue.length > 0) {
207
+ const { url, depth } = queue.shift()!;
208
+ logger?.info(`Processing: ${url} (depth ${depth})`);
209
+ let page: puppeteer.Page | null = null;
210
+
211
+ try {
212
+ page = await browser.newPage();
213
+
214
+ if (userAgent) {
215
+ await page.setUserAgent(userAgent);
216
+ }
217
+ // Consider adding viewport setting if needed: await page.setViewport({ width: 1280, height: 800 });
218
+
219
+ await page.goto(url, { waitUntil: 'networkidle2', timeout: timeout });
220
+ const html = await page.content();
221
+
222
+ results.push({ url, html }); // Matches PageEntry type
223
+ logger?.debug(`Successfully fetched content for ${url}`);
224
+
225
+ // Link Discovery (only if not at max depth)
226
+ if (depth < maxDepth) {
227
+ logger?.debug(`Discovering links on ${url} (depth ${depth}/${maxDepth})`);
228
+ const hrefs = await page.evaluate(() =>
229
+ Array.from(document.querySelectorAll('a[href]'), a => a.getAttribute('href'))
230
+ );
231
+ logger?.debug(`Found ${hrefs.length} potential hrefs on ${url}`);
232
+
233
+ let linksAdded = 0;
234
+ for (const href of hrefs) {
235
+ if (!href) continue;
236
+
237
+ let absoluteUrl: string;
238
+ try {
239
+ const resolved = new URL(href, url);
240
+ resolved.hash = ''; // Normalize
241
+ absoluteUrl = resolved.href;
242
+ } catch (e) {
243
+ logger?.debug(`Ignoring invalid URL syntax: "${href}" on page ${url}`);
244
+ continue;
245
+ }
246
+
247
+ // TODO: Implement include/exclude filtering here using micromatch or similar
248
+ // if (!matchesInclude(absoluteUrl, include) || matchesExclude(absoluteUrl, exclude)) {
249
+ // logger?.debug(`Skipping due to include/exclude rules: ${absoluteUrl}`);
250
+ // continue;
251
+ // }
196
252
 
197
- // --- Filtering and Queueing ---
198
- // 1. Check if it belongs to the same origin as the start URL
199
- // 2. Check if it has already been visited (or is in the queue)
200
- if (absoluteUrl.startsWith(startOrigin) && !visited.has(absoluteUrl)) {
201
- visited.add(absoluteUrl); // Mark as visited *before* adding to queue
202
- queue.push({ url: absoluteUrl, depth: depth + 1 });
203
- linksAdded++;
204
- // logger?.debug(`Queueing: ${absoluteUrl} (depth ${depth + 1})`); // Verbose
205
- } else {
206
- // logger?.debug(`Skipping (external, visited, or invalid): ${absoluteUrl}`); // Verbose
253
+ // Filter: same origin and not visited
254
+ if (absoluteUrl.startsWith(startOrigin) && !visited.has(absoluteUrl)) {
255
+ visited.add(absoluteUrl);
256
+ queue.push({ url: absoluteUrl, depth: depth + 1 });
257
+ linksAdded++;
258
+ }
207
259
  }
260
+ logger?.debug(`Added ${linksAdded} new unique internal links to queue from ${url}`);
261
+ } else {
262
+ logger?.debug(`Max depth (${maxDepth}) reached, not discovering links on ${url}`);
208
263
  }
209
- logger?.debug(`Added ${linksAdded} new unique internal links to queue from ${url}`);
210
- } else {
211
- logger?.debug(`Max depth (${maxDepth}) reached, not discovering links on ${url}`);
212
- }
213
264
 
214
- } catch (err: any) {
215
- // Log errors encountered during page processing (goto, content, evaluate)
216
- logger?.warn(`❌ Failed to process ${url}: ${err.message}`);
217
- // Optionally add error details to results or a separate error list if needed
218
- } finally {
219
- // Ensure the page is closed reliably after processing or error
220
- if (page) {
221
- try {
222
- await page.close();
223
- } catch (pageCloseError: any) {
224
- // Log if closing the page fails, but don't let it stop the crawl
225
- logger?.error(`Failed to close page for ${url}: ${pageCloseError.message}`);
265
+ } catch (err: any) {
266
+ logger?.warn(`❌ Failed to process ${url}: ${err.message}`);
267
+ // Continue crawl even if one page fails
268
+ } finally {
269
+ if (page && !page.isClosed()) {
270
+ try {
271
+ await page.close();
272
+ } catch (pageCloseError: any) {
273
+ logger?.error(`Failed to close page for ${url}: ${pageCloseError.message}`);
274
+ }
226
275
  }
227
276
  }
277
+ } // End while loop
278
+
279
+ } catch (error) {
280
+ // Catch critical errors like invalid start URL or browser launch failure
281
+ logger?.error(`Critical crawl error: ${error instanceof Error ? error.message : error}`);
282
+ // Rethrow or handle appropriately
283
+ throw error;
284
+ } finally {
285
+ // Ensure browser is closed after crawl finishes or critical error occurs
286
+ if (browser) {
287
+ logger?.info(`Crawl finished or errored. Closing browser.`);
288
+ await browser.close();
289
+ logger?.debug(`Browser closed after crawl.`);
228
290
  }
229
- } // End while loop
291
+ }
230
292
 
231
- logger?.info(`Crawl finished. Closing browser.`);
232
- await browser.close();
233
- logger?.info(`Found ${results.length} pages.`);
293
+ logger?.info(`Crawl found ${results.length} pages.`);
234
294
  return results;
235
295
  }
236
296
 
297
+
237
298
  /**
238
299
  * Fetches all internal pages of a website recursively starting from a given URL,
239
300
  * bundles them into a single HTML string using the bundler module, and writes
240
- * the result to a file.
301
+ * the result to a file. Creates its own logger unless `loggerInstance` is provided.
241
302
  *
242
303
  * @export
243
304
  * @param {string} startUrl - The fully qualified URL to begin crawling from.
244
305
  * @param {string} outputFile - The path where the bundled HTML file should be saved.
245
306
  * @param {number} [maxDepth=1] - The maximum depth to crawl links (default: 1, only the start page).
307
+ * @param {Logger} [loggerInstance] - Optional external logger instance to use.
246
308
  * @returns {Promise<{ pages: number; html: string }>} A promise resolving to an object containing
247
309
  * the number of pages successfully crawled and the final bundled HTML string.
248
310
  * @throws {Error} Throws errors if the crawl initiation fails, bundling fails, or file writing fails.
@@ -250,15 +312,18 @@ async function crawlWebsite(
250
312
  export async function recursivelyBundleSite(
251
313
  startUrl: string,
252
314
  outputFile: string,
253
- maxDepth = 1
315
+ maxDepth = 1,
316
+ loggerInstance?: Logger // Added optional logger parameter
254
317
  ): Promise<{ pages: number; html: string }> {
255
- // Create a logger instance specifically for this operation
256
- const logger = new Logger();
318
+ // Use provided logger OR create a new default one
319
+ const logger = loggerInstance || new Logger();
257
320
  logger.info(`Starting recursive site bundle for ${startUrl} to ${outputFile} (maxDepth: ${maxDepth})`);
258
321
 
259
322
  try {
260
323
  // Step 1: Crawl the website
261
- const pages: PageEntry[] = await crawlWebsite(startUrl, maxDepth, logger);
324
+ // Pass necessary options down to crawlWebsite
325
+ const crawlOptions = { maxDepth, logger /* Add other options like timeout, userAgent if needed */ };
326
+ const pages: PageEntry[] = await crawlWebsite(startUrl, crawlOptions);
262
327
 
263
328
  if (pages.length === 0) {
264
329
  logger.warn("Crawl completed but found 0 pages. Output file may be empty or reflect an empty bundle.");
@@ -267,7 +332,8 @@ export async function recursivelyBundleSite(
267
332
  }
268
333
 
269
334
  // Step 2: Bundle the HTML content
270
- const bundledHtml = bundleMultiPageHTML(pages, logger); // Passing logger for consistency
335
+ // Pass the same logger instance for consistent logging
336
+ const bundledHtml = bundleMultiPageHTML(pages, logger);
271
337
  logger.info(`Bundling complete. Output size: ${Buffer.byteLength(bundledHtml, 'utf-8')} bytes.`);
272
338
 
273
339
  // Step 3: Write the bundled HTML to the output file
@@ -282,11 +348,9 @@ export async function recursivelyBundleSite(
282
348
  };
283
349
  } catch (error: any) {
284
350
  logger.error(`Error during recursive site bundle: ${error.message}`);
285
- // Log the stack trace for better debugging if available
286
351
  if (error.stack) {
287
352
  logger.error(`Stack trace: ${error.stack}`);
288
353
  }
289
- // Re-throw the error to signal failure to the caller
290
- throw error;
354
+ throw error; // Re-throw the error
291
355
  }
292
356
  }