portapack 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,6 +1,129 @@
1
- // src/core/parser.ts
1
+ // src/core/web-fetcher.ts
2
+ import * as puppeteer from "puppeteer";
3
+ import * as fs2 from "fs/promises";
4
+
5
+ // src/types.ts
6
+ var LogLevel = /* @__PURE__ */ ((LogLevel2) => {
7
+ LogLevel2[LogLevel2["NONE"] = 0] = "NONE";
8
+ LogLevel2[LogLevel2["ERROR"] = 1] = "ERROR";
9
+ LogLevel2[LogLevel2["WARN"] = 2] = "WARN";
10
+ LogLevel2[LogLevel2["INFO"] = 3] = "INFO";
11
+ LogLevel2[LogLevel2["DEBUG"] = 4] = "DEBUG";
12
+ return LogLevel2;
13
+ })(LogLevel || {});
14
+
15
+ // src/utils/logger.ts
16
+ var Logger = class _Logger {
17
+ /** The current minimum log level required for a message to be output. */
18
+ level;
19
+ /**
20
+ * Creates a new Logger instance.
21
+ * Defaults to LogLevel.INFO if no level is provided.
22
+ *
23
+ * @param {LogLevel} [level=LogLevel.INFO] - The initial log level for this logger instance.
24
+ * Must be one of the values from the LogLevel enum.
25
+ */
26
+ constructor(level = 3 /* INFO */) {
27
+ this.level = level !== void 0 && LogLevel[level] !== void 0 ? level : 3 /* INFO */;
28
+ }
29
+ /**
30
+ * Updates the logger's current level. Messages below this level will be suppressed.
31
+ *
32
+ * @param {LogLevel} level - The new log level to set. Must be a LogLevel enum member.
33
+ */
34
+ setLevel(level) {
35
+ this.level = level;
36
+ }
37
+ /**
38
+ * Logs a debug message if the current log level is DEBUG or higher.
39
+ *
40
+ * @param {string} message - The debug message string.
41
+ */
42
+ debug(message) {
43
+ if (this.level >= 4 /* DEBUG */) {
44
+ console.debug(`[DEBUG] ${message}`);
45
+ }
46
+ }
47
+ /**
48
+ * Logs an informational message if the current log level is INFO or higher.
49
+ *
50
+ * @param {string} message - The informational message string.
51
+ */
52
+ info(message) {
53
+ if (this.level >= 3 /* INFO */) {
54
+ console.info(`[INFO] ${message}`);
55
+ }
56
+ }
57
+ /**
58
+ * Logs a warning message if the current log level is WARN or higher.
59
+ *
60
+ * @param {string} message - The warning message string.
61
+ */
62
+ warn(message) {
63
+ if (this.level >= 2 /* WARN */) {
64
+ console.warn(`[WARN] ${message}`);
65
+ }
66
+ }
67
+ /**
68
+ * Logs an error message if the current log level is ERROR or higher.
69
+ *
70
+ * @param {string} message - The error message string.
71
+ */
72
+ error(message) {
73
+ if (this.level >= 1 /* ERROR */) {
74
+ console.error(`[ERROR] ${message}`);
75
+ }
76
+ }
77
+ /**
78
+ * Static factory method to create a Logger instance based on a simple boolean `verbose` flag.
79
+ *
80
+ * @static
81
+ * @param {{ verbose?: boolean }} [options={}] - An object potentially containing a `verbose` flag.
82
+ * @returns {Logger} A new Logger instance set to LogLevel.DEBUG if options.verbose is true,
83
+ * otherwise set to LogLevel.INFO.
84
+ */
85
+ static fromVerboseFlag(options = {}) {
86
+ return new _Logger(options.verbose ? 4 /* DEBUG */ : 3 /* INFO */);
87
+ }
88
+ /**
89
+ * Static factory method to create a Logger instance based on a LogLevel string name.
90
+ * Useful for creating a logger from config files or environments variables.
91
+ *
92
+ * @static
93
+ * @param {string | undefined} levelName - The name of the log level (e.g., 'debug', 'info', 'warn', 'error', 'silent'/'none'). Case-insensitive.
94
+ * @param {LogLevel} [defaultLevel=LogLevel.INFO] - The level to use if levelName is invalid or undefined.
95
+ * @returns {Logger} A new Logger instance set to the corresponding LogLevel.
96
+ */
97
+ static fromLevelName(levelName, defaultLevel = 3 /* INFO */) {
98
+ if (!levelName) {
99
+ return new _Logger(defaultLevel);
100
+ }
101
+ switch (levelName.toLowerCase()) {
102
+ // Return enum members
103
+ case "debug":
104
+ return new _Logger(4 /* DEBUG */);
105
+ case "info":
106
+ return new _Logger(3 /* INFO */);
107
+ case "warn":
108
+ return new _Logger(2 /* WARN */);
109
+ case "error":
110
+ return new _Logger(1 /* ERROR */);
111
+ case "silent":
112
+ case "none":
113
+ return new _Logger(0 /* NONE */);
114
+ default:
115
+ console.warn(`[Logger] Invalid log level name "${levelName}". Defaulting to ${LogLevel[defaultLevel]}.`);
116
+ return new _Logger(defaultLevel);
117
+ }
118
+ }
119
+ };
120
+
121
+ // src/core/extractor.ts
2
122
  import { readFile } from "fs/promises";
3
- import * as cheerio from "cheerio";
123
+ import * as fs from "fs";
124
+ import path2 from "path";
125
+ import { fileURLToPath, URL as URL2 } from "url";
126
+ import * as axiosNs from "axios";
4
127
 
5
128
  // src/utils/mime.ts
6
129
  import path from "path";
@@ -58,76 +181,7 @@ function guessMimeType(urlOrPath) {
58
181
  return MIME_MAP[ext] || DEFAULT_MIME_TYPE;
59
182
  }
60
183
 
61
- // src/core/parser.ts
62
- async function parseHTML(entryFilePath, logger) {
63
- logger?.debug(`Parsing HTML file: ${entryFilePath}`);
64
- let htmlContent;
65
- try {
66
- htmlContent = await readFile(entryFilePath, "utf-8");
67
- logger?.debug(`Successfully read HTML file (${Buffer.byteLength(htmlContent)} bytes).`);
68
- } catch (err) {
69
- logger?.error(`Failed to read HTML file "${entryFilePath}": ${err.message}`);
70
- throw new Error(`Could not read input HTML file: ${entryFilePath}`, { cause: err });
71
- }
72
- const $ = cheerio.load(htmlContent);
73
- const assets = [];
74
- const addedUrls = /* @__PURE__ */ new Set();
75
- const addAsset = (url, forcedType) => {
76
- if (!url || url.trim() === "" || url.startsWith("data:")) {
77
- return;
78
- }
79
- if (!addedUrls.has(url)) {
80
- addedUrls.add(url);
81
- const mimeInfo = guessMimeType(url);
82
- const type = forcedType ?? mimeInfo.assetType;
83
- assets.push({ type, url });
84
- logger?.debug(`Discovered asset: Type='${type}', URL='${url}'`);
85
- } else {
86
- logger?.debug(`Skipping duplicate asset URL: ${url}`);
87
- }
88
- };
89
- logger?.debug("Extracting assets from HTML tags...");
90
- $('link[rel="stylesheet"][href]').each((_, el) => {
91
- addAsset($(el).attr("href"), "css");
92
- });
93
- $("script[src]").each((_, el) => {
94
- addAsset($(el).attr("src"), "js");
95
- });
96
- $("img[src]").each((_, el) => addAsset($(el).attr("src"), "image"));
97
- $('input[type="image"][src]').each((_, el) => addAsset($(el).attr("src"), "image"));
98
- $("img[srcset], picture source[srcset]").each((_, el) => {
99
- const srcset = $(el).attr("srcset");
100
- srcset?.split(",").forEach((entry) => {
101
- const [url] = entry.trim().split(/\s+/);
102
- addAsset(url, "image");
103
- });
104
- });
105
- $("video[src]").each((_, el) => addAsset($(el).attr("src"), "video"));
106
- $("video[poster]").each((_, el) => addAsset($(el).attr("poster"), "image"));
107
- $("audio[src]").each((_, el) => addAsset($(el).attr("src"), "audio"));
108
- $("video > source[src]").each((_, el) => addAsset($(el).attr("src"), "video"));
109
- $("audio > source[src]").each((_, el) => addAsset($(el).attr("src"), "audio"));
110
- $("link[href]").filter((_, el) => {
111
- const rel = $(el).attr("rel")?.toLowerCase() ?? "";
112
- return ["icon", "shortcut icon", "apple-touch-icon", "manifest"].includes(rel);
113
- }).each((_, el) => {
114
- const rel = $(el).attr("rel")?.toLowerCase() ?? "";
115
- const isIcon = ["icon", "shortcut icon", "apple-touch-icon"].includes(rel);
116
- addAsset($(el).attr("href"), isIcon ? "image" : void 0);
117
- });
118
- $('link[rel="preload"][as="font"][href]').each((_, el) => {
119
- addAsset($(el).attr("href"), "font");
120
- });
121
- logger?.info(`HTML parsing complete. Discovered ${assets.length} unique asset links.`);
122
- return { htmlContent, assets };
123
- }
124
-
125
184
  // src/core/extractor.ts
126
- import { readFile as readFile2 } from "fs/promises";
127
- import * as fs from "fs";
128
- import path2 from "path";
129
- import { fileURLToPath, URL as URL2 } from "url";
130
- import * as axios from "axios";
131
185
  var TEXT_ASSET_TYPES = /* @__PURE__ */ new Set(["css", "js"]);
132
186
  var BINARY_ASSET_TYPES = /* @__PURE__ */ new Set(["image", "font", "video", "audio"]);
133
187
  var MAX_ASSET_EXTRACTION_ITERATIONS = 1e3;
@@ -158,44 +212,35 @@ function determineBaseUrl(inputPathOrUrl, logger) {
158
212
  logger?.warn(`Input "${inputPathOrUrl}" looks like a URL but uses an unsupported protocol. Cannot determine base URL.`);
159
213
  return void 0;
160
214
  } else {
161
- let absolutePath;
215
+ let resourcePath;
216
+ let isInputLikelyDirectory = false;
162
217
  if (inputPathOrUrl.startsWith("file:")) {
163
- try {
164
- absolutePath = fileURLToPath(inputPathOrUrl);
165
- } catch (e) {
166
- logger?.error(`\u{1F480} Failed to convert file URL "${inputPathOrUrl}" to path: ${e.message}`);
167
- return void 0;
168
- }
218
+ resourcePath = fileURLToPath(inputPathOrUrl);
219
+ isInputLikelyDirectory = inputPathOrUrl.endsWith("/");
169
220
  } else {
170
- absolutePath = path2.resolve(inputPathOrUrl);
171
- }
172
- let isDirectory = false;
173
- try {
174
- isDirectory = fs.statSync(absolutePath).isDirectory();
175
- } catch (statError) {
176
- if (statError instanceof Error && statError.code === "ENOENT") {
177
- logger?.debug(`Path "${absolutePath}" not found. Assuming input represents a file, using its parent directory as base.`);
178
- } else {
179
- logger?.warn(`Could not stat local path "${absolutePath}" during base URL determination: ${statError instanceof Error ? statError.message : String(statError)}. Assuming input represents a file.`);
221
+ resourcePath = path2.resolve(inputPathOrUrl);
222
+ try {
223
+ isInputLikelyDirectory = fs.statSync(resourcePath).isDirectory();
224
+ } catch {
225
+ isInputLikelyDirectory = false;
180
226
  }
181
- isDirectory = false;
182
227
  }
183
- const dirPath = isDirectory ? absolutePath : path2.dirname(absolutePath);
184
- let normalizedPathForURL = dirPath.replace(/\\/g, "/");
228
+ const baseDirPath = isInputLikelyDirectory ? resourcePath : path2.dirname(resourcePath);
229
+ let normalizedPathForURL = baseDirPath.replace(/\\/g, "/");
185
230
  if (/^[A-Z]:\//i.test(normalizedPathForURL) && !normalizedPathForURL.startsWith("/")) {
186
231
  normalizedPathForURL = "/" + normalizedPathForURL;
187
232
  }
188
- const fileUrl = new URL2("file://" + normalizedPathForURL);
189
- let fileUrlString = fileUrl.href;
190
- if (!fileUrlString.endsWith("/")) {
191
- fileUrlString += "/";
233
+ if (!normalizedPathForURL.endsWith("/")) {
234
+ normalizedPathForURL += "/";
192
235
  }
193
- logger?.debug(`Determined local base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved dir: ${dirPath}, isDir: ${isDirectory})`);
236
+ const fileUrl = new URL2("file://" + normalizedPathForURL);
237
+ const fileUrlString = fileUrl.href;
238
+ logger?.debug(`Determined base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved base dir: ${baseDirPath})`);
194
239
  return fileUrlString;
195
240
  }
196
241
  } catch (error) {
197
242
  const message = error instanceof Error ? error.message : String(error);
198
- logger?.error(`\u{1F480} Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error ? ` - Stack: ${error.stack}` : ""}`);
243
+ logger?.error(`\u{1F480} Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error && error.stack ? ` - Stack: ${error.stack}` : ""}`);
199
244
  return void 0;
200
245
  }
201
246
  }
@@ -216,6 +261,10 @@ function resolveAssetUrl(assetUrl, baseContextUrl, logger) {
216
261
  }
217
262
  try {
218
263
  const resolved = new URL2(resolvableUrl, baseContextUrl);
264
+ if (!["http:", "https:", "file:"].includes(resolved.protocol)) {
265
+ logger?.debug(`Skipping asset with unsupported protocol: ${resolved.href}`);
266
+ return null;
267
+ }
219
268
  return resolved;
220
269
  } catch (error) {
221
270
  const message = error instanceof Error ? error.message : String(error);
@@ -228,35 +277,15 @@ function resolveAssetUrl(assetUrl, baseContextUrl, logger) {
228
277
  }
229
278
  }
230
279
  function resolveCssRelativeUrl(relativeUrl, cssBaseContextUrl, logger) {
231
- if (!relativeUrl || relativeUrl.startsWith("data:")) {
280
+ if (!relativeUrl || relativeUrl.startsWith("data:") || relativeUrl.startsWith("#")) {
232
281
  return null;
233
282
  }
234
283
  try {
235
- if (cssBaseContextUrl.startsWith("file:")) {
236
- const basePath = fileURLToPath(cssBaseContextUrl);
237
- let cssDir;
238
- try {
239
- const stat = fs.statSync(basePath);
240
- if (stat.isDirectory()) {
241
- cssDir = basePath;
242
- } else {
243
- cssDir = path2.dirname(basePath);
244
- }
245
- } catch {
246
- cssDir = path2.dirname(basePath);
247
- }
248
- let resolvedPath = path2.resolve(cssDir, relativeUrl);
249
- resolvedPath = resolvedPath.replace(/\\/g, "/");
250
- if (/^[A-Z]:/i.test(resolvedPath) && !resolvedPath.startsWith("/")) {
251
- resolvedPath = "/" + resolvedPath;
252
- }
253
- return `file://${resolvedPath}`;
254
- } else {
255
- return new URL2(relativeUrl, cssBaseContextUrl).href;
256
- }
284
+ const resolvedUrl = new URL2(relativeUrl, cssBaseContextUrl);
285
+ return resolvedUrl.href;
257
286
  } catch (error) {
258
287
  logger?.warn(
259
- `Failed to resolve CSS URL: "${relativeUrl}" against "${cssBaseContextUrl}": ${String(error)}`
288
+ `Failed to resolve CSS URL: "${relativeUrl}" relative to "${cssBaseContextUrl}": ${String(error)}`
260
289
  );
261
290
  return null;
262
291
  }
@@ -266,11 +295,13 @@ async function fetchAsset(resolvedUrl, logger, timeout = 1e4) {
266
295
  const protocol = resolvedUrl.protocol;
267
296
  try {
268
297
  if (protocol === "http:" || protocol === "https:") {
269
- const response = await axios.default.get(resolvedUrl.href, {
298
+ const response = await axiosNs.default.get(resolvedUrl.href, {
270
299
  responseType: "arraybuffer",
300
+ // Fetch as binary data
271
301
  timeout
302
+ // Apply network timeout
272
303
  });
273
- logger?.debug(`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers["content-type"] || "N/A"}, Size: ${response.data.byteLength} bytes)`);
304
+ logger?.debug(`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers["content-type"] || "N/A"}, Size: ${response.data?.byteLength ?? 0} bytes)`);
274
305
  return Buffer.from(response.data);
275
306
  } else if (protocol === "file:") {
276
307
  let filePath;
@@ -280,7 +311,8 @@ async function fetchAsset(resolvedUrl, logger, timeout = 1e4) {
280
311
  logger?.error(`Could not convert file URL to path: ${resolvedUrl.href}. Error: ${e.message}`);
281
312
  return null;
282
313
  }
283
- const data = await readFile2(filePath);
314
+ const normalizedForLog = path2.normalize(filePath);
315
+ const data = await readFile(filePath);
284
316
  logger?.debug(`Read local file ${filePath} (${data.byteLength} bytes)`);
285
317
  return data;
286
318
  } else {
@@ -288,27 +320,26 @@ async function fetchAsset(resolvedUrl, logger, timeout = 1e4) {
288
320
  return null;
289
321
  }
290
322
  } catch (error) {
291
- if ((protocol === "http:" || protocol === "https:") && axios.default.isAxiosError(error)) {
292
- const status = error.response?.status ?? "N/A";
293
- const statusText = error.response?.statusText ?? "Error";
294
- const code = error.code ?? "N/A";
295
- const message = error.message;
296
- const logMessage = `\u26A0\uFE0F Failed to fetch remote asset ${resolvedUrl.href}: Status ${status} - ${statusText}. Code: ${code}, Message: ${message}`;
323
+ const failedId = protocol === "file:" ? path2.normalize(fileURLToPath(resolvedUrl)) : resolvedUrl.href;
324
+ if ((protocol === "http:" || protocol === "https:") && error?.isAxiosError === true) {
325
+ const axiosError = error;
326
+ const status = axiosError.response?.status ?? "N/A";
327
+ const code = axiosError.code ?? "N/A";
328
+ const logMessage = `\u26A0\uFE0F Failed to fetch remote asset ${resolvedUrl.href}: ${axiosError.message} (Code: ${code})`;
297
329
  logger?.warn(logMessage);
298
- } else if (protocol === "file:") {
330
+ } else if (protocol === "file:" && error instanceof Error) {
299
331
  let failedPath = resolvedUrl.href;
300
332
  try {
301
333
  failedPath = fileURLToPath(resolvedUrl);
302
334
  } catch {
303
335
  }
304
- if (error instanceof Error && error.code === "ENOENT") {
336
+ failedPath = path2.normalize(failedPath);
337
+ if (error.code === "ENOENT") {
305
338
  logger?.warn(`\u26A0\uFE0F File not found (ENOENT) for asset: ${failedPath}.`);
306
- } else if (error instanceof Error && error.code === "EACCES") {
339
+ } else if (error.code === "EACCES") {
307
340
  logger?.warn(`\u26A0\uFE0F Permission denied (EACCES) reading asset: ${failedPath}.`);
308
- } else if (error instanceof Error) {
309
- logger?.warn(`\u26A0\uFE0F Failed to read local asset ${failedPath}: ${error.message}`);
310
341
  } else {
311
- logger?.warn(`\u26A0\uFE0F An unknown error occurred while reading local asset ${failedPath}: ${String(error)}`);
342
+ logger?.warn(`\u26A0\uFE0F Failed to read local asset ${failedPath}: ${error.message}`);
312
343
  }
313
344
  } else if (error instanceof Error) {
314
345
  logger?.warn(`\u26A0\uFE0F An unexpected error occurred processing asset ${resolvedUrl.href}: ${error.message}`);
@@ -324,7 +355,7 @@ function extractUrlsFromCSS(cssContent, cssBaseContextUrl, logger) {
324
355
  const urlRegex = /url\(\s*(['"]?)(.*?)\1\s*\)/gi;
325
356
  const importRegex = /@import\s+(?:url\(\s*(['"]?)(.*?)\1\s*\)|(['"])(.*?)\3)\s*;/gi;
326
357
  const processFoundUrl = (rawUrl, ruleType) => {
327
- if (!rawUrl || rawUrl.trim() === "" || rawUrl.startsWith("data:")) return;
358
+ if (!rawUrl || rawUrl.trim() === "" || rawUrl.startsWith("data:") || rawUrl.startsWith("#")) return;
328
359
  const resolvedUrl = resolveCssRelativeUrl(rawUrl, cssBaseContextUrl, logger);
329
360
  if (resolvedUrl && !processedInThisParse.has(resolvedUrl)) {
330
361
  processedInThisParse.add(resolvedUrl);
@@ -332,14 +363,13 @@ function extractUrlsFromCSS(cssContent, cssBaseContextUrl, logger) {
332
363
  newlyDiscovered.push({
333
364
  type: assetType,
334
365
  url: resolvedUrl,
335
- // The resolved URL string
366
+ // Store the resolved absolute URL string
336
367
  content: void 0
368
+ // Content will be fetched later if needed
337
369
  });
338
370
  logger?.debug(`Discovered nested ${assetType} asset (${ruleType}) in CSS ${cssBaseContextUrl}: ${resolvedUrl}`);
339
371
  }
340
372
  };
341
- urlRegex.lastIndex = 0;
342
- importRegex.lastIndex = 0;
343
373
  let match;
344
374
  while ((match = urlRegex.exec(cssContent)) !== null) {
345
375
  processFoundUrl(match[2], "url()");
@@ -355,31 +385,35 @@ async function extractAssets(parsed, embedAssets = true, inputPathOrUrl, logger)
355
385
  const initialAssets = parsed.assets || [];
356
386
  const finalAssetsMap = /* @__PURE__ */ new Map();
357
387
  let assetsToProcess = [];
388
+ const processedOrQueuedUrls = /* @__PURE__ */ new Set();
358
389
  const htmlBaseContextUrl = determineBaseUrl(inputPathOrUrl || "", logger);
359
390
  if (!htmlBaseContextUrl && initialAssets.some((a) => !/^[a-z]+:/i.test(a.url) && !a.url.startsWith("data:") && !a.url.startsWith("#") && !a.url.startsWith("/"))) {
360
391
  logger?.warn("\u{1F6A8} No valid base path/URL determined for the HTML source! Resolution of relative asset paths from HTML may fail.");
361
392
  } else if (htmlBaseContextUrl) {
362
393
  logger?.debug(`Using HTML base context URL: ${htmlBaseContextUrl}`);
363
394
  }
364
- const processedOrQueuedUrls = /* @__PURE__ */ new Set();
365
395
  logger?.debug(`Queueing ${initialAssets.length} initial assets parsed from HTML...`);
366
396
  for (const asset of initialAssets) {
367
397
  const resolvedUrlObj = resolveAssetUrl(asset.url, htmlBaseContextUrl, logger);
368
- const urlToQueue = resolvedUrlObj ? resolvedUrlObj.href : asset.url;
369
- if (!urlToQueue.startsWith("data:") && !processedOrQueuedUrls.has(urlToQueue)) {
398
+ if (!resolvedUrlObj) {
399
+ logger?.debug(` -> Skipping initial asset with unresolvable/ignorable URL: ${asset.url}`);
400
+ continue;
401
+ }
402
+ const urlToQueue = resolvedUrlObj.href;
403
+ if (!processedOrQueuedUrls.has(urlToQueue)) {
370
404
  processedOrQueuedUrls.add(urlToQueue);
371
405
  const { assetType: guessedType } = guessMimeType(urlToQueue);
372
406
  const initialType = asset.type ?? guessedType;
373
407
  assetsToProcess.push({
374
408
  url: urlToQueue,
409
+ // Use the resolved URL
375
410
  type: initialType,
376
411
  content: void 0
412
+ // Content is initially undefined
377
413
  });
378
414
  logger?.debug(` -> Queued initial asset: ${urlToQueue} (Original raw: ${asset.url})`);
379
- } else if (urlToQueue.startsWith("data:")) {
380
- logger?.debug(` -> Skipping data URI: ${urlToQueue.substring(0, 50)}...`);
381
415
  } else {
382
- logger?.debug(` -> Skipping already queued initial asset: ${urlToQueue}`);
416
+ logger?.debug(` -> Skipping already processed/queued initial asset: ${urlToQueue}`);
383
417
  }
384
418
  }
385
419
  let iterationCount = 0;
@@ -450,7 +484,7 @@ async function extractAssets(parsed, embedAssets = true, inputPathOrUrl, logger)
450
484
  cssContentForParsing = textContent;
451
485
  }
452
486
  } else {
453
- logger?.warn(`Could not decode ${asset.type} ${asset.url} as valid UTF-8 text.${embedAssets ? " Falling back to base64 data URI." : ""}`);
487
+ logger?.warn(`Could not decode ${asset.type} asset ${asset.url} as valid UTF-8 text.${embedAssets ? " Falling back to base64 data URI." : ""}`);
454
488
  cssContentForParsing = void 0;
455
489
  if (embedAssets) {
456
490
  finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString("base64")}`;
@@ -497,6 +531,7 @@ async function extractAssets(parsed, embedAssets = true, inputPathOrUrl, logger)
497
531
  const newlyDiscoveredAssets = extractUrlsFromCSS(
498
532
  cssContentForParsing,
499
533
  cssBaseContextUrl,
534
+ // Use the CSS file's own URL as the base
500
535
  logger
501
536
  );
502
537
  if (newlyDiscoveredAssets.length > 0) {
@@ -517,7 +552,7 @@ async function extractAssets(parsed, embedAssets = true, inputPathOrUrl, logger)
517
552
  }
518
553
  }
519
554
  }
520
- const finalIterationCount = iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS ? "MAX+" : iterationCount;
555
+ const finalIterationCount = iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS ? `${MAX_ASSET_EXTRACTION_ITERATIONS}+ (limit hit)` : iterationCount;
521
556
  logger?.info(`\u2705 Asset extraction COMPLETE! Found ${finalAssetsMap.size} unique assets in ${finalIterationCount} iterations.`);
522
557
  return {
523
558
  htmlContent: parsed.htmlContent,
@@ -671,7 +706,7 @@ async function minifyAssets(parsed, options = {}, logger) {
671
706
  }
672
707
 
673
708
  // src/core/packer.ts
674
- import * as cheerio2 from "cheerio";
709
+ import * as cheerio from "cheerio";
675
710
  function escapeScriptContent(code) {
676
711
  return code.replace(/<\/(script)/gi, "<\\/$1");
677
712
  }
@@ -784,7 +819,7 @@ function packHTML(parsed, logger) {
784
819
  return '<!DOCTYPE html><html><head><base href="./"></head><body></body></html>';
785
820
  }
786
821
  logger?.debug("Loading HTML content into Cheerio for packing...");
787
- const $ = cheerio2.load(htmlContent);
822
+ const $ = cheerio.load(htmlContent);
788
823
  logger?.debug("Ensuring <base> tag exists...");
789
824
  ensureBaseTag($, logger);
790
825
  logger?.debug("Starting asset inlining...");
@@ -795,126 +830,6 @@ function packHTML(parsed, logger) {
795
830
  return finalHtml;
796
831
  }
797
832
 
798
- // src/core/web-fetcher.ts
799
- import * as puppeteer from "puppeteer";
800
- import * as fs2 from "fs/promises";
801
-
802
- // src/types.ts
803
- var LogLevel = /* @__PURE__ */ ((LogLevel2) => {
804
- LogLevel2[LogLevel2["NONE"] = 0] = "NONE";
805
- LogLevel2[LogLevel2["ERROR"] = 1] = "ERROR";
806
- LogLevel2[LogLevel2["WARN"] = 2] = "WARN";
807
- LogLevel2[LogLevel2["INFO"] = 3] = "INFO";
808
- LogLevel2[LogLevel2["DEBUG"] = 4] = "DEBUG";
809
- return LogLevel2;
810
- })(LogLevel || {});
811
-
812
- // src/utils/logger.ts
813
- var Logger = class _Logger {
814
- /** The current minimum log level required for a message to be output. */
815
- level;
816
- /**
817
- * Creates a new Logger instance.
818
- * Defaults to LogLevel.INFO if no level is provided.
819
- *
820
- * @param {LogLevel} [level=LogLevel.INFO] - The initial log level for this logger instance.
821
- * Must be one of the values from the LogLevel enum.
822
- */
823
- constructor(level = 3 /* INFO */) {
824
- this.level = level !== void 0 && LogLevel[level] !== void 0 ? level : 3 /* INFO */;
825
- }
826
- /**
827
- * Updates the logger's current level. Messages below this level will be suppressed.
828
- *
829
- * @param {LogLevel} level - The new log level to set. Must be a LogLevel enum member.
830
- */
831
- setLevel(level) {
832
- this.level = level;
833
- }
834
- /**
835
- * Logs a debug message if the current log level is DEBUG or higher.
836
- *
837
- * @param {string} message - The debug message string.
838
- */
839
- debug(message) {
840
- if (this.level >= 4 /* DEBUG */) {
841
- console.debug(`[DEBUG] ${message}`);
842
- }
843
- }
844
- /**
845
- * Logs an informational message if the current log level is INFO or higher.
846
- *
847
- * @param {string} message - The informational message string.
848
- */
849
- info(message) {
850
- if (this.level >= 3 /* INFO */) {
851
- console.info(`[INFO] ${message}`);
852
- }
853
- }
854
- /**
855
- * Logs a warning message if the current log level is WARN or higher.
856
- *
857
- * @param {string} message - The warning message string.
858
- */
859
- warn(message) {
860
- if (this.level >= 2 /* WARN */) {
861
- console.warn(`[WARN] ${message}`);
862
- }
863
- }
864
- /**
865
- * Logs an error message if the current log level is ERROR or higher.
866
- *
867
- * @param {string} message - The error message string.
868
- */
869
- error(message) {
870
- if (this.level >= 1 /* ERROR */) {
871
- console.error(`[ERROR] ${message}`);
872
- }
873
- }
874
- /**
875
- * Static factory method to create a Logger instance based on a simple boolean `verbose` flag.
876
- *
877
- * @static
878
- * @param {{ verbose?: boolean }} [options={}] - An object potentially containing a `verbose` flag.
879
- * @returns {Logger} A new Logger instance set to LogLevel.DEBUG if options.verbose is true,
880
- * otherwise set to LogLevel.INFO.
881
- */
882
- static fromVerboseFlag(options = {}) {
883
- return new _Logger(options.verbose ? 4 /* DEBUG */ : 3 /* INFO */);
884
- }
885
- /**
886
- * Static factory method to create a Logger instance based on a LogLevel string name.
887
- * Useful for creating a logger from config files or environments variables.
888
- *
889
- * @static
890
- * @param {string | undefined} levelName - The name of the log level (e.g., 'debug', 'info', 'warn', 'error', 'silent'/'none'). Case-insensitive.
891
- * @param {LogLevel} [defaultLevel=LogLevel.INFO] - The level to use if levelName is invalid or undefined.
892
- * @returns {Logger} A new Logger instance set to the corresponding LogLevel.
893
- */
894
- static fromLevelName(levelName, defaultLevel = 3 /* INFO */) {
895
- if (!levelName) {
896
- return new _Logger(defaultLevel);
897
- }
898
- switch (levelName.toLowerCase()) {
899
- // Return enum members
900
- case "debug":
901
- return new _Logger(4 /* DEBUG */);
902
- case "info":
903
- return new _Logger(3 /* INFO */);
904
- case "warn":
905
- return new _Logger(2 /* WARN */);
906
- case "error":
907
- return new _Logger(1 /* ERROR */);
908
- case "silent":
909
- case "none":
910
- return new _Logger(0 /* NONE */);
911
- default:
912
- console.warn(`[Logger] Invalid log level name "${levelName}". Defaulting to ${LogLevel[defaultLevel]}.`);
913
- return new _Logger(defaultLevel);
914
- }
915
- }
916
- };
917
-
918
833
  // src/utils/slugify.ts
919
834
  function slugify(url) {
920
835
  if (!url || typeof url !== "string") return "index";
@@ -946,9 +861,11 @@ function bundleMultiPageHTML(pages, logger) {
946
861
  throw new Error(errorMsg);
947
862
  }
948
863
  logger?.info(`Bundling ${pages.length} pages into a multi-page HTML document.`);
864
+ let pageIndex = 0;
949
865
  const validPages = pages.filter((page) => {
950
866
  const isValid = page && typeof page === "object" && typeof page.url === "string" && typeof page.html === "string";
951
- if (!isValid) logger?.warn("Skipping invalid page entry");
867
+ if (!isValid) logger?.warn(`Skipping invalid page entry at index ${pageIndex}`);
868
+ pageIndex++;
952
869
  return isValid;
953
870
  });
954
871
  if (validPages.length === 0) {
@@ -958,70 +875,137 @@ function bundleMultiPageHTML(pages, logger) {
958
875
  }
959
876
  const slugMap = /* @__PURE__ */ new Map();
960
877
  const usedSlugs = /* @__PURE__ */ new Set();
878
+ let firstValidSlug = void 0;
879
+ let pageCounterForFallback = 1;
961
880
  for (const page of validPages) {
962
- const baseSlug = sanitizeSlug(page.url);
881
+ let baseSlug = sanitizeSlug(page.url);
882
+ const isRootIndex = page.url === "/" || page.url === "index.html" || page.url.endsWith("/index.html");
883
+ if (baseSlug === "index" && !isRootIndex) {
884
+ logger?.debug(`URL "${page.url}" sanitized to "index", attempting to find alternative slug.`);
885
+ const pathParts = page.url.replace(/\/$/, "").split("/").filter((p) => p && p.toLowerCase() !== "index.html" && p.toLowerCase() !== "index");
886
+ if (pathParts.length > 0) {
887
+ const lastPartSlug = sanitizeSlug(pathParts[pathParts.length - 1]);
888
+ if (lastPartSlug && lastPartSlug !== "index") {
889
+ baseSlug = lastPartSlug;
890
+ logger?.debug(`Using last path part slug "${baseSlug}" instead.`);
891
+ } else {
892
+ baseSlug = "page";
893
+ logger?.debug(`Last path part invalid ("${lastPartSlug}"), using fallback slug "page".`);
894
+ }
895
+ } else {
896
+ baseSlug = "page";
897
+ logger?.debug(`No valid path parts found, using fallback slug "page".`);
898
+ }
899
+ } else if (!baseSlug) {
900
+ if (isRootIndex) {
901
+ baseSlug = "index";
902
+ logger?.debug(`URL "${page.url}" sanitized to empty string, using "index" as it is a root index.`);
903
+ } else {
904
+ baseSlug = "page";
905
+ logger?.debug(`URL "${page.url}" sanitized to empty string, using fallback slug "page".`);
906
+ }
907
+ }
908
+ if (!baseSlug) {
909
+ baseSlug = `page-${pageCounterForFallback++}`;
910
+ logger?.warn(`Could not determine a valid base slug for "${page.url}", using generated fallback "${baseSlug}".`);
911
+ }
963
912
  let slug = baseSlug;
964
- let counter = 1;
913
+ let collisionCounter = 1;
914
+ const originalBaseSlugForLog = baseSlug;
965
915
  while (usedSlugs.has(slug)) {
966
- slug = `${baseSlug}-${counter++}`;
967
- logger?.warn(`Slug collision detected for "${page.url}". Using "${slug}" instead.`);
916
+ const newSlug = `${originalBaseSlugForLog}-${collisionCounter++}`;
917
+ logger?.warn(`Slug collision detected for "${page.url}" (intended slug: '${originalBaseSlugForLog}'). Using "${newSlug}" instead.`);
918
+ slug = newSlug;
968
919
  }
969
920
  usedSlugs.add(slug);
970
921
  slugMap.set(page.url, slug);
922
+ if (firstValidSlug === void 0) {
923
+ firstValidSlug = slug;
924
+ }
971
925
  }
972
- const defaultPageSlug = slugMap.get(validPages[0].url);
926
+ const defaultPageSlug = usedSlugs.has("index") ? "index" : firstValidSlug || "page";
973
927
  let output = `<!DOCTYPE html>
974
928
  <html lang="en">
975
929
  <head>
976
930
  <meta charset="UTF-8">
977
931
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
978
932
  <title>Multi-Page Bundle</title>
933
+ <style>
934
+ body { font-family: sans-serif; margin: 0; }
935
+ #main-nav { background-color: #f0f0f0; padding: 10px; border-bottom: 1px solid #ccc; }
936
+ #main-nav a { margin-right: 15px; text-decoration: none; color: #007bff; }
937
+ #main-nav a.active { font-weight: bold; text-decoration: underline; }
938
+ #page-container { padding: 20px; }
939
+ template { display: none; }
940
+ </style>
979
941
  </head>
980
942
  <body>
981
943
  <nav id="main-nav">
982
944
  ${validPages.map((p) => {
983
945
  const slug = slugMap.get(p.url);
984
- const label = p.url.split("/").pop()?.split(".")[0] || "Page";
946
+ const label = slug;
985
947
  return `<a href="#${slug}" data-page="${slug}">${label}</a>`;
986
- }).join("\n")}
948
+ }).join("\n ")}
987
949
  </nav>
988
950
  <div id="page-container"></div>
989
951
  ${validPages.map((p) => {
990
952
  const slug = slugMap.get(p.url);
991
953
  return `<template id="page-${slug}">${p.html}</template>`;
992
- }).join("\n")}
954
+ }).join("\n ")}
993
955
  <script id="router-script">
994
956
  document.addEventListener('DOMContentLoaded', function() {
957
+ const pageContainer = document.getElementById('page-container');
958
+ const navLinks = document.querySelectorAll('#main-nav a');
959
+
995
960
  function navigateTo(slug) {
996
961
  const template = document.getElementById('page-' + slug);
997
- const container = document.getElementById('page-container');
998
- if (!template || !container) return;
999
- container.innerHTML = '';
1000
- container.appendChild(template.content.cloneNode(true));
1001
- document.querySelectorAll('#main-nav a').forEach(link => {
1002
- if (link.getAttribute('data-page') === slug) link.classList.add('active');
1003
- else link.classList.remove('active');
962
+ if (!template || !pageContainer) {
963
+ console.warn('Navigation failed: Template or container not found for slug:', slug);
964
+ // Maybe try navigating to default page? Or just clear container?
965
+ if (pageContainer) pageContainer.innerHTML = '<p>Page not found.</p>';
966
+ return;
967
+ }
968
+ // Clear previous content and append new content
969
+ pageContainer.innerHTML = ''; // Clear reliably
970
+ pageContainer.appendChild(template.content.cloneNode(true));
971
+
972
+ // Update active link styling
973
+ navLinks.forEach(link => {
974
+ link.classList.toggle('active', link.getAttribute('data-page') === slug);
1004
975
  });
976
+
977
+ // Update URL hash without triggering hashchange if already correct
1005
978
  if (window.location.hash.substring(1) !== slug) {
1006
- history.pushState(null, '', '#' + slug);
979
+ // Use pushState for cleaner history
980
+ history.pushState({ slug: slug }, '', '#' + slug);
1007
981
  }
1008
982
  }
1009
983
 
1010
- window.addEventListener('hashchange', () => {
1011
- const slug = window.location.hash.substring(1);
1012
- if (document.getElementById('page-' + slug)) navigateTo(slug);
984
+ // Handle back/forward navigation
985
+ window.addEventListener('popstate', (event) => {
986
+ let slug = window.location.hash.substring(1);
987
+ // If popstate event has state use it, otherwise fallback to hash or default
988
+ if (event && event.state && event.state.slug) { // Check event exists
989
+ slug = event.state.slug;
990
+ }
991
+ // Ensure the target page exists before navigating, fallback to default slug
992
+ const targetSlug = document.getElementById('page-' + slug) ? slug : '${defaultPageSlug}';
993
+ navigateTo(targetSlug);
1013
994
  });
1014
995
 
1015
- document.querySelectorAll('#main-nav a').forEach(link => {
996
+ // Handle direct link clicks
997
+ navLinks.forEach(link => {
1016
998
  link.addEventListener('click', function(e) {
1017
999
  e.preventDefault();
1018
1000
  const slug = this.getAttribute('data-page');
1019
- navigateTo(slug);
1001
+ if (slug) navigateTo(slug);
1020
1002
  });
1021
1003
  });
1022
1004
 
1023
- const initial = window.location.hash.substring(1);
1024
- navigateTo(document.getElementById('page-' + initial) ? initial : '${defaultPageSlug}');
1005
+ // Initial page load
1006
+ const initialHash = window.location.hash.substring(1);
1007
+ const initialSlug = document.getElementById('page-' + initialHash) ? initialHash : '${defaultPageSlug}';
1008
+ navigateTo(initialSlug);
1025
1009
  });
1026
1010
  </script>
1027
1011
  </body>
@@ -1031,51 +1015,74 @@ function bundleMultiPageHTML(pages, logger) {
1031
1015
  }
1032
1016
 
1033
1017
  // src/core/web-fetcher.ts
1034
- async function fetchAndPackWebPage(url, logger, timeout = 3e4) {
1018
+ var PUPPETEER_LAUNCH_OPTIONS = {
1019
+ headless: true,
1020
+ args: [
1021
+ "--no-sandbox",
1022
+ // Often required in containerized environments
1023
+ "--disable-setuid-sandbox",
1024
+ "--disable-dev-shm-usage"
1025
+ // Recommended for Docker/CI
1026
+ ]
1027
+ };
1028
+ var DEFAULT_PAGE_TIMEOUT = 3e4;
1029
+ async function fetchAndPackWebPage(url, logger, timeout = DEFAULT_PAGE_TIMEOUT, userAgent) {
1035
1030
  let browser = null;
1036
1031
  const start = Date.now();
1037
- logger?.debug(`Initiating fetch for single page: ${url}`);
1032
+ logger?.info(`Initiating fetch for single page: ${url}`);
1038
1033
  try {
1039
- browser = await puppeteer.launch({ headless: true });
1040
- logger?.debug(`Browser launched for ${url}`);
1034
+ logger?.debug("Launching browser...");
1035
+ browser = await puppeteer.launch(PUPPETEER_LAUNCH_OPTIONS);
1036
+ logger?.debug(`Browser launched successfully (PID: ${browser.process()?.pid}).`);
1041
1037
  const page = await browser.newPage();
1042
- logger?.debug(`Page created for ${url}`);
1038
+ logger?.debug(`New page created for ${url}`);
1039
+ if (userAgent) {
1040
+ await page.setUserAgent(userAgent);
1041
+ logger?.debug(`User-Agent set to: "${userAgent}"`);
1042
+ }
1043
1043
  try {
1044
1044
  logger?.debug(`Navigating to ${url} with timeout ${timeout}ms`);
1045
1045
  await page.goto(url, { waitUntil: "networkidle2", timeout });
1046
1046
  logger?.debug(`Navigation successful for ${url}`);
1047
1047
  const html = await page.content();
1048
- logger?.debug(`Content retrieved for ${url}`);
1048
+ logger?.debug(`Content retrieved for ${url} (${Buffer.byteLength(html, "utf-8")} bytes)`);
1049
1049
  const metadata = {
1050
1050
  input: url,
1051
1051
  outputSize: Buffer.byteLength(html, "utf-8"),
1052
1052
  assetCount: 0,
1053
- // Basic fetch doesn't track assets
1053
+ // Basic fetch doesn't track assets processed by *this* tool
1054
1054
  buildTimeMs: Date.now() - start,
1055
1055
  errors: []
1056
1056
  // No errors if we reached this point
1057
1057
  };
1058
1058
  await page.close();
1059
1059
  logger?.debug(`Page closed for ${url}`);
1060
+ await browser.close();
1060
1061
  logger?.debug(`Browser closed for ${url}`);
1061
1062
  browser = null;
1062
1063
  return { html, metadata };
1063
1064
  } catch (pageError) {
1064
1065
  logger?.error(`Error during page processing for ${url}: ${pageError.message}`);
1065
- try {
1066
- await page.close();
1067
- } catch (closeErr) {
1068
- throw closeErr;
1066
+ if (page && !page.isClosed()) {
1067
+ try {
1068
+ await page.close();
1069
+ logger?.debug(`Page closed after error for ${url}`);
1070
+ } catch (closeErr) {
1071
+ logger?.error(`Failed to close page after error for ${url}: ${closeErr.message}`);
1072
+ }
1069
1073
  }
1070
1074
  throw pageError;
1071
1075
  }
1072
1076
  } catch (launchError) {
1073
- logger?.error(`Critical error during browser launch or page creation for ${url}: ${launchError.message}`);
1077
+ logger?.error(`Critical error during browser launch or page setup for ${url}: ${launchError.message}`);
1074
1078
  if (browser) {
1075
1079
  try {
1076
1080
  await browser.close();
1081
+ logger?.debug("Browser closed after launch/setup error.");
1077
1082
  } catch (closeErr) {
1083
+ logger?.warn(`Failed to close browser after launch/setup error: ${closeErr.message}`);
1078
1084
  }
1085
+ browser = null;
1079
1086
  }
1080
1087
  throw launchError;
1081
1088
  } finally {
@@ -1088,99 +1095,123 @@ async function fetchAndPackWebPage(url, logger, timeout = 3e4) {
1088
1095
  }
1089
1096
  }
1090
1097
  }
1091
- async function crawlWebsite(startUrl, maxDepth, logger) {
1098
+ async function crawlWebsite(startUrl, options) {
1099
+ const {
1100
+ maxDepth = 1,
1101
+ timeout = DEFAULT_PAGE_TIMEOUT,
1102
+ // include = ['**'], // TODO: Implement glob filtering
1103
+ // exclude = [],
1104
+ userAgent,
1105
+ logger
1106
+ } = options;
1092
1107
  logger?.info(`Starting crawl for ${startUrl} with maxDepth ${maxDepth}`);
1093
1108
  if (maxDepth <= 0) {
1094
- logger?.info("maxDepth is 0 or negative, no pages will be crawled.");
1109
+ logger?.warn("maxDepth is 0 or negative, no pages will be crawled.");
1095
1110
  return [];
1096
1111
  }
1097
- const browser = await puppeteer.launch({ headless: true });
1112
+ let browser = null;
1098
1113
  const visited = /* @__PURE__ */ new Set();
1099
1114
  const results = [];
1100
1115
  const queue = [];
1101
1116
  let startOrigin;
1102
1117
  try {
1103
- startOrigin = new URL(startUrl).origin;
1104
- } catch (e) {
1105
- logger?.error(`Invalid start URL: ${startUrl}. ${e.message}`);
1106
- await browser.close();
1107
- return [];
1108
- }
1109
- let normalizedStartUrl;
1110
- try {
1111
- const parsedStartUrl = new URL(startUrl);
1112
- parsedStartUrl.hash = "";
1113
- normalizedStartUrl = parsedStartUrl.href;
1114
- } catch (e) {
1115
- logger?.error(`Invalid start URL: ${startUrl}. ${e.message}`);
1116
- await browser.close();
1117
- return [];
1118
- }
1119
- visited.add(normalizedStartUrl);
1120
- queue.push({ url: normalizedStartUrl, depth: 1 });
1121
- logger?.debug(`Queued initial URL: ${normalizedStartUrl} (depth 1)`);
1122
- while (queue.length > 0) {
1123
- const { url, depth } = queue.shift();
1124
- logger?.info(`Processing: ${url} (depth ${depth})`);
1125
- let page = null;
1126
1118
  try {
1127
- page = await browser.newPage();
1128
- await page.setViewport({ width: 1280, height: 800 });
1129
- await page.goto(url, { waitUntil: "networkidle2", timeout: 3e4 });
1130
- const html = await page.content();
1131
- results.push({ url, html });
1132
- logger?.debug(`Successfully fetched content for ${url}`);
1133
- if (depth < maxDepth) {
1134
- logger?.debug(`Discovering links on ${url} (current depth ${depth}, maxDepth ${maxDepth})`);
1135
- const hrefs = await page.evaluate(
1136
- () => Array.from(document.querySelectorAll("a[href]"), (a) => a.getAttribute("href"))
1137
- );
1138
- logger?.debug(`Found ${hrefs.length} potential hrefs on ${url}`);
1139
- let linksAdded = 0;
1140
- for (const href of hrefs) {
1141
- if (!href) continue;
1142
- let absoluteUrl;
1143
- try {
1144
- const resolved = new URL(href, url);
1145
- resolved.hash = "";
1146
- absoluteUrl = resolved.href;
1147
- } catch (e) {
1148
- logger?.debug(`Ignoring invalid URL syntax: "${href}" on page ${url}`);
1149
- continue;
1150
- }
1151
- if (absoluteUrl.startsWith(startOrigin) && !visited.has(absoluteUrl)) {
1152
- visited.add(absoluteUrl);
1153
- queue.push({ url: absoluteUrl, depth: depth + 1 });
1154
- linksAdded++;
1155
- } else {
1119
+ startOrigin = new URL(startUrl).origin;
1120
+ } catch (e) {
1121
+ logger?.error(`Invalid start URL: ${startUrl}. ${e.message}`);
1122
+ throw new Error(`Invalid start URL: ${startUrl}`);
1123
+ }
1124
+ let normalizedStartUrl;
1125
+ try {
1126
+ const parsedStartUrl = new URL(startUrl);
1127
+ parsedStartUrl.hash = "";
1128
+ normalizedStartUrl = parsedStartUrl.href;
1129
+ } catch (e) {
1130
+ logger?.error(`Invalid start URL: ${startUrl}. ${e.message}`);
1131
+ throw new Error(`Invalid start URL: ${startUrl}`);
1132
+ }
1133
+ logger?.debug("Launching browser for crawl...");
1134
+ browser = await puppeteer.launch(PUPPETEER_LAUNCH_OPTIONS);
1135
+ logger?.debug(`Browser launched for crawl (PID: ${browser.process()?.pid}).`);
1136
+ visited.add(normalizedStartUrl);
1137
+ queue.push({ url: normalizedStartUrl, depth: 1 });
1138
+ logger?.debug(`Queued initial URL: ${normalizedStartUrl} (depth 1)`);
1139
+ while (queue.length > 0) {
1140
+ const { url, depth } = queue.shift();
1141
+ logger?.info(`Processing: ${url} (depth ${depth})`);
1142
+ let page = null;
1143
+ try {
1144
+ page = await browser.newPage();
1145
+ if (userAgent) {
1146
+ await page.setUserAgent(userAgent);
1147
+ }
1148
+ await page.goto(url, { waitUntil: "networkidle2", timeout });
1149
+ const html = await page.content();
1150
+ results.push({ url, html });
1151
+ logger?.debug(`Successfully fetched content for ${url}`);
1152
+ if (depth < maxDepth) {
1153
+ logger?.debug(`Discovering links on ${url} (depth ${depth}/${maxDepth})`);
1154
+ const hrefs = await page.evaluate(
1155
+ () => Array.from(document.querySelectorAll("a[href]"), (a) => a.getAttribute("href"))
1156
+ );
1157
+ logger?.debug(`Found ${hrefs.length} potential hrefs on ${url}`);
1158
+ let linksAdded = 0;
1159
+ for (const href of hrefs) {
1160
+ if (!href) continue;
1161
+ let absoluteUrl;
1162
+ try {
1163
+ const resolved = new URL(href, url);
1164
+ resolved.hash = "";
1165
+ absoluteUrl = resolved.href;
1166
+ } catch (e) {
1167
+ logger?.debug(`Ignoring invalid URL syntax: "${href}" on page ${url}`);
1168
+ continue;
1169
+ }
1170
+ if (absoluteUrl.startsWith(startOrigin) && !visited.has(absoluteUrl)) {
1171
+ visited.add(absoluteUrl);
1172
+ queue.push({ url: absoluteUrl, depth: depth + 1 });
1173
+ linksAdded++;
1174
+ }
1156
1175
  }
1176
+ logger?.debug(`Added ${linksAdded} new unique internal links to queue from ${url}`);
1177
+ } else {
1178
+ logger?.debug(`Max depth (${maxDepth}) reached, not discovering links on ${url}`);
1157
1179
  }
1158
- logger?.debug(`Added ${linksAdded} new unique internal links to queue from ${url}`);
1159
- } else {
1160
- logger?.debug(`Max depth (${maxDepth}) reached, not discovering links on ${url}`);
1161
- }
1162
- } catch (err) {
1163
- logger?.warn(`\u274C Failed to process ${url}: ${err.message}`);
1164
- } finally {
1165
- if (page) {
1166
- try {
1167
- await page.close();
1168
- } catch (pageCloseError) {
1169
- logger?.error(`Failed to close page for ${url}: ${pageCloseError.message}`);
1180
+ } catch (err) {
1181
+ logger?.warn(`\u274C Failed to process ${url}: ${err.message}`);
1182
+ } finally {
1183
+ if (page && !page.isClosed()) {
1184
+ try {
1185
+ await page.close();
1186
+ } catch (pageCloseError) {
1187
+ logger?.error(`Failed to close page for ${url}: ${pageCloseError.message}`);
1188
+ }
1170
1189
  }
1171
1190
  }
1172
1191
  }
1192
+ } catch (error) {
1193
+ logger?.error(`Critical crawl error: ${error instanceof Error ? error.message : error}`);
1194
+ throw error;
1195
+ } finally {
1196
+ if (browser) {
1197
+ logger?.info(`Crawl finished or errored. Closing browser.`);
1198
+ await browser.close();
1199
+ logger?.debug(`Browser closed after crawl.`);
1200
+ }
1173
1201
  }
1174
- logger?.info(`Crawl finished. Closing browser.`);
1175
- await browser.close();
1176
- logger?.info(`Found ${results.length} pages.`);
1202
+ logger?.info(`Crawl found ${results.length} pages.`);
1177
1203
  return results;
1178
1204
  }
1179
- async function recursivelyBundleSite(startUrl, outputFile, maxDepth = 1) {
1180
- const logger = new Logger();
1205
+ async function recursivelyBundleSite(startUrl, outputFile, maxDepth = 1, loggerInstance) {
1206
+ const logger = loggerInstance || new Logger();
1181
1207
  logger.info(`Starting recursive site bundle for ${startUrl} to ${outputFile} (maxDepth: ${maxDepth})`);
1182
1208
  try {
1183
- const pages = await crawlWebsite(startUrl, maxDepth, logger);
1209
+ const crawlOptions = {
1210
+ maxDepth,
1211
+ logger
1212
+ /* Add other options like timeout, userAgent if needed */
1213
+ };
1214
+ const pages = await crawlWebsite(startUrl, crawlOptions);
1184
1215
  if (pages.length === 0) {
1185
1216
  logger.warn("Crawl completed but found 0 pages. Output file may be empty or reflect an empty bundle.");
1186
1217
  } else {
@@ -1204,6 +1235,72 @@ async function recursivelyBundleSite(startUrl, outputFile, maxDepth = 1) {
1204
1235
  }
1205
1236
  }
1206
1237
 
1238
+ // src/core/parser.ts
1239
+ import { readFile as readFile2 } from "fs/promises";
1240
+ import * as cheerio2 from "cheerio";
1241
+ async function parseHTML(entryFilePath, logger) {
1242
+ logger?.debug(`Parsing HTML file: ${entryFilePath}`);
1243
+ let htmlContent;
1244
+ try {
1245
+ htmlContent = await readFile2(entryFilePath, "utf-8");
1246
+ logger?.debug(`Successfully read HTML file (${Buffer.byteLength(htmlContent)} bytes).`);
1247
+ } catch (err) {
1248
+ logger?.error(`Failed to read HTML file "${entryFilePath}": ${err.message}`);
1249
+ throw new Error(`Could not read input HTML file: ${entryFilePath}`, { cause: err });
1250
+ }
1251
+ const $ = cheerio2.load(htmlContent);
1252
+ const assets = [];
1253
+ const addedUrls = /* @__PURE__ */ new Set();
1254
+ const addAsset = (url, forcedType) => {
1255
+ if (!url || url.trim() === "" || url.startsWith("data:")) {
1256
+ return;
1257
+ }
1258
+ if (!addedUrls.has(url)) {
1259
+ addedUrls.add(url);
1260
+ const mimeInfo = guessMimeType(url);
1261
+ const type = forcedType ?? mimeInfo.assetType;
1262
+ assets.push({ type, url });
1263
+ logger?.debug(`Discovered asset: Type='${type}', URL='${url}'`);
1264
+ } else {
1265
+ logger?.debug(`Skipping duplicate asset URL: ${url}`);
1266
+ }
1267
+ };
1268
+ logger?.debug("Extracting assets from HTML tags...");
1269
+ $('link[rel="stylesheet"][href]').each((_, el) => {
1270
+ addAsset($(el).attr("href"), "css");
1271
+ });
1272
+ $("script[src]").each((_, el) => {
1273
+ addAsset($(el).attr("src"), "js");
1274
+ });
1275
+ $("img[src]").each((_, el) => addAsset($(el).attr("src"), "image"));
1276
+ $('input[type="image"][src]').each((_, el) => addAsset($(el).attr("src"), "image"));
1277
+ $("img[srcset], picture source[srcset]").each((_, el) => {
1278
+ const srcset = $(el).attr("srcset");
1279
+ srcset?.split(",").forEach((entry) => {
1280
+ const [url] = entry.trim().split(/\s+/);
1281
+ addAsset(url, "image");
1282
+ });
1283
+ });
1284
+ $("video[src]").each((_, el) => addAsset($(el).attr("src"), "video"));
1285
+ $("video[poster]").each((_, el) => addAsset($(el).attr("poster"), "image"));
1286
+ $("audio[src]").each((_, el) => addAsset($(el).attr("src"), "audio"));
1287
+ $("video > source[src]").each((_, el) => addAsset($(el).attr("src"), "video"));
1288
+ $("audio > source[src]").each((_, el) => addAsset($(el).attr("src"), "audio"));
1289
+ $("link[href]").filter((_, el) => {
1290
+ const rel = $(el).attr("rel")?.toLowerCase() ?? "";
1291
+ return ["icon", "shortcut icon", "apple-touch-icon", "manifest"].includes(rel);
1292
+ }).each((_, el) => {
1293
+ const rel = $(el).attr("rel")?.toLowerCase() ?? "";
1294
+ const isIcon = ["icon", "shortcut icon", "apple-touch-icon"].includes(rel);
1295
+ addAsset($(el).attr("href"), isIcon ? "image" : void 0);
1296
+ });
1297
+ $('link[rel="preload"][as="font"][href]').each((_, el) => {
1298
+ addAsset($(el).attr("href"), "font");
1299
+ });
1300
+ logger?.info(`HTML parsing complete. Discovered ${assets.length} unique asset links.`);
1301
+ return { htmlContent, assets };
1302
+ }
1303
+
1207
1304
  // src/utils/meta.ts
1208
1305
  var BuildTimer = class {
1209
1306
  startTime;
@@ -1284,122 +1381,84 @@ var BuildTimer = class {
1284
1381
  };
1285
1382
 
1286
1383
  // src/index.ts
1384
+ async function pack(input, options = {}) {
1385
+ const logger = options.loggerInstance || new Logger(options.logLevel);
1386
+ const isHttp = /^https?:\/\//i.test(input);
1387
+ if (!isHttp && /:\/\//.test(input) && !input.startsWith("file://")) {
1388
+ const errorMsg = `Unsupported protocol or input type: ${input}`;
1389
+ logger.error(errorMsg);
1390
+ throw new Error(errorMsg);
1391
+ }
1392
+ const isRemote = /^https?:\/\//i.test(input);
1393
+ const recursive = options.recursive === true || typeof options.recursive === "number";
1394
+ if (isRemote && recursive) {
1395
+ const depth = typeof options.recursive === "number" ? options.recursive : 1;
1396
+ logger.info(`Starting recursive fetch for ${input} up to depth ${depth}`);
1397
+ return generateRecursivePortableHTML(input, depth, options, logger);
1398
+ }
1399
+ logger.info(`Starting single page processing for: ${input}`);
1400
+ return generatePortableHTML(input, options, logger);
1401
+ }
1287
1402
  async function generatePortableHTML(input, options = {}, loggerInstance) {
1288
1403
  const logger = loggerInstance || new Logger(options.logLevel);
1289
- logger.info(`Generating portable HTML for: ${input}`);
1290
1404
  const timer = new BuildTimer(input);
1291
- const isRemote = /^https?:\/\//i.test(input);
1292
- if (isRemote) {
1293
- logger.info(`Input is a remote URL. Fetching page content directly...`);
1405
+ if (/^https?:\/\//i.test(input)) {
1406
+ logger.info(`Workspaceing remote page: ${input}`);
1294
1407
  try {
1295
- const result = await fetchAndPackWebPage2(input, options, logger);
1296
- logger.info(`Remote fetch complete. Input: ${input}, Size: ${result.metadata.outputSize} bytes, Time: ${result.metadata.buildTimeMs}ms`);
1297
- return result;
1408
+ const result = await fetchAndPackWebPage(input, logger);
1409
+ const metadata = timer.finish(result.html, result.metadata);
1410
+ logger.info(`Finished fetching and packing remote page: ${input}`);
1411
+ return { html: result.html, metadata };
1298
1412
  } catch (error) {
1299
- logger.error(`Failed to fetch remote URL ${input}: ${error.message}`);
1413
+ logger.error(`Error fetching remote page ${input}: ${error.message}`);
1300
1414
  throw error;
1301
1415
  }
1302
1416
  }
1303
- logger.info(`Input is a local file path. Starting local processing pipeline...`);
1304
- const basePath = options.baseUrl || input;
1305
- logger.debug(`Using base path for asset resolution: ${basePath}`);
1417
+ logger.info(`Processing local file: ${input}`);
1306
1418
  try {
1419
+ const baseUrl = options.baseUrl || input;
1307
1420
  const parsed = await parseHTML(input, logger);
1308
- const enriched = await extractAssets(parsed, options.embedAssets ?? true, basePath, logger);
1421
+ const enriched = await extractAssets(parsed, options.embedAssets ?? true, baseUrl, logger);
1309
1422
  const minified = await minifyAssets(enriched, options, logger);
1310
1423
  const finalHtml = packHTML(minified, logger);
1311
1424
  const metadata = timer.finish(finalHtml, {
1312
1425
  assetCount: minified.assets.length
1313
- // FIX: Removed incorrect attempt to get errors from logger
1314
- // Errors collected by the timer itself (via timer.addError) will be included automatically.
1315
1426
  });
1316
- logger.info(`Local processing complete. Input: ${input}, Size: ${metadata.outputSize} bytes, Assets: ${metadata.assetCount}, Time: ${metadata.buildTimeMs}ms`);
1317
- if (metadata.errors && metadata.errors.length > 0) {
1318
- logger.warn(`Completed with ${metadata.errors.length} warning(s) logged in metadata.`);
1319
- }
1427
+ logger.info(`Finished processing local file: ${input}`);
1320
1428
  return { html: finalHtml, metadata };
1321
1429
  } catch (error) {
1322
- logger.error(`Error during local processing for ${input}: ${error.message}`);
1430
+ logger.error(`Error processing local file ${input}: ${error.message}`);
1323
1431
  throw error;
1324
1432
  }
1325
1433
  }
1326
1434
  async function generateRecursivePortableHTML(url, depth = 1, options = {}, loggerInstance) {
1327
1435
  const logger = loggerInstance || new Logger(options.logLevel);
1328
- logger.info(`Generating recursive portable HTML for: ${url}, Max Depth: ${depth}`);
1329
1436
  const timer = new BuildTimer(url);
1330
1437
  if (!/^https?:\/\//i.test(url)) {
1331
- const errMsg = `Invalid input URL for recursive bundling: ${url}. Must start with http(s)://`;
1332
- logger.error(errMsg);
1333
- throw new Error(errMsg);
1438
+ const errorMsg = `Invalid URL for recursive bundling. Must start with http:// or https://. Received: ${url}`;
1439
+ logger.error(errorMsg);
1440
+ throw new Error(errorMsg);
1334
1441
  }
1335
- const internalOutputPathPlaceholder = `${new URL(url).hostname}_recursive.html`;
1442
+ logger.info(`Starting recursive bundle for ${url} up to depth ${depth}`);
1336
1443
  try {
1337
- const { html, pages } = await recursivelyBundleSite(url, internalOutputPathPlaceholder, depth);
1338
- logger.info(`Recursive crawl complete. Discovered and bundled ${pages} pages.`);
1444
+ const { html, pages } = await recursivelyBundleSite(url, "output.html", depth, logger);
1339
1445
  timer.setPageCount(pages);
1340
1446
  const metadata = timer.finish(html, {
1341
1447
  assetCount: 0,
1342
- // NOTE: Asset count across multiple pages is not currently aggregated.
1343
1448
  pagesBundled: pages
1344
- // TODO: Potentially collect errors from the core function if it returns them
1345
1449
  });
1346
- logger.info(`Recursive bundling complete. Input: ${url}, Size: ${metadata.outputSize} bytes, Pages: ${metadata.pagesBundled}, Time: ${metadata.buildTimeMs}ms`);
1347
- if (metadata.errors && metadata.errors.length > 0) {
1348
- logger.warn(`Completed with ${metadata.errors.length} warning(s) logged in metadata.`);
1349
- }
1450
+ logger.info(`Finished recursive bundle for ${url}. Bundled ${pages} pages.`);
1350
1451
  return { html, metadata };
1351
1452
  } catch (error) {
1352
- logger.error(`Error during recursive generation for ${url}: ${error.message}`);
1353
- if (error.cause instanceof Error) {
1354
- logger.error(`Cause: ${error.cause.message}`);
1355
- }
1356
- throw error;
1357
- }
1358
- }
1359
- async function fetchAndPackWebPage2(url, options = {}, loggerInstance) {
1360
- const logger = loggerInstance || new Logger(options.logLevel);
1361
- logger.info(`Workspaceing single remote page: ${url}`);
1362
- const timer = new BuildTimer(url);
1363
- if (!/^https?:\/\//i.test(url)) {
1364
- const errMsg = `Invalid input URL for fetchAndPackWebPage: ${url}. Must start with http(s)://`;
1365
- logger.error(errMsg);
1366
- throw new Error(errMsg);
1367
- }
1368
- try {
1369
- const result = await fetchAndPackWebPage(url, logger);
1370
- const metadata = timer.finish(result.html, {
1371
- // Use assetCount and errors from core metadata if available
1372
- assetCount: result.metadata?.assetCount ?? 0,
1373
- errors: result.metadata?.errors ?? []
1374
- // Ensure errors array exists
1375
- });
1376
- logger.info(`Single page fetch complete. Input: ${url}, Size: ${metadata.outputSize} bytes, Assets: ${metadata.assetCount}, Time: ${metadata.buildTimeMs}ms`);
1377
- if (metadata.errors && metadata.errors.length > 0) {
1378
- logger.warn(`Completed with ${metadata.errors.length} warning(s) logged in metadata.`);
1379
- }
1380
- return { html: result.html, metadata };
1381
- } catch (error) {
1382
- logger.error(`Error during single page fetch for ${url}: ${error.message}`);
1383
- throw error;
1384
- }
1385
- }
1386
- function bundleMultiPageHTML2(pages, options = {}, loggerInstance) {
1387
- const logger = loggerInstance || new Logger(options.logLevel);
1388
- logger.info(`Bundling ${pages.length} provided pages into multi-page HTML...`);
1389
- try {
1390
- const bundledHtml = bundleMultiPageHTML(pages, logger);
1391
- logger.info(`Multi-page bundling complete.`);
1392
- return bundledHtml;
1393
- } catch (error) {
1394
- logger.error(`Error during multi-page bundling: ${error.message}`);
1453
+ logger.error(`Error during recursive bundle for ${url}: ${error.message}`);
1395
1454
  throw error;
1396
1455
  }
1397
1456
  }
1398
1457
  export {
1399
- LogLevel,
1400
- bundleMultiPageHTML2 as bundleMultiPageHTML,
1401
- fetchAndPackWebPage2 as fetchAndPackWebPage,
1458
+ Logger,
1459
+ bundleMultiPageHTML,
1402
1460
  generatePortableHTML,
1403
- generateRecursivePortableHTML
1461
+ generateRecursivePortableHTML,
1462
+ pack
1404
1463
  };
1405
1464
  //# sourceMappingURL=index.js.map