portapack 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,24 +2,24 @@
2
2
  * @file src/core/extractor.ts
3
3
  * @description Handles discovery, resolution, fetching, and optional embedding of assets
4
4
  * linked from HTML and recursively within CSS (@import, url()). This is the heart of finding EVERYTHING.
5
- * @version 1.1.3 - Fixed CSS path resolution and handling of 'other' asset types.
5
+ * @version 1.1.6 - Revised fetchAsset error handling logic for Axios errors.
6
6
  */
7
7
 
8
8
  // === Node.js Core Imports ===
9
9
  import { readFile } from 'fs/promises';
10
10
  import * as fs from 'fs'; // Required for statSync for sync directory check
11
- import type { FileHandle } from 'fs/promises';
11
+ import type { FileHandle } from 'fs/promises'; // Import specific type if needed elsewhere
12
12
  import path from 'path';
13
13
  import { fileURLToPath, URL } from 'url'; // Crucial for file path/URL conversion
14
14
 
15
15
  // === External Dependencies ===
16
- import * as axios from 'axios'; // Using namespace import for clarity
17
- import type { AxiosError, AxiosRequestConfig, AxiosResponse, InternalAxiosRequestConfig } from 'axios';
16
+ import * as axiosNs from 'axios'; // Using namespace import for clarity
17
+ import type { AxiosError, AxiosRequestConfig, AxiosResponse, InternalAxiosRequestConfig } from 'axios'; // Import necessary types
18
18
 
19
19
  // === Project Imports ===
20
- import type { Asset, ParsedHTML } from '../types';
21
- import { guessMimeType } from '../utils/mime';
22
- import { Logger } from '../utils/logger';
20
+ import type { Asset, ParsedHTML } from '../types'; // Adjust path if needed
21
+ import { guessMimeType } from '../utils/mime'; // Adjust path if needed
22
+ import { Logger } from '../utils/logger'; // Adjust path if needed
23
23
 
24
24
  // === Constants ===
25
25
  /** Set of asset types defined in Asset['type'] generally considered text-based */
@@ -31,6 +31,11 @@ const MAX_ASSET_EXTRACTION_ITERATIONS = 1000;
31
31
 
32
32
  // === Helper Functions ===
33
33
 
34
+ /**
35
+ * Custom type for Node.js error objects with a `code` property.
36
+ */
37
+ type NodeJSErrnoException = Error & { code?: string };
38
+
34
39
  /**
35
40
  * Checks if decoding a buffer as UTF-8 and re-encoding is lossy.
36
41
  * @param {Buffer} originalBuffer The original binary buffer.
@@ -39,333 +44,380 @@ const MAX_ASSET_EXTRACTION_ITERATIONS = 1000;
39
44
  */
40
45
  function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boolean {
41
46
  try {
47
+ // Re-encode the decoded string back to a buffer using UTF-8
42
48
  const reEncodedBuffer = Buffer.from(decodedString, 'utf-8');
49
+ // Compare the re-encoded buffer with the original buffer
43
50
  return !originalBuffer.equals(reEncodedBuffer);
44
51
  } catch (e) {
52
+ // If an error occurs during re-encoding, it implies the original wasn't valid UTF-8
45
53
  return true;
46
54
  }
47
55
  }
48
56
 
49
57
  /**
50
58
  * Determines the absolute base directory URL (http://, https://, or file:///) ending in '/'.
59
+ * This is crucial for resolving relative links found in the source document.
51
60
  * @param {string} inputPathOrUrl - The original source HTML file path or a full HTTP/HTTPS URL.
52
61
  * @param {Logger} [logger] - Optional logger instance.
53
62
  * @returns {string | undefined} The absolute base URL string ending in '/', or undefined if determination fails.
54
63
  */
55
64
  function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | undefined {
65
+ // Log the input for debugging purposes
66
+ // console.log(`[DEBUG determineBaseUrl] Input: "${inputPathOrUrl}"`); // Keep debug log commented unless needed
56
67
  logger?.debug(`Determining base URL for input: ${inputPathOrUrl}`);
68
+
69
+ // Handle invalid or empty input
57
70
  if (!inputPathOrUrl) {
58
71
  logger?.warn('Cannot determine base URL: inputPathOrUrl is empty or invalid.');
59
72
  return undefined;
60
73
  }
61
74
 
62
75
  try {
76
+ // Handle non-file URLs (HTTP, HTTPS)
63
77
  if (/^https?:\/\//i.test(inputPathOrUrl)) {
64
78
  const url = new URL(inputPathOrUrl);
79
+ // Construct the base URL by taking the path up to the last '/'
65
80
  url.pathname = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
66
- url.search = ''; url.hash = '';
81
+ url.search = ''; // Remove query parameters
82
+ url.hash = ''; // Remove fragments
67
83
  const baseUrl = url.href;
68
84
  logger?.debug(`Determined remote base URL: ${baseUrl}`);
85
+ // console.log(`[DEBUG determineBaseUrl] Determined Remote URL: "${baseUrl}"`); // Keep debug log commented unless needed
86
+ // Return the constructed base URL (usually ends in '/')
69
87
  return baseUrl;
70
88
  }
89
+ // Handle other protocols (warn and return undefined)
71
90
  else if (inputPathOrUrl.includes('://') && !inputPathOrUrl.startsWith('file:')) {
72
91
  logger?.warn(`Input "${inputPathOrUrl}" looks like a URL but uses an unsupported protocol. Cannot determine base URL.`);
92
+ // console.log(`[DEBUG determineBaseUrl] Unsupported protocol.`); // Keep debug log commented unless needed
73
93
  return undefined;
74
94
  }
95
+ // Handle file paths and file: URLs
75
96
  else {
76
- let absolutePath: string;
97
+ let resourcePath: string; // Path to the actual file or dir input
98
+ let isInputLikelyDirectory = false;
99
+
100
+ // Convert input to an absolute path
77
101
  if (inputPathOrUrl.startsWith('file:')) {
78
- try { absolutePath = fileURLToPath(inputPathOrUrl); }
79
- catch (e: any) { logger?.error(`💀 Failed to convert file URL "${inputPathOrUrl}" to path: ${e.message}`); return undefined; }
102
+ // Convert file URL to path
103
+ resourcePath = fileURLToPath(inputPathOrUrl);
104
+ // file: URLs ending in / strongly suggest a directory
105
+ isInputLikelyDirectory = inputPathOrUrl.endsWith('/');
80
106
  } else {
81
- absolutePath = path.resolve(inputPathOrUrl);
82
- }
83
- let isDirectory = false;
84
- try { isDirectory = fs.statSync(absolutePath).isDirectory(); }
85
- catch (statError: unknown) {
86
- if (statError instanceof Error && (statError as NodeJS.ErrnoException).code === 'ENOENT') {
87
- logger?.debug(`Path "${absolutePath}" not found. Assuming input represents a file, using its parent directory as base.`);
88
- } else {
89
- logger?.warn(`Could not stat local path "${absolutePath}" during base URL determination: ${statError instanceof Error ? statError.message : String(statError)}. Assuming input represents a file.`);
107
+ // Resolve relative/absolute file paths
108
+ resourcePath = path.resolve(inputPathOrUrl);
109
+ // Check if the resolved path *actually* exists and is a directory
110
+ try {
111
+ // Use statSync carefully - assumes it's available and works (or mocked)
112
+ isInputLikelyDirectory = fs.statSync(resourcePath).isDirectory();
113
+ } catch {
114
+ // If stat fails (ENOENT, EACCES), assume it refers to a file path
115
+ isInputLikelyDirectory = false;
90
116
  }
91
- isDirectory = false;
92
117
  }
93
- const dirPath = isDirectory ? absolutePath : path.dirname(absolutePath);
94
- let normalizedPathForURL = dirPath.replace(/\\/g, '/');
118
+ // console.log(`[DEBUG determineBaseUrl] resourcePath: "${resourcePath}", isInputLikelyDirectory: ${isInputLikelyDirectory}`); // Keep debug log commented unless needed
119
+
120
+ // The base directory is the directory containing the resourcePath,
121
+ // OR resourcePath itself if it was identified as a directory.
122
+ const baseDirPath = isInputLikelyDirectory ? resourcePath : path.dirname(resourcePath);
123
+ // console.log(`[DEBUG determineBaseUrl] Calculated baseDirPath: "${baseDirPath}"`); // Keep debug log commented unless needed
124
+
125
+ // Convert base directory path back to a file URL ending in '/'
126
+ let normalizedPathForURL = baseDirPath.replace(/\\/g, '/'); // Use forward slashes for URL consistency
127
+ // Ensure leading slash for Windows file URLs (e.g., /C:/...)
95
128
  if (/^[A-Z]:\//i.test(normalizedPathForURL) && !normalizedPathForURL.startsWith('/')) {
96
129
  normalizedPathForURL = '/' + normalizedPathForURL;
97
130
  }
131
+ // Ensure trailing slash for the directory URL
132
+ if (!normalizedPathForURL.endsWith('/')) {
133
+ normalizedPathForURL += '/';
134
+ }
135
+
136
+ // Create the final file URL object and get its string representation
98
137
  const fileUrl = new URL('file://' + normalizedPathForURL);
99
- let fileUrlString = fileUrl.href;
100
- if (!fileUrlString.endsWith('/')) { fileUrlString += '/'; }
101
- logger?.debug(`Determined local base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved dir: ${dirPath}, isDir: ${isDirectory})`);
138
+ const fileUrlString = fileUrl.href;
139
+
140
+ logger?.debug(`Determined base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved base dir: ${baseDirPath})`);
141
+ // console.log(`[DEBUG determineBaseUrl] Determined File URL: "${fileUrlString}"`); // Keep debug log commented unless needed
102
142
  return fileUrlString;
103
143
  }
104
144
  } catch (error: unknown) {
145
+ // Handle any errors during base URL determination
105
146
  const message = error instanceof Error ? error.message : String(error);
106
- logger?.error(`💀 Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error ? ` - Stack: ${error.stack}` : ''}`);
147
+ // console.error(`[DEBUG determineBaseUrl] Error determining base URL: ${message}`); // Keep debug log commented unless needed
148
+ logger?.error(`💀 Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error && error.stack ? ` - Stack: ${error.stack}` : ''}`);
107
149
  return undefined;
108
150
  }
109
151
  }
110
152
 
111
153
  /**
112
154
  * Resolves an asset URL relative to a base URL context.
113
- * @param {string} assetUrl - The raw URL string found in the source.
114
- * @param {string} [baseContextUrl] - The absolute base URL of the containing document.
155
+ * Handles data URIs, fragments, protocol-relative URLs.
156
+ * @param {string} assetUrl - The raw URL string found in the source (e.g., href, src).
157
+ * @param {string} [baseContextUrl] - The absolute base URL of the containing document (HTML or CSS).
115
158
  * @param {Logger} [logger] - Optional logger instance.
116
- * @returns {URL | null} A validated, absolute URL object or null.
159
+ * @returns {URL | null} A validated, absolute URL object, or null if invalid/ignorable.
117
160
  */
118
161
  function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Logger): URL | null {
162
+ // Trim whitespace from the URL
119
163
  const trimmedUrl = assetUrl?.trim();
164
+
165
+ // Ignore empty URLs, data URIs, or fragment-only URLs
120
166
  if (!trimmedUrl || trimmedUrl.startsWith('data:') || trimmedUrl.startsWith('#')) {
121
167
  return null;
122
168
  }
169
+
123
170
  let resolvableUrl = trimmedUrl;
171
+
172
+ // Handle protocol-relative URLs (e.g., //example.com/image.png)
124
173
  if (resolvableUrl.startsWith('//') && baseContextUrl) {
125
174
  try {
175
+ // Prepend the protocol from the base context URL
126
176
  const base = new URL(baseContextUrl);
127
177
  resolvableUrl = base.protocol + resolvableUrl;
128
178
  } catch (e) {
179
+ // Log a warning if the base protocol cannot be determined
129
180
  logger?.warn(`Could not extract protocol from base "${baseContextUrl}" for protocol-relative URL "${trimmedUrl}". Skipping.`);
130
181
  return null;
131
182
  }
132
183
  }
184
+
133
185
  try {
186
+ // Use URL constructor for resolution. Handles absolute paths, relative paths, ../ etc.
134
187
  const resolved = new URL(resolvableUrl, baseContextUrl);
188
+
189
+ // Skip assets with unsupported protocols (e.g., mailto:, ws:)
190
+ if (!['http:', 'https:', 'file:'].includes(resolved.protocol)) {
191
+ logger?.debug(`Skipping asset with unsupported protocol: ${resolved.href}`);
192
+ return null;
193
+ }
194
+ // Return the resolved URL object
135
195
  return resolved;
136
196
  } catch (error: unknown) {
197
+ // Log errors during URL parsing/resolution
137
198
  const message = error instanceof Error ? error.message : String(error);
199
+ // Avoid redundant warnings for relative paths when no base context was provided (expected failure)
138
200
  if (!/^[a-z]+:/i.test(resolvableUrl) && !resolvableUrl.startsWith('/') && !baseContextUrl) {
139
201
  logger?.warn(`Cannot resolve relative URL "${resolvableUrl}" - Base context URL was not provided or determined.`);
140
202
  } else {
203
+ // Log other resolution failures
141
204
  logger?.warn(`⚠️ Failed to parse/resolve URL "${resolvableUrl}" ${baseContextUrl ? 'against base "' + baseContextUrl + '"' : '(no base provided)'}: ${message}`);
142
205
  }
206
+ // Return null if resolution fails
143
207
  return null;
144
208
  }
145
209
  }
146
210
 
147
211
  /**
148
- * Properly resolves CSS relative paths, handling "../" correctly.
149
- * This is critical for properly resolving paths in CSS like "../images/bg.png".
150
- *
151
- * @param {string} relativeUrl - The relative URL from CSS (e.g., "../images/bg.png")
152
- * @param {string} cssBaseUrl - The base URL of the CSS file
153
- * @param {Logger} [logger] - Optional logger instance
154
- * @returns {string | null} The resolved absolute URL or null if resolution fails
212
+ * Properly resolves CSS relative paths (like url("../images/bg.png")), handling "../" correctly.
213
+ * Uses the CSS file's own location as the base for resolution.
214
+ * @param {string} relativeUrl - The relative URL string from CSS (e.g., "../images/bg.png").
215
+ * @param {string} cssBaseContextUrl - The absolute URL of the CSS file containing the relative URL.
216
+ * @param {Logger} [logger] - Optional logger instance.
217
+ * @returns {string | null} The resolved absolute URL string, or null if resolution fails/invalid.
155
218
  */
156
219
  function resolveCssRelativeUrl(
157
220
  relativeUrl: string,
158
- cssBaseContextUrl: string,
221
+ cssBaseContextUrl: string, // e.g., file:///C:/mock/base/dir/css/deep.css or https://.../style.css
159
222
  logger?: Logger
160
223
  ): string | null {
161
- // Skip empty or data URLs
162
- if (!relativeUrl || relativeUrl.startsWith('data:')) {
224
+ // console.log(`[DEBUG resolveCssRelativeUrl] Input: relative="${relativeUrl}", base="${cssBaseContextUrl}"`); // Keep debug log commented unless needed
225
+
226
+ // Ignore empty, data URIs, or fragments
227
+ if (!relativeUrl || relativeUrl.startsWith('data:') || relativeUrl.startsWith('#')) {
163
228
  return null;
164
229
  }
165
230
 
166
231
  try {
167
- if (cssBaseContextUrl.startsWith('file:')) {
168
- // Turn the CSS base URL into a filesystem path
169
- const basePath = fileURLToPath(cssBaseContextUrl);
170
-
171
- // If that base path is actually a directory, use it directly;
172
- // otherwise, use its dirname. This prevents us from dropping
173
- // the final directory name when we already have a trailing slash.
174
- let cssDir: string;
175
- try {
176
- const stat = fs.statSync(basePath);
177
- if (stat.isDirectory()) {
178
- cssDir = basePath;
179
- } else {
180
- cssDir = path.dirname(basePath);
181
- }
182
- } catch {
183
- // If stat fails, assume it's a file path
184
- cssDir = path.dirname(basePath);
185
- }
186
-
187
- // Resolve relativeUrl against this directory
188
- let resolvedPath = path.resolve(cssDir, relativeUrl);
189
- resolvedPath = resolvedPath.replace(/\\/g, '/'); // Normalize to forward slashes
232
+ // Use the URL constructor which correctly handles relative paths including ../
233
+ // relative to the base URL provided (the CSS file's URL).
234
+ const resolvedUrl = new URL(relativeUrl, cssBaseContextUrl);
235
+ // console.log(`[DEBUG resolveCssRelativeUrl] Resolved URL object href: "${resolvedUrl.href}"`); // Keep debug log commented unless needed
236
+ // Return the resolved absolute URL string
237
+ return resolvedUrl.href;
190
238
 
191
- // On Windows, ensure file:///C:/something
192
- if (/^[A-Z]:/i.test(resolvedPath) && !resolvedPath.startsWith('/')) {
193
- resolvedPath = '/' + resolvedPath;
194
- }
195
- return `file://${resolvedPath}`;
196
- } else {
197
- // For http/https etc., do standard resolution
198
- return new URL(relativeUrl, cssBaseContextUrl).href;
199
- }
200
239
  } catch (error) {
240
+ // Log warning if URL resolution fails
201
241
  logger?.warn(
202
- `Failed to resolve CSS URL: "${relativeUrl}" against "${cssBaseContextUrl}": ${String(error)}`
242
+ `Failed to resolve CSS URL: "${relativeUrl}" relative to "${cssBaseContextUrl}": ${String(error)}`
203
243
  );
244
+ // console.error(`[DEBUG resolveCssRelativeUrl] Error resolving: ${String(error)}`); // Keep debug log commented unless needed
204
245
  return null;
205
246
  }
206
247
  }
207
248
 
208
249
 
209
250
  /**
210
- * Asynchronously fetches the content of a resolved asset URL.
251
+ * Asynchronously fetches the content of a resolved asset URL (http, https, file).
211
252
  * @async
212
253
  * @param {URL} resolvedUrl - The absolute URL object of the asset to fetch.
213
254
  * @param {Logger} [logger] - Optional logger instance.
214
- * @param {number} [timeout=10000] - Network timeout in milliseconds.
215
- * @returns {Promise<Buffer | null>} Asset content as a Buffer, or null on failure.
216
- */
217
- /**
218
- * Asynchronously fetches the content of a resolved asset URL.
219
- * @async
220
- * @param {URL} resolvedUrl - The absolute URL object of the asset to fetch.
221
- * @param {Logger} [logger] - Optional logger instance.
222
- * @param {number} [timeout=10000] - Network timeout in milliseconds.
255
+ * @param {number} [timeout=10000] - Network timeout in milliseconds for HTTP(S) requests.
223
256
  * @returns {Promise<Buffer | null>} Asset content as a Buffer, or null on failure.
224
257
  */
225
258
  async function fetchAsset(resolvedUrl: URL, logger?: Logger, timeout: number = 10000): Promise<Buffer | null> {
259
+ // console.log(`[DEBUG fetchAsset] Attempting fetch for URL: ${resolvedUrl.href}`); // Keep debug log commented unless needed
226
260
  logger?.debug(`Attempting to fetch asset: ${resolvedUrl.href}`);
227
261
  const protocol = resolvedUrl.protocol;
228
262
 
229
263
  try {
264
+ // Handle HTTP and HTTPS protocols
230
265
  if (protocol === 'http:' || protocol === 'https:') {
231
- const response: AxiosResponse<ArrayBuffer> = await axios.default.get(resolvedUrl.href, {
232
- responseType: 'arraybuffer', timeout: timeout,
266
+ // Use axios to fetch remote content as an ArrayBuffer
267
+ const response: AxiosResponse<ArrayBuffer> = await axiosNs.default.get(resolvedUrl.href, {
268
+ responseType: 'arraybuffer', // Fetch as binary data
269
+ timeout: timeout, // Apply network timeout
233
270
  });
234
- logger?.debug(`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data.byteLength} bytes)`);
271
+ logger?.debug(`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data?.byteLength ?? 0} bytes)`);
272
+ // console.log(`[DEBUG fetchAsset] HTTP fetch SUCCESS for: ${resolvedUrl.href}, Status: ${response.status}`); // Keep debug log commented unless needed
273
+ // Return the fetched data as a Node.js Buffer
235
274
  return Buffer.from(response.data);
236
- } else if (protocol === 'file:') {
275
+ }
276
+ // Handle file protocol
277
+ else if (protocol === 'file:') {
237
278
  let filePath: string;
238
279
  try {
239
- filePath = fileURLToPath(resolvedUrl);
240
- } catch (e: any) {
241
- // Log error specifically for path conversion failure
280
+ // Convert file URL to a system file path
281
+ // IMPORTANT: This strips query params and fragments from the URL
282
+ filePath = fileURLToPath(resolvedUrl);
283
+ } catch (e: any) {
284
+ // console.error(`[DEBUG fetchAsset] fileURLToPath FAILED for: ${resolvedUrl.href}`, e); // Keep debug log commented unless needed
242
285
  logger?.error(`Could not convert file URL to path: ${resolvedUrl.href}. Error: ${e.message}`);
243
- return null; // Cannot proceed without a valid path
244
- }
245
- // This section will now only be reached if fileURLToPath succeeded
246
- const data = await readFile(filePath); // This might throw ENOENT, EACCES etc.
286
+ return null; // Return null if conversion fails
287
+ }
288
+
289
+ const normalizedForLog = path.normalize(filePath);
290
+ // console.log(`[DEBUG fetchAsset] Attempting readFile with path: "${normalizedForLog}" (Original from URL: "${filePath}")`); // Keep debug log commented unless needed
291
+
292
+ // Read file content using fs/promises
293
+ const data = await readFile(filePath); // This call uses the mock in tests
294
+
295
+ // console.log(`[DEBUG fetchAsset] readFile call SUCCEEDED for path: "${normalizedForLog}". Data length: ${data?.byteLength}`); // Keep debug log commented unless needed
247
296
  logger?.debug(`Read local file ${filePath} (${data.byteLength} bytes)`);
297
+ // Return the file content as a Buffer
248
298
  return data;
249
- } else {
250
- logger?.warn(`Unsupported protocol "${protocol}" in URL: ${resolvedUrl.href}`);
251
- return null;
299
+ }
300
+ // Handle unsupported protocols
301
+ else {
302
+ // console.log(`[DEBUG fetchAsset] Unsupported protocol: ${protocol}`); // Keep debug log commented unless needed
303
+ logger?.warn(`Unsupported protocol "${protocol}" in URL: ${resolvedUrl.href}`);
304
+ return null;
252
305
  }
253
306
  } catch (error: unknown) {
254
- // --- Handle Errors Based on Protocol/Context ---
255
-
256
- // Check for AxiosError FIRST (only relevant if protocol was http/https)
257
- if ((protocol === 'http:' || protocol === 'https:') && axios.default.isAxiosError(error)) {
258
- const status = error.response?.status ?? 'N/A';
259
- const statusText = error.response?.statusText ?? 'Error';
260
- const code = error.code ?? 'N/A';
261
- const message = error.message;
262
- // Construct the message matching test expectation
263
- const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: Status ${status} - ${statusText}. Code: ${code}, Message: ${message}`;
307
+ // --- Handle Errors During Fetch/Read ---
308
+ const failedId = protocol === 'file:' ? path.normalize(fileURLToPath(resolvedUrl)) : resolvedUrl.href;
309
+ // console.error(`[DEBUG fetchAsset] CAUGHT Error for ${failedId}. Type: ${Object.prototype.toString.call(error)}, Constructor: ${error?.constructor?.name}, isAxiosError property: ${(error as any)?.isAxiosError}, Code: ${(error as any)?.code}`); // Keep for debugging if needed
310
+
311
+ // *** FIXED LOGIC: Check for AxiosError using its property *before* generic instanceof Error ***
312
+ if ((protocol === 'http:' || protocol === 'https:') && (error as any)?.isAxiosError === true) {
313
+ const axiosError = error as AxiosError; // Cast for easier property access
314
+ const status = axiosError.response?.status ?? 'N/A';
315
+ const code = axiosError.code ?? 'N/A'; // e.g., ECONNABORTED for timeout
316
+ // Use the specific log format
317
+ const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: ${axiosError.message} (Code: ${code})`;
264
318
  logger?.warn(logMessage);
265
319
  }
266
- // Check for specific FS errors (only relevant if protocol was file:)
267
- else if (protocol === 'file:') {
268
- // Determine the file path again for logging, handling potential errors
320
+ // Check for file system errors *next*
321
+ else if (protocol === 'file:' && error instanceof Error) {
269
322
  let failedPath = resolvedUrl.href;
270
- try { failedPath = fileURLToPath(resolvedUrl); } catch { /* ignore if conversion fails here, use original href */ }
323
+ try { failedPath = fileURLToPath(resolvedUrl); } catch { /* ignore */ }
324
+ failedPath = path.normalize(failedPath);
271
325
 
272
- if (error instanceof Error && (error as NodeJS.ErrnoException).code === 'ENOENT') {
326
+ if ((error as NodeJSErrnoException).code === 'ENOENT') {
273
327
  logger?.warn(`⚠️ File not found (ENOENT) for asset: ${failedPath}.`);
274
- } else if (error instanceof Error && (error as NodeJS.ErrnoException).code === 'EACCES') {
328
+ } else if ((error as NodeJSErrnoException).code === 'EACCES') {
329
+ // Log ONLY the specific EACCES message
275
330
  logger?.warn(`⚠️ Permission denied (EACCES) reading asset: ${failedPath}.`);
276
- } else if (error instanceof Error) { // Catch other errors during file reading (but not path conversion which is handled above)
277
- logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
278
331
  } else {
279
- logger?.warn(`⚠️ An unknown error occurred while reading local asset ${failedPath}: ${String(error)}`);
332
+ logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
280
333
  }
281
334
  }
282
- // Check for other specific errors like invalid URL types if necessary (ERR_INVALID_URL handled above mostly)
283
- // else if (error instanceof TypeError && error.message.includes('ERR_INVALID_URL')) { ... }
284
-
285
- // Generic fallback for truly unexpected errors during fetch/read
335
+ // Generic fallback for *other* types of Errors (that are not Axios or known FS errors)
286
336
  else if (error instanceof Error) {
287
337
  logger?.warn(`⚠️ An unexpected error occurred processing asset ${resolvedUrl.href}: ${error.message}`);
288
- } else {
338
+ }
339
+ // Fallback for non-Error throws (e.g., strings, numbers)
340
+ else {
289
341
  logger?.warn(`⚠️ An unknown and unexpected error occurred processing asset ${resolvedUrl.href}: ${String(error)}`);
290
342
  }
291
- return null; // Return null on ANY fetch/read error caught here
343
+ // Return null on ANY error
344
+ return null;
292
345
  }
293
346
  }
294
347
 
295
348
  /**
296
- * Extracts URLs from CSS content and resolves them against the CSS base URL.
297
- * @param {string} cssContent - The CSS content to parse
298
- * @param {string} cssBaseContextUrl - The base URL of the CSS file
299
- * @param {Asset[]} discoveredAssets - Array to push newly discovered assets to
300
- * @param {Set<string>} visitedUrls - Set of already visited URLs to avoid duplicates
301
- * @param {Logger} [logger] - Optional logger instance
302
- */
303
- /**
304
- * Extracts URLs from CSS content and resolves them against the CSS base URL.
305
- * Returns an array of *potentially* new Asset objects with resolved URLs.
349
+ * Extracts URLs from CSS content using regex and resolves them.
350
+ * Finds `url(...)` and `@import` rules.
351
+ * @param {string} cssContent - The CSS content string to parse.
352
+ * @param {string} cssBaseContextUrl - The absolute URL of the CSS file (used for resolving relative paths).
353
+ * @param {Logger} [logger] - Optional logger instance.
354
+ * @returns {Asset[]} An array of newly discovered Asset objects (type, resolved URL, content initially undefined).
306
355
  */
307
356
  function extractUrlsFromCSS(
308
357
  cssContent: string,
309
358
  cssBaseContextUrl: string,
310
- // discoveredAssets: Asset[], // REMOVE: This function will now RETURN the assets
311
- // visitedUrls: Set<string>, // REMOVE
312
359
  logger?: Logger
313
- ): Asset[] { // RETURN the discovered assets
314
- const newlyDiscovered: Asset[] = []; // Internal list for this parse
315
- const processedInThisParse = new Set<string>(); // Track URLs found in *this specific* CSS file to avoid duplicates from the same file
360
+ ): Asset[] {
361
+ // Array to hold assets discovered within this CSS content
362
+ const newlyDiscovered: Asset[] = [];
363
+ // Set to track URLs processed within this specific CSS file to avoid adding duplicates from the same file
364
+ const processedInThisParse = new Set<string>();
316
365
 
366
+ // Regex for url(...) patterns, handling optional quotes (non-greedy match for URL)
317
367
  const urlRegex = /url\(\s*(['"]?)(.*?)\1\s*\)/gi;
368
+ // Regex for @import rules, handling url() or bare string, optional quotes (non-greedy match for URL)
318
369
  const importRegex = /@import\s+(?:url\(\s*(['"]?)(.*?)\1\s*\)|(['"])(.*?)\3)\s*;/gi;
319
370
 
371
+ /** Internal helper to process a found URL string */
320
372
  const processFoundUrl = (rawUrl: string | undefined, ruleType: '@import' | 'url()') => {
321
- if (!rawUrl || rawUrl.trim() === '' || rawUrl.startsWith('data:')) return;
373
+ // Skip if URL is empty, undefined, a data URI, or only a fragment
374
+ if (!rawUrl || rawUrl.trim() === '' || rawUrl.startsWith('data:') || rawUrl.startsWith('#')) return;
322
375
 
376
+ // Resolve the potentially relative URL against the CSS file's base URL
323
377
  const resolvedUrl = resolveCssRelativeUrl(rawUrl, cssBaseContextUrl, logger);
324
378
 
325
- // Check if resolved AND not already processed within *this* CSS file
379
+ // If successfully resolved and not already found *in this specific CSS file*
326
380
  if (resolvedUrl && !processedInThisParse.has(resolvedUrl)) {
327
- processedInThisParse.add(resolvedUrl); // Mark as found in this file
381
+ // Mark this resolved URL as processed for this CSS file
382
+ processedInThisParse.add(resolvedUrl);
383
+ // Guess the asset type (css, image, font, etc.) based on the resolved URL
328
384
  const { assetType } = guessMimeType(resolvedUrl);
329
385
 
330
- // Add to the list to be returned
386
+ // Add the discovered asset to the list for this CSS file
331
387
  newlyDiscovered.push({
332
388
  type: assetType,
333
- url: resolvedUrl, // The resolved URL string
334
- content: undefined
389
+ url: resolvedUrl, // Store the resolved absolute URL string
390
+ content: undefined // Content will be fetched later if needed
335
391
  });
336
392
  logger?.debug(`Discovered nested ${assetType} asset (${ruleType}) in CSS ${cssBaseContextUrl}: ${resolvedUrl}`);
337
393
  }
338
394
  };
339
395
 
340
- // ... (run regex loops calling processFoundUrl) ...
341
- urlRegex.lastIndex = 0;
342
- importRegex.lastIndex = 0;
396
+ // Find all url(...) matches in the CSS content
343
397
  let match;
344
398
  while ((match = urlRegex.exec(cssContent)) !== null) {
399
+ // Group 2 captures the URL part inside url()
345
400
  processFoundUrl(match[2], 'url()');
346
401
  }
402
+
403
+ // Find all @import matches in the CSS content
404
+ // Reset lastIndex as we're reusing the regex object implicitly
347
405
  importRegex.lastIndex = 0;
348
406
  while ((match = importRegex.exec(cssContent)) !== null) {
407
+ // Group 2 captures url('...'), Group 4 captures bare "..."
349
408
  processFoundUrl(match[2] || match[4], '@import');
350
409
  }
351
410
 
352
- return newlyDiscovered; // Return the list
411
+ // Return the list of assets discovered within this CSS content
412
+ return newlyDiscovered;
353
413
  }
354
414
 
355
- /**
356
- * Extracts all discoverable assets recursively from HTML and CSS.
357
- * @async
358
- * @export
359
- * @param {ParsedHTML} parsed - Initial parsed HTML data.
360
- * @param {boolean} [embedAssets=true] - Whether to embed content.
361
- * @param {string} [inputPathOrUrl] - Original HTML source location.
362
- * @param {Logger} [logger] - Optional logger instance.
363
- * @returns {Promise<ParsedHTML>} Processed data with all assets.
364
- */
365
415
  /**
366
416
  * Extracts all discoverable assets recursively from HTML and CSS.
367
417
  * Fetches assets if embedAssets is true or if the asset is CSS (to parse for more assets).
368
418
  * Resolves URLs relative to their context (HTML base or CSS file location).
419
+ * Handles potential infinite loops with an iteration limit.
420
+ *
369
421
  * @async
370
422
  * @export
371
423
  * @param {ParsedHTML} parsed - Initial parsed HTML data containing `htmlContent` and an initial `assets` array.
@@ -382,59 +434,65 @@ export async function extractAssets(
382
434
  ): Promise<ParsedHTML> {
383
435
  logger?.info(`🚀 Starting asset extraction! Embed: ${embedAssets}. Input: ${inputPathOrUrl || '(HTML content only)'}`);
384
436
 
437
+ // Get the initial list of assets found directly in the HTML
385
438
  const initialAssets: Asset[] = parsed.assets || [];
386
- // Stores the final result: Map<resolved URL string, Asset object>
439
+ // Stores the final result: Map<resolved URL string, Asset object> to ensure uniqueness
387
440
  const finalAssetsMap = new Map<string, Asset>();
388
- // Queue holds assets to be processed: { url: string (resolved), type: ..., content?: ... }
441
+ // Queue holds assets whose content needs to be processed (fetched/analyzed)
389
442
  let assetsToProcess: Asset[] = [];
443
+ // Set to track URLs that are either already fully processed (in finalAssetsMap)
444
+ // OR currently in the processing queue (assetsToProcess) to prevent reprocessing/loops.
445
+ const processedOrQueuedUrls = new Set<string>();
390
446
 
391
- // Determine the base URL context for resolving relative paths FROM THE HTML
447
+ // --- Determine Base URL Context for the HTML ---
392
448
  const htmlBaseContextUrl = determineBaseUrl(inputPathOrUrl || '', logger);
449
+ // Warn if no base URL could be found and there are relative paths in the initial assets
393
450
  if (!htmlBaseContextUrl && initialAssets.some(a => !/^[a-z]+:/i.test(a.url) && !a.url.startsWith('data:') && !a.url.startsWith('#') && !a.url.startsWith('/'))) {
394
451
  logger?.warn("🚨 No valid base path/URL determined for the HTML source! Resolution of relative asset paths from HTML may fail.");
395
452
  } else if (htmlBaseContextUrl) {
396
453
  logger?.debug(`Using HTML base context URL: ${htmlBaseContextUrl}`);
397
454
  }
398
455
 
399
- // --- CORRECTED: Define processedOrQueuedUrls HERE in the main function scope ---
400
- // Set to track URLs that are already processed (in finalAssetsMap) OR currently in the queue (assetsToProcess)
401
- // This prevents adding the same asset to the queue multiple times.
402
- const processedOrQueuedUrls = new Set<string>();
403
-
404
- // --- Initial Queue Population ---
456
+ // --- Initial Queue Population from HTML assets ---
405
457
  logger?.debug(`Queueing ${initialAssets.length} initial assets parsed from HTML...`);
406
458
  for (const asset of initialAssets) {
407
459
  // Resolve the initial asset URL against the HTML base context
408
460
  const resolvedUrlObj = resolveAssetUrl(asset.url, htmlBaseContextUrl, logger);
409
- // Use the resolved URL string if resolution succeeded, otherwise use the original
410
- const urlToQueue = resolvedUrlObj ? resolvedUrlObj.href : asset.url;
411
461
 
412
- // Skip data URIs and check if this URL is already tracked
413
- if (!urlToQueue.startsWith('data:') && !processedOrQueuedUrls.has(urlToQueue)) {
414
- processedOrQueuedUrls.add(urlToQueue); // Mark as queued
462
+ // Skip if URL is invalid, data URI, fragment, or unsupported protocol
463
+ if (!resolvedUrlObj) {
464
+ logger?.debug(` -> Skipping initial asset with unresolvable/ignorable URL: ${asset.url}`);
465
+ continue;
466
+ }
467
+ // Get the resolved absolute URL string
468
+ const urlToQueue = resolvedUrlObj.href;
469
+
470
+ // Check if this URL is already tracked (processed or queued)
471
+ if (!processedOrQueuedUrls.has(urlToQueue)) {
472
+ // Mark as queued (add to set *before* adding to array)
473
+ processedOrQueuedUrls.add(urlToQueue);
415
474
 
416
475
  // Guess type from the resolved/original URL if not provided initially
417
476
  const { assetType: guessedType } = guessMimeType(urlToQueue);
418
- const initialType = asset.type ?? guessedType;
477
+ const initialType = asset.type ?? guessedType; // Use provided type or fallback to guessed type
419
478
 
420
- // Add to the processing queue
479
+ // Add the resolved asset to the processing queue
421
480
  assetsToProcess.push({
422
- url: urlToQueue,
481
+ url: urlToQueue, // Use the resolved URL
423
482
  type: initialType,
424
- content: undefined
483
+ content: undefined // Content is initially undefined
425
484
  });
426
485
  logger?.debug(` -> Queued initial asset: ${urlToQueue} (Original raw: ${asset.url})`);
427
- } else if (urlToQueue.startsWith('data:')) {
428
- logger?.debug(` -> Skipping data URI: ${urlToQueue.substring(0, 50)}...`);
429
486
  } else {
430
- logger?.debug(` -> Skipping already queued initial asset: ${urlToQueue}`);
487
+ logger?.debug(` -> Skipping already processed/queued initial asset: ${urlToQueue}`);
431
488
  }
432
489
  }
433
490
 
434
- // --- Main processing loop ---
491
+ // --- Main processing loop (continues as long as there are assets to process) ---
435
492
  let iterationCount = 0;
436
493
  while (assetsToProcess.length > 0) {
437
494
  iterationCount++;
495
+ // Prevent potential infinite loops
438
496
  if (iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS) {
439
497
  logger?.error(`🛑 Asset extraction loop limit hit (${MAX_ASSET_EXTRACTION_ITERATIONS})! Aborting.`);
440
498
  const remainingUrls = assetsToProcess.map(a => a.url).slice(0, 10).join(', ');
@@ -442,175 +500,204 @@ export async function extractAssets(
442
500
  // Add assets remaining in queue to final map without content before breaking
443
501
  assetsToProcess.forEach(asset => {
444
502
  if (!finalAssetsMap.has(asset.url)) {
445
- finalAssetsMap.set(asset.url, { ...asset, content: undefined });
503
+ finalAssetsMap.set(asset.url, { ...asset, content: undefined });
446
504
  }
447
505
  });
448
- assetsToProcess = []; // Clear queue
506
+ assetsToProcess = []; // Clear queue to stop the loop
449
507
  break; // Exit loop
450
508
  }
451
509
 
452
- // Process assets in batches
510
+ // Take a snapshot of the current queue to process in this iteration
453
511
  const currentBatch = [...assetsToProcess];
454
- assetsToProcess = []; // Clear queue for the next batch discovered in this iteration
512
+ // Clear the main queue; new assets found in this batch will be added here for the *next* iteration
513
+ assetsToProcess = [];
455
514
 
456
515
  logger?.debug(`--- Processing batch ${iterationCount}: ${currentBatch.length} asset(s) ---`);
457
516
 
517
+ // Process each asset in the current batch
458
518
  for (const asset of currentBatch) {
459
- // Skip if already fully processed
519
+ // Double-check: Skip if this asset somehow got fully processed in a previous iteration (shouldn't happen with current logic, but safe check)
460
520
  if (finalAssetsMap.has(asset.url)) {
461
521
  logger?.debug(`Skipping asset already in final map: ${asset.url}`);
462
522
  continue;
463
523
  }
464
524
 
465
- let assetContentBuffer: Buffer | null = null;
466
- let finalContent: string | undefined = undefined; // For embedding
467
- let cssContentForParsing: string | undefined = undefined; // For CSS parsing
525
+ let assetContentBuffer: Buffer | null = null; // To store fetched binary content
526
+ let finalContent: string | undefined = undefined; // Final content (text or data URI) for the Asset object
527
+ let cssContentForParsing: string | undefined = undefined; // Text content specifically for parsing CSS
468
528
 
469
529
  // --- Determine if fetching is needed ---
530
+ // Fetch if we need to embed all assets OR if it's CSS (we need content to parse for nested assets)
470
531
  const needsFetching = embedAssets || asset.type === 'css';
471
- let assetUrlObj: URL | null = null;
532
+ let assetUrlObj: URL | null = null; // URL object needed for fetchAsset
472
533
 
473
534
  if (needsFetching) {
474
535
  // --- Create URL object for fetching ---
475
536
  try {
537
+ // Asset URL should be absolute at this point
476
538
  assetUrlObj = new URL(asset.url);
477
539
  } catch (urlError) {
540
+ // Log error if creating URL object fails
478
541
  logger?.warn(`Cannot create URL object for "${asset.url}", skipping fetch. Error: ${urlError instanceof Error ? urlError.message : String(urlError)}`);
542
+ // Store asset without content in the final map
479
543
  finalAssetsMap.set(asset.url, { ...asset, content: undefined });
480
- continue; // Skip to next asset in batch
544
+ // Skip to next asset in the current batch
545
+ continue;
481
546
  }
482
547
 
483
548
  // --- Fetch Asset ---
484
549
  if (assetUrlObj) {
550
+ // Call fetchAsset (which handles http/https/file and errors)
485
551
  assetContentBuffer = await fetchAsset(assetUrlObj, logger);
552
+ // fetchAsset returns null on failure
486
553
  }
487
554
  } // End if(needsFetching)
488
555
 
489
- // --- If fetching was needed but failed, add to map without content and skip ---
556
+ // --- If fetching was required but failed, store asset without content and continue ---
490
557
  if (needsFetching && assetContentBuffer === null) {
491
558
  logger?.debug(`Storing asset ${asset.url} without content due to fetch failure.`);
559
+ // Add to final map with undefined content
492
560
  finalAssetsMap.set(asset.url, { ...asset, content: undefined });
493
- continue; // Skip to next asset in batch
561
+ // Skip to the next asset in the current batch
562
+ continue;
494
563
  }
495
564
 
496
565
  // --- Prepare Content for Storing/Embedding (if fetched successfully) ---
497
566
  if (assetContentBuffer) { // Only proceed if content was fetched
498
- const mimeInfo = guessMimeType(asset.url);
499
- const effectiveMime = mimeInfo.mime || 'application/octet-stream';
500
-
501
- // Try to decode TEXT types as UTF-8
502
- if (TEXT_ASSET_TYPES.has(asset.type)) {
503
- let textContent: string | undefined;
504
- let wasLossy = false;
505
- try {
506
- textContent = assetContentBuffer.toString('utf-8');
507
- wasLossy = isUtf8DecodingLossy(assetContentBuffer, textContent);
508
- } catch (e) { textContent = undefined; wasLossy = true; }
509
-
510
- if (!wasLossy && textContent !== undefined) {
511
- // Store the decoded text content if embedding or it's CSS (for parsing)
512
- if (embedAssets) {
513
- finalContent = textContent;
514
- } else {
515
- finalContent = undefined; // Not embedding text
516
- }
517
- // If it's CSS, store it for parsing later regardless of embedding
518
- if (asset.type === 'css') {
519
- cssContentForParsing = textContent;
520
- }
521
- } else {
522
- // Decoding failed or was lossy
523
- logger?.warn(`Could not decode ${asset.type} ${asset.url} as valid UTF-8 text.${embedAssets ? ' Falling back to base64 data URI.' : ''}`);
524
- cssContentForParsing = undefined; // Cannot parse if decoding failed
525
- // Embed as base64 if requested
526
- if (embedAssets) {
527
- finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
528
- } else {
529
- finalContent = undefined; // Not embedding, content remains undefined
530
- }
531
- }
532
- }
533
- // Embed BINARY types as base64 if requested
534
- else if (BINARY_ASSET_TYPES.has(asset.type)) {
535
- if (embedAssets) {
536
- finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
537
- } else {
538
- finalContent = undefined; // Not embedding
539
- }
540
- cssContentForParsing = undefined; // Not CSS
541
- }
542
- // Handle 'other' types: try text, fallback to base64 if embedding
543
- else { // asset.type === 'other' or unknown
544
- cssContentForParsing = undefined; // Not CSS
545
- if (embedAssets) {
546
- try {
547
- const attemptedTextContent = assetContentBuffer.toString('utf-8');
548
- if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
549
- logger?.warn(`Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`);
550
- finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
551
- } else {
552
- finalContent = attemptedTextContent;
553
- logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
554
- }
555
- } catch (decodeError) {
556
- logger?.warn(`Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`);
557
- finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
558
- }
559
- } else {
560
- finalContent = undefined; // Not embedding
561
- }
562
- }
563
- } else {
564
- // Content was not fetched
565
- finalContent = undefined;
566
- cssContentForParsing = undefined;
567
+ // Guess MIME type based on the asset's URL extension
568
+ const mimeInfo = guessMimeType(asset.url);
569
+ // Use the guessed MIME type or fallback to a generic binary type
570
+ const effectiveMime = mimeInfo.mime || 'application/octet-stream';
571
+
572
+ // Handle TEXT types (CSS, JS)
573
+ if (TEXT_ASSET_TYPES.has(asset.type)) {
574
+ let textContent: string | undefined;
575
+ let wasLossy = false;
576
+ try {
577
+ // Try decoding the buffer as UTF-8
578
+ textContent = assetContentBuffer.toString('utf-8');
579
+ // Check if the decoding process lost information (e.g., invalid sequences replaced)
580
+ wasLossy = isUtf8DecodingLossy(assetContentBuffer, textContent);
581
+ } catch (e) {
582
+ // Decoding itself failed
583
+ textContent = undefined;
584
+ wasLossy = true;
585
+ }
586
+
587
+ // If decoding was successful and not lossy
588
+ if (!wasLossy && textContent !== undefined) {
589
+ // If embedding, store the text content
590
+ if (embedAssets) {
591
+ finalContent = textContent;
592
+ } else {
593
+ finalContent = undefined; // Not embedding text, store undefined
594
+ }
595
+ // If it's CSS, store its text content for parsing regardless of embedding option
596
+ if (asset.type === 'css') {
597
+ cssContentForParsing = textContent;
598
+ }
599
+ } else {
600
+ // Decoding failed or was lossy
601
+ // Fixed log message: Added "asset" after type.
602
+ logger?.warn(`Could not decode ${asset.type} asset ${asset.url} as valid UTF-8 text.${embedAssets ? ' Falling back to base64 data URI.' : ''}`);
603
+ cssContentForParsing = undefined; // Cannot parse CSS if decoding failed
604
+ // Embed as base64 data URI if requested, using the effective MIME type
605
+ if (embedAssets) {
606
+ finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
607
+ } else {
608
+ finalContent = undefined; // Not embedding
609
+ }
610
+ }
611
+ }
612
+ // Handle BINARY types (image, font, video, audio)
613
+ else if (BINARY_ASSET_TYPES.has(asset.type)) {
614
+ // Embed as base64 data URI if requested
615
+ if (embedAssets) {
616
+ finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
617
+ } else {
618
+ finalContent = undefined; // Not embedding
619
+ }
620
+ cssContentForParsing = undefined; // Not CSS, so no parsing needed
621
+ }
622
+ // Handle 'other' or unknown types
623
+ else {
624
+ cssContentForParsing = undefined; // Assume not parseable as CSS
625
+ // If embedding, attempt to store as text, fallback to base64 if invalid UTF-8
626
+ if (embedAssets) {
627
+ try {
628
+ const attemptedTextContent = assetContentBuffer.toString('utf-8');
629
+ if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
630
+ // If text decoding is lossy, warn and use base64
631
+ logger?.warn(`Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`);
632
+ finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
633
+ } else {
634
+ // Store as text if decoding worked
635
+ finalContent = attemptedTextContent;
636
+ logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
637
+ }
638
+ } catch (decodeError) {
639
+ // If toString fails, warn and use base64
640
+ logger?.warn(`Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`);
641
+ finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
642
+ }
643
+ } else {
644
+ finalContent = undefined; // Not embedding
645
+ }
646
+ }
647
+ } else { // Content was not fetched (e.g., embedAssets=false and not CSS)
648
+ finalContent = undefined;
649
+ cssContentForParsing = undefined;
567
650
  }
568
651
 
569
- // --- Store the final asset ---
570
- // Use the resolved URL as the key and in the asset object itself
652
+ // --- Store the final processed asset in the map ---
653
+ // Use the resolved URL as the key and ensure the asset object also uses the resolved URL
571
654
  finalAssetsMap.set(asset.url, { ...asset, url: asset.url, content: finalContent });
572
- // Note: URL is already marked in processedOrQueuedUrls
655
+ // Note: URL was already added to processedOrQueuedUrls when initially queued or discovered in CSS
573
656
 
574
657
  // --- Process CSS for nested assets ---
575
658
  // Only if it's CSS and we successfully decoded its content for parsing
576
659
  if (asset.type === 'css' && cssContentForParsing) {
577
- // Determine the base URL *for this specific CSS file*
578
- const cssBaseContextUrl = determineBaseUrl(asset.url, logger);
579
- logger?.debug(`CSS base context for resolving nested assets within ${asset.url}: ${cssBaseContextUrl}`);
660
+ // Determine the base URL *for this specific CSS file* to resolve its relative links
661
+ const cssBaseContextUrl = determineBaseUrl(asset.url, logger); // CSS URL is absolute here
662
+ logger?.debug(`CSS base context for resolving nested assets within ${asset.url}: ${cssBaseContextUrl}`);
580
663
 
581
664
  if (cssBaseContextUrl) {
582
- // Get the list of *potentially* new assets discovered in this CSS
665
+ // Extract URLs found within this CSS content
583
666
  const newlyDiscoveredAssets = extractUrlsFromCSS(
584
667
  cssContentForParsing,
585
- cssBaseContextUrl,
668
+ cssBaseContextUrl, // Use the CSS file's own URL as the base
586
669
  logger
587
670
  );
588
671
 
672
+ // If new assets were found in the CSS
589
673
  if (newlyDiscoveredAssets.length > 0) {
590
674
  logger?.debug(`Discovered ${newlyDiscoveredAssets.length} nested assets in CSS ${asset.url}. Checking against queue...`);
675
+ // Process each newly discovered asset
591
676
  for (const newAsset of newlyDiscoveredAssets) {
592
- // CHECK: Add to queue only if this resolved URL hasn't been processed OR queued before.
593
- // Use the 'processedOrQueuedUrls' Set which tracks both.
677
+ // CHECK: Add to the main processing queue only if this resolved URL hasn't been processed OR queued before.
594
678
  if (!processedOrQueuedUrls.has(newAsset.url)) {
595
679
  processedOrQueuedUrls.add(newAsset.url); // Mark as queued now
596
- assetsToProcess.push(newAsset); // Add to the main queue for the *next* iteration
597
- logger?.debug(` -> Queued new nested asset: ${newAsset.url}`);
680
+ assetsToProcess.push(newAsset); // Add to the queue for the *next* iteration
681
+ logger?.debug(` -> Queued new nested asset: ${newAsset.url}`);
598
682
  } else {
599
- logger?.debug(` -> Skipping already processed/queued nested asset: ${newAsset.url}`);
683
+ // Skip if already handled
684
+ logger?.debug(` -> Skipping already processed/queued nested asset: ${newAsset.url}`);
600
685
  }
601
686
  }
602
687
  }
603
688
  } else {
604
- logger?.warn(`Could not determine base URL context for CSS file ${asset.url}. Cannot resolve nested relative paths within it.`);
689
+ // Warn if the base URL for the CSS file couldn't be determined (shouldn't happen if asset.url was valid)
690
+ logger?.warn(`Could not determine base URL context for CSS file ${asset.url}. Cannot resolve nested relative paths within it.`);
605
691
  }
606
692
  } // End if(asset.type === 'css' && cssContentForParsing)
607
693
  } // End for loop over currentBatch
608
- } // End while loop
694
+ } // End while loop (assetsToProcess.length > 0)
609
695
 
610
- const finalIterationCount = iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS ? 'MAX+' : iterationCount;
696
+ // Log completion summary
697
+ const finalIterationCount = iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS ? `${MAX_ASSET_EXTRACTION_ITERATIONS}+ (limit hit)` : iterationCount;
611
698
  logger?.info(`✅ Asset extraction COMPLETE! Found ${finalAssetsMap.size} unique assets in ${finalIterationCount} iterations.`);
612
699
 
613
- // Return the original HTML content and the final list of processed assets
700
+ // Return the original HTML content and the final list of processed assets from the map
614
701
  return {
615
702
  htmlContent: parsed.htmlContent,
616
703
  assets: Array.from(finalAssetsMap.values())