portapack 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,21 +2,19 @@
2
2
  * @file src/core/extractor.ts
3
3
  * @description Handles discovery, resolution, fetching, and optional embedding of assets
4
4
  * linked from HTML and recursively within CSS (@import, url()). This is the heart of finding EVERYTHING.
5
- * @version 1.1.4 - Added console logs for debugging path/URL resolution. Refined determineBaseUrl.
5
+ * @version 1.1.6 - Revised fetchAsset error handling logic for Axios errors.
6
6
  */
7
7
 
8
8
  // === Node.js Core Imports ===
9
9
  import { readFile } from 'fs/promises';
10
10
  import * as fs from 'fs'; // Required for statSync for sync directory check
11
- import type { FileHandle } from 'fs/promises';
11
+ import type { FileHandle } from 'fs/promises'; // Import specific type if needed elsewhere
12
12
  import path from 'path';
13
13
  import { fileURLToPath, URL } from 'url'; // Crucial for file path/URL conversion
14
14
 
15
15
  // === External Dependencies ===
16
- // Using requireNamespace avoids potential ESM/CJS interop issues with mocks if they arise
17
- // const axios = require('axios'); // Alternative if import * causes issues with mocks
18
16
  import * as axiosNs from 'axios'; // Using namespace import for clarity
19
- import type { AxiosError, AxiosRequestConfig, AxiosResponse, InternalAxiosRequestConfig } from 'axios';
17
+ import type { AxiosError, AxiosRequestConfig, AxiosResponse, InternalAxiosRequestConfig } from 'axios'; // Import necessary types
20
18
 
21
19
  // === Project Imports ===
22
20
  import type { Asset, ParsedHTML } from '../types'; // Adjust path if needed
@@ -46,10 +44,12 @@ type NodeJSErrnoException = Error & { code?: string };
46
44
  */
47
45
  function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boolean {
48
46
  try {
47
+ // Re-encode the decoded string back to a buffer using UTF-8
49
48
  const reEncodedBuffer = Buffer.from(decodedString, 'utf-8');
49
+ // Compare the re-encoded buffer with the original buffer
50
50
  return !originalBuffer.equals(reEncodedBuffer);
51
51
  } catch (e) {
52
- // Error during re-encoding likely means original wasn't valid UTF-8
52
+ // If an error occurs during re-encoding, it implies the original wasn't valid UTF-8
53
53
  return true;
54
54
  }
55
55
  }
@@ -62,9 +62,11 @@ function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boo
62
62
  * @returns {string | undefined} The absolute base URL string ending in '/', or undefined if determination fails.
63
63
  */
64
64
  function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | undefined {
65
- // [DEBUG LOG] Added for diagnostics
66
- console.log(`[DEBUG determineBaseUrl] Input: "${inputPathOrUrl}"`);
65
+ // Log the input for debugging purposes
66
+ // console.log(`[DEBUG determineBaseUrl] Input: "${inputPathOrUrl}"`); // Keep debug log commented unless needed
67
67
  logger?.debug(`Determining base URL for input: ${inputPathOrUrl}`);
68
+
69
+ // Handle invalid or empty input
68
70
  if (!inputPathOrUrl) {
69
71
  logger?.warn('Cannot determine base URL: inputPathOrUrl is empty or invalid.');
70
72
  return undefined;
@@ -74,20 +76,20 @@ function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | und
74
76
  // Handle non-file URLs (HTTP, HTTPS)
75
77
  if (/^https?:\/\//i.test(inputPathOrUrl)) {
76
78
  const url = new URL(inputPathOrUrl);
77
- // Get URL up to the last slash in the path
79
+ // Construct the base URL by taking the path up to the last '/'
78
80
  url.pathname = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
79
- url.search = ''; url.hash = ''; // Clear query params/fragments
81
+ url.search = ''; // Remove query parameters
82
+ url.hash = ''; // Remove fragments
80
83
  const baseUrl = url.href;
81
84
  logger?.debug(`Determined remote base URL: ${baseUrl}`);
82
- // [DEBUG LOG] Added for diagnostics
83
- console.log(`[DEBUG determineBaseUrl] Determined Remote URL: "${baseUrl}"`);
84
- return baseUrl; // URLs from constructor usually end in '/' if path ends in '/'
85
+ // console.log(`[DEBUG determineBaseUrl] Determined Remote URL: "${baseUrl}"`); // Keep debug log commented unless needed
86
+ // Return the constructed base URL (usually ends in '/')
87
+ return baseUrl;
85
88
  }
86
89
  // Handle other protocols (warn and return undefined)
87
90
  else if (inputPathOrUrl.includes('://') && !inputPathOrUrl.startsWith('file:')) {
88
91
  logger?.warn(`Input "${inputPathOrUrl}" looks like a URL but uses an unsupported protocol. Cannot determine base URL.`);
89
- // [DEBUG LOG] Added for diagnostics
90
- console.log(`[DEBUG determineBaseUrl] Unsupported protocol.`);
92
+ // console.log(`[DEBUG determineBaseUrl] Unsupported protocol.`); // Keep debug log commented unless needed
91
93
  return undefined;
92
94
  }
93
95
  // Handle file paths and file: URLs
@@ -97,32 +99,31 @@ function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | und
97
99
 
98
100
  // Convert input to an absolute path
99
101
  if (inputPathOrUrl.startsWith('file:')) {
102
+ // Convert file URL to path
100
103
  resourcePath = fileURLToPath(inputPathOrUrl);
101
104
  // file: URLs ending in / strongly suggest a directory
102
105
  isInputLikelyDirectory = inputPathOrUrl.endsWith('/');
103
106
  } else {
104
- resourcePath = path.resolve(inputPathOrUrl); // Resolve relative/absolute file paths
105
- // Check if the resolved path *actually* exists and is a directory
106
- // This distinguishes 'C:\path\to\dir' from 'C:\path\to\file.html'
107
- try {
108
- // Use statSync carefully - assumes it's available and works (or mocked)
109
- isInputLikelyDirectory = fs.statSync(resourcePath).isDirectory();
110
- } catch {
111
- // If stat fails (ENOENT, EACCES), assume it refers to a file path
112
- isInputLikelyDirectory = false;
113
- }
107
+ // Resolve relative/absolute file paths
108
+ resourcePath = path.resolve(inputPathOrUrl);
109
+ // Check if the resolved path *actually* exists and is a directory
110
+ try {
111
+ // Use statSync carefully - assumes it's available and works (or mocked)
112
+ isInputLikelyDirectory = fs.statSync(resourcePath).isDirectory();
113
+ } catch {
114
+ // If stat fails (ENOENT, EACCES), assume it refers to a file path
115
+ isInputLikelyDirectory = false;
116
+ }
114
117
  }
115
- // [DEBUG LOG] Added for diagnostics
116
- console.log(`[DEBUG determineBaseUrl] resourcePath: "${resourcePath}", isInputLikelyDirectory: ${isInputLikelyDirectory}`);
118
+ // console.log(`[DEBUG determineBaseUrl] resourcePath: "${resourcePath}", isInputLikelyDirectory: ${isInputLikelyDirectory}`); // Keep debug log commented unless needed
117
119
 
118
- // The base directory is the directory containing the resourcePath,
119
- // OR resourcePath itself if it was identified as a directory.
120
+ // The base directory is the directory containing the resourcePath,
121
+ // OR resourcePath itself if it was identified as a directory.
120
122
  const baseDirPath = isInputLikelyDirectory ? resourcePath : path.dirname(resourcePath);
121
- // [DEBUG LOG] Added for diagnostics
122
- console.log(`[DEBUG determineBaseUrl] Calculated baseDirPath: "${baseDirPath}"`);
123
+ // console.log(`[DEBUG determineBaseUrl] Calculated baseDirPath: "${baseDirPath}"`); // Keep debug log commented unless needed
123
124
 
124
125
  // Convert base directory path back to a file URL ending in '/'
125
- let normalizedPathForURL = baseDirPath.replace(/\\/g, '/'); // Use forward slashes
126
+ let normalizedPathForURL = baseDirPath.replace(/\\/g, '/'); // Use forward slashes for URL consistency
126
127
  // Ensure leading slash for Windows file URLs (e.g., /C:/...)
127
128
  if (/^[A-Z]:\//i.test(normalizedPathForURL) && !normalizedPathForURL.startsWith('/')) {
128
129
  normalizedPathForURL = '/' + normalizedPathForURL;
@@ -132,19 +133,18 @@ function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | und
132
133
  normalizedPathForURL += '/';
133
134
  }
134
135
 
136
+ // Create the final file URL object and get its string representation
135
137
  const fileUrl = new URL('file://' + normalizedPathForURL);
136
138
  const fileUrlString = fileUrl.href;
137
139
 
138
140
  logger?.debug(`Determined base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved base dir: ${baseDirPath})`);
139
- // [DEBUG LOG] Added for diagnostics
140
- console.log(`[DEBUG determineBaseUrl] Determined File URL: "${fileUrlString}"`);
141
+ // console.log(`[DEBUG determineBaseUrl] Determined File URL: "${fileUrlString}"`); // Keep debug log commented unless needed
141
142
  return fileUrlString;
142
-
143
143
  }
144
144
  } catch (error: unknown) {
145
+ // Handle any errors during base URL determination
145
146
  const message = error instanceof Error ? error.message : String(error);
146
- // [DEBUG LOG] Added for diagnostics
147
- console.error(`[DEBUG determineBaseUrl] Error determining base URL: ${message}`);
147
+ // console.error(`[DEBUG determineBaseUrl] Error determining base URL: ${message}`); // Keep debug log commented unless needed
148
148
  logger?.error(`💀 Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error && error.stack ? ` - Stack: ${error.stack}` : ''}`);
149
149
  return undefined;
150
150
  }
@@ -159,8 +159,10 @@ function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | und
159
159
  * @returns {URL | null} A validated, absolute URL object, or null if invalid/ignorable.
160
160
  */
161
161
  function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Logger): URL | null {
162
+ // Trim whitespace from the URL
162
163
  const trimmedUrl = assetUrl?.trim();
163
- // Ignore empty, data URIs, or fragment-only URLs
164
+
165
+ // Ignore empty URLs, data URIs, or fragment-only URLs
164
166
  if (!trimmedUrl || trimmedUrl.startsWith('data:') || trimmedUrl.startsWith('#')) {
165
167
  return null;
166
168
  }
@@ -170,34 +172,39 @@ function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Log
170
172
  // Handle protocol-relative URLs (e.g., //example.com/image.png)
171
173
  if (resolvableUrl.startsWith('//') && baseContextUrl) {
172
174
  try {
175
+ // Prepend the protocol from the base context URL
173
176
  const base = new URL(baseContextUrl);
174
- resolvableUrl = base.protocol + resolvableUrl; // Prepend the base protocol (http: or https:)
177
+ resolvableUrl = base.protocol + resolvableUrl;
175
178
  } catch (e) {
179
+ // Log a warning if the base protocol cannot be determined
176
180
  logger?.warn(`Could not extract protocol from base "${baseContextUrl}" for protocol-relative URL "${trimmedUrl}". Skipping.`);
177
181
  return null;
178
182
  }
179
183
  }
180
184
 
181
185
  try {
182
- // Use URL constructor for resolution. Handles absolute, relative paths, ../ etc.
183
- // baseContextUrl provides the context for resolving relative URLs.
186
+ // Use URL constructor for resolution. Handles absolute paths, relative paths, ../ etc.
184
187
  const resolved = new URL(resolvableUrl, baseContextUrl);
185
- // Don't attempt to fetch ws://, mailto:, etc. Add protocols as needed.
188
+
189
+ // Skip assets with unsupported protocols (e.g., mailto:, ws:)
186
190
  if (!['http:', 'https:', 'file:'].includes(resolved.protocol)) {
187
- logger?.debug(`Skipping asset with unsupported protocol: ${resolved.href}`);
188
- return null;
191
+ logger?.debug(`Skipping asset with unsupported protocol: ${resolved.href}`);
192
+ return null;
189
193
  }
194
+ // Return the resolved URL object
190
195
  return resolved;
191
196
  } catch (error: unknown) {
192
- // Log errors during URL parsing/resolution but don't halt the process
197
+ // Log errors during URL parsing/resolution
193
198
  const message = error instanceof Error ? error.message : String(error);
194
- // Avoid warning for relative paths when no base was provided (e.g., direct HTML string input)
199
+ // Avoid redundant warnings for relative paths when no base context was provided (expected failure)
195
200
  if (!/^[a-z]+:/i.test(resolvableUrl) && !resolvableUrl.startsWith('/') && !baseContextUrl) {
196
201
  logger?.warn(`Cannot resolve relative URL "${resolvableUrl}" - Base context URL was not provided or determined.`);
197
202
  } else {
203
+ // Log other resolution failures
198
204
  logger?.warn(`⚠️ Failed to parse/resolve URL "${resolvableUrl}" ${baseContextUrl ? 'against base "' + baseContextUrl + '"' : '(no base provided)'}: ${message}`);
199
205
  }
200
- return null; // Return null if resolution fails
206
+ // Return null if resolution fails
207
+ return null;
201
208
  }
202
209
  }
203
210
 
@@ -214,29 +221,27 @@ function resolveCssRelativeUrl(
214
221
  cssBaseContextUrl: string, // e.g., file:///C:/mock/base/dir/css/deep.css or https://.../style.css
215
222
  logger?: Logger
216
223
  ): string | null {
217
- // [DEBUG LOG] Added for diagnostics
218
- console.log(`[DEBUG resolveCssRelativeUrl] Input: relative="${relativeUrl}", base="${cssBaseContextUrl}"`);
224
+ // console.log(`[DEBUG resolveCssRelativeUrl] Input: relative="${relativeUrl}", base="${cssBaseContextUrl}"`); // Keep debug log commented unless needed
219
225
 
226
+ // Ignore empty, data URIs, or fragments
220
227
  if (!relativeUrl || relativeUrl.startsWith('data:') || relativeUrl.startsWith('#')) {
221
- return null; // Ignore empty, data URIs, or fragments
228
+ return null;
222
229
  }
223
230
 
224
231
  try {
225
232
  // Use the URL constructor which correctly handles relative paths including ../
226
- // relative to the base URL provided.
233
+ // relative to the base URL provided (the CSS file's URL).
227
234
  const resolvedUrl = new URL(relativeUrl, cssBaseContextUrl);
228
-
229
- // [DEBUG LOG] Added for diagnostics
230
- console.log(`[DEBUG resolveCssRelativeUrl] Resolved URL object href: "${resolvedUrl.href}"`);
231
- return resolvedUrl.href; // Return the resolved absolute URL string
235
+ // console.log(`[DEBUG resolveCssRelativeUrl] Resolved URL object href: "${resolvedUrl.href}"`); // Keep debug log commented unless needed
236
+ // Return the resolved absolute URL string
237
+ return resolvedUrl.href;
232
238
 
233
239
  } catch (error) {
234
- // Log warning if URL resolution fails for some reason
240
+ // Log warning if URL resolution fails
235
241
  logger?.warn(
236
242
  `Failed to resolve CSS URL: "${relativeUrl}" relative to "${cssBaseContextUrl}": ${String(error)}`
237
243
  );
238
- // [DEBUG LOG] Added for diagnostics
239
- console.error(`[DEBUG resolveCssRelativeUrl] Error resolving: ${String(error)}`);
244
+ // console.error(`[DEBUG resolveCssRelativeUrl] Error resolving: ${String(error)}`); // Keep debug log commented unless needed
240
245
  return null;
241
246
  }
242
247
  }
@@ -251,92 +256,92 @@ function resolveCssRelativeUrl(
251
256
  * @returns {Promise<Buffer | null>} Asset content as a Buffer, or null on failure.
252
257
  */
253
258
  async function fetchAsset(resolvedUrl: URL, logger?: Logger, timeout: number = 10000): Promise<Buffer | null> {
254
- // [DEBUG LOG] Added for diagnostics
255
- console.log(`[DEBUG fetchAsset] Attempting fetch for URL: ${resolvedUrl.href}`);
259
+ // console.log(`[DEBUG fetchAsset] Attempting fetch for URL: ${resolvedUrl.href}`); // Keep debug log commented unless needed
256
260
  logger?.debug(`Attempting to fetch asset: ${resolvedUrl.href}`);
257
261
  const protocol = resolvedUrl.protocol;
258
262
 
259
263
  try {
264
+ // Handle HTTP and HTTPS protocols
260
265
  if (protocol === 'http:' || protocol === 'https:') {
261
- // Use axios namespace import's default property
266
+ // Use axios to fetch remote content as an ArrayBuffer
262
267
  const response: AxiosResponse<ArrayBuffer> = await axiosNs.default.get(resolvedUrl.href, {
263
- responseType: 'arraybuffer', timeout: timeout,
268
+ responseType: 'arraybuffer', // Fetch as binary data
269
+ timeout: timeout, // Apply network timeout
264
270
  });
265
- logger?.debug(`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data?.byteLength ?? 0} bytes)`);
266
- // [DEBUG LOG] Added for diagnostics
267
- console.log(`[DEBUG fetchAsset] HTTP fetch SUCCESS for: ${resolvedUrl.href}, Status: ${response.status}`);
268
- return Buffer.from(response.data);
269
- } else if (protocol === 'file:') {
271
+ logger?.debug(`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data?.byteLength ?? 0} bytes)`);
272
+ // console.log(`[DEBUG fetchAsset] HTTP fetch SUCCESS for: ${resolvedUrl.href}, Status: ${response.status}`); // Keep debug log commented unless needed
273
+ // Return the fetched data as a Node.js Buffer
274
+ return Buffer.from(response.data);
275
+ }
276
+ // Handle file protocol
277
+ else if (protocol === 'file:') {
270
278
  let filePath: string;
271
279
  try {
272
- // Convert file URL to path. IMPORTANT: This strips query params and fragments.
280
+ // Convert file URL to a system file path
281
+ // IMPORTANT: This strips query params and fragments from the URL
273
282
  filePath = fileURLToPath(resolvedUrl);
274
283
  } catch (e: any) {
275
- // [DEBUG LOG] Added for diagnostics
276
- console.error(`[DEBUG fetchAsset] fileURLToPath FAILED for: ${resolvedUrl.href}`, e);
277
- logger?.error(`Could not convert file URL to path: ${resolvedUrl.href}. Error: ${e.message}`);
278
- return null;
284
+ // console.error(`[DEBUG fetchAsset] fileURLToPath FAILED for: ${resolvedUrl.href}`, e); // Keep debug log commented unless needed
285
+ logger?.error(`Could not convert file URL to path: ${resolvedUrl.href}. Error: ${e.message}`);
286
+ return null; // Return null if conversion fails
279
287
  }
280
288
 
281
289
  const normalizedForLog = path.normalize(filePath);
282
- // [DEBUG LOG] Added for diagnostics
283
- console.log(`[DEBUG fetchAsset] Attempting readFile with path: "${normalizedForLog}" (Original from URL: "${filePath}")`);
290
+ // console.log(`[DEBUG fetchAsset] Attempting readFile with path: "${normalizedForLog}" (Original from URL: "${filePath}")`); // Keep debug log commented unless needed
284
291
 
285
- // Read file using fs/promises
292
+ // Read file content using fs/promises
286
293
  const data = await readFile(filePath); // This call uses the mock in tests
287
294
 
288
- // [DEBUG LOG] Added for diagnostics
289
- console.log(`[DEBUG fetchAsset] readFile call SUCCEEDED for path: "${normalizedForLog}". Data length: ${data?.byteLength}`);
290
- logger?.debug(`Read local file ${filePath} (${data.byteLength} bytes)`);
295
+ // console.log(`[DEBUG fetchAsset] readFile call SUCCEEDED for path: "${normalizedForLog}". Data length: ${data?.byteLength}`); // Keep debug log commented unless needed
296
+ logger?.debug(`Read local file ${filePath} (${data.byteLength} bytes)`);
297
+ // Return the file content as a Buffer
291
298
  return data;
292
- } else {
293
- // [DEBUG LOG] Added for diagnostics
294
- console.log(`[DEBUG fetchAsset] Unsupported protocol: ${protocol}`);
299
+ }
300
+ // Handle unsupported protocols
301
+ else {
302
+ // console.log(`[DEBUG fetchAsset] Unsupported protocol: ${protocol}`); // Keep debug log commented unless needed
295
303
  logger?.warn(`Unsupported protocol "${protocol}" in URL: ${resolvedUrl.href}`);
296
- return null;
304
+ return null;
297
305
  }
298
306
  } catch (error: unknown) {
299
- // [DEBUG LOG] Added for diagnostics
300
- const failedId = protocol === 'file:' ? path.normalize(fileURLToPath(resolvedUrl)) : resolvedUrl.href;
301
- console.error(`[DEBUG fetchAsset] fetch/read FAILED for: "${failedId}". Error:`, error);
302
-
303
- // --- Handle Errors Based on Protocol/Context ---
304
- // Use the imported namespace directly for isAxiosError check
305
- if ((protocol === 'http:' || protocol === 'https:') && axiosNs.isAxiosError(error)) {
306
- const status = error.response?.status ?? 'N/A';
307
- const statusText = error.response?.statusText ?? 'Error';
308
- const code = error.code ?? 'N/A';
309
- const message = error.message;
310
- // Format consistent with test expectations
311
- const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: Status ${status} - ${statusText}. Code: ${code}, Message: ${message}`;
307
+ // --- Handle Errors During Fetch/Read ---
308
+ const failedId = protocol === 'file:' ? path.normalize(fileURLToPath(resolvedUrl)) : resolvedUrl.href;
309
+ // console.error(`[DEBUG fetchAsset] CAUGHT Error for ${failedId}. Type: ${Object.prototype.toString.call(error)}, Constructor: ${error?.constructor?.name}, isAxiosError property: ${(error as any)?.isAxiosError}, Code: ${(error as any)?.code}`); // Keep for debugging if needed
310
+
311
+ // *** FIXED LOGIC: Check for AxiosError using its property *before* generic instanceof Error ***
312
+ if ((protocol === 'http:' || protocol === 'https:') && (error as any)?.isAxiosError === true) {
313
+ const axiosError = error as AxiosError; // Cast for easier property access
314
+ const status = axiosError.response?.status ?? 'N/A';
315
+ const code = axiosError.code ?? 'N/A'; // e.g., ECONNABORTED for timeout
316
+ // Use the specific log format
317
+ const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: ${axiosError.message} (Code: ${code})`;
312
318
  logger?.warn(logMessage);
313
319
  }
314
- // Check for specific FS errors (only relevant if protocol was file:)
315
- if (error instanceof Error && (error as { code?: string }).code === 'ENOENT') {
316
- let failedPath = resolvedUrl.href; // Fallback path for logging if conversion fails
320
+ // Check for file system errors *next*
321
+ else if (protocol === 'file:' && error instanceof Error) {
322
+ let failedPath = resolvedUrl.href;
317
323
  try { failedPath = fileURLToPath(resolvedUrl); } catch { /* ignore */ }
318
- failedPath = path.normalize(failedPath); // Normalize for consistent logging
324
+ failedPath = path.normalize(failedPath);
319
325
 
320
- if (error instanceof Error && (error as NodeJSErrnoException).code === 'ENOENT') {
326
+ if ((error as NodeJSErrnoException).code === 'ENOENT') {
321
327
  logger?.warn(`⚠️ File not found (ENOENT) for asset: ${failedPath}.`);
322
- } else if (error instanceof Error && (error as NodeJSErrnoException).code === 'EACCES') {
323
- // Log EACCES specifically for tests to catch if needed
324
- logger?.warn(`⚠️ Permission denied (EACCES) reading asset: ${failedPath}.`);
325
- // Also log the more generic message that the test currently expects
326
- logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
327
- } else if (error instanceof Error) {
328
- logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
328
+ } else if ((error as NodeJSErrnoException).code === 'EACCES') {
329
+ // Log ONLY the specific EACCES message
330
+ logger?.warn(`⚠️ Permission denied (EACCES) reading asset: ${failedPath}.`);
329
331
  } else {
330
- logger?.warn(`⚠️ An unknown error occurred while reading local asset ${failedPath}: ${String(error)}`);
332
+ logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
331
333
  }
332
334
  }
333
- // Generic fallback for truly unexpected errors during fetch/read
335
+ // Generic fallback for *other* types of Errors (that are not Axios or known FS errors)
334
336
  else if (error instanceof Error) {
335
337
  logger?.warn(`⚠️ An unexpected error occurred processing asset ${resolvedUrl.href}: ${error.message}`);
336
- } else {
338
+ }
339
+ // Fallback for non-Error throws (e.g., strings, numbers)
340
+ else {
337
341
  logger?.warn(`⚠️ An unknown and unexpected error occurred processing asset ${resolvedUrl.href}: ${String(error)}`);
338
342
  }
339
- return null; // Return null on ANY fetch/read error caught here
343
+ // Return null on ANY error
344
+ return null;
340
345
  }
341
346
  }
342
347
 
@@ -353,50 +358,57 @@ function extractUrlsFromCSS(
353
358
  cssBaseContextUrl: string,
354
359
  logger?: Logger
355
360
  ): Asset[] {
361
+ // Array to hold assets discovered within this CSS content
356
362
  const newlyDiscovered: Asset[] = [];
357
- // Track URLs processed within this specific CSS file to avoid adding duplicates from the same file
363
+ // Set to track URLs processed within this specific CSS file to avoid adding duplicates from the same file
358
364
  const processedInThisParse = new Set<string>();
359
365
 
360
- // Regex for url(...) patterns, handling optional quotes
366
+ // Regex for url(...) patterns, handling optional quotes (non-greedy match for URL)
361
367
  const urlRegex = /url\(\s*(['"]?)(.*?)\1\s*\)/gi;
362
- // Regex for @import rules, handling url() or bare string, optional quotes
368
+ // Regex for @import rules, handling url() or bare string, optional quotes (non-greedy match for URL)
363
369
  const importRegex = /@import\s+(?:url\(\s*(['"]?)(.*?)\1\s*\)|(['"])(.*?)\3)\s*;/gi;
364
370
 
365
371
  /** Internal helper to process a found URL string */
366
372
  const processFoundUrl = (rawUrl: string | undefined, ruleType: '@import' | 'url()') => {
367
- if (!rawUrl || rawUrl.trim() === '' || rawUrl.startsWith('data:')) return;
373
+ // Skip if URL is empty, undefined, a data URI, or only a fragment
374
+ if (!rawUrl || rawUrl.trim() === '' || rawUrl.startsWith('data:') || rawUrl.startsWith('#')) return;
368
375
 
376
+ // Resolve the potentially relative URL against the CSS file's base URL
369
377
  const resolvedUrl = resolveCssRelativeUrl(rawUrl, cssBaseContextUrl, logger);
370
378
 
371
- // If successfully resolved and not already found in *this* CSS file
379
+ // If successfully resolved and not already found *in this specific CSS file*
372
380
  if (resolvedUrl && !processedInThisParse.has(resolvedUrl)) {
381
+ // Mark this resolved URL as processed for this CSS file
373
382
  processedInThisParse.add(resolvedUrl);
374
- const { assetType } = guessMimeType(resolvedUrl); // Guess type based on resolved URL
383
+ // Guess the asset type (css, image, font, etc.) based on the resolved URL
384
+ const { assetType } = guessMimeType(resolvedUrl);
375
385
 
376
- // Add to the list of assets discovered in this pass
386
+ // Add the discovered asset to the list for this CSS file
377
387
  newlyDiscovered.push({
378
388
  type: assetType,
379
- url: resolvedUrl, // The resolved absolute URL string
389
+ url: resolvedUrl, // Store the resolved absolute URL string
380
390
  content: undefined // Content will be fetched later if needed
381
391
  });
382
- logger?.debug(`Discovered nested ${assetType} asset (${ruleType}) in CSS ${cssBaseContextUrl}: ${resolvedUrl}`);
392
+ logger?.debug(`Discovered nested ${assetType} asset (${ruleType}) in CSS ${cssBaseContextUrl}: ${resolvedUrl}`);
383
393
  }
384
394
  };
385
395
 
386
- // Execute regex for url(...)
396
+ // Find all url(...) matches in the CSS content
387
397
  let match;
388
398
  while ((match = urlRegex.exec(cssContent)) !== null) {
389
- processFoundUrl(match[2], 'url()'); // Group 2 captures the URL part
399
+ // Group 2 captures the URL part inside url()
400
+ processFoundUrl(match[2], 'url()');
390
401
  }
391
402
 
392
- // Execute regex for @import
393
- // Reset lastIndex as we're using the same regex instance implicitly if defined outside loop
394
- importRegex.lastIndex = 0; // Explicitly reset
403
+ // Find all @import matches in the CSS content
404
+ // Reset lastIndex as we're reusing the regex object implicitly
405
+ importRegex.lastIndex = 0;
395
406
  while ((match = importRegex.exec(cssContent)) !== null) {
396
407
  // Group 2 captures url('...'), Group 4 captures bare "..."
397
408
  processFoundUrl(match[2] || match[4], '@import');
398
409
  }
399
410
 
411
+ // Return the list of assets discovered within this CSS content
400
412
  return newlyDiscovered;
401
413
  }
402
414
 
@@ -422,59 +434,65 @@ export async function extractAssets(
422
434
  ): Promise<ParsedHTML> {
423
435
  logger?.info(`🚀 Starting asset extraction! Embed: ${embedAssets}. Input: ${inputPathOrUrl || '(HTML content only)'}`);
424
436
 
437
+ // Get the initial list of assets found directly in the HTML
425
438
  const initialAssets: Asset[] = parsed.assets || [];
426
- // Stores the final result: Map<resolved URL string, Asset object>
439
+ // Stores the final result: Map<resolved URL string, Asset object> to ensure uniqueness
427
440
  const finalAssetsMap = new Map<string, Asset>();
428
- // Queue holds assets to be processed: { url: string (resolved), type: ..., content?: ... }
441
+ // Queue holds assets whose content needs to be processed (fetched/analyzed)
429
442
  let assetsToProcess: Asset[] = [];
430
- // Set to track URLs that are already processed (in finalAssetsMap) OR currently in the queue (assetsToProcess)
443
+ // Set to track URLs that are either already fully processed (in finalAssetsMap)
444
+ // OR currently in the processing queue (assetsToProcess) to prevent reprocessing/loops.
431
445
  const processedOrQueuedUrls = new Set<string>();
432
446
 
433
- // --- Determine Base URL Context ---
447
+ // --- Determine Base URL Context for the HTML ---
434
448
  const htmlBaseContextUrl = determineBaseUrl(inputPathOrUrl || '', logger);
449
+ // Warn if no base URL could be found and there are relative paths in the initial assets
435
450
  if (!htmlBaseContextUrl && initialAssets.some(a => !/^[a-z]+:/i.test(a.url) && !a.url.startsWith('data:') && !a.url.startsWith('#') && !a.url.startsWith('/'))) {
436
451
  logger?.warn("🚨 No valid base path/URL determined for the HTML source! Resolution of relative asset paths from HTML may fail.");
437
452
  } else if (htmlBaseContextUrl) {
438
453
  logger?.debug(`Using HTML base context URL: ${htmlBaseContextUrl}`);
439
454
  }
440
455
 
441
- // --- Initial Queue Population ---
456
+ // --- Initial Queue Population from HTML assets ---
442
457
  logger?.debug(`Queueing ${initialAssets.length} initial assets parsed from HTML...`);
443
458
  for (const asset of initialAssets) {
444
459
  // Resolve the initial asset URL against the HTML base context
445
460
  const resolvedUrlObj = resolveAssetUrl(asset.url, htmlBaseContextUrl, logger);
461
+
462
+ // Skip if URL is invalid, data URI, fragment, or unsupported protocol
446
463
  if (!resolvedUrlObj) {
447
- logger?.debug(` -> Skipping initial asset with unresolvable/ignorable URL: ${asset.url}`);
448
- continue; // Skip if URL is invalid or data URI etc.
464
+ logger?.debug(` -> Skipping initial asset with unresolvable/ignorable URL: ${asset.url}`);
465
+ continue;
449
466
  }
450
- const urlToQueue = resolvedUrlObj.href; // Use the resolved absolute URL string
467
+ // Get the resolved absolute URL string
468
+ const urlToQueue = resolvedUrlObj.href;
451
469
 
452
- // Skip data URIs and check if this URL is already tracked
453
- if (!urlToQueue.startsWith('data:') && !processedOrQueuedUrls.has(urlToQueue)) {
454
- processedOrQueuedUrls.add(urlToQueue); // Mark as queued
470
+ // Check if this URL is already tracked (processed or queued)
471
+ if (!processedOrQueuedUrls.has(urlToQueue)) {
472
+ // Mark as queued (add to set *before* adding to array)
473
+ processedOrQueuedUrls.add(urlToQueue);
455
474
 
456
475
  // Guess type from the resolved/original URL if not provided initially
457
476
  const { assetType: guessedType } = guessMimeType(urlToQueue);
458
- const initialType = asset.type ?? guessedType;
477
+ const initialType = asset.type ?? guessedType; // Use provided type or fallback to guessed type
459
478
 
460
- // Add to the processing queue
479
+ // Add the resolved asset to the processing queue
461
480
  assetsToProcess.push({
462
481
  url: urlToQueue, // Use the resolved URL
463
482
  type: initialType,
464
- content: undefined
483
+ content: undefined // Content is initially undefined
465
484
  });
466
- logger?.debug(` -> Queued initial asset: ${urlToQueue} (Original raw: ${asset.url})`);
467
- } else if (urlToQueue.startsWith('data:')) {
468
- logger?.debug(` -> Skipping data URI: ${urlToQueue.substring(0, 50)}...`);
485
+ logger?.debug(` -> Queued initial asset: ${urlToQueue} (Original raw: ${asset.url})`);
469
486
  } else {
470
- logger?.debug(` -> Skipping already processed/queued initial asset: ${urlToQueue}`);
487
+ logger?.debug(` -> Skipping already processed/queued initial asset: ${urlToQueue}`);
471
488
  }
472
489
  }
473
490
 
474
- // --- Main processing loop ---
491
+ // --- Main processing loop (continues as long as there are assets to process) ---
475
492
  let iterationCount = 0;
476
493
  while (assetsToProcess.length > 0) {
477
494
  iterationCount++;
495
+ // Prevent potential infinite loops
478
496
  if (iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS) {
479
497
  logger?.error(`🛑 Asset extraction loop limit hit (${MAX_ASSET_EXTRACTION_ITERATIONS})! Aborting.`);
480
498
  const remainingUrls = assetsToProcess.map(a => a.url).slice(0, 10).join(', ');
@@ -482,175 +500,204 @@ export async function extractAssets(
482
500
  // Add assets remaining in queue to final map without content before breaking
483
501
  assetsToProcess.forEach(asset => {
484
502
  if (!finalAssetsMap.has(asset.url)) {
485
- finalAssetsMap.set(asset.url, { ...asset, content: undefined });
503
+ finalAssetsMap.set(asset.url, { ...asset, content: undefined });
486
504
  }
487
505
  });
488
- assetsToProcess = []; // Clear queue
506
+ assetsToProcess = []; // Clear queue to stop the loop
489
507
  break; // Exit loop
490
508
  }
491
509
 
492
- // Process assets in batches for clarity in logs
510
+ // Take a snapshot of the current queue to process in this iteration
493
511
  const currentBatch = [...assetsToProcess];
494
- assetsToProcess = []; // Clear queue for the next batch discovered in this iteration
512
+ // Clear the main queue; new assets found in this batch will be added here for the *next* iteration
513
+ assetsToProcess = [];
495
514
 
496
515
  logger?.debug(`--- Processing batch ${iterationCount}: ${currentBatch.length} asset(s) ---`);
497
516
 
517
+ // Process each asset in the current batch
498
518
  for (const asset of currentBatch) {
499
- // Skip if already fully processed (e.g., added in a previous batch)
519
+ // Double-check: Skip if this asset somehow got fully processed in a previous iteration (shouldn't happen with current logic, but safe check)
500
520
  if (finalAssetsMap.has(asset.url)) {
501
- logger?.debug(`Skipping asset already in final map: ${asset.url}`);
521
+ logger?.debug(`Skipping asset already in final map: ${asset.url}`);
502
522
  continue;
503
523
  }
504
524
 
505
- let assetContentBuffer: Buffer | null = null;
506
- let finalContent: string | undefined = undefined; // For embedding
507
- let cssContentForParsing: string | undefined = undefined; // For CSS parsing
525
+ let assetContentBuffer: Buffer | null = null; // To store fetched binary content
526
+ let finalContent: string | undefined = undefined; // Final content (text or data URI) for the Asset object
527
+ let cssContentForParsing: string | undefined = undefined; // Text content specifically for parsing CSS
508
528
 
509
529
  // --- Determine if fetching is needed ---
510
- // Fetch if embedding everything OR if it's CSS (need content for parsing)
530
+ // Fetch if we need to embed all assets OR if it's CSS (we need content to parse for nested assets)
511
531
  const needsFetching = embedAssets || asset.type === 'css';
512
532
  let assetUrlObj: URL | null = null; // URL object needed for fetchAsset
513
533
 
514
534
  if (needsFetching) {
515
535
  // --- Create URL object for fetching ---
516
536
  try {
517
- assetUrlObj = new URL(asset.url); // Asset URL should be absolute here
537
+ // Asset URL should be absolute at this point
538
+ assetUrlObj = new URL(asset.url);
518
539
  } catch (urlError) {
519
- logger?.warn(`Cannot create URL object for "${asset.url}", skipping fetch. Error: ${urlError instanceof Error ? urlError.message : String(urlError)}`);
520
- finalAssetsMap.set(asset.url, { ...asset, content: undefined }); // Store asset without content
521
- continue; // Skip to next asset in batch
540
+ // Log error if creating URL object fails
541
+ logger?.warn(`Cannot create URL object for "${asset.url}", skipping fetch. Error: ${urlError instanceof Error ? urlError.message : String(urlError)}`);
542
+ // Store asset without content in the final map
543
+ finalAssetsMap.set(asset.url, { ...asset, content: undefined });
544
+ // Skip to next asset in the current batch
545
+ continue;
522
546
  }
523
547
 
524
548
  // --- Fetch Asset ---
525
549
  if (assetUrlObj) {
550
+ // Call fetchAsset (which handles http/https/file and errors)
526
551
  assetContentBuffer = await fetchAsset(assetUrlObj, logger);
527
552
  // fetchAsset returns null on failure
528
553
  }
529
554
  } // End if(needsFetching)
530
555
 
531
- // --- If fetching was needed but failed, store asset without content and skip ---
556
+ // --- If fetching was required but failed, store asset without content and continue ---
532
557
  if (needsFetching && assetContentBuffer === null) {
533
- logger?.debug(`Storing asset ${asset.url} without content due to fetch failure.`);
558
+ logger?.debug(`Storing asset ${asset.url} without content due to fetch failure.`);
559
+ // Add to final map with undefined content
534
560
  finalAssetsMap.set(asset.url, { ...asset, content: undefined });
535
- continue; // Skip to next asset in batch
561
+ // Skip to the next asset in the current batch
562
+ continue;
536
563
  }
537
564
 
538
565
  // --- Prepare Content for Storing/Embedding (if fetched successfully) ---
539
566
  if (assetContentBuffer) { // Only proceed if content was fetched
540
- const mimeInfo = guessMimeType(asset.url); // Guess MIME based on URL extension
541
- const effectiveMime = mimeInfo.mime || 'application/octet-stream'; // Fallback MIME
542
-
543
- // Try to decode TEXT types as UTF-8
544
- if (TEXT_ASSET_TYPES.has(asset.type)) {
545
- let textContent: string | undefined;
546
- let wasLossy = false;
547
- try {
548
- textContent = assetContentBuffer.toString('utf-8');
549
- wasLossy = isUtf8DecodingLossy(assetContentBuffer, textContent);
550
- } catch (e) { textContent = undefined; wasLossy = true; }
551
-
552
- if (!wasLossy && textContent !== undefined) {
553
- // If embedding, store the text content
554
- if (embedAssets) {
555
- finalContent = textContent;
556
- } else {
557
- finalContent = undefined; // Not embedding text
558
- }
559
- // If it's CSS, store its text content for parsing regardless of embedding
560
- if (asset.type === 'css') {
561
- cssContentForParsing = textContent;
562
- }
563
- } else {
564
- // Decoding failed or was lossy
565
- logger?.warn(`Could not decode ${asset.type} asset ${asset.url} as valid UTF-8 text.${embedAssets ? ' Falling back to base64 data URI.' : ''}`);
566
- cssContentForParsing = undefined; // Cannot parse if decoding failed
567
- // Embed as base64 data URI if requested
568
- if (embedAssets) {
569
- finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
570
- } else {
571
- finalContent = undefined;
572
- }
573
- }
574
- }
575
- // Embed BINARY types as base64 data URI if requested
576
- else if (BINARY_ASSET_TYPES.has(asset.type)) {
577
- if (embedAssets) {
578
- finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
579
- } else {
580
- finalContent = undefined; // Not embedding
581
- }
582
- cssContentForParsing = undefined; // Not CSS
583
- }
584
- // Handle 'other' types: attempt text decode, fallback to base64 if embedding
585
- else { // asset.type === 'other' or unknown
586
- cssContentForParsing = undefined; // Not CSS
587
- if (embedAssets) {
588
- try {
589
- const attemptedTextContent = assetContentBuffer.toString('utf-8');
590
- if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
591
- logger?.warn(`Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`);
592
- finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
593
- } else {
594
- finalContent = attemptedTextContent;
595
- logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
596
- }
597
- } catch (decodeError) {
598
- logger?.warn(`Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`);
599
- finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
600
- }
601
- } else {
602
- finalContent = undefined; // Not embedding
603
- }
604
- }
567
+ // Guess MIME type based on the asset's URL extension
568
+ const mimeInfo = guessMimeType(asset.url);
569
+ // Use the guessed MIME type or fallback to a generic binary type
570
+ const effectiveMime = mimeInfo.mime || 'application/octet-stream';
571
+
572
+ // Handle TEXT types (CSS, JS)
573
+ if (TEXT_ASSET_TYPES.has(asset.type)) {
574
+ let textContent: string | undefined;
575
+ let wasLossy = false;
576
+ try {
577
+ // Try decoding the buffer as UTF-8
578
+ textContent = assetContentBuffer.toString('utf-8');
579
+ // Check if the decoding process lost information (e.g., invalid sequences replaced)
580
+ wasLossy = isUtf8DecodingLossy(assetContentBuffer, textContent);
581
+ } catch (e) {
582
+ // Decoding itself failed
583
+ textContent = undefined;
584
+ wasLossy = true;
585
+ }
586
+
587
+ // If decoding was successful and not lossy
588
+ if (!wasLossy && textContent !== undefined) {
589
+ // If embedding, store the text content
590
+ if (embedAssets) {
591
+ finalContent = textContent;
592
+ } else {
593
+ finalContent = undefined; // Not embedding text, store undefined
594
+ }
595
+ // If it's CSS, store its text content for parsing regardless of embedding option
596
+ if (asset.type === 'css') {
597
+ cssContentForParsing = textContent;
598
+ }
599
+ } else {
600
+ // Decoding failed or was lossy
601
+ // Fixed log message: Added "asset" after type.
602
+ logger?.warn(`Could not decode ${asset.type} asset ${asset.url} as valid UTF-8 text.${embedAssets ? ' Falling back to base64 data URI.' : ''}`);
603
+ cssContentForParsing = undefined; // Cannot parse CSS if decoding failed
604
+ // Embed as base64 data URI if requested, using the effective MIME type
605
+ if (embedAssets) {
606
+ finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
607
+ } else {
608
+ finalContent = undefined; // Not embedding
609
+ }
610
+ }
611
+ }
612
+ // Handle BINARY types (image, font, video, audio)
613
+ else if (BINARY_ASSET_TYPES.has(asset.type)) {
614
+ // Embed as base64 data URI if requested
615
+ if (embedAssets) {
616
+ finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
617
+ } else {
618
+ finalContent = undefined; // Not embedding
619
+ }
620
+ cssContentForParsing = undefined; // Not CSS, so no parsing needed
621
+ }
622
+ // Handle 'other' or unknown types
623
+ else {
624
+ cssContentForParsing = undefined; // Assume not parseable as CSS
625
+ // If embedding, attempt to store as text, fallback to base64 if invalid UTF-8
626
+ if (embedAssets) {
627
+ try {
628
+ const attemptedTextContent = assetContentBuffer.toString('utf-8');
629
+ if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
630
+ // If text decoding is lossy, warn and use base64
631
+ logger?.warn(`Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`);
632
+ finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
633
+ } else {
634
+ // Store as text if decoding worked
635
+ finalContent = attemptedTextContent;
636
+ logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
637
+ }
638
+ } catch (decodeError) {
639
+ // If toString fails, warn and use base64
640
+ logger?.warn(`Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`);
641
+ finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
642
+ }
643
+ } else {
644
+ finalContent = undefined; // Not embedding
645
+ }
646
+ }
605
647
  } else { // Content was not fetched (e.g., embedAssets=false and not CSS)
606
- finalContent = undefined;
607
- cssContentForParsing = undefined;
648
+ finalContent = undefined;
649
+ cssContentForParsing = undefined;
608
650
  }
609
651
 
610
- // --- Store the final asset ---
611
- // Use the resolved URL as the key and in the asset object itself
652
+ // --- Store the final processed asset in the map ---
653
+ // Use the resolved URL as the key and ensure the asset object also uses the resolved URL
612
654
  finalAssetsMap.set(asset.url, { ...asset, url: asset.url, content: finalContent });
613
- // Note: URL was already added to processedOrQueuedUrls when initially queued or discovered
655
+ // Note: URL was already added to processedOrQueuedUrls when initially queued or discovered in CSS
614
656
 
615
657
  // --- Process CSS for nested assets ---
616
658
  // Only if it's CSS and we successfully decoded its content for parsing
617
659
  if (asset.type === 'css' && cssContentForParsing) {
618
660
  // Determine the base URL *for this specific CSS file* to resolve its relative links
619
- const cssBaseContextUrl = determineBaseUrl(asset.url, logger); // CSS URL is absolute here
620
- logger?.debug(`CSS base context for resolving nested assets within ${asset.url}: ${cssBaseContextUrl}`);
661
+ const cssBaseContextUrl = determineBaseUrl(asset.url, logger); // CSS URL is absolute here
662
+ logger?.debug(`CSS base context for resolving nested assets within ${asset.url}: ${cssBaseContextUrl}`);
621
663
 
622
664
  if (cssBaseContextUrl) {
623
- // Get the list of *potentially* new assets discovered in this CSS file's content
665
+ // Extract URLs found within this CSS content
624
666
  const newlyDiscoveredAssets = extractUrlsFromCSS(
625
667
  cssContentForParsing,
626
- cssBaseContextUrl, // Use CSS file's base URL
668
+ cssBaseContextUrl, // Use the CSS file's own URL as the base
627
669
  logger
628
670
  );
629
671
 
672
+ // If new assets were found in the CSS
630
673
  if (newlyDiscoveredAssets.length > 0) {
631
- logger?.debug(`Discovered ${newlyDiscoveredAssets.length} nested assets in CSS ${asset.url}. Checking against queue...`);
674
+ logger?.debug(`Discovered ${newlyDiscoveredAssets.length} nested assets in CSS ${asset.url}. Checking against queue...`);
675
+ // Process each newly discovered asset
632
676
  for (const newAsset of newlyDiscoveredAssets) {
633
- // CHECK: Add to queue only if this resolved URL hasn't been processed OR queued before.
634
- if (!processedOrQueuedUrls.has(newAsset.url)) {
635
- processedOrQueuedUrls.add(newAsset.url); // Mark as queued now
636
- assetsToProcess.push(newAsset); // Add to the main queue for the *next* iteration
637
- logger?.debug(` -> Queued new nested asset: ${newAsset.url}`);
638
- } else {
639
- logger?.debug(` -> Skipping already processed/queued nested asset: ${newAsset.url}`);
640
- }
677
+ // CHECK: Add to the main processing queue only if this resolved URL hasn't been processed OR queued before.
678
+ if (!processedOrQueuedUrls.has(newAsset.url)) {
679
+ processedOrQueuedUrls.add(newAsset.url); // Mark as queued now
680
+ assetsToProcess.push(newAsset); // Add to the queue for the *next* iteration
681
+ logger?.debug(` -> Queued new nested asset: ${newAsset.url}`);
682
+ } else {
683
+ // Skip if already handled
684
+ logger?.debug(` -> Skipping already processed/queued nested asset: ${newAsset.url}`);
685
+ }
641
686
  }
642
687
  }
643
688
  } else {
644
- logger?.warn(`Could not determine base URL context for CSS file ${asset.url}. Cannot resolve nested relative paths within it.`);
689
+ // Warn if the base URL for the CSS file couldn't be determined (shouldn't happen if asset.url was valid)
690
+ logger?.warn(`Could not determine base URL context for CSS file ${asset.url}. Cannot resolve nested relative paths within it.`);
645
691
  }
646
692
  } // End if(asset.type === 'css' && cssContentForParsing)
647
693
  } // End for loop over currentBatch
648
- } // End while loop
694
+ } // End while loop (assetsToProcess.length > 0)
649
695
 
650
- const finalIterationCount = iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS ? 'MAX+' : iterationCount;
696
+ // Log completion summary
697
+ const finalIterationCount = iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS ? `${MAX_ASSET_EXTRACTION_ITERATIONS}+ (limit hit)` : iterationCount;
651
698
  logger?.info(`✅ Asset extraction COMPLETE! Found ${finalAssetsMap.size} unique assets in ${finalIterationCount} iterations.`);
652
699
 
653
- // Return the original HTML content and the final list of processed assets
700
+ // Return the original HTML content and the final list of processed assets from the map
654
701
  return {
655
702
  htmlContent: parsed.htmlContent,
656
703
  assets: Array.from(finalAssetsMap.values())