portapack 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/ci.yml +5 -4
- package/CHANGELOG.md +8 -0
- package/README.md +8 -13
- package/dist/cli/cli-entry.cjs +17 -38
- package/dist/cli/cli-entry.cjs.map +1 -1
- package/dist/index.js +17 -38
- package/dist/index.js.map +1 -1
- package/docs/.vitepress/config.ts +0 -1
- package/docs/cli.md +14 -67
- package/docs/configuration.md +101 -116
- package/docs/getting-started.md +74 -44
- package/package.json +1 -1
- package/src/core/extractor.ts +295 -248
- package/tests/unit/cli/cli.test.ts +1 -1
- package/tests/unit/core/extractor.test.ts +412 -208
- package/tests/unit/core/web-fetcher.test.ts +67 -67
- package/tsconfig.jest.json +1 -0
- package/docs/demo.md +0 -46
package/src/core/extractor.ts
CHANGED
@@ -2,21 +2,19 @@
|
|
2
2
|
* @file src/core/extractor.ts
|
3
3
|
* @description Handles discovery, resolution, fetching, and optional embedding of assets
|
4
4
|
* linked from HTML and recursively within CSS (@import, url()). This is the heart of finding EVERYTHING.
|
5
|
-
* @version 1.1.
|
5
|
+
* @version 1.1.6 - Revised fetchAsset error handling logic for Axios errors.
|
6
6
|
*/
|
7
7
|
|
8
8
|
// === Node.js Core Imports ===
|
9
9
|
import { readFile } from 'fs/promises';
|
10
10
|
import * as fs from 'fs'; // Required for statSync for sync directory check
|
11
|
-
import type { FileHandle } from 'fs/promises';
|
11
|
+
import type { FileHandle } from 'fs/promises'; // Import specific type if needed elsewhere
|
12
12
|
import path from 'path';
|
13
13
|
import { fileURLToPath, URL } from 'url'; // Crucial for file path/URL conversion
|
14
14
|
|
15
15
|
// === External Dependencies ===
|
16
|
-
// Using requireNamespace avoids potential ESM/CJS interop issues with mocks if they arise
|
17
|
-
// const axios = require('axios'); // Alternative if import * causes issues with mocks
|
18
16
|
import * as axiosNs from 'axios'; // Using namespace import for clarity
|
19
|
-
import type { AxiosError, AxiosRequestConfig, AxiosResponse, InternalAxiosRequestConfig } from 'axios';
|
17
|
+
import type { AxiosError, AxiosRequestConfig, AxiosResponse, InternalAxiosRequestConfig } from 'axios'; // Import necessary types
|
20
18
|
|
21
19
|
// === Project Imports ===
|
22
20
|
import type { Asset, ParsedHTML } from '../types'; // Adjust path if needed
|
@@ -46,10 +44,12 @@ type NodeJSErrnoException = Error & { code?: string };
|
|
46
44
|
*/
|
47
45
|
function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boolean {
|
48
46
|
try {
|
47
|
+
// Re-encode the decoded string back to a buffer using UTF-8
|
49
48
|
const reEncodedBuffer = Buffer.from(decodedString, 'utf-8');
|
49
|
+
// Compare the re-encoded buffer with the original buffer
|
50
50
|
return !originalBuffer.equals(reEncodedBuffer);
|
51
51
|
} catch (e) {
|
52
|
-
//
|
52
|
+
// If an error occurs during re-encoding, it implies the original wasn't valid UTF-8
|
53
53
|
return true;
|
54
54
|
}
|
55
55
|
}
|
@@ -62,9 +62,11 @@ function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boo
|
|
62
62
|
* @returns {string | undefined} The absolute base URL string ending in '/', or undefined if determination fails.
|
63
63
|
*/
|
64
64
|
function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | undefined {
|
65
|
-
//
|
66
|
-
console.log(`[DEBUG determineBaseUrl] Input: "${inputPathOrUrl}"`);
|
65
|
+
// Log the input for debugging purposes
|
66
|
+
// console.log(`[DEBUG determineBaseUrl] Input: "${inputPathOrUrl}"`); // Keep debug log commented unless needed
|
67
67
|
logger?.debug(`Determining base URL for input: ${inputPathOrUrl}`);
|
68
|
+
|
69
|
+
// Handle invalid or empty input
|
68
70
|
if (!inputPathOrUrl) {
|
69
71
|
logger?.warn('Cannot determine base URL: inputPathOrUrl is empty or invalid.');
|
70
72
|
return undefined;
|
@@ -74,20 +76,20 @@ function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | und
|
|
74
76
|
// Handle non-file URLs (HTTP, HTTPS)
|
75
77
|
if (/^https?:\/\//i.test(inputPathOrUrl)) {
|
76
78
|
const url = new URL(inputPathOrUrl);
|
77
|
-
//
|
79
|
+
// Construct the base URL by taking the path up to the last '/'
|
78
80
|
url.pathname = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
|
79
|
-
url.search = '';
|
81
|
+
url.search = ''; // Remove query parameters
|
82
|
+
url.hash = ''; // Remove fragments
|
80
83
|
const baseUrl = url.href;
|
81
84
|
logger?.debug(`Determined remote base URL: ${baseUrl}`);
|
82
|
-
// [DEBUG
|
83
|
-
|
84
|
-
return baseUrl;
|
85
|
+
// console.log(`[DEBUG determineBaseUrl] Determined Remote URL: "${baseUrl}"`); // Keep debug log commented unless needed
|
86
|
+
// Return the constructed base URL (usually ends in '/')
|
87
|
+
return baseUrl;
|
85
88
|
}
|
86
89
|
// Handle other protocols (warn and return undefined)
|
87
90
|
else if (inputPathOrUrl.includes('://') && !inputPathOrUrl.startsWith('file:')) {
|
88
91
|
logger?.warn(`Input "${inputPathOrUrl}" looks like a URL but uses an unsupported protocol. Cannot determine base URL.`);
|
89
|
-
|
90
|
-
console.log(`[DEBUG determineBaseUrl] Unsupported protocol.`);
|
92
|
+
// console.log(`[DEBUG determineBaseUrl] Unsupported protocol.`); // Keep debug log commented unless needed
|
91
93
|
return undefined;
|
92
94
|
}
|
93
95
|
// Handle file paths and file: URLs
|
@@ -97,32 +99,31 @@ function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | und
|
|
97
99
|
|
98
100
|
// Convert input to an absolute path
|
99
101
|
if (inputPathOrUrl.startsWith('file:')) {
|
102
|
+
// Convert file URL to path
|
100
103
|
resourcePath = fileURLToPath(inputPathOrUrl);
|
101
104
|
// file: URLs ending in / strongly suggest a directory
|
102
105
|
isInputLikelyDirectory = inputPathOrUrl.endsWith('/');
|
103
106
|
} else {
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
107
|
+
// Resolve relative/absolute file paths
|
108
|
+
resourcePath = path.resolve(inputPathOrUrl);
|
109
|
+
// Check if the resolved path *actually* exists and is a directory
|
110
|
+
try {
|
111
|
+
// Use statSync carefully - assumes it's available and works (or mocked)
|
112
|
+
isInputLikelyDirectory = fs.statSync(resourcePath).isDirectory();
|
113
|
+
} catch {
|
114
|
+
// If stat fails (ENOENT, EACCES), assume it refers to a file path
|
115
|
+
isInputLikelyDirectory = false;
|
116
|
+
}
|
114
117
|
}
|
115
|
-
|
116
|
-
console.log(`[DEBUG determineBaseUrl] resourcePath: "${resourcePath}", isInputLikelyDirectory: ${isInputLikelyDirectory}`);
|
118
|
+
// console.log(`[DEBUG determineBaseUrl] resourcePath: "${resourcePath}", isInputLikelyDirectory: ${isInputLikelyDirectory}`); // Keep debug log commented unless needed
|
117
119
|
|
118
|
-
|
119
|
-
|
120
|
+
// The base directory is the directory containing the resourcePath,
|
121
|
+
// OR resourcePath itself if it was identified as a directory.
|
120
122
|
const baseDirPath = isInputLikelyDirectory ? resourcePath : path.dirname(resourcePath);
|
121
|
-
// [DEBUG
|
122
|
-
console.log(`[DEBUG determineBaseUrl] Calculated baseDirPath: "${baseDirPath}"`);
|
123
|
+
// console.log(`[DEBUG determineBaseUrl] Calculated baseDirPath: "${baseDirPath}"`); // Keep debug log commented unless needed
|
123
124
|
|
124
125
|
// Convert base directory path back to a file URL ending in '/'
|
125
|
-
let normalizedPathForURL = baseDirPath.replace(/\\/g, '/'); // Use forward slashes
|
126
|
+
let normalizedPathForURL = baseDirPath.replace(/\\/g, '/'); // Use forward slashes for URL consistency
|
126
127
|
// Ensure leading slash for Windows file URLs (e.g., /C:/...)
|
127
128
|
if (/^[A-Z]:\//i.test(normalizedPathForURL) && !normalizedPathForURL.startsWith('/')) {
|
128
129
|
normalizedPathForURL = '/' + normalizedPathForURL;
|
@@ -132,19 +133,18 @@ function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | und
|
|
132
133
|
normalizedPathForURL += '/';
|
133
134
|
}
|
134
135
|
|
136
|
+
// Create the final file URL object and get its string representation
|
135
137
|
const fileUrl = new URL('file://' + normalizedPathForURL);
|
136
138
|
const fileUrlString = fileUrl.href;
|
137
139
|
|
138
140
|
logger?.debug(`Determined base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved base dir: ${baseDirPath})`);
|
139
|
-
|
140
|
-
console.log(`[DEBUG determineBaseUrl] Determined File URL: "${fileUrlString}"`);
|
141
|
+
// console.log(`[DEBUG determineBaseUrl] Determined File URL: "${fileUrlString}"`); // Keep debug log commented unless needed
|
141
142
|
return fileUrlString;
|
142
|
-
|
143
143
|
}
|
144
144
|
} catch (error: unknown) {
|
145
|
+
// Handle any errors during base URL determination
|
145
146
|
const message = error instanceof Error ? error.message : String(error);
|
146
|
-
// [DEBUG
|
147
|
-
console.error(`[DEBUG determineBaseUrl] Error determining base URL: ${message}`);
|
147
|
+
// console.error(`[DEBUG determineBaseUrl] Error determining base URL: ${message}`); // Keep debug log commented unless needed
|
148
148
|
logger?.error(`💀 Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error && error.stack ? ` - Stack: ${error.stack}` : ''}`);
|
149
149
|
return undefined;
|
150
150
|
}
|
@@ -159,8 +159,10 @@ function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | und
|
|
159
159
|
* @returns {URL | null} A validated, absolute URL object, or null if invalid/ignorable.
|
160
160
|
*/
|
161
161
|
function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Logger): URL | null {
|
162
|
+
// Trim whitespace from the URL
|
162
163
|
const trimmedUrl = assetUrl?.trim();
|
163
|
-
|
164
|
+
|
165
|
+
// Ignore empty URLs, data URIs, or fragment-only URLs
|
164
166
|
if (!trimmedUrl || trimmedUrl.startsWith('data:') || trimmedUrl.startsWith('#')) {
|
165
167
|
return null;
|
166
168
|
}
|
@@ -170,34 +172,39 @@ function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Log
|
|
170
172
|
// Handle protocol-relative URLs (e.g., //example.com/image.png)
|
171
173
|
if (resolvableUrl.startsWith('//') && baseContextUrl) {
|
172
174
|
try {
|
175
|
+
// Prepend the protocol from the base context URL
|
173
176
|
const base = new URL(baseContextUrl);
|
174
|
-
resolvableUrl = base.protocol + resolvableUrl;
|
177
|
+
resolvableUrl = base.protocol + resolvableUrl;
|
175
178
|
} catch (e) {
|
179
|
+
// Log a warning if the base protocol cannot be determined
|
176
180
|
logger?.warn(`Could not extract protocol from base "${baseContextUrl}" for protocol-relative URL "${trimmedUrl}". Skipping.`);
|
177
181
|
return null;
|
178
182
|
}
|
179
183
|
}
|
180
184
|
|
181
185
|
try {
|
182
|
-
// Use URL constructor for resolution. Handles absolute, relative paths, ../ etc.
|
183
|
-
// baseContextUrl provides the context for resolving relative URLs.
|
186
|
+
// Use URL constructor for resolution. Handles absolute paths, relative paths, ../ etc.
|
184
187
|
const resolved = new URL(resolvableUrl, baseContextUrl);
|
185
|
-
|
188
|
+
|
189
|
+
// Skip assets with unsupported protocols (e.g., mailto:, ws:)
|
186
190
|
if (!['http:', 'https:', 'file:'].includes(resolved.protocol)) {
|
187
|
-
|
188
|
-
|
191
|
+
logger?.debug(`Skipping asset with unsupported protocol: ${resolved.href}`);
|
192
|
+
return null;
|
189
193
|
}
|
194
|
+
// Return the resolved URL object
|
190
195
|
return resolved;
|
191
196
|
} catch (error: unknown) {
|
192
|
-
// Log errors during URL parsing/resolution
|
197
|
+
// Log errors during URL parsing/resolution
|
193
198
|
const message = error instanceof Error ? error.message : String(error);
|
194
|
-
// Avoid
|
199
|
+
// Avoid redundant warnings for relative paths when no base context was provided (expected failure)
|
195
200
|
if (!/^[a-z]+:/i.test(resolvableUrl) && !resolvableUrl.startsWith('/') && !baseContextUrl) {
|
196
201
|
logger?.warn(`Cannot resolve relative URL "${resolvableUrl}" - Base context URL was not provided or determined.`);
|
197
202
|
} else {
|
203
|
+
// Log other resolution failures
|
198
204
|
logger?.warn(`⚠️ Failed to parse/resolve URL "${resolvableUrl}" ${baseContextUrl ? 'against base "' + baseContextUrl + '"' : '(no base provided)'}: ${message}`);
|
199
205
|
}
|
200
|
-
|
206
|
+
// Return null if resolution fails
|
207
|
+
return null;
|
201
208
|
}
|
202
209
|
}
|
203
210
|
|
@@ -214,29 +221,27 @@ function resolveCssRelativeUrl(
|
|
214
221
|
cssBaseContextUrl: string, // e.g., file:///C:/mock/base/dir/css/deep.css or https://.../style.css
|
215
222
|
logger?: Logger
|
216
223
|
): string | null {
|
217
|
-
// [DEBUG
|
218
|
-
console.log(`[DEBUG resolveCssRelativeUrl] Input: relative="${relativeUrl}", base="${cssBaseContextUrl}"`);
|
224
|
+
// console.log(`[DEBUG resolveCssRelativeUrl] Input: relative="${relativeUrl}", base="${cssBaseContextUrl}"`); // Keep debug log commented unless needed
|
219
225
|
|
226
|
+
// Ignore empty, data URIs, or fragments
|
220
227
|
if (!relativeUrl || relativeUrl.startsWith('data:') || relativeUrl.startsWith('#')) {
|
221
|
-
return null;
|
228
|
+
return null;
|
222
229
|
}
|
223
230
|
|
224
231
|
try {
|
225
232
|
// Use the URL constructor which correctly handles relative paths including ../
|
226
|
-
// relative to the base URL provided.
|
233
|
+
// relative to the base URL provided (the CSS file's URL).
|
227
234
|
const resolvedUrl = new URL(relativeUrl, cssBaseContextUrl);
|
228
|
-
|
229
|
-
//
|
230
|
-
|
231
|
-
return resolvedUrl.href; // Return the resolved absolute URL string
|
235
|
+
// console.log(`[DEBUG resolveCssRelativeUrl] Resolved URL object href: "${resolvedUrl.href}"`); // Keep debug log commented unless needed
|
236
|
+
// Return the resolved absolute URL string
|
237
|
+
return resolvedUrl.href;
|
232
238
|
|
233
239
|
} catch (error) {
|
234
|
-
// Log warning if URL resolution fails
|
240
|
+
// Log warning if URL resolution fails
|
235
241
|
logger?.warn(
|
236
242
|
`Failed to resolve CSS URL: "${relativeUrl}" relative to "${cssBaseContextUrl}": ${String(error)}`
|
237
243
|
);
|
238
|
-
|
239
|
-
console.error(`[DEBUG resolveCssRelativeUrl] Error resolving: ${String(error)}`);
|
244
|
+
// console.error(`[DEBUG resolveCssRelativeUrl] Error resolving: ${String(error)}`); // Keep debug log commented unless needed
|
240
245
|
return null;
|
241
246
|
}
|
242
247
|
}
|
@@ -251,92 +256,92 @@ function resolveCssRelativeUrl(
|
|
251
256
|
* @returns {Promise<Buffer | null>} Asset content as a Buffer, or null on failure.
|
252
257
|
*/
|
253
258
|
async function fetchAsset(resolvedUrl: URL, logger?: Logger, timeout: number = 10000): Promise<Buffer | null> {
|
254
|
-
// [DEBUG
|
255
|
-
console.log(`[DEBUG fetchAsset] Attempting fetch for URL: ${resolvedUrl.href}`);
|
259
|
+
// console.log(`[DEBUG fetchAsset] Attempting fetch for URL: ${resolvedUrl.href}`); // Keep debug log commented unless needed
|
256
260
|
logger?.debug(`Attempting to fetch asset: ${resolvedUrl.href}`);
|
257
261
|
const protocol = resolvedUrl.protocol;
|
258
262
|
|
259
263
|
try {
|
264
|
+
// Handle HTTP and HTTPS protocols
|
260
265
|
if (protocol === 'http:' || protocol === 'https:') {
|
261
|
-
// Use axios
|
266
|
+
// Use axios to fetch remote content as an ArrayBuffer
|
262
267
|
const response: AxiosResponse<ArrayBuffer> = await axiosNs.default.get(resolvedUrl.href, {
|
263
|
-
responseType: 'arraybuffer',
|
268
|
+
responseType: 'arraybuffer', // Fetch as binary data
|
269
|
+
timeout: timeout, // Apply network timeout
|
264
270
|
});
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
}
|
271
|
+
logger?.debug(`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data?.byteLength ?? 0} bytes)`);
|
272
|
+
// console.log(`[DEBUG fetchAsset] HTTP fetch SUCCESS for: ${resolvedUrl.href}, Status: ${response.status}`); // Keep debug log commented unless needed
|
273
|
+
// Return the fetched data as a Node.js Buffer
|
274
|
+
return Buffer.from(response.data);
|
275
|
+
}
|
276
|
+
// Handle file protocol
|
277
|
+
else if (protocol === 'file:') {
|
270
278
|
let filePath: string;
|
271
279
|
try {
|
272
|
-
// Convert file URL to
|
280
|
+
// Convert file URL to a system file path
|
281
|
+
// IMPORTANT: This strips query params and fragments from the URL
|
273
282
|
filePath = fileURLToPath(resolvedUrl);
|
274
283
|
} catch (e: any) {
|
275
|
-
// [DEBUG
|
276
|
-
|
277
|
-
|
278
|
-
return null;
|
284
|
+
// console.error(`[DEBUG fetchAsset] fileURLToPath FAILED for: ${resolvedUrl.href}`, e); // Keep debug log commented unless needed
|
285
|
+
logger?.error(`Could not convert file URL to path: ${resolvedUrl.href}. Error: ${e.message}`);
|
286
|
+
return null; // Return null if conversion fails
|
279
287
|
}
|
280
288
|
|
281
289
|
const normalizedForLog = path.normalize(filePath);
|
282
|
-
|
283
|
-
console.log(`[DEBUG fetchAsset] Attempting readFile with path: "${normalizedForLog}" (Original from URL: "${filePath}")`);
|
290
|
+
// console.log(`[DEBUG fetchAsset] Attempting readFile with path: "${normalizedForLog}" (Original from URL: "${filePath}")`); // Keep debug log commented unless needed
|
284
291
|
|
285
|
-
|
292
|
+
// Read file content using fs/promises
|
286
293
|
const data = await readFile(filePath); // This call uses the mock in tests
|
287
294
|
|
288
|
-
|
289
|
-
|
290
|
-
|
295
|
+
// console.log(`[DEBUG fetchAsset] readFile call SUCCEEDED for path: "${normalizedForLog}". Data length: ${data?.byteLength}`); // Keep debug log commented unless needed
|
296
|
+
logger?.debug(`Read local file ${filePath} (${data.byteLength} bytes)`);
|
297
|
+
// Return the file content as a Buffer
|
291
298
|
return data;
|
292
|
-
}
|
293
|
-
|
294
|
-
|
299
|
+
}
|
300
|
+
// Handle unsupported protocols
|
301
|
+
else {
|
302
|
+
// console.log(`[DEBUG fetchAsset] Unsupported protocol: ${protocol}`); // Keep debug log commented unless needed
|
295
303
|
logger?.warn(`Unsupported protocol "${protocol}" in URL: ${resolvedUrl.href}`);
|
296
|
-
|
304
|
+
return null;
|
297
305
|
}
|
298
306
|
} catch (error: unknown) {
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
//
|
304
|
-
|
305
|
-
|
306
|
-
const status =
|
307
|
-
const
|
308
|
-
|
309
|
-
const
|
310
|
-
// Format consistent with test expectations
|
311
|
-
const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: Status ${status} - ${statusText}. Code: ${code}, Message: ${message}`;
|
307
|
+
// --- Handle Errors During Fetch/Read ---
|
308
|
+
const failedId = protocol === 'file:' ? path.normalize(fileURLToPath(resolvedUrl)) : resolvedUrl.href;
|
309
|
+
// console.error(`[DEBUG fetchAsset] CAUGHT Error for ${failedId}. Type: ${Object.prototype.toString.call(error)}, Constructor: ${error?.constructor?.name}, isAxiosError property: ${(error as any)?.isAxiosError}, Code: ${(error as any)?.code}`); // Keep for debugging if needed
|
310
|
+
|
311
|
+
// *** FIXED LOGIC: Check for AxiosError using its property *before* generic instanceof Error ***
|
312
|
+
if ((protocol === 'http:' || protocol === 'https:') && (error as any)?.isAxiosError === true) {
|
313
|
+
const axiosError = error as AxiosError; // Cast for easier property access
|
314
|
+
const status = axiosError.response?.status ?? 'N/A';
|
315
|
+
const code = axiosError.code ?? 'N/A'; // e.g., ECONNABORTED for timeout
|
316
|
+
// Use the specific log format
|
317
|
+
const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: ${axiosError.message} (Code: ${code})`;
|
312
318
|
logger?.warn(logMessage);
|
313
319
|
}
|
314
|
-
// Check for
|
315
|
-
|
316
|
-
let failedPath = resolvedUrl.href;
|
320
|
+
// Check for file system errors *next*
|
321
|
+
else if (protocol === 'file:' && error instanceof Error) {
|
322
|
+
let failedPath = resolvedUrl.href;
|
317
323
|
try { failedPath = fileURLToPath(resolvedUrl); } catch { /* ignore */ }
|
318
|
-
failedPath = path.normalize(failedPath);
|
324
|
+
failedPath = path.normalize(failedPath);
|
319
325
|
|
320
|
-
if (
|
326
|
+
if ((error as NodeJSErrnoException).code === 'ENOENT') {
|
321
327
|
logger?.warn(`⚠️ File not found (ENOENT) for asset: ${failedPath}.`);
|
322
|
-
} else if (
|
323
|
-
|
324
|
-
|
325
|
-
// Also log the more generic message that the test currently expects
|
326
|
-
logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
|
327
|
-
} else if (error instanceof Error) {
|
328
|
-
logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
|
328
|
+
} else if ((error as NodeJSErrnoException).code === 'EACCES') {
|
329
|
+
// Log ONLY the specific EACCES message
|
330
|
+
logger?.warn(`⚠️ Permission denied (EACCES) reading asset: ${failedPath}.`);
|
329
331
|
} else {
|
330
|
-
logger?.warn(`⚠️
|
332
|
+
logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
|
331
333
|
}
|
332
334
|
}
|
333
|
-
// Generic fallback for
|
335
|
+
// Generic fallback for *other* types of Errors (that are not Axios or known FS errors)
|
334
336
|
else if (error instanceof Error) {
|
335
337
|
logger?.warn(`⚠️ An unexpected error occurred processing asset ${resolvedUrl.href}: ${error.message}`);
|
336
|
-
}
|
338
|
+
}
|
339
|
+
// Fallback for non-Error throws (e.g., strings, numbers)
|
340
|
+
else {
|
337
341
|
logger?.warn(`⚠️ An unknown and unexpected error occurred processing asset ${resolvedUrl.href}: ${String(error)}`);
|
338
342
|
}
|
339
|
-
|
343
|
+
// Return null on ANY error
|
344
|
+
return null;
|
340
345
|
}
|
341
346
|
}
|
342
347
|
|
@@ -353,50 +358,57 @@ function extractUrlsFromCSS(
|
|
353
358
|
cssBaseContextUrl: string,
|
354
359
|
logger?: Logger
|
355
360
|
): Asset[] {
|
361
|
+
// Array to hold assets discovered within this CSS content
|
356
362
|
const newlyDiscovered: Asset[] = [];
|
357
|
-
//
|
363
|
+
// Set to track URLs processed within this specific CSS file to avoid adding duplicates from the same file
|
358
364
|
const processedInThisParse = new Set<string>();
|
359
365
|
|
360
|
-
// Regex for url(...) patterns, handling optional quotes
|
366
|
+
// Regex for url(...) patterns, handling optional quotes (non-greedy match for URL)
|
361
367
|
const urlRegex = /url\(\s*(['"]?)(.*?)\1\s*\)/gi;
|
362
|
-
// Regex for @import rules, handling url() or bare string, optional quotes
|
368
|
+
// Regex for @import rules, handling url() or bare string, optional quotes (non-greedy match for URL)
|
363
369
|
const importRegex = /@import\s+(?:url\(\s*(['"]?)(.*?)\1\s*\)|(['"])(.*?)\3)\s*;/gi;
|
364
370
|
|
365
371
|
/** Internal helper to process a found URL string */
|
366
372
|
const processFoundUrl = (rawUrl: string | undefined, ruleType: '@import' | 'url()') => {
|
367
|
-
if
|
373
|
+
// Skip if URL is empty, undefined, a data URI, or only a fragment
|
374
|
+
if (!rawUrl || rawUrl.trim() === '' || rawUrl.startsWith('data:') || rawUrl.startsWith('#')) return;
|
368
375
|
|
376
|
+
// Resolve the potentially relative URL against the CSS file's base URL
|
369
377
|
const resolvedUrl = resolveCssRelativeUrl(rawUrl, cssBaseContextUrl, logger);
|
370
378
|
|
371
|
-
// If successfully resolved and not already found in
|
379
|
+
// If successfully resolved and not already found *in this specific CSS file*
|
372
380
|
if (resolvedUrl && !processedInThisParse.has(resolvedUrl)) {
|
381
|
+
// Mark this resolved URL as processed for this CSS file
|
373
382
|
processedInThisParse.add(resolvedUrl);
|
374
|
-
|
383
|
+
// Guess the asset type (css, image, font, etc.) based on the resolved URL
|
384
|
+
const { assetType } = guessMimeType(resolvedUrl);
|
375
385
|
|
376
|
-
// Add to the list
|
386
|
+
// Add the discovered asset to the list for this CSS file
|
377
387
|
newlyDiscovered.push({
|
378
388
|
type: assetType,
|
379
|
-
url: resolvedUrl, //
|
389
|
+
url: resolvedUrl, // Store the resolved absolute URL string
|
380
390
|
content: undefined // Content will be fetched later if needed
|
381
391
|
});
|
382
|
-
|
392
|
+
logger?.debug(`Discovered nested ${assetType} asset (${ruleType}) in CSS ${cssBaseContextUrl}: ${resolvedUrl}`);
|
383
393
|
}
|
384
394
|
};
|
385
395
|
|
386
|
-
//
|
396
|
+
// Find all url(...) matches in the CSS content
|
387
397
|
let match;
|
388
398
|
while ((match = urlRegex.exec(cssContent)) !== null) {
|
389
|
-
|
399
|
+
// Group 2 captures the URL part inside url()
|
400
|
+
processFoundUrl(match[2], 'url()');
|
390
401
|
}
|
391
402
|
|
392
|
-
//
|
393
|
-
// Reset lastIndex as we're
|
394
|
-
importRegex.lastIndex = 0;
|
403
|
+
// Find all @import matches in the CSS content
|
404
|
+
// Reset lastIndex as we're reusing the regex object implicitly
|
405
|
+
importRegex.lastIndex = 0;
|
395
406
|
while ((match = importRegex.exec(cssContent)) !== null) {
|
396
407
|
// Group 2 captures url('...'), Group 4 captures bare "..."
|
397
408
|
processFoundUrl(match[2] || match[4], '@import');
|
398
409
|
}
|
399
410
|
|
411
|
+
// Return the list of assets discovered within this CSS content
|
400
412
|
return newlyDiscovered;
|
401
413
|
}
|
402
414
|
|
@@ -422,59 +434,65 @@ export async function extractAssets(
|
|
422
434
|
): Promise<ParsedHTML> {
|
423
435
|
logger?.info(`🚀 Starting asset extraction! Embed: ${embedAssets}. Input: ${inputPathOrUrl || '(HTML content only)'}`);
|
424
436
|
|
437
|
+
// Get the initial list of assets found directly in the HTML
|
425
438
|
const initialAssets: Asset[] = parsed.assets || [];
|
426
|
-
// Stores the final result: Map<resolved URL string, Asset object>
|
439
|
+
// Stores the final result: Map<resolved URL string, Asset object> to ensure uniqueness
|
427
440
|
const finalAssetsMap = new Map<string, Asset>();
|
428
|
-
// Queue holds assets to be processed
|
441
|
+
// Queue holds assets whose content needs to be processed (fetched/analyzed)
|
429
442
|
let assetsToProcess: Asset[] = [];
|
430
|
-
// Set to track URLs that are already processed (in finalAssetsMap)
|
443
|
+
// Set to track URLs that are either already fully processed (in finalAssetsMap)
|
444
|
+
// OR currently in the processing queue (assetsToProcess) to prevent reprocessing/loops.
|
431
445
|
const processedOrQueuedUrls = new Set<string>();
|
432
446
|
|
433
|
-
// --- Determine Base URL Context ---
|
447
|
+
// --- Determine Base URL Context for the HTML ---
|
434
448
|
const htmlBaseContextUrl = determineBaseUrl(inputPathOrUrl || '', logger);
|
449
|
+
// Warn if no base URL could be found and there are relative paths in the initial assets
|
435
450
|
if (!htmlBaseContextUrl && initialAssets.some(a => !/^[a-z]+:/i.test(a.url) && !a.url.startsWith('data:') && !a.url.startsWith('#') && !a.url.startsWith('/'))) {
|
436
451
|
logger?.warn("🚨 No valid base path/URL determined for the HTML source! Resolution of relative asset paths from HTML may fail.");
|
437
452
|
} else if (htmlBaseContextUrl) {
|
438
453
|
logger?.debug(`Using HTML base context URL: ${htmlBaseContextUrl}`);
|
439
454
|
}
|
440
455
|
|
441
|
-
// --- Initial Queue Population ---
|
456
|
+
// --- Initial Queue Population from HTML assets ---
|
442
457
|
logger?.debug(`Queueing ${initialAssets.length} initial assets parsed from HTML...`);
|
443
458
|
for (const asset of initialAssets) {
|
444
459
|
// Resolve the initial asset URL against the HTML base context
|
445
460
|
const resolvedUrlObj = resolveAssetUrl(asset.url, htmlBaseContextUrl, logger);
|
461
|
+
|
462
|
+
// Skip if URL is invalid, data URI, fragment, or unsupported protocol
|
446
463
|
if (!resolvedUrlObj) {
|
447
|
-
|
448
|
-
|
464
|
+
logger?.debug(` -> Skipping initial asset with unresolvable/ignorable URL: ${asset.url}`);
|
465
|
+
continue;
|
449
466
|
}
|
450
|
-
|
467
|
+
// Get the resolved absolute URL string
|
468
|
+
const urlToQueue = resolvedUrlObj.href;
|
451
469
|
|
452
|
-
//
|
453
|
-
if (!
|
454
|
-
|
470
|
+
// Check if this URL is already tracked (processed or queued)
|
471
|
+
if (!processedOrQueuedUrls.has(urlToQueue)) {
|
472
|
+
// Mark as queued (add to set *before* adding to array)
|
473
|
+
processedOrQueuedUrls.add(urlToQueue);
|
455
474
|
|
456
475
|
// Guess type from the resolved/original URL if not provided initially
|
457
476
|
const { assetType: guessedType } = guessMimeType(urlToQueue);
|
458
|
-
const initialType = asset.type ?? guessedType;
|
477
|
+
const initialType = asset.type ?? guessedType; // Use provided type or fallback to guessed type
|
459
478
|
|
460
|
-
// Add to the processing queue
|
479
|
+
// Add the resolved asset to the processing queue
|
461
480
|
assetsToProcess.push({
|
462
481
|
url: urlToQueue, // Use the resolved URL
|
463
482
|
type: initialType,
|
464
|
-
content: undefined
|
483
|
+
content: undefined // Content is initially undefined
|
465
484
|
});
|
466
|
-
|
467
|
-
} else if (urlToQueue.startsWith('data:')) {
|
468
|
-
logger?.debug(` -> Skipping data URI: ${urlToQueue.substring(0, 50)}...`);
|
485
|
+
logger?.debug(` -> Queued initial asset: ${urlToQueue} (Original raw: ${asset.url})`);
|
469
486
|
} else {
|
470
|
-
|
487
|
+
logger?.debug(` -> Skipping already processed/queued initial asset: ${urlToQueue}`);
|
471
488
|
}
|
472
489
|
}
|
473
490
|
|
474
|
-
// --- Main processing loop ---
|
491
|
+
// --- Main processing loop (continues as long as there are assets to process) ---
|
475
492
|
let iterationCount = 0;
|
476
493
|
while (assetsToProcess.length > 0) {
|
477
494
|
iterationCount++;
|
495
|
+
// Prevent potential infinite loops
|
478
496
|
if (iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS) {
|
479
497
|
logger?.error(`🛑 Asset extraction loop limit hit (${MAX_ASSET_EXTRACTION_ITERATIONS})! Aborting.`);
|
480
498
|
const remainingUrls = assetsToProcess.map(a => a.url).slice(0, 10).join(', ');
|
@@ -482,175 +500,204 @@ export async function extractAssets(
|
|
482
500
|
// Add assets remaining in queue to final map without content before breaking
|
483
501
|
assetsToProcess.forEach(asset => {
|
484
502
|
if (!finalAssetsMap.has(asset.url)) {
|
485
|
-
|
503
|
+
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
486
504
|
}
|
487
505
|
});
|
488
|
-
assetsToProcess = []; // Clear queue
|
506
|
+
assetsToProcess = []; // Clear queue to stop the loop
|
489
507
|
break; // Exit loop
|
490
508
|
}
|
491
509
|
|
492
|
-
//
|
510
|
+
// Take a snapshot of the current queue to process in this iteration
|
493
511
|
const currentBatch = [...assetsToProcess];
|
494
|
-
|
512
|
+
// Clear the main queue; new assets found in this batch will be added here for the *next* iteration
|
513
|
+
assetsToProcess = [];
|
495
514
|
|
496
515
|
logger?.debug(`--- Processing batch ${iterationCount}: ${currentBatch.length} asset(s) ---`);
|
497
516
|
|
517
|
+
// Process each asset in the current batch
|
498
518
|
for (const asset of currentBatch) {
|
499
|
-
// Skip if
|
519
|
+
// Double-check: Skip if this asset somehow got fully processed in a previous iteration (shouldn't happen with current logic, but safe check)
|
500
520
|
if (finalAssetsMap.has(asset.url)) {
|
501
|
-
|
521
|
+
logger?.debug(`Skipping asset already in final map: ${asset.url}`);
|
502
522
|
continue;
|
503
523
|
}
|
504
524
|
|
505
|
-
let assetContentBuffer: Buffer | null = null;
|
506
|
-
let finalContent: string | undefined = undefined; //
|
507
|
-
let cssContentForParsing: string | undefined = undefined; //
|
525
|
+
let assetContentBuffer: Buffer | null = null; // To store fetched binary content
|
526
|
+
let finalContent: string | undefined = undefined; // Final content (text or data URI) for the Asset object
|
527
|
+
let cssContentForParsing: string | undefined = undefined; // Text content specifically for parsing CSS
|
508
528
|
|
509
529
|
// --- Determine if fetching is needed ---
|
510
|
-
// Fetch if
|
530
|
+
// Fetch if we need to embed all assets OR if it's CSS (we need content to parse for nested assets)
|
511
531
|
const needsFetching = embedAssets || asset.type === 'css';
|
512
532
|
let assetUrlObj: URL | null = null; // URL object needed for fetchAsset
|
513
533
|
|
514
534
|
if (needsFetching) {
|
515
535
|
// --- Create URL object for fetching ---
|
516
536
|
try {
|
517
|
-
|
537
|
+
// Asset URL should be absolute at this point
|
538
|
+
assetUrlObj = new URL(asset.url);
|
518
539
|
} catch (urlError) {
|
519
|
-
|
520
|
-
|
521
|
-
|
540
|
+
// Log error if creating URL object fails
|
541
|
+
logger?.warn(`Cannot create URL object for "${asset.url}", skipping fetch. Error: ${urlError instanceof Error ? urlError.message : String(urlError)}`);
|
542
|
+
// Store asset without content in the final map
|
543
|
+
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
544
|
+
// Skip to next asset in the current batch
|
545
|
+
continue;
|
522
546
|
}
|
523
547
|
|
524
548
|
// --- Fetch Asset ---
|
525
549
|
if (assetUrlObj) {
|
550
|
+
// Call fetchAsset (which handles http/https/file and errors)
|
526
551
|
assetContentBuffer = await fetchAsset(assetUrlObj, logger);
|
527
552
|
// fetchAsset returns null on failure
|
528
553
|
}
|
529
554
|
} // End if(needsFetching)
|
530
555
|
|
531
|
-
// --- If fetching was
|
556
|
+
// --- If fetching was required but failed, store asset without content and continue ---
|
532
557
|
if (needsFetching && assetContentBuffer === null) {
|
533
|
-
|
558
|
+
logger?.debug(`Storing asset ${asset.url} without content due to fetch failure.`);
|
559
|
+
// Add to final map with undefined content
|
534
560
|
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
535
|
-
|
561
|
+
// Skip to the next asset in the current batch
|
562
|
+
continue;
|
536
563
|
}
|
537
564
|
|
538
565
|
// --- Prepare Content for Storing/Embedding (if fetched successfully) ---
|
539
566
|
if (assetContentBuffer) { // Only proceed if content was fetched
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
567
|
+
// Guess MIME type based on the asset's URL extension
|
568
|
+
const mimeInfo = guessMimeType(asset.url);
|
569
|
+
// Use the guessed MIME type or fallback to a generic binary type
|
570
|
+
const effectiveMime = mimeInfo.mime || 'application/octet-stream';
|
571
|
+
|
572
|
+
// Handle TEXT types (CSS, JS)
|
573
|
+
if (TEXT_ASSET_TYPES.has(asset.type)) {
|
574
|
+
let textContent: string | undefined;
|
575
|
+
let wasLossy = false;
|
576
|
+
try {
|
577
|
+
// Try decoding the buffer as UTF-8
|
578
|
+
textContent = assetContentBuffer.toString('utf-8');
|
579
|
+
// Check if the decoding process lost information (e.g., invalid sequences replaced)
|
580
|
+
wasLossy = isUtf8DecodingLossy(assetContentBuffer, textContent);
|
581
|
+
} catch (e) {
|
582
|
+
// Decoding itself failed
|
583
|
+
textContent = undefined;
|
584
|
+
wasLossy = true;
|
585
|
+
}
|
586
|
+
|
587
|
+
// If decoding was successful and not lossy
|
588
|
+
if (!wasLossy && textContent !== undefined) {
|
589
|
+
// If embedding, store the text content
|
590
|
+
if (embedAssets) {
|
591
|
+
finalContent = textContent;
|
592
|
+
} else {
|
593
|
+
finalContent = undefined; // Not embedding text, store undefined
|
594
|
+
}
|
595
|
+
// If it's CSS, store its text content for parsing regardless of embedding option
|
596
|
+
if (asset.type === 'css') {
|
597
|
+
cssContentForParsing = textContent;
|
598
|
+
}
|
599
|
+
} else {
|
600
|
+
// Decoding failed or was lossy
|
601
|
+
// Fixed log message: Added "asset" after type.
|
602
|
+
logger?.warn(`Could not decode ${asset.type} asset ${asset.url} as valid UTF-8 text.${embedAssets ? ' Falling back to base64 data URI.' : ''}`);
|
603
|
+
cssContentForParsing = undefined; // Cannot parse CSS if decoding failed
|
604
|
+
// Embed as base64 data URI if requested, using the effective MIME type
|
605
|
+
if (embedAssets) {
|
606
|
+
finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
|
607
|
+
} else {
|
608
|
+
finalContent = undefined; // Not embedding
|
609
|
+
}
|
610
|
+
}
|
611
|
+
}
|
612
|
+
// Handle BINARY types (image, font, video, audio)
|
613
|
+
else if (BINARY_ASSET_TYPES.has(asset.type)) {
|
614
|
+
// Embed as base64 data URI if requested
|
615
|
+
if (embedAssets) {
|
616
|
+
finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
|
617
|
+
} else {
|
618
|
+
finalContent = undefined; // Not embedding
|
619
|
+
}
|
620
|
+
cssContentForParsing = undefined; // Not CSS, so no parsing needed
|
621
|
+
}
|
622
|
+
// Handle 'other' or unknown types
|
623
|
+
else {
|
624
|
+
cssContentForParsing = undefined; // Assume not parseable as CSS
|
625
|
+
// If embedding, attempt to store as text, fallback to base64 if invalid UTF-8
|
626
|
+
if (embedAssets) {
|
627
|
+
try {
|
628
|
+
const attemptedTextContent = assetContentBuffer.toString('utf-8');
|
629
|
+
if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
|
630
|
+
// If text decoding is lossy, warn and use base64
|
631
|
+
logger?.warn(`Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`);
|
632
|
+
finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
|
633
|
+
} else {
|
634
|
+
// Store as text if decoding worked
|
635
|
+
finalContent = attemptedTextContent;
|
636
|
+
logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
|
637
|
+
}
|
638
|
+
} catch (decodeError) {
|
639
|
+
// If toString fails, warn and use base64
|
640
|
+
logger?.warn(`Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`);
|
641
|
+
finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
|
642
|
+
}
|
643
|
+
} else {
|
644
|
+
finalContent = undefined; // Not embedding
|
645
|
+
}
|
646
|
+
}
|
605
647
|
} else { // Content was not fetched (e.g., embedAssets=false and not CSS)
|
606
|
-
|
607
|
-
|
648
|
+
finalContent = undefined;
|
649
|
+
cssContentForParsing = undefined;
|
608
650
|
}
|
609
651
|
|
610
|
-
// --- Store the final asset ---
|
611
|
-
// Use the resolved URL as the key and
|
652
|
+
// --- Store the final processed asset in the map ---
|
653
|
+
// Use the resolved URL as the key and ensure the asset object also uses the resolved URL
|
612
654
|
finalAssetsMap.set(asset.url, { ...asset, url: asset.url, content: finalContent });
|
613
|
-
// Note: URL was already added to processedOrQueuedUrls when initially queued or discovered
|
655
|
+
// Note: URL was already added to processedOrQueuedUrls when initially queued or discovered in CSS
|
614
656
|
|
615
657
|
// --- Process CSS for nested assets ---
|
616
658
|
// Only if it's CSS and we successfully decoded its content for parsing
|
617
659
|
if (asset.type === 'css' && cssContentForParsing) {
|
618
660
|
// Determine the base URL *for this specific CSS file* to resolve its relative links
|
619
|
-
|
620
|
-
|
661
|
+
const cssBaseContextUrl = determineBaseUrl(asset.url, logger); // CSS URL is absolute here
|
662
|
+
logger?.debug(`CSS base context for resolving nested assets within ${asset.url}: ${cssBaseContextUrl}`);
|
621
663
|
|
622
664
|
if (cssBaseContextUrl) {
|
623
|
-
//
|
665
|
+
// Extract URLs found within this CSS content
|
624
666
|
const newlyDiscoveredAssets = extractUrlsFromCSS(
|
625
667
|
cssContentForParsing,
|
626
|
-
cssBaseContextUrl, // Use CSS file's
|
668
|
+
cssBaseContextUrl, // Use the CSS file's own URL as the base
|
627
669
|
logger
|
628
670
|
);
|
629
671
|
|
672
|
+
// If new assets were found in the CSS
|
630
673
|
if (newlyDiscoveredAssets.length > 0) {
|
631
|
-
|
674
|
+
logger?.debug(`Discovered ${newlyDiscoveredAssets.length} nested assets in CSS ${asset.url}. Checking against queue...`);
|
675
|
+
// Process each newly discovered asset
|
632
676
|
for (const newAsset of newlyDiscoveredAssets) {
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
677
|
+
// CHECK: Add to the main processing queue only if this resolved URL hasn't been processed OR queued before.
|
678
|
+
if (!processedOrQueuedUrls.has(newAsset.url)) {
|
679
|
+
processedOrQueuedUrls.add(newAsset.url); // Mark as queued now
|
680
|
+
assetsToProcess.push(newAsset); // Add to the queue for the *next* iteration
|
681
|
+
logger?.debug(` -> Queued new nested asset: ${newAsset.url}`);
|
682
|
+
} else {
|
683
|
+
// Skip if already handled
|
684
|
+
logger?.debug(` -> Skipping already processed/queued nested asset: ${newAsset.url}`);
|
685
|
+
}
|
641
686
|
}
|
642
687
|
}
|
643
688
|
} else {
|
644
|
-
|
689
|
+
// Warn if the base URL for the CSS file couldn't be determined (shouldn't happen if asset.url was valid)
|
690
|
+
logger?.warn(`Could not determine base URL context for CSS file ${asset.url}. Cannot resolve nested relative paths within it.`);
|
645
691
|
}
|
646
692
|
} // End if(asset.type === 'css' && cssContentForParsing)
|
647
693
|
} // End for loop over currentBatch
|
648
|
-
} // End while loop
|
694
|
+
} // End while loop (assetsToProcess.length > 0)
|
649
695
|
|
650
|
-
|
696
|
+
// Log completion summary
|
697
|
+
const finalIterationCount = iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS ? `${MAX_ASSET_EXTRACTION_ITERATIONS}+ (limit hit)` : iterationCount;
|
651
698
|
logger?.info(`✅ Asset extraction COMPLETE! Found ${finalAssetsMap.size} unique assets in ${finalIterationCount} iterations.`);
|
652
699
|
|
653
|
-
// Return the original HTML content and the final list of processed assets
|
700
|
+
// Return the original HTML content and the final list of processed assets from the map
|
654
701
|
return {
|
655
702
|
htmlContent: parsed.htmlContent,
|
656
703
|
assets: Array.from(finalAssetsMap.values())
|