portapack 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/ci.yml +5 -4
- package/CHANGELOG.md +20 -0
- package/README.md +81 -219
- package/dist/cli/{cli-entry.js → cli-entry.cjs} +620 -513
- package/dist/cli/cli-entry.cjs.map +1 -0
- package/dist/index.d.ts +51 -56
- package/dist/index.js +517 -458
- package/dist/index.js.map +1 -1
- package/docs/.vitepress/config.ts +0 -1
- package/docs/cli.md +108 -45
- package/docs/configuration.md +101 -116
- package/docs/getting-started.md +74 -44
- package/jest.config.ts +18 -8
- package/jest.setup.cjs +66 -146
- package/package.json +5 -5
- package/src/cli/cli-entry.ts +15 -15
- package/src/cli/cli.ts +130 -119
- package/src/core/bundler.ts +174 -63
- package/src/core/extractor.ts +364 -277
- package/src/core/web-fetcher.ts +205 -141
- package/src/index.ts +161 -224
- package/tests/unit/cli/cli-entry.test.ts +66 -77
- package/tests/unit/cli/cli.test.ts +243 -145
- package/tests/unit/core/bundler.test.ts +334 -258
- package/tests/unit/core/extractor.test.ts +608 -1064
- package/tests/unit/core/minifier.test.ts +130 -221
- package/tests/unit/core/packer.test.ts +255 -106
- package/tests/unit/core/parser.test.ts +89 -458
- package/tests/unit/core/web-fetcher.test.ts +310 -265
- package/tests/unit/index.test.ts +206 -300
- package/tests/unit/utils/logger.test.ts +32 -28
- package/tsconfig.jest.json +8 -7
- package/tsup.config.ts +34 -29
- package/dist/cli/cli-entry.js.map +0 -1
- package/docs/demo.md +0 -46
- package/output.html +0 -1
- package/site-packed.html +0 -1
- package/test-output.html +0 -0
package/src/core/extractor.ts
CHANGED
@@ -2,24 +2,24 @@
|
|
2
2
|
* @file src/core/extractor.ts
|
3
3
|
* @description Handles discovery, resolution, fetching, and optional embedding of assets
|
4
4
|
* linked from HTML and recursively within CSS (@import, url()). This is the heart of finding EVERYTHING.
|
5
|
-
* @version 1.1.
|
5
|
+
* @version 1.1.6 - Revised fetchAsset error handling logic for Axios errors.
|
6
6
|
*/
|
7
7
|
|
8
8
|
// === Node.js Core Imports ===
|
9
9
|
import { readFile } from 'fs/promises';
|
10
10
|
import * as fs from 'fs'; // Required for statSync for sync directory check
|
11
|
-
import type { FileHandle } from 'fs/promises';
|
11
|
+
import type { FileHandle } from 'fs/promises'; // Import specific type if needed elsewhere
|
12
12
|
import path from 'path';
|
13
13
|
import { fileURLToPath, URL } from 'url'; // Crucial for file path/URL conversion
|
14
14
|
|
15
15
|
// === External Dependencies ===
|
16
|
-
import * as
|
17
|
-
import type { AxiosError, AxiosRequestConfig, AxiosResponse, InternalAxiosRequestConfig } from 'axios';
|
16
|
+
import * as axiosNs from 'axios'; // Using namespace import for clarity
|
17
|
+
import type { AxiosError, AxiosRequestConfig, AxiosResponse, InternalAxiosRequestConfig } from 'axios'; // Import necessary types
|
18
18
|
|
19
19
|
// === Project Imports ===
|
20
|
-
import type { Asset, ParsedHTML } from '../types';
|
21
|
-
import { guessMimeType } from '../utils/mime';
|
22
|
-
import { Logger } from '../utils/logger';
|
20
|
+
import type { Asset, ParsedHTML } from '../types'; // Adjust path if needed
|
21
|
+
import { guessMimeType } from '../utils/mime'; // Adjust path if needed
|
22
|
+
import { Logger } from '../utils/logger'; // Adjust path if needed
|
23
23
|
|
24
24
|
// === Constants ===
|
25
25
|
/** Set of asset types defined in Asset['type'] generally considered text-based */
|
@@ -31,6 +31,11 @@ const MAX_ASSET_EXTRACTION_ITERATIONS = 1000;
|
|
31
31
|
|
32
32
|
// === Helper Functions ===
|
33
33
|
|
34
|
+
/**
|
35
|
+
* Custom type for Node.js error objects with a `code` property.
|
36
|
+
*/
|
37
|
+
type NodeJSErrnoException = Error & { code?: string };
|
38
|
+
|
34
39
|
/**
|
35
40
|
* Checks if decoding a buffer as UTF-8 and re-encoding is lossy.
|
36
41
|
* @param {Buffer} originalBuffer The original binary buffer.
|
@@ -39,333 +44,380 @@ const MAX_ASSET_EXTRACTION_ITERATIONS = 1000;
|
|
39
44
|
*/
|
40
45
|
function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boolean {
|
41
46
|
try {
|
47
|
+
// Re-encode the decoded string back to a buffer using UTF-8
|
42
48
|
const reEncodedBuffer = Buffer.from(decodedString, 'utf-8');
|
49
|
+
// Compare the re-encoded buffer with the original buffer
|
43
50
|
return !originalBuffer.equals(reEncodedBuffer);
|
44
51
|
} catch (e) {
|
52
|
+
// If an error occurs during re-encoding, it implies the original wasn't valid UTF-8
|
45
53
|
return true;
|
46
54
|
}
|
47
55
|
}
|
48
56
|
|
49
57
|
/**
|
50
58
|
* Determines the absolute base directory URL (http://, https://, or file:///) ending in '/'.
|
59
|
+
* This is crucial for resolving relative links found in the source document.
|
51
60
|
* @param {string} inputPathOrUrl - The original source HTML file path or a full HTTP/HTTPS URL.
|
52
61
|
* @param {Logger} [logger] - Optional logger instance.
|
53
62
|
* @returns {string | undefined} The absolute base URL string ending in '/', or undefined if determination fails.
|
54
63
|
*/
|
55
64
|
function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | undefined {
|
65
|
+
// Log the input for debugging purposes
|
66
|
+
// console.log(`[DEBUG determineBaseUrl] Input: "${inputPathOrUrl}"`); // Keep debug log commented unless needed
|
56
67
|
logger?.debug(`Determining base URL for input: ${inputPathOrUrl}`);
|
68
|
+
|
69
|
+
// Handle invalid or empty input
|
57
70
|
if (!inputPathOrUrl) {
|
58
71
|
logger?.warn('Cannot determine base URL: inputPathOrUrl is empty or invalid.');
|
59
72
|
return undefined;
|
60
73
|
}
|
61
74
|
|
62
75
|
try {
|
76
|
+
// Handle non-file URLs (HTTP, HTTPS)
|
63
77
|
if (/^https?:\/\//i.test(inputPathOrUrl)) {
|
64
78
|
const url = new URL(inputPathOrUrl);
|
79
|
+
// Construct the base URL by taking the path up to the last '/'
|
65
80
|
url.pathname = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
|
66
|
-
url.search = '';
|
81
|
+
url.search = ''; // Remove query parameters
|
82
|
+
url.hash = ''; // Remove fragments
|
67
83
|
const baseUrl = url.href;
|
68
84
|
logger?.debug(`Determined remote base URL: ${baseUrl}`);
|
85
|
+
// console.log(`[DEBUG determineBaseUrl] Determined Remote URL: "${baseUrl}"`); // Keep debug log commented unless needed
|
86
|
+
// Return the constructed base URL (usually ends in '/')
|
69
87
|
return baseUrl;
|
70
88
|
}
|
89
|
+
// Handle other protocols (warn and return undefined)
|
71
90
|
else if (inputPathOrUrl.includes('://') && !inputPathOrUrl.startsWith('file:')) {
|
72
91
|
logger?.warn(`Input "${inputPathOrUrl}" looks like a URL but uses an unsupported protocol. Cannot determine base URL.`);
|
92
|
+
// console.log(`[DEBUG determineBaseUrl] Unsupported protocol.`); // Keep debug log commented unless needed
|
73
93
|
return undefined;
|
74
94
|
}
|
95
|
+
// Handle file paths and file: URLs
|
75
96
|
else {
|
76
|
-
let
|
97
|
+
let resourcePath: string; // Path to the actual file or dir input
|
98
|
+
let isInputLikelyDirectory = false;
|
99
|
+
|
100
|
+
// Convert input to an absolute path
|
77
101
|
if (inputPathOrUrl.startsWith('file:')) {
|
78
|
-
|
79
|
-
|
102
|
+
// Convert file URL to path
|
103
|
+
resourcePath = fileURLToPath(inputPathOrUrl);
|
104
|
+
// file: URLs ending in / strongly suggest a directory
|
105
|
+
isInputLikelyDirectory = inputPathOrUrl.endsWith('/');
|
80
106
|
} else {
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
107
|
+
// Resolve relative/absolute file paths
|
108
|
+
resourcePath = path.resolve(inputPathOrUrl);
|
109
|
+
// Check if the resolved path *actually* exists and is a directory
|
110
|
+
try {
|
111
|
+
// Use statSync carefully - assumes it's available and works (or mocked)
|
112
|
+
isInputLikelyDirectory = fs.statSync(resourcePath).isDirectory();
|
113
|
+
} catch {
|
114
|
+
// If stat fails (ENOENT, EACCES), assume it refers to a file path
|
115
|
+
isInputLikelyDirectory = false;
|
90
116
|
}
|
91
|
-
isDirectory = false;
|
92
117
|
}
|
93
|
-
|
94
|
-
|
118
|
+
// console.log(`[DEBUG determineBaseUrl] resourcePath: "${resourcePath}", isInputLikelyDirectory: ${isInputLikelyDirectory}`); // Keep debug log commented unless needed
|
119
|
+
|
120
|
+
// The base directory is the directory containing the resourcePath,
|
121
|
+
// OR resourcePath itself if it was identified as a directory.
|
122
|
+
const baseDirPath = isInputLikelyDirectory ? resourcePath : path.dirname(resourcePath);
|
123
|
+
// console.log(`[DEBUG determineBaseUrl] Calculated baseDirPath: "${baseDirPath}"`); // Keep debug log commented unless needed
|
124
|
+
|
125
|
+
// Convert base directory path back to a file URL ending in '/'
|
126
|
+
let normalizedPathForURL = baseDirPath.replace(/\\/g, '/'); // Use forward slashes for URL consistency
|
127
|
+
// Ensure leading slash for Windows file URLs (e.g., /C:/...)
|
95
128
|
if (/^[A-Z]:\//i.test(normalizedPathForURL) && !normalizedPathForURL.startsWith('/')) {
|
96
129
|
normalizedPathForURL = '/' + normalizedPathForURL;
|
97
130
|
}
|
131
|
+
// Ensure trailing slash for the directory URL
|
132
|
+
if (!normalizedPathForURL.endsWith('/')) {
|
133
|
+
normalizedPathForURL += '/';
|
134
|
+
}
|
135
|
+
|
136
|
+
// Create the final file URL object and get its string representation
|
98
137
|
const fileUrl = new URL('file://' + normalizedPathForURL);
|
99
|
-
|
100
|
-
|
101
|
-
logger?.debug(`Determined
|
138
|
+
const fileUrlString = fileUrl.href;
|
139
|
+
|
140
|
+
logger?.debug(`Determined base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved base dir: ${baseDirPath})`);
|
141
|
+
// console.log(`[DEBUG determineBaseUrl] Determined File URL: "${fileUrlString}"`); // Keep debug log commented unless needed
|
102
142
|
return fileUrlString;
|
103
143
|
}
|
104
144
|
} catch (error: unknown) {
|
145
|
+
// Handle any errors during base URL determination
|
105
146
|
const message = error instanceof Error ? error.message : String(error);
|
106
|
-
|
147
|
+
// console.error(`[DEBUG determineBaseUrl] Error determining base URL: ${message}`); // Keep debug log commented unless needed
|
148
|
+
logger?.error(`💀 Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error && error.stack ? ` - Stack: ${error.stack}` : ''}`);
|
107
149
|
return undefined;
|
108
150
|
}
|
109
151
|
}
|
110
152
|
|
111
153
|
/**
|
112
154
|
* Resolves an asset URL relative to a base URL context.
|
113
|
-
*
|
114
|
-
* @param {string}
|
155
|
+
* Handles data URIs, fragments, protocol-relative URLs.
|
156
|
+
* @param {string} assetUrl - The raw URL string found in the source (e.g., href, src).
|
157
|
+
* @param {string} [baseContextUrl] - The absolute base URL of the containing document (HTML or CSS).
|
115
158
|
* @param {Logger} [logger] - Optional logger instance.
|
116
|
-
* @returns {URL | null} A validated, absolute URL object or null.
|
159
|
+
* @returns {URL | null} A validated, absolute URL object, or null if invalid/ignorable.
|
117
160
|
*/
|
118
161
|
function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Logger): URL | null {
|
162
|
+
// Trim whitespace from the URL
|
119
163
|
const trimmedUrl = assetUrl?.trim();
|
164
|
+
|
165
|
+
// Ignore empty URLs, data URIs, or fragment-only URLs
|
120
166
|
if (!trimmedUrl || trimmedUrl.startsWith('data:') || trimmedUrl.startsWith('#')) {
|
121
167
|
return null;
|
122
168
|
}
|
169
|
+
|
123
170
|
let resolvableUrl = trimmedUrl;
|
171
|
+
|
172
|
+
// Handle protocol-relative URLs (e.g., //example.com/image.png)
|
124
173
|
if (resolvableUrl.startsWith('//') && baseContextUrl) {
|
125
174
|
try {
|
175
|
+
// Prepend the protocol from the base context URL
|
126
176
|
const base = new URL(baseContextUrl);
|
127
177
|
resolvableUrl = base.protocol + resolvableUrl;
|
128
178
|
} catch (e) {
|
179
|
+
// Log a warning if the base protocol cannot be determined
|
129
180
|
logger?.warn(`Could not extract protocol from base "${baseContextUrl}" for protocol-relative URL "${trimmedUrl}". Skipping.`);
|
130
181
|
return null;
|
131
182
|
}
|
132
183
|
}
|
184
|
+
|
133
185
|
try {
|
186
|
+
// Use URL constructor for resolution. Handles absolute paths, relative paths, ../ etc.
|
134
187
|
const resolved = new URL(resolvableUrl, baseContextUrl);
|
188
|
+
|
189
|
+
// Skip assets with unsupported protocols (e.g., mailto:, ws:)
|
190
|
+
if (!['http:', 'https:', 'file:'].includes(resolved.protocol)) {
|
191
|
+
logger?.debug(`Skipping asset with unsupported protocol: ${resolved.href}`);
|
192
|
+
return null;
|
193
|
+
}
|
194
|
+
// Return the resolved URL object
|
135
195
|
return resolved;
|
136
196
|
} catch (error: unknown) {
|
197
|
+
// Log errors during URL parsing/resolution
|
137
198
|
const message = error instanceof Error ? error.message : String(error);
|
199
|
+
// Avoid redundant warnings for relative paths when no base context was provided (expected failure)
|
138
200
|
if (!/^[a-z]+:/i.test(resolvableUrl) && !resolvableUrl.startsWith('/') && !baseContextUrl) {
|
139
201
|
logger?.warn(`Cannot resolve relative URL "${resolvableUrl}" - Base context URL was not provided or determined.`);
|
140
202
|
} else {
|
203
|
+
// Log other resolution failures
|
141
204
|
logger?.warn(`⚠️ Failed to parse/resolve URL "${resolvableUrl}" ${baseContextUrl ? 'against base "' + baseContextUrl + '"' : '(no base provided)'}: ${message}`);
|
142
205
|
}
|
206
|
+
// Return null if resolution fails
|
143
207
|
return null;
|
144
208
|
}
|
145
209
|
}
|
146
210
|
|
147
211
|
/**
|
148
|
-
* Properly resolves CSS relative paths, handling "../" correctly.
|
149
|
-
*
|
150
|
-
*
|
151
|
-
* @param {string}
|
152
|
-
* @param {
|
153
|
-
* @
|
154
|
-
* @returns {string | null} The resolved absolute URL or null if resolution fails
|
212
|
+
* Properly resolves CSS relative paths (like url("../images/bg.png")), handling "../" correctly.
|
213
|
+
* Uses the CSS file's own location as the base for resolution.
|
214
|
+
* @param {string} relativeUrl - The relative URL string from CSS (e.g., "../images/bg.png").
|
215
|
+
* @param {string} cssBaseContextUrl - The absolute URL of the CSS file containing the relative URL.
|
216
|
+
* @param {Logger} [logger] - Optional logger instance.
|
217
|
+
* @returns {string | null} The resolved absolute URL string, or null if resolution fails/invalid.
|
155
218
|
*/
|
156
219
|
function resolveCssRelativeUrl(
|
157
220
|
relativeUrl: string,
|
158
|
-
cssBaseContextUrl: string,
|
221
|
+
cssBaseContextUrl: string, // e.g., file:///C:/mock/base/dir/css/deep.css or https://.../style.css
|
159
222
|
logger?: Logger
|
160
223
|
): string | null {
|
161
|
-
|
162
|
-
|
224
|
+
// console.log(`[DEBUG resolveCssRelativeUrl] Input: relative="${relativeUrl}", base="${cssBaseContextUrl}"`); // Keep debug log commented unless needed
|
225
|
+
|
226
|
+
// Ignore empty, data URIs, or fragments
|
227
|
+
if (!relativeUrl || relativeUrl.startsWith('data:') || relativeUrl.startsWith('#')) {
|
163
228
|
return null;
|
164
229
|
}
|
165
230
|
|
166
231
|
try {
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
// the final directory name when we already have a trailing slash.
|
174
|
-
let cssDir: string;
|
175
|
-
try {
|
176
|
-
const stat = fs.statSync(basePath);
|
177
|
-
if (stat.isDirectory()) {
|
178
|
-
cssDir = basePath;
|
179
|
-
} else {
|
180
|
-
cssDir = path.dirname(basePath);
|
181
|
-
}
|
182
|
-
} catch {
|
183
|
-
// If stat fails, assume it's a file path
|
184
|
-
cssDir = path.dirname(basePath);
|
185
|
-
}
|
186
|
-
|
187
|
-
// Resolve relativeUrl against this directory
|
188
|
-
let resolvedPath = path.resolve(cssDir, relativeUrl);
|
189
|
-
resolvedPath = resolvedPath.replace(/\\/g, '/'); // Normalize to forward slashes
|
232
|
+
// Use the URL constructor which correctly handles relative paths including ../
|
233
|
+
// relative to the base URL provided (the CSS file's URL).
|
234
|
+
const resolvedUrl = new URL(relativeUrl, cssBaseContextUrl);
|
235
|
+
// console.log(`[DEBUG resolveCssRelativeUrl] Resolved URL object href: "${resolvedUrl.href}"`); // Keep debug log commented unless needed
|
236
|
+
// Return the resolved absolute URL string
|
237
|
+
return resolvedUrl.href;
|
190
238
|
|
191
|
-
// On Windows, ensure file:///C:/something
|
192
|
-
if (/^[A-Z]:/i.test(resolvedPath) && !resolvedPath.startsWith('/')) {
|
193
|
-
resolvedPath = '/' + resolvedPath;
|
194
|
-
}
|
195
|
-
return `file://${resolvedPath}`;
|
196
|
-
} else {
|
197
|
-
// For http/https etc., do standard resolution
|
198
|
-
return new URL(relativeUrl, cssBaseContextUrl).href;
|
199
|
-
}
|
200
239
|
} catch (error) {
|
240
|
+
// Log warning if URL resolution fails
|
201
241
|
logger?.warn(
|
202
|
-
`Failed to resolve CSS URL: "${relativeUrl}"
|
242
|
+
`Failed to resolve CSS URL: "${relativeUrl}" relative to "${cssBaseContextUrl}": ${String(error)}`
|
203
243
|
);
|
244
|
+
// console.error(`[DEBUG resolveCssRelativeUrl] Error resolving: ${String(error)}`); // Keep debug log commented unless needed
|
204
245
|
return null;
|
205
246
|
}
|
206
247
|
}
|
207
248
|
|
208
249
|
|
209
250
|
/**
|
210
|
-
* Asynchronously fetches the content of a resolved asset URL.
|
251
|
+
* Asynchronously fetches the content of a resolved asset URL (http, https, file).
|
211
252
|
* @async
|
212
253
|
* @param {URL} resolvedUrl - The absolute URL object of the asset to fetch.
|
213
254
|
* @param {Logger} [logger] - Optional logger instance.
|
214
|
-
* @param {number} [timeout=10000] - Network timeout in milliseconds.
|
215
|
-
* @returns {Promise<Buffer | null>} Asset content as a Buffer, or null on failure.
|
216
|
-
*/
|
217
|
-
/**
|
218
|
-
* Asynchronously fetches the content of a resolved asset URL.
|
219
|
-
* @async
|
220
|
-
* @param {URL} resolvedUrl - The absolute URL object of the asset to fetch.
|
221
|
-
* @param {Logger} [logger] - Optional logger instance.
|
222
|
-
* @param {number} [timeout=10000] - Network timeout in milliseconds.
|
255
|
+
* @param {number} [timeout=10000] - Network timeout in milliseconds for HTTP(S) requests.
|
223
256
|
* @returns {Promise<Buffer | null>} Asset content as a Buffer, or null on failure.
|
224
257
|
*/
|
225
258
|
async function fetchAsset(resolvedUrl: URL, logger?: Logger, timeout: number = 10000): Promise<Buffer | null> {
|
259
|
+
// console.log(`[DEBUG fetchAsset] Attempting fetch for URL: ${resolvedUrl.href}`); // Keep debug log commented unless needed
|
226
260
|
logger?.debug(`Attempting to fetch asset: ${resolvedUrl.href}`);
|
227
261
|
const protocol = resolvedUrl.protocol;
|
228
262
|
|
229
263
|
try {
|
264
|
+
// Handle HTTP and HTTPS protocols
|
230
265
|
if (protocol === 'http:' || protocol === 'https:') {
|
231
|
-
|
232
|
-
|
266
|
+
// Use axios to fetch remote content as an ArrayBuffer
|
267
|
+
const response: AxiosResponse<ArrayBuffer> = await axiosNs.default.get(resolvedUrl.href, {
|
268
|
+
responseType: 'arraybuffer', // Fetch as binary data
|
269
|
+
timeout: timeout, // Apply network timeout
|
233
270
|
});
|
234
|
-
logger?.debug(`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data
|
271
|
+
logger?.debug(`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data?.byteLength ?? 0} bytes)`);
|
272
|
+
// console.log(`[DEBUG fetchAsset] HTTP fetch SUCCESS for: ${resolvedUrl.href}, Status: ${response.status}`); // Keep debug log commented unless needed
|
273
|
+
// Return the fetched data as a Node.js Buffer
|
235
274
|
return Buffer.from(response.data);
|
236
|
-
}
|
275
|
+
}
|
276
|
+
// Handle file protocol
|
277
|
+
else if (protocol === 'file:') {
|
237
278
|
let filePath: string;
|
238
279
|
try {
|
239
|
-
|
240
|
-
|
241
|
-
|
280
|
+
// Convert file URL to a system file path
|
281
|
+
// IMPORTANT: This strips query params and fragments from the URL
|
282
|
+
filePath = fileURLToPath(resolvedUrl);
|
283
|
+
} catch (e: any) {
|
284
|
+
// console.error(`[DEBUG fetchAsset] fileURLToPath FAILED for: ${resolvedUrl.href}`, e); // Keep debug log commented unless needed
|
242
285
|
logger?.error(`Could not convert file URL to path: ${resolvedUrl.href}. Error: ${e.message}`);
|
243
|
-
return null; //
|
244
|
-
|
245
|
-
|
246
|
-
const
|
286
|
+
return null; // Return null if conversion fails
|
287
|
+
}
|
288
|
+
|
289
|
+
const normalizedForLog = path.normalize(filePath);
|
290
|
+
// console.log(`[DEBUG fetchAsset] Attempting readFile with path: "${normalizedForLog}" (Original from URL: "${filePath}")`); // Keep debug log commented unless needed
|
291
|
+
|
292
|
+
// Read file content using fs/promises
|
293
|
+
const data = await readFile(filePath); // This call uses the mock in tests
|
294
|
+
|
295
|
+
// console.log(`[DEBUG fetchAsset] readFile call SUCCEEDED for path: "${normalizedForLog}". Data length: ${data?.byteLength}`); // Keep debug log commented unless needed
|
247
296
|
logger?.debug(`Read local file ${filePath} (${data.byteLength} bytes)`);
|
297
|
+
// Return the file content as a Buffer
|
248
298
|
return data;
|
249
|
-
}
|
250
|
-
|
251
|
-
|
299
|
+
}
|
300
|
+
// Handle unsupported protocols
|
301
|
+
else {
|
302
|
+
// console.log(`[DEBUG fetchAsset] Unsupported protocol: ${protocol}`); // Keep debug log commented unless needed
|
303
|
+
logger?.warn(`Unsupported protocol "${protocol}" in URL: ${resolvedUrl.href}`);
|
304
|
+
return null;
|
252
305
|
}
|
253
306
|
} catch (error: unknown) {
|
254
|
-
// --- Handle Errors
|
255
|
-
|
256
|
-
//
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
const
|
261
|
-
const
|
262
|
-
|
263
|
-
|
307
|
+
// --- Handle Errors During Fetch/Read ---
|
308
|
+
const failedId = protocol === 'file:' ? path.normalize(fileURLToPath(resolvedUrl)) : resolvedUrl.href;
|
309
|
+
// console.error(`[DEBUG fetchAsset] CAUGHT Error for ${failedId}. Type: ${Object.prototype.toString.call(error)}, Constructor: ${error?.constructor?.name}, isAxiosError property: ${(error as any)?.isAxiosError}, Code: ${(error as any)?.code}`); // Keep for debugging if needed
|
310
|
+
|
311
|
+
// *** FIXED LOGIC: Check for AxiosError using its property *before* generic instanceof Error ***
|
312
|
+
if ((protocol === 'http:' || protocol === 'https:') && (error as any)?.isAxiosError === true) {
|
313
|
+
const axiosError = error as AxiosError; // Cast for easier property access
|
314
|
+
const status = axiosError.response?.status ?? 'N/A';
|
315
|
+
const code = axiosError.code ?? 'N/A'; // e.g., ECONNABORTED for timeout
|
316
|
+
// Use the specific log format
|
317
|
+
const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: ${axiosError.message} (Code: ${code})`;
|
264
318
|
logger?.warn(logMessage);
|
265
319
|
}
|
266
|
-
// Check for
|
267
|
-
else if (protocol === 'file:') {
|
268
|
-
// Determine the file path again for logging, handling potential errors
|
320
|
+
// Check for file system errors *next*
|
321
|
+
else if (protocol === 'file:' && error instanceof Error) {
|
269
322
|
let failedPath = resolvedUrl.href;
|
270
|
-
try { failedPath = fileURLToPath(resolvedUrl); } catch { /* ignore
|
323
|
+
try { failedPath = fileURLToPath(resolvedUrl); } catch { /* ignore */ }
|
324
|
+
failedPath = path.normalize(failedPath);
|
271
325
|
|
272
|
-
if (
|
326
|
+
if ((error as NodeJSErrnoException).code === 'ENOENT') {
|
273
327
|
logger?.warn(`⚠️ File not found (ENOENT) for asset: ${failedPath}.`);
|
274
|
-
} else if (
|
328
|
+
} else if ((error as NodeJSErrnoException).code === 'EACCES') {
|
329
|
+
// Log ONLY the specific EACCES message
|
275
330
|
logger?.warn(`⚠️ Permission denied (EACCES) reading asset: ${failedPath}.`);
|
276
|
-
} else if (error instanceof Error) { // Catch other errors during file reading (but not path conversion which is handled above)
|
277
|
-
logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
|
278
331
|
} else {
|
279
|
-
|
332
|
+
logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
|
280
333
|
}
|
281
334
|
}
|
282
|
-
//
|
283
|
-
// else if (error instanceof TypeError && error.message.includes('ERR_INVALID_URL')) { ... }
|
284
|
-
|
285
|
-
// Generic fallback for truly unexpected errors during fetch/read
|
335
|
+
// Generic fallback for *other* types of Errors (that are not Axios or known FS errors)
|
286
336
|
else if (error instanceof Error) {
|
287
337
|
logger?.warn(`⚠️ An unexpected error occurred processing asset ${resolvedUrl.href}: ${error.message}`);
|
288
|
-
}
|
338
|
+
}
|
339
|
+
// Fallback for non-Error throws (e.g., strings, numbers)
|
340
|
+
else {
|
289
341
|
logger?.warn(`⚠️ An unknown and unexpected error occurred processing asset ${resolvedUrl.href}: ${String(error)}`);
|
290
342
|
}
|
291
|
-
|
343
|
+
// Return null on ANY error
|
344
|
+
return null;
|
292
345
|
}
|
293
346
|
}
|
294
347
|
|
295
348
|
/**
|
296
|
-
* Extracts URLs from CSS content and resolves them
|
297
|
-
*
|
298
|
-
* @param {string}
|
299
|
-
* @param {
|
300
|
-
* @param {
|
301
|
-
* @
|
302
|
-
*/
|
303
|
-
/**
|
304
|
-
* Extracts URLs from CSS content and resolves them against the CSS base URL.
|
305
|
-
* Returns an array of *potentially* new Asset objects with resolved URLs.
|
349
|
+
* Extracts URLs from CSS content using regex and resolves them.
|
350
|
+
* Finds `url(...)` and `@import` rules.
|
351
|
+
* @param {string} cssContent - The CSS content string to parse.
|
352
|
+
* @param {string} cssBaseContextUrl - The absolute URL of the CSS file (used for resolving relative paths).
|
353
|
+
* @param {Logger} [logger] - Optional logger instance.
|
354
|
+
* @returns {Asset[]} An array of newly discovered Asset objects (type, resolved URL, content initially undefined).
|
306
355
|
*/
|
307
356
|
function extractUrlsFromCSS(
|
308
357
|
cssContent: string,
|
309
358
|
cssBaseContextUrl: string,
|
310
|
-
// discoveredAssets: Asset[], // REMOVE: This function will now RETURN the assets
|
311
|
-
// visitedUrls: Set<string>, // REMOVE
|
312
359
|
logger?: Logger
|
313
|
-
): Asset[] {
|
314
|
-
|
315
|
-
const
|
360
|
+
): Asset[] {
|
361
|
+
// Array to hold assets discovered within this CSS content
|
362
|
+
const newlyDiscovered: Asset[] = [];
|
363
|
+
// Set to track URLs processed within this specific CSS file to avoid adding duplicates from the same file
|
364
|
+
const processedInThisParse = new Set<string>();
|
316
365
|
|
366
|
+
// Regex for url(...) patterns, handling optional quotes (non-greedy match for URL)
|
317
367
|
const urlRegex = /url\(\s*(['"]?)(.*?)\1\s*\)/gi;
|
368
|
+
// Regex for @import rules, handling url() or bare string, optional quotes (non-greedy match for URL)
|
318
369
|
const importRegex = /@import\s+(?:url\(\s*(['"]?)(.*?)\1\s*\)|(['"])(.*?)\3)\s*;/gi;
|
319
370
|
|
371
|
+
/** Internal helper to process a found URL string */
|
320
372
|
const processFoundUrl = (rawUrl: string | undefined, ruleType: '@import' | 'url()') => {
|
321
|
-
if
|
373
|
+
// Skip if URL is empty, undefined, a data URI, or only a fragment
|
374
|
+
if (!rawUrl || rawUrl.trim() === '' || rawUrl.startsWith('data:') || rawUrl.startsWith('#')) return;
|
322
375
|
|
376
|
+
// Resolve the potentially relative URL against the CSS file's base URL
|
323
377
|
const resolvedUrl = resolveCssRelativeUrl(rawUrl, cssBaseContextUrl, logger);
|
324
378
|
|
325
|
-
//
|
379
|
+
// If successfully resolved and not already found *in this specific CSS file*
|
326
380
|
if (resolvedUrl && !processedInThisParse.has(resolvedUrl)) {
|
327
|
-
|
381
|
+
// Mark this resolved URL as processed for this CSS file
|
382
|
+
processedInThisParse.add(resolvedUrl);
|
383
|
+
// Guess the asset type (css, image, font, etc.) based on the resolved URL
|
328
384
|
const { assetType } = guessMimeType(resolvedUrl);
|
329
385
|
|
330
|
-
// Add to the list
|
386
|
+
// Add the discovered asset to the list for this CSS file
|
331
387
|
newlyDiscovered.push({
|
332
388
|
type: assetType,
|
333
|
-
url: resolvedUrl, //
|
334
|
-
content: undefined
|
389
|
+
url: resolvedUrl, // Store the resolved absolute URL string
|
390
|
+
content: undefined // Content will be fetched later if needed
|
335
391
|
});
|
336
392
|
logger?.debug(`Discovered nested ${assetType} asset (${ruleType}) in CSS ${cssBaseContextUrl}: ${resolvedUrl}`);
|
337
393
|
}
|
338
394
|
};
|
339
395
|
|
340
|
-
//
|
341
|
-
urlRegex.lastIndex = 0;
|
342
|
-
importRegex.lastIndex = 0;
|
396
|
+
// Find all url(...) matches in the CSS content
|
343
397
|
let match;
|
344
398
|
while ((match = urlRegex.exec(cssContent)) !== null) {
|
399
|
+
// Group 2 captures the URL part inside url()
|
345
400
|
processFoundUrl(match[2], 'url()');
|
346
401
|
}
|
402
|
+
|
403
|
+
// Find all @import matches in the CSS content
|
404
|
+
// Reset lastIndex as we're reusing the regex object implicitly
|
347
405
|
importRegex.lastIndex = 0;
|
348
406
|
while ((match = importRegex.exec(cssContent)) !== null) {
|
407
|
+
// Group 2 captures url('...'), Group 4 captures bare "..."
|
349
408
|
processFoundUrl(match[2] || match[4], '@import');
|
350
409
|
}
|
351
410
|
|
352
|
-
|
411
|
+
// Return the list of assets discovered within this CSS content
|
412
|
+
return newlyDiscovered;
|
353
413
|
}
|
354
414
|
|
355
|
-
/**
|
356
|
-
* Extracts all discoverable assets recursively from HTML and CSS.
|
357
|
-
* @async
|
358
|
-
* @export
|
359
|
-
* @param {ParsedHTML} parsed - Initial parsed HTML data.
|
360
|
-
* @param {boolean} [embedAssets=true] - Whether to embed content.
|
361
|
-
* @param {string} [inputPathOrUrl] - Original HTML source location.
|
362
|
-
* @param {Logger} [logger] - Optional logger instance.
|
363
|
-
* @returns {Promise<ParsedHTML>} Processed data with all assets.
|
364
|
-
*/
|
365
415
|
/**
|
366
416
|
* Extracts all discoverable assets recursively from HTML and CSS.
|
367
417
|
* Fetches assets if embedAssets is true or if the asset is CSS (to parse for more assets).
|
368
418
|
* Resolves URLs relative to their context (HTML base or CSS file location).
|
419
|
+
* Handles potential infinite loops with an iteration limit.
|
420
|
+
*
|
369
421
|
* @async
|
370
422
|
* @export
|
371
423
|
* @param {ParsedHTML} parsed - Initial parsed HTML data containing `htmlContent` and an initial `assets` array.
|
@@ -382,59 +434,65 @@ export async function extractAssets(
|
|
382
434
|
): Promise<ParsedHTML> {
|
383
435
|
logger?.info(`🚀 Starting asset extraction! Embed: ${embedAssets}. Input: ${inputPathOrUrl || '(HTML content only)'}`);
|
384
436
|
|
437
|
+
// Get the initial list of assets found directly in the HTML
|
385
438
|
const initialAssets: Asset[] = parsed.assets || [];
|
386
|
-
// Stores the final result: Map<resolved URL string, Asset object>
|
439
|
+
// Stores the final result: Map<resolved URL string, Asset object> to ensure uniqueness
|
387
440
|
const finalAssetsMap = new Map<string, Asset>();
|
388
|
-
// Queue holds assets to be processed
|
441
|
+
// Queue holds assets whose content needs to be processed (fetched/analyzed)
|
389
442
|
let assetsToProcess: Asset[] = [];
|
443
|
+
// Set to track URLs that are either already fully processed (in finalAssetsMap)
|
444
|
+
// OR currently in the processing queue (assetsToProcess) to prevent reprocessing/loops.
|
445
|
+
const processedOrQueuedUrls = new Set<string>();
|
390
446
|
|
391
|
-
// Determine
|
447
|
+
// --- Determine Base URL Context for the HTML ---
|
392
448
|
const htmlBaseContextUrl = determineBaseUrl(inputPathOrUrl || '', logger);
|
449
|
+
// Warn if no base URL could be found and there are relative paths in the initial assets
|
393
450
|
if (!htmlBaseContextUrl && initialAssets.some(a => !/^[a-z]+:/i.test(a.url) && !a.url.startsWith('data:') && !a.url.startsWith('#') && !a.url.startsWith('/'))) {
|
394
451
|
logger?.warn("🚨 No valid base path/URL determined for the HTML source! Resolution of relative asset paths from HTML may fail.");
|
395
452
|
} else if (htmlBaseContextUrl) {
|
396
453
|
logger?.debug(`Using HTML base context URL: ${htmlBaseContextUrl}`);
|
397
454
|
}
|
398
455
|
|
399
|
-
// ---
|
400
|
-
// Set to track URLs that are already processed (in finalAssetsMap) OR currently in the queue (assetsToProcess)
|
401
|
-
// This prevents adding the same asset to the queue multiple times.
|
402
|
-
const processedOrQueuedUrls = new Set<string>();
|
403
|
-
|
404
|
-
// --- Initial Queue Population ---
|
456
|
+
// --- Initial Queue Population from HTML assets ---
|
405
457
|
logger?.debug(`Queueing ${initialAssets.length} initial assets parsed from HTML...`);
|
406
458
|
for (const asset of initialAssets) {
|
407
459
|
// Resolve the initial asset URL against the HTML base context
|
408
460
|
const resolvedUrlObj = resolveAssetUrl(asset.url, htmlBaseContextUrl, logger);
|
409
|
-
// Use the resolved URL string if resolution succeeded, otherwise use the original
|
410
|
-
const urlToQueue = resolvedUrlObj ? resolvedUrlObj.href : asset.url;
|
411
461
|
|
412
|
-
// Skip
|
413
|
-
if (!
|
414
|
-
|
462
|
+
// Skip if URL is invalid, data URI, fragment, or unsupported protocol
|
463
|
+
if (!resolvedUrlObj) {
|
464
|
+
logger?.debug(` -> Skipping initial asset with unresolvable/ignorable URL: ${asset.url}`);
|
465
|
+
continue;
|
466
|
+
}
|
467
|
+
// Get the resolved absolute URL string
|
468
|
+
const urlToQueue = resolvedUrlObj.href;
|
469
|
+
|
470
|
+
// Check if this URL is already tracked (processed or queued)
|
471
|
+
if (!processedOrQueuedUrls.has(urlToQueue)) {
|
472
|
+
// Mark as queued (add to set *before* adding to array)
|
473
|
+
processedOrQueuedUrls.add(urlToQueue);
|
415
474
|
|
416
475
|
// Guess type from the resolved/original URL if not provided initially
|
417
476
|
const { assetType: guessedType } = guessMimeType(urlToQueue);
|
418
|
-
const initialType = asset.type ?? guessedType;
|
477
|
+
const initialType = asset.type ?? guessedType; // Use provided type or fallback to guessed type
|
419
478
|
|
420
|
-
// Add to the processing queue
|
479
|
+
// Add the resolved asset to the processing queue
|
421
480
|
assetsToProcess.push({
|
422
|
-
url: urlToQueue,
|
481
|
+
url: urlToQueue, // Use the resolved URL
|
423
482
|
type: initialType,
|
424
|
-
content: undefined
|
483
|
+
content: undefined // Content is initially undefined
|
425
484
|
});
|
426
485
|
logger?.debug(` -> Queued initial asset: ${urlToQueue} (Original raw: ${asset.url})`);
|
427
|
-
} else if (urlToQueue.startsWith('data:')) {
|
428
|
-
logger?.debug(` -> Skipping data URI: ${urlToQueue.substring(0, 50)}...`);
|
429
486
|
} else {
|
430
|
-
|
487
|
+
logger?.debug(` -> Skipping already processed/queued initial asset: ${urlToQueue}`);
|
431
488
|
}
|
432
489
|
}
|
433
490
|
|
434
|
-
// --- Main processing loop ---
|
491
|
+
// --- Main processing loop (continues as long as there are assets to process) ---
|
435
492
|
let iterationCount = 0;
|
436
493
|
while (assetsToProcess.length > 0) {
|
437
494
|
iterationCount++;
|
495
|
+
// Prevent potential infinite loops
|
438
496
|
if (iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS) {
|
439
497
|
logger?.error(`🛑 Asset extraction loop limit hit (${MAX_ASSET_EXTRACTION_ITERATIONS})! Aborting.`);
|
440
498
|
const remainingUrls = assetsToProcess.map(a => a.url).slice(0, 10).join(', ');
|
@@ -442,175 +500,204 @@ export async function extractAssets(
|
|
442
500
|
// Add assets remaining in queue to final map without content before breaking
|
443
501
|
assetsToProcess.forEach(asset => {
|
444
502
|
if (!finalAssetsMap.has(asset.url)) {
|
445
|
-
|
503
|
+
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
446
504
|
}
|
447
505
|
});
|
448
|
-
assetsToProcess = []; // Clear queue
|
506
|
+
assetsToProcess = []; // Clear queue to stop the loop
|
449
507
|
break; // Exit loop
|
450
508
|
}
|
451
509
|
|
452
|
-
//
|
510
|
+
// Take a snapshot of the current queue to process in this iteration
|
453
511
|
const currentBatch = [...assetsToProcess];
|
454
|
-
|
512
|
+
// Clear the main queue; new assets found in this batch will be added here for the *next* iteration
|
513
|
+
assetsToProcess = [];
|
455
514
|
|
456
515
|
logger?.debug(`--- Processing batch ${iterationCount}: ${currentBatch.length} asset(s) ---`);
|
457
516
|
|
517
|
+
// Process each asset in the current batch
|
458
518
|
for (const asset of currentBatch) {
|
459
|
-
// Skip if
|
519
|
+
// Double-check: Skip if this asset somehow got fully processed in a previous iteration (shouldn't happen with current logic, but safe check)
|
460
520
|
if (finalAssetsMap.has(asset.url)) {
|
461
521
|
logger?.debug(`Skipping asset already in final map: ${asset.url}`);
|
462
522
|
continue;
|
463
523
|
}
|
464
524
|
|
465
|
-
let assetContentBuffer: Buffer | null = null;
|
466
|
-
let finalContent: string | undefined = undefined; //
|
467
|
-
let cssContentForParsing: string | undefined = undefined; //
|
525
|
+
let assetContentBuffer: Buffer | null = null; // To store fetched binary content
|
526
|
+
let finalContent: string | undefined = undefined; // Final content (text or data URI) for the Asset object
|
527
|
+
let cssContentForParsing: string | undefined = undefined; // Text content specifically for parsing CSS
|
468
528
|
|
469
529
|
// --- Determine if fetching is needed ---
|
530
|
+
// Fetch if we need to embed all assets OR if it's CSS (we need content to parse for nested assets)
|
470
531
|
const needsFetching = embedAssets || asset.type === 'css';
|
471
|
-
let assetUrlObj: URL | null = null;
|
532
|
+
let assetUrlObj: URL | null = null; // URL object needed for fetchAsset
|
472
533
|
|
473
534
|
if (needsFetching) {
|
474
535
|
// --- Create URL object for fetching ---
|
475
536
|
try {
|
537
|
+
// Asset URL should be absolute at this point
|
476
538
|
assetUrlObj = new URL(asset.url);
|
477
539
|
} catch (urlError) {
|
540
|
+
// Log error if creating URL object fails
|
478
541
|
logger?.warn(`Cannot create URL object for "${asset.url}", skipping fetch. Error: ${urlError instanceof Error ? urlError.message : String(urlError)}`);
|
542
|
+
// Store asset without content in the final map
|
479
543
|
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
480
|
-
|
544
|
+
// Skip to next asset in the current batch
|
545
|
+
continue;
|
481
546
|
}
|
482
547
|
|
483
548
|
// --- Fetch Asset ---
|
484
549
|
if (assetUrlObj) {
|
550
|
+
// Call fetchAsset (which handles http/https/file and errors)
|
485
551
|
assetContentBuffer = await fetchAsset(assetUrlObj, logger);
|
552
|
+
// fetchAsset returns null on failure
|
486
553
|
}
|
487
554
|
} // End if(needsFetching)
|
488
555
|
|
489
|
-
// --- If fetching was
|
556
|
+
// --- If fetching was required but failed, store asset without content and continue ---
|
490
557
|
if (needsFetching && assetContentBuffer === null) {
|
491
558
|
logger?.debug(`Storing asset ${asset.url} without content due to fetch failure.`);
|
559
|
+
// Add to final map with undefined content
|
492
560
|
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
493
|
-
|
561
|
+
// Skip to the next asset in the current batch
|
562
|
+
continue;
|
494
563
|
}
|
495
564
|
|
496
565
|
// --- Prepare Content for Storing/Embedding (if fetched successfully) ---
|
497
566
|
if (assetContentBuffer) { // Only proceed if content was fetched
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
+
// Guess MIME type based on the asset's URL extension
|
568
|
+
const mimeInfo = guessMimeType(asset.url);
|
569
|
+
// Use the guessed MIME type or fallback to a generic binary type
|
570
|
+
const effectiveMime = mimeInfo.mime || 'application/octet-stream';
|
571
|
+
|
572
|
+
// Handle TEXT types (CSS, JS)
|
573
|
+
if (TEXT_ASSET_TYPES.has(asset.type)) {
|
574
|
+
let textContent: string | undefined;
|
575
|
+
let wasLossy = false;
|
576
|
+
try {
|
577
|
+
// Try decoding the buffer as UTF-8
|
578
|
+
textContent = assetContentBuffer.toString('utf-8');
|
579
|
+
// Check if the decoding process lost information (e.g., invalid sequences replaced)
|
580
|
+
wasLossy = isUtf8DecodingLossy(assetContentBuffer, textContent);
|
581
|
+
} catch (e) {
|
582
|
+
// Decoding itself failed
|
583
|
+
textContent = undefined;
|
584
|
+
wasLossy = true;
|
585
|
+
}
|
586
|
+
|
587
|
+
// If decoding was successful and not lossy
|
588
|
+
if (!wasLossy && textContent !== undefined) {
|
589
|
+
// If embedding, store the text content
|
590
|
+
if (embedAssets) {
|
591
|
+
finalContent = textContent;
|
592
|
+
} else {
|
593
|
+
finalContent = undefined; // Not embedding text, store undefined
|
594
|
+
}
|
595
|
+
// If it's CSS, store its text content for parsing regardless of embedding option
|
596
|
+
if (asset.type === 'css') {
|
597
|
+
cssContentForParsing = textContent;
|
598
|
+
}
|
599
|
+
} else {
|
600
|
+
// Decoding failed or was lossy
|
601
|
+
// Fixed log message: Added "asset" after type.
|
602
|
+
logger?.warn(`Could not decode ${asset.type} asset ${asset.url} as valid UTF-8 text.${embedAssets ? ' Falling back to base64 data URI.' : ''}`);
|
603
|
+
cssContentForParsing = undefined; // Cannot parse CSS if decoding failed
|
604
|
+
// Embed as base64 data URI if requested, using the effective MIME type
|
605
|
+
if (embedAssets) {
|
606
|
+
finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
|
607
|
+
} else {
|
608
|
+
finalContent = undefined; // Not embedding
|
609
|
+
}
|
610
|
+
}
|
611
|
+
}
|
612
|
+
// Handle BINARY types (image, font, video, audio)
|
613
|
+
else if (BINARY_ASSET_TYPES.has(asset.type)) {
|
614
|
+
// Embed as base64 data URI if requested
|
615
|
+
if (embedAssets) {
|
616
|
+
finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
|
617
|
+
} else {
|
618
|
+
finalContent = undefined; // Not embedding
|
619
|
+
}
|
620
|
+
cssContentForParsing = undefined; // Not CSS, so no parsing needed
|
621
|
+
}
|
622
|
+
// Handle 'other' or unknown types
|
623
|
+
else {
|
624
|
+
cssContentForParsing = undefined; // Assume not parseable as CSS
|
625
|
+
// If embedding, attempt to store as text, fallback to base64 if invalid UTF-8
|
626
|
+
if (embedAssets) {
|
627
|
+
try {
|
628
|
+
const attemptedTextContent = assetContentBuffer.toString('utf-8');
|
629
|
+
if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
|
630
|
+
// If text decoding is lossy, warn and use base64
|
631
|
+
logger?.warn(`Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`);
|
632
|
+
finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
|
633
|
+
} else {
|
634
|
+
// Store as text if decoding worked
|
635
|
+
finalContent = attemptedTextContent;
|
636
|
+
logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
|
637
|
+
}
|
638
|
+
} catch (decodeError) {
|
639
|
+
// If toString fails, warn and use base64
|
640
|
+
logger?.warn(`Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`);
|
641
|
+
finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
|
642
|
+
}
|
643
|
+
} else {
|
644
|
+
finalContent = undefined; // Not embedding
|
645
|
+
}
|
646
|
+
}
|
647
|
+
} else { // Content was not fetched (e.g., embedAssets=false and not CSS)
|
648
|
+
finalContent = undefined;
|
649
|
+
cssContentForParsing = undefined;
|
567
650
|
}
|
568
651
|
|
569
|
-
// --- Store the final asset ---
|
570
|
-
// Use the resolved URL as the key and
|
652
|
+
// --- Store the final processed asset in the map ---
|
653
|
+
// Use the resolved URL as the key and ensure the asset object also uses the resolved URL
|
571
654
|
finalAssetsMap.set(asset.url, { ...asset, url: asset.url, content: finalContent });
|
572
|
-
// Note: URL
|
655
|
+
// Note: URL was already added to processedOrQueuedUrls when initially queued or discovered in CSS
|
573
656
|
|
574
657
|
// --- Process CSS for nested assets ---
|
575
658
|
// Only if it's CSS and we successfully decoded its content for parsing
|
576
659
|
if (asset.type === 'css' && cssContentForParsing) {
|
577
|
-
// Determine the base URL *for this specific CSS file*
|
578
|
-
const cssBaseContextUrl = determineBaseUrl(asset.url, logger);
|
579
|
-
|
660
|
+
// Determine the base URL *for this specific CSS file* to resolve its relative links
|
661
|
+
const cssBaseContextUrl = determineBaseUrl(asset.url, logger); // CSS URL is absolute here
|
662
|
+
logger?.debug(`CSS base context for resolving nested assets within ${asset.url}: ${cssBaseContextUrl}`);
|
580
663
|
|
581
664
|
if (cssBaseContextUrl) {
|
582
|
-
//
|
665
|
+
// Extract URLs found within this CSS content
|
583
666
|
const newlyDiscoveredAssets = extractUrlsFromCSS(
|
584
667
|
cssContentForParsing,
|
585
|
-
cssBaseContextUrl,
|
668
|
+
cssBaseContextUrl, // Use the CSS file's own URL as the base
|
586
669
|
logger
|
587
670
|
);
|
588
671
|
|
672
|
+
// If new assets were found in the CSS
|
589
673
|
if (newlyDiscoveredAssets.length > 0) {
|
590
674
|
logger?.debug(`Discovered ${newlyDiscoveredAssets.length} nested assets in CSS ${asset.url}. Checking against queue...`);
|
675
|
+
// Process each newly discovered asset
|
591
676
|
for (const newAsset of newlyDiscoveredAssets) {
|
592
|
-
// CHECK: Add to queue only if this resolved URL hasn't been processed OR queued before.
|
593
|
-
// Use the 'processedOrQueuedUrls' Set which tracks both.
|
677
|
+
// CHECK: Add to the main processing queue only if this resolved URL hasn't been processed OR queued before.
|
594
678
|
if (!processedOrQueuedUrls.has(newAsset.url)) {
|
595
679
|
processedOrQueuedUrls.add(newAsset.url); // Mark as queued now
|
596
|
-
assetsToProcess.push(newAsset);
|
597
|
-
|
680
|
+
assetsToProcess.push(newAsset); // Add to the queue for the *next* iteration
|
681
|
+
logger?.debug(` -> Queued new nested asset: ${newAsset.url}`);
|
598
682
|
} else {
|
599
|
-
|
683
|
+
// Skip if already handled
|
684
|
+
logger?.debug(` -> Skipping already processed/queued nested asset: ${newAsset.url}`);
|
600
685
|
}
|
601
686
|
}
|
602
687
|
}
|
603
688
|
} else {
|
604
|
-
|
689
|
+
// Warn if the base URL for the CSS file couldn't be determined (shouldn't happen if asset.url was valid)
|
690
|
+
logger?.warn(`Could not determine base URL context for CSS file ${asset.url}. Cannot resolve nested relative paths within it.`);
|
605
691
|
}
|
606
692
|
} // End if(asset.type === 'css' && cssContentForParsing)
|
607
693
|
} // End for loop over currentBatch
|
608
|
-
} // End while loop
|
694
|
+
} // End while loop (assetsToProcess.length > 0)
|
609
695
|
|
610
|
-
|
696
|
+
// Log completion summary
|
697
|
+
const finalIterationCount = iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS ? `${MAX_ASSET_EXTRACTION_ITERATIONS}+ (limit hit)` : iterationCount;
|
611
698
|
logger?.info(`✅ Asset extraction COMPLETE! Found ${finalAssetsMap.size} unique assets in ${finalIterationCount} iterations.`);
|
612
699
|
|
613
|
-
// Return the original HTML content and the final list of processed assets
|
700
|
+
// Return the original HTML content and the final list of processed assets from the map
|
614
701
|
return {
|
615
702
|
htmlContent: parsed.htmlContent,
|
616
703
|
assets: Array.from(finalAssetsMap.values())
|