portapack 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +67 -8
- package/.releaserc.js +25 -27
- package/CHANGELOG.md +14 -22
- package/LICENSE.md +21 -0
- package/README.md +22 -53
- package/commitlint.config.js +30 -34
- package/dist/cli/cli-entry.cjs +183 -98
- package/dist/cli/cli-entry.cjs.map +1 -1
- package/dist/index.d.ts +0 -3
- package/dist/index.js +178 -97
- package/dist/index.js.map +1 -1
- package/docs/.vitepress/config.ts +38 -33
- package/docs/.vitepress/sidebar-generator.ts +89 -38
- package/docs/architecture.md +186 -0
- package/docs/cli.md +23 -23
- package/docs/code-of-conduct.md +7 -1
- package/docs/configuration.md +12 -11
- package/docs/contributing.md +6 -2
- package/docs/deployment.md +10 -5
- package/docs/development.md +8 -5
- package/docs/getting-started.md +13 -13
- package/docs/index.md +1 -1
- package/docs/public/android-chrome-192x192.png +0 -0
- package/docs/public/android-chrome-512x512.png +0 -0
- package/docs/public/apple-touch-icon.png +0 -0
- package/docs/public/favicon-16x16.png +0 -0
- package/docs/public/favicon-32x32.png +0 -0
- package/docs/public/favicon.ico +0 -0
- package/docs/roadmap.md +233 -0
- package/docs/site.webmanifest +1 -0
- package/docs/troubleshooting.md +12 -1
- package/examples/main.ts +5 -30
- package/examples/sample-project/script.js +1 -1
- package/jest.config.ts +8 -13
- package/nodemon.json +5 -10
- package/package.json +2 -5
- package/src/cli/cli-entry.ts +2 -2
- package/src/cli/cli.ts +21 -16
- package/src/cli/options.ts +127 -113
- package/src/core/bundler.ts +253 -222
- package/src/core/extractor.ts +632 -565
- package/src/core/minifier.ts +173 -162
- package/src/core/packer.ts +141 -137
- package/src/core/parser.ts +74 -73
- package/src/core/web-fetcher.ts +270 -258
- package/src/index.ts +18 -17
- package/src/types.ts +9 -11
- package/src/utils/font.ts +12 -6
- package/src/utils/logger.ts +110 -105
- package/src/utils/meta.ts +75 -76
- package/src/utils/mime.ts +50 -50
- package/src/utils/slugify.ts +33 -34
- package/tests/unit/cli/cli-entry.test.ts +72 -70
- package/tests/unit/cli/cli.test.ts +314 -278
- package/tests/unit/cli/options.test.ts +294 -301
- package/tests/unit/core/bundler.test.ts +426 -329
- package/tests/unit/core/extractor.test.ts +793 -549
- package/tests/unit/core/minifier.test.ts +374 -274
- package/tests/unit/core/packer.test.ts +298 -264
- package/tests/unit/core/parser.test.ts +538 -150
- package/tests/unit/core/web-fetcher.test.ts +389 -359
- package/tests/unit/index.test.ts +238 -197
- package/tests/unit/utils/font.test.ts +26 -21
- package/tests/unit/utils/logger.test.ts +267 -260
- package/tests/unit/utils/meta.test.ts +29 -28
- package/tests/unit/utils/mime.test.ts +73 -74
- package/tests/unit/utils/slugify.test.ts +14 -12
- package/tsconfig.build.json +9 -10
- package/tsconfig.jest.json +1 -1
- package/tsconfig.json +2 -2
- package/tsup.config.ts +8 -9
- package/typedoc.json +5 -9
- /package/docs/{portapack-transparent.png → public/portapack-transparent.png} +0 -0
- /package/docs/{portapack.jpg → public/portapack.jpg} +0 -0
package/src/core/extractor.ts
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
* @file src/core/extractor.ts
|
3
3
|
* @description Handles discovery, resolution, fetching, and optional embedding of assets
|
4
4
|
* linked from HTML and recursively within CSS (@import, url()). This is the heart of finding EVERYTHING.
|
5
|
-
* @version 1.1.6 - Revised fetchAsset error handling logic for Axios errors.
|
6
5
|
*/
|
7
6
|
|
8
7
|
// === Node.js Core Imports ===
|
@@ -14,7 +13,12 @@ import { fileURLToPath, URL } from 'url'; // Crucial for file path/URL conversio
|
|
14
13
|
|
15
14
|
// === External Dependencies ===
|
16
15
|
import * as axiosNs from 'axios'; // Using namespace import for clarity
|
17
|
-
import type {
|
16
|
+
import type {
|
17
|
+
AxiosError,
|
18
|
+
AxiosRequestConfig,
|
19
|
+
AxiosResponse,
|
20
|
+
InternalAxiosRequestConfig,
|
21
|
+
} from 'axios'; // Import necessary types
|
18
22
|
|
19
23
|
// === Project Imports ===
|
20
24
|
import type { Asset, ParsedHTML } from '../types'; // Adjust path if needed
|
@@ -30,7 +34,6 @@ const BINARY_ASSET_TYPES: Set<Asset['type']> = new Set(['image', 'font', 'video'
|
|
30
34
|
const MAX_ASSET_EXTRACTION_ITERATIONS = 1000;
|
31
35
|
|
32
36
|
// === Helper Functions ===
|
33
|
-
|
34
37
|
/**
|
35
38
|
* Custom type for Node.js error objects with a `code` property.
|
36
39
|
*/
|
@@ -43,15 +46,15 @@ type NodeJSErrnoException = Error & { code?: string };
|
|
43
46
|
* @returns {boolean} True if re-encoding doesn't match original buffer (lossy), false otherwise.
|
44
47
|
*/
|
45
48
|
function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boolean {
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
49
|
+
try {
|
50
|
+
// Re-encode the decoded string back to a buffer using UTF-8
|
51
|
+
const reEncodedBuffer = Buffer.from(decodedString, 'utf-8');
|
52
|
+
// Compare the re-encoded buffer with the original buffer
|
53
|
+
return !originalBuffer.equals(reEncodedBuffer);
|
54
|
+
} catch (e) {
|
55
|
+
// If an error occurs during re-encoding, it implies the original wasn't valid UTF-8
|
56
|
+
return true;
|
57
|
+
}
|
55
58
|
}
|
56
59
|
|
57
60
|
/**
|
@@ -62,92 +65,98 @@ function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boo
|
|
62
65
|
* @returns {string | undefined} The absolute base URL string ending in '/', or undefined if determination fails.
|
63
66
|
*/
|
64
67
|
function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | undefined {
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
68
|
+
// Log the input for debugging purposes
|
69
|
+
// console.log(`[DEBUG determineBaseUrl] Input: "${inputPathOrUrl}"`); // Keep debug log commented unless needed
|
70
|
+
logger?.debug(`Determining base URL for input: ${inputPathOrUrl}`);
|
71
|
+
|
72
|
+
// Handle invalid or empty input
|
73
|
+
if (!inputPathOrUrl) {
|
74
|
+
logger?.warn('Cannot determine base URL: inputPathOrUrl is empty or invalid.');
|
75
|
+
return undefined;
|
76
|
+
}
|
77
|
+
|
78
|
+
try {
|
79
|
+
// Handle non-file URLs (HTTP, HTTPS)
|
80
|
+
if (/^https?:\/\//i.test(inputPathOrUrl)) {
|
81
|
+
const url = new URL(inputPathOrUrl);
|
82
|
+
// Construct the base URL by taking the path up to the last '/'
|
83
|
+
url.pathname = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
|
84
|
+
url.search = ''; // Remove query parameters
|
85
|
+
url.hash = ''; // Remove fragments
|
86
|
+
const baseUrl = url.href;
|
87
|
+
logger?.debug(`Determined remote base URL: ${baseUrl}`);
|
88
|
+
// console.log(`[DEBUG determineBaseUrl] Determined Remote URL: "${baseUrl}"`); // Keep debug log commented unless needed
|
89
|
+
// Return the constructed base URL (usually ends in '/')
|
90
|
+
return baseUrl;
|
73
91
|
}
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
//
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
resourcePath = fileURLToPath(inputPathOrUrl);
|
104
|
-
// file: URLs ending in / strongly suggest a directory
|
105
|
-
isInputLikelyDirectory = inputPathOrUrl.endsWith('/');
|
106
|
-
} else {
|
107
|
-
// Resolve relative/absolute file paths
|
108
|
-
resourcePath = path.resolve(inputPathOrUrl);
|
109
|
-
// Check if the resolved path *actually* exists and is a directory
|
110
|
-
try {
|
111
|
-
// Use statSync carefully - assumes it's available and works (or mocked)
|
112
|
-
isInputLikelyDirectory = fs.statSync(resourcePath).isDirectory();
|
113
|
-
} catch {
|
114
|
-
// If stat fails (ENOENT, EACCES), assume it refers to a file path
|
115
|
-
isInputLikelyDirectory = false;
|
116
|
-
}
|
117
|
-
}
|
118
|
-
// console.log(`[DEBUG determineBaseUrl] resourcePath: "${resourcePath}", isInputLikelyDirectory: ${isInputLikelyDirectory}`); // Keep debug log commented unless needed
|
119
|
-
|
120
|
-
// The base directory is the directory containing the resourcePath,
|
121
|
-
// OR resourcePath itself if it was identified as a directory.
|
122
|
-
const baseDirPath = isInputLikelyDirectory ? resourcePath : path.dirname(resourcePath);
|
123
|
-
// console.log(`[DEBUG determineBaseUrl] Calculated baseDirPath: "${baseDirPath}"`); // Keep debug log commented unless needed
|
124
|
-
|
125
|
-
// Convert base directory path back to a file URL ending in '/'
|
126
|
-
let normalizedPathForURL = baseDirPath.replace(/\\/g, '/'); // Use forward slashes for URL consistency
|
127
|
-
// Ensure leading slash for Windows file URLs (e.g., /C:/...)
|
128
|
-
if (/^[A-Z]:\//i.test(normalizedPathForURL) && !normalizedPathForURL.startsWith('/')) {
|
129
|
-
normalizedPathForURL = '/' + normalizedPathForURL;
|
130
|
-
}
|
131
|
-
// Ensure trailing slash for the directory URL
|
132
|
-
if (!normalizedPathForURL.endsWith('/')) {
|
133
|
-
normalizedPathForURL += '/';
|
134
|
-
}
|
135
|
-
|
136
|
-
// Create the final file URL object and get its string representation
|
137
|
-
const fileUrl = new URL('file://' + normalizedPathForURL);
|
138
|
-
const fileUrlString = fileUrl.href;
|
139
|
-
|
140
|
-
logger?.debug(`Determined base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved base dir: ${baseDirPath})`);
|
141
|
-
// console.log(`[DEBUG determineBaseUrl] Determined File URL: "${fileUrlString}"`); // Keep debug log commented unless needed
|
142
|
-
return fileUrlString;
|
92
|
+
// Handle other protocols (warn and return undefined)
|
93
|
+
else if (inputPathOrUrl.includes('://') && !inputPathOrUrl.startsWith('file:')) {
|
94
|
+
logger?.warn(
|
95
|
+
`Input "${inputPathOrUrl}" looks like a URL but uses an unsupported protocol. Cannot determine base URL.`
|
96
|
+
);
|
97
|
+
// console.log(`[DEBUG determineBaseUrl] Unsupported protocol.`); // Keep debug log commented unless needed
|
98
|
+
return undefined;
|
99
|
+
}
|
100
|
+
// Handle file paths and file: URLs
|
101
|
+
else {
|
102
|
+
let resourcePath: string; // Path to the actual file or dir input
|
103
|
+
let isInputLikelyDirectory = false;
|
104
|
+
|
105
|
+
// Convert input to an absolute path
|
106
|
+
if (inputPathOrUrl.startsWith('file:')) {
|
107
|
+
// Convert file URL to path
|
108
|
+
resourcePath = fileURLToPath(inputPathOrUrl);
|
109
|
+
// file: URLs ending in / strongly suggest a directory
|
110
|
+
isInputLikelyDirectory = inputPathOrUrl.endsWith('/');
|
111
|
+
} else {
|
112
|
+
// Resolve relative/absolute file paths
|
113
|
+
resourcePath = path.resolve(inputPathOrUrl);
|
114
|
+
// Check if the resolved path *actually* exists and is a directory
|
115
|
+
try {
|
116
|
+
// Use statSync carefully - assumes it's available and works (or mocked)
|
117
|
+
isInputLikelyDirectory = fs.statSync(resourcePath).isDirectory();
|
118
|
+
} catch {
|
119
|
+
// If stat fails (ENOENT, EACCES), assume it refers to a file path
|
120
|
+
isInputLikelyDirectory = false;
|
143
121
|
}
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
122
|
+
}
|
123
|
+
// console.log(`[DEBUG determineBaseUrl] resourcePath: "${resourcePath}", isInputLikelyDirectory: ${isInputLikelyDirectory}`); // Keep debug log commented unless needed
|
124
|
+
|
125
|
+
// The base directory is the directory containing the resourcePath,
|
126
|
+
// OR resourcePath itself if it was identified as a directory.
|
127
|
+
const baseDirPath = isInputLikelyDirectory ? resourcePath : path.dirname(resourcePath);
|
128
|
+
// console.log(`[DEBUG determineBaseUrl] Calculated baseDirPath: "${baseDirPath}"`); // Keep debug log commented unless needed
|
129
|
+
|
130
|
+
// Convert base directory path back to a file URL ending in '/'
|
131
|
+
let normalizedPathForURL = baseDirPath.replace(/\\/g, '/'); // Use forward slashes for URL consistency
|
132
|
+
// Ensure leading slash for Windows file URLs (e.g., /C:/...)
|
133
|
+
if (/^[A-Z]:\//i.test(normalizedPathForURL) && !normalizedPathForURL.startsWith('/')) {
|
134
|
+
normalizedPathForURL = '/' + normalizedPathForURL;
|
135
|
+
}
|
136
|
+
// Ensure trailing slash for the directory URL
|
137
|
+
if (!normalizedPathForURL.endsWith('/')) {
|
138
|
+
normalizedPathForURL += '/';
|
139
|
+
}
|
140
|
+
|
141
|
+
// Create the final file URL object and get its string representation
|
142
|
+
const fileUrl = new URL('file://' + normalizedPathForURL);
|
143
|
+
const fileUrlString = fileUrl.href;
|
144
|
+
|
145
|
+
logger?.debug(
|
146
|
+
`Determined base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved base dir: ${baseDirPath})`
|
147
|
+
);
|
148
|
+
// console.log(`[DEBUG determineBaseUrl] Determined File URL: "${fileUrlString}"`); // Keep debug log commented unless needed
|
149
|
+
return fileUrlString;
|
150
150
|
}
|
151
|
+
} catch (error: unknown) {
|
152
|
+
// Handle any errors during base URL determination
|
153
|
+
const message = error instanceof Error ? error.message : String(error);
|
154
|
+
// console.error(`[DEBUG determineBaseUrl] Error determining base URL: ${message}`); // Keep debug log commented unless needed
|
155
|
+
logger?.error(
|
156
|
+
`💀 Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error && error.stack ? ` - Stack: ${error.stack}` : ''}`
|
157
|
+
);
|
158
|
+
return undefined;
|
159
|
+
}
|
151
160
|
}
|
152
161
|
|
153
162
|
/**
|
@@ -159,53 +168,59 @@ function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | und
|
|
159
168
|
* @returns {URL | null} A validated, absolute URL object, or null if invalid/ignorable.
|
160
169
|
*/
|
161
170
|
function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Logger): URL | null {
|
162
|
-
|
163
|
-
|
171
|
+
// Trim whitespace from the URL
|
172
|
+
const trimmedUrl = assetUrl?.trim();
|
164
173
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
174
|
+
// Ignore empty URLs, data URIs, or fragment-only URLs
|
175
|
+
if (!trimmedUrl || trimmedUrl.startsWith('data:') || trimmedUrl.startsWith('#')) {
|
176
|
+
return null;
|
177
|
+
}
|
169
178
|
|
170
|
-
|
179
|
+
let resolvableUrl = trimmedUrl;
|
171
180
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
181
|
+
// Handle protocol-relative URLs (e.g., //example.com/image.png)
|
182
|
+
if (resolvableUrl.startsWith('//') && baseContextUrl) {
|
183
|
+
try {
|
184
|
+
// Prepend the protocol from the base context URL
|
185
|
+
const base = new URL(baseContextUrl);
|
186
|
+
resolvableUrl = base.protocol + resolvableUrl;
|
187
|
+
} catch (e) {
|
188
|
+
// Log a warning if the base protocol cannot be determined
|
189
|
+
logger?.warn(
|
190
|
+
`Could not extract protocol from base "${baseContextUrl}" for protocol-relative URL "${trimmedUrl}". Skipping.`
|
191
|
+
);
|
192
|
+
return null;
|
183
193
|
}
|
194
|
+
}
|
184
195
|
|
185
|
-
|
186
|
-
|
187
|
-
|
196
|
+
try {
|
197
|
+
// Use URL constructor for resolution. Handles absolute paths, relative paths, ../ etc.
|
198
|
+
const resolved = new URL(resolvableUrl, baseContextUrl);
|
188
199
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
}
|
194
|
-
// Return the resolved URL object
|
195
|
-
return resolved;
|
196
|
-
} catch (error: unknown) {
|
197
|
-
// Log errors during URL parsing/resolution
|
198
|
-
const message = error instanceof Error ? error.message : String(error);
|
199
|
-
// Avoid redundant warnings for relative paths when no base context was provided (expected failure)
|
200
|
-
if (!/^[a-z]+:/i.test(resolvableUrl) && !resolvableUrl.startsWith('/') && !baseContextUrl) {
|
201
|
-
logger?.warn(`Cannot resolve relative URL "${resolvableUrl}" - Base context URL was not provided or determined.`);
|
202
|
-
} else {
|
203
|
-
// Log other resolution failures
|
204
|
-
logger?.warn(`⚠️ Failed to parse/resolve URL "${resolvableUrl}" ${baseContextUrl ? 'against base "' + baseContextUrl + '"' : '(no base provided)'}: ${message}`);
|
205
|
-
}
|
206
|
-
// Return null if resolution fails
|
207
|
-
return null;
|
200
|
+
// Skip assets with unsupported protocols (e.g., mailto:, ws:)
|
201
|
+
if (!['http:', 'https:', 'file:'].includes(resolved.protocol)) {
|
202
|
+
logger?.debug(`Skipping asset with unsupported protocol: ${resolved.href}`);
|
203
|
+
return null;
|
208
204
|
}
|
205
|
+
// Return the resolved URL object
|
206
|
+
return resolved;
|
207
|
+
} catch (error: unknown) {
|
208
|
+
// Log errors during URL parsing/resolution
|
209
|
+
const message = error instanceof Error ? error.message : String(error);
|
210
|
+
// Avoid redundant warnings for relative paths when no base context was provided (expected failure)
|
211
|
+
if (!/^[a-z]+:/i.test(resolvableUrl) && !resolvableUrl.startsWith('/') && !baseContextUrl) {
|
212
|
+
logger?.warn(
|
213
|
+
`Cannot resolve relative URL "${resolvableUrl}" - Base context URL was not provided or determined.`
|
214
|
+
);
|
215
|
+
} else {
|
216
|
+
// Log other resolution failures
|
217
|
+
logger?.warn(
|
218
|
+
`⚠️ Failed to parse/resolve URL "${resolvableUrl}" ${baseContextUrl ? 'against base "' + baseContextUrl + '"' : '(no base provided)'}: ${message}`
|
219
|
+
);
|
220
|
+
}
|
221
|
+
// Return null if resolution fails
|
222
|
+
return null;
|
223
|
+
}
|
209
224
|
}
|
210
225
|
|
211
226
|
/**
|
@@ -217,36 +232,34 @@ function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Log
|
|
217
232
|
* @returns {string | null} The resolved absolute URL string, or null if resolution fails/invalid.
|
218
233
|
*/
|
219
234
|
function resolveCssRelativeUrl(
|
220
|
-
|
221
|
-
|
222
|
-
|
235
|
+
relativeUrl: string,
|
236
|
+
cssBaseContextUrl: string, // e.g., file:///C:/mock/base/dir/css/deep.css or https://.../style.css
|
237
|
+
logger?: Logger
|
223
238
|
): string | null {
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
}
|
239
|
+
// console.log(`[DEBUG resolveCssRelativeUrl] Input: relative="${relativeUrl}", base="${cssBaseContextUrl}"`); // Keep debug log commented unless needed
|
240
|
+
|
241
|
+
// Ignore empty, data URIs, or fragments
|
242
|
+
if (!relativeUrl || relativeUrl.startsWith('data:') || relativeUrl.startsWith('#')) {
|
243
|
+
return null;
|
244
|
+
}
|
245
|
+
|
246
|
+
try {
|
247
|
+
// Use the URL constructor which correctly handles relative paths including ../
|
248
|
+
// relative to the base URL provided (the CSS file's URL).
|
249
|
+
const resolvedUrl = new URL(relativeUrl, cssBaseContextUrl);
|
250
|
+
// console.log(`[DEBUG resolveCssRelativeUrl] Resolved URL object href: "${resolvedUrl.href}"`); // Keep debug log commented unless needed
|
251
|
+
// Return the resolved absolute URL string
|
252
|
+
return resolvedUrl.href;
|
253
|
+
} catch (error) {
|
254
|
+
// Log warning if URL resolution fails
|
255
|
+
logger?.warn(
|
256
|
+
`Failed to resolve CSS URL: "${relativeUrl}" relative to "${cssBaseContextUrl}": ${String(error)}`
|
257
|
+
);
|
258
|
+
// console.error(`[DEBUG resolveCssRelativeUrl] Error resolving: ${String(error)}`); // Keep debug log commented unless needed
|
259
|
+
return null;
|
260
|
+
}
|
247
261
|
}
|
248
262
|
|
249
|
-
|
250
263
|
/**
|
251
264
|
* Asynchronously fetches the content of a resolved asset URL (http, https, file).
|
252
265
|
* @async
|
@@ -255,94 +268,103 @@ function resolveCssRelativeUrl(
|
|
255
268
|
* @param {number} [timeout=10000] - Network timeout in milliseconds for HTTP(S) requests.
|
256
269
|
* @returns {Promise<Buffer | null>} Asset content as a Buffer, or null on failure.
|
257
270
|
*/
|
258
|
-
async function fetchAsset(
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
271
|
+
async function fetchAsset(
|
272
|
+
resolvedUrl: URL,
|
273
|
+
logger?: Logger,
|
274
|
+
timeout: number = 10000
|
275
|
+
): Promise<Buffer | null> {
|
276
|
+
// console.log(`[DEBUG fetchAsset] Attempting fetch for URL: ${resolvedUrl.href}`); // Keep debug log commented unless needed
|
277
|
+
logger?.debug(`Attempting to fetch asset: ${resolvedUrl.href}`);
|
278
|
+
const protocol = resolvedUrl.protocol;
|
279
|
+
|
280
|
+
try {
|
281
|
+
// Handle HTTP and HTTPS protocols
|
282
|
+
if (protocol === 'http:' || protocol === 'https:') {
|
283
|
+
// Use axios to fetch remote content as an ArrayBuffer
|
284
|
+
const response: AxiosResponse<ArrayBuffer> = await axiosNs.default.get(resolvedUrl.href, {
|
285
|
+
responseType: 'arraybuffer', // Fetch as binary data
|
286
|
+
timeout: timeout, // Apply network timeout
|
287
|
+
});
|
288
|
+
logger?.debug(
|
289
|
+
`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data?.byteLength ?? 0} bytes)`
|
290
|
+
);
|
291
|
+
// Return the fetched data as a Node.js Buffer
|
292
|
+
return Buffer.from(response.data);
|
293
|
+
}
|
294
|
+
// Handle file protocol
|
295
|
+
else if (protocol === 'file:') {
|
296
|
+
let filePath: string;
|
297
|
+
try {
|
298
|
+
// Convert file URL to a system file path
|
299
|
+
// IMPORTANT: This strips query params and fragments from the URL
|
300
|
+
filePath = fileURLToPath(resolvedUrl);
|
301
|
+
} catch (e: any) {
|
302
|
+
logger?.error(
|
303
|
+
`Could not convert file URL to path: ${resolvedUrl.href}. Error: ${e.message}`
|
304
|
+
);
|
305
|
+
return null; // Return null if conversion fails
|
306
|
+
}
|
291
307
|
|
292
|
-
|
293
|
-
const data = await readFile(filePath); // This call uses the mock in tests
|
308
|
+
const normalizedForLog = path.normalize(filePath);
|
294
309
|
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
}
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
// Return null on ANY error
|
344
|
-
return null;
|
310
|
+
// Read file content using fs/promises
|
311
|
+
const data = await readFile(filePath); // This call uses the mock in tests
|
312
|
+
logger?.debug(`Read local file ${filePath} (${data.byteLength} bytes)`);
|
313
|
+
// Return the file content as a Buffer
|
314
|
+
return data;
|
315
|
+
}
|
316
|
+
// Handle unsupported protocols
|
317
|
+
else {
|
318
|
+
// console.log(`[DEBUG fetchAsset] Unsupported protocol: ${protocol}`); // Keep debug log commented unless needed
|
319
|
+
logger?.warn(`Unsupported protocol "${protocol}" in URL: ${resolvedUrl.href}`);
|
320
|
+
return null;
|
321
|
+
}
|
322
|
+
} catch (error: unknown) {
|
323
|
+
// --- Handle Errors During Fetch/Read ---
|
324
|
+
const failedId =
|
325
|
+
protocol === 'file:' ? path.normalize(fileURLToPath(resolvedUrl)) : resolvedUrl.href;
|
326
|
+
if ((protocol === 'http:' || protocol === 'https:') && (error as any)?.isAxiosError === true) {
|
327
|
+
const axiosError = error as AxiosError; // Cast for easier property access
|
328
|
+
const status = axiosError.response?.status ?? 'N/A';
|
329
|
+
const code = axiosError.code ?? 'N/A'; // e.g., ECONNABORTED for timeout
|
330
|
+
// Use the specific log format
|
331
|
+
const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: ${axiosError.message} (Code: ${code})`;
|
332
|
+
logger?.warn(logMessage);
|
333
|
+
}
|
334
|
+
// Check for file system errors *next*
|
335
|
+
else if (protocol === 'file:' && error instanceof Error) {
|
336
|
+
let failedPath = resolvedUrl.href;
|
337
|
+
try {
|
338
|
+
failedPath = fileURLToPath(resolvedUrl);
|
339
|
+
} catch {
|
340
|
+
/* ignore */
|
341
|
+
}
|
342
|
+
failedPath = path.normalize(failedPath);
|
343
|
+
|
344
|
+
if ((error as NodeJSErrnoException).code === 'ENOENT') {
|
345
|
+
logger?.warn(`⚠️ File not found (ENOENT) for asset: ${failedPath}.`);
|
346
|
+
} else if ((error as NodeJSErrnoException).code === 'EACCES') {
|
347
|
+
// Log ONLY the specific EACCES message
|
348
|
+
logger?.warn(`⚠️ Permission denied (EACCES) reading asset: ${failedPath}.`);
|
349
|
+
} else {
|
350
|
+
logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
|
351
|
+
}
|
352
|
+
}
|
353
|
+
// Generic fallback for *other* types of Errors (that are not Axios or known FS errors)
|
354
|
+
else if (error instanceof Error) {
|
355
|
+
logger?.warn(
|
356
|
+
`⚠️ An unexpected error occurred processing asset ${resolvedUrl.href}: ${error.message}`
|
357
|
+
);
|
345
358
|
}
|
359
|
+
// Fallback for non-Error throws (e.g., strings, numbers)
|
360
|
+
else {
|
361
|
+
logger?.warn(
|
362
|
+
`⚠️ An unknown and unexpected error occurred processing asset ${resolvedUrl.href}: ${String(error)}`
|
363
|
+
);
|
364
|
+
}
|
365
|
+
// Return null on ANY error
|
366
|
+
return null;
|
367
|
+
}
|
346
368
|
}
|
347
369
|
|
348
370
|
/**
|
@@ -354,62 +376,65 @@ async function fetchAsset(resolvedUrl: URL, logger?: Logger, timeout: number = 1
|
|
354
376
|
* @returns {Asset[]} An array of newly discovered Asset objects (type, resolved URL, content initially undefined).
|
355
377
|
*/
|
356
378
|
function extractUrlsFromCSS(
|
357
|
-
|
358
|
-
|
359
|
-
|
379
|
+
cssContent: string,
|
380
|
+
cssBaseContextUrl: string,
|
381
|
+
logger?: Logger
|
360
382
|
): Asset[] {
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
// Find all url(...) matches in the CSS content
|
397
|
-
let match;
|
398
|
-
while ((match = urlRegex.exec(cssContent)) !== null) {
|
399
|
-
// Group 2 captures the URL part inside url()
|
400
|
-
processFoundUrl(match[2], 'url()');
|
383
|
+
// Array to hold assets discovered within this CSS content
|
384
|
+
const newlyDiscovered: Asset[] = [];
|
385
|
+
// Set to track URLs processed within this specific CSS file to avoid adding duplicates from the same file
|
386
|
+
const processedInThisParse = new Set<string>();
|
387
|
+
|
388
|
+
// Regex for url(...) patterns, handling optional quotes (non-greedy match for URL)
|
389
|
+
const urlRegex = /url\(\s*(['"]?)(.*?)\1\s*\)/gi;
|
390
|
+
// Regex for @import rules, handling url() or bare string, optional quotes (non-greedy match for URL)
|
391
|
+
const importRegex = /@import\s+(?:url\(\s*(['"]?)(.*?)\1\s*\)|(['"])(.*?)\3)\s*;/gi;
|
392
|
+
|
393
|
+
/** Internal helper to process a found URL string */
|
394
|
+
const processFoundUrl = (rawUrl: string | undefined, ruleType: '@import' | 'url()') => {
|
395
|
+
// Skip if URL is empty, undefined, a data URI, or only a fragment
|
396
|
+
if (!rawUrl || rawUrl.trim() === '' || rawUrl.startsWith('data:') || rawUrl.startsWith('#'))
|
397
|
+
return;
|
398
|
+
|
399
|
+
// Resolve the potentially relative URL against the CSS file's base URL
|
400
|
+
const resolvedUrl = resolveCssRelativeUrl(rawUrl, cssBaseContextUrl, logger);
|
401
|
+
|
402
|
+
// If successfully resolved and not already found *in this specific CSS file*
|
403
|
+
if (resolvedUrl && !processedInThisParse.has(resolvedUrl)) {
|
404
|
+
// Mark this resolved URL as processed for this CSS file
|
405
|
+
processedInThisParse.add(resolvedUrl);
|
406
|
+
// Guess the asset type (css, image, font, etc.) based on the resolved URL
|
407
|
+
const { assetType } = guessMimeType(resolvedUrl);
|
408
|
+
|
409
|
+
// Add the discovered asset to the list for this CSS file
|
410
|
+
newlyDiscovered.push({
|
411
|
+
type: assetType,
|
412
|
+
url: resolvedUrl, // Store the resolved absolute URL string
|
413
|
+
content: undefined, // Content will be fetched later if needed
|
414
|
+
});
|
415
|
+
logger?.debug(
|
416
|
+
`Discovered nested ${assetType} asset (${ruleType}) in CSS ${cssBaseContextUrl}: ${resolvedUrl}`
|
417
|
+
);
|
401
418
|
}
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
419
|
+
};
|
420
|
+
|
421
|
+
// Find all url(...) matches in the CSS content
|
422
|
+
let match;
|
423
|
+
while ((match = urlRegex.exec(cssContent)) !== null) {
|
424
|
+
// Group 2 captures the URL part inside url()
|
425
|
+
processFoundUrl(match[2], 'url()');
|
426
|
+
}
|
427
|
+
|
428
|
+
// Find all @import matches in the CSS content
|
429
|
+
// Reset lastIndex as we're reusing the regex object implicitly
|
430
|
+
importRegex.lastIndex = 0;
|
431
|
+
while ((match = importRegex.exec(cssContent)) !== null) {
|
432
|
+
// Group 2 captures url('...'), Group 4 captures bare "..."
|
433
|
+
processFoundUrl(match[2] || match[4], '@import');
|
434
|
+
}
|
435
|
+
|
436
|
+
// Return the list of assets discovered within this CSS content
|
437
|
+
return newlyDiscovered;
|
413
438
|
}
|
414
439
|
|
415
440
|
/**
|
@@ -427,279 +452,321 @@ function extractUrlsFromCSS(
|
|
427
452
|
* @returns {Promise<ParsedHTML>} Processed data with `htmlContent` and the final `assets` array containing all discovered assets (with content if `embedAssets` was true and fetch succeeded).
|
428
453
|
*/
|
429
454
|
export async function extractAssets(
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
455
|
+
parsed: ParsedHTML,
|
456
|
+
embedAssets = true,
|
457
|
+
inputPathOrUrl?: string,
|
458
|
+
logger?: Logger
|
434
459
|
): Promise<ParsedHTML> {
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
460
|
+
logger?.info(
|
461
|
+
`🚀 Starting asset extraction! Embed: ${embedAssets}. Input: ${inputPathOrUrl || '(HTML content only)'}`
|
462
|
+
);
|
463
|
+
|
464
|
+
// Get the initial list of assets found directly in the HTML
|
465
|
+
const initialAssets: Asset[] = parsed.assets || [];
|
466
|
+
// Stores the final result: Map<resolved URL string, Asset object> to ensure uniqueness
|
467
|
+
const finalAssetsMap = new Map<string, Asset>();
|
468
|
+
// Queue holds assets whose content needs to be processed (fetched/analyzed)
|
469
|
+
let assetsToProcess: Asset[] = [];
|
470
|
+
// Set to track URLs that are either already fully processed (in finalAssetsMap)
|
471
|
+
// OR currently in the processing queue (assetsToProcess) to prevent reprocessing/loops.
|
472
|
+
const processedOrQueuedUrls = new Set<string>();
|
473
|
+
|
474
|
+
// --- Determine Base URL Context for the HTML ---
|
475
|
+
const htmlBaseContextUrl = determineBaseUrl(inputPathOrUrl || '', logger);
|
476
|
+
// Warn if no base URL could be found and there are relative paths in the initial assets
|
477
|
+
if (
|
478
|
+
!htmlBaseContextUrl &&
|
479
|
+
initialAssets.some(
|
480
|
+
a =>
|
481
|
+
!/^[a-z]+:/i.test(a.url) &&
|
482
|
+
!a.url.startsWith('data:') &&
|
483
|
+
!a.url.startsWith('#') &&
|
484
|
+
!a.url.startsWith('/')
|
485
|
+
)
|
486
|
+
) {
|
487
|
+
logger?.warn(
|
488
|
+
'🚨 No valid base path/URL determined for the HTML source! Resolution of relative asset paths from HTML may fail.'
|
489
|
+
);
|
490
|
+
} else if (htmlBaseContextUrl) {
|
491
|
+
logger?.debug(`Using HTML base context URL: ${htmlBaseContextUrl}`);
|
492
|
+
}
|
493
|
+
|
494
|
+
// --- Initial Queue Population from HTML assets ---
|
495
|
+
logger?.debug(`Queueing ${initialAssets.length} initial assets parsed from HTML...`);
|
496
|
+
for (const asset of initialAssets) {
|
497
|
+
// Resolve the initial asset URL against the HTML base context
|
498
|
+
const resolvedUrlObj = resolveAssetUrl(asset.url, htmlBaseContextUrl, logger);
|
499
|
+
|
500
|
+
// Skip if URL is invalid, data URI, fragment, or unsupported protocol
|
501
|
+
if (!resolvedUrlObj) {
|
502
|
+
logger?.debug(` -> Skipping initial asset with unresolvable/ignorable URL: ${asset.url}`);
|
503
|
+
continue;
|
454
504
|
}
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
505
|
+
// Get the resolved absolute URL string
|
506
|
+
const urlToQueue = resolvedUrlObj.href;
|
507
|
+
|
508
|
+
// Check if this URL is already tracked (processed or queued)
|
509
|
+
if (!processedOrQueuedUrls.has(urlToQueue)) {
|
510
|
+
// Mark as queued (add to set *before* adding to array)
|
511
|
+
processedOrQueuedUrls.add(urlToQueue);
|
512
|
+
|
513
|
+
// Guess type from the resolved/original URL if not provided initially
|
514
|
+
const { assetType: guessedType } = guessMimeType(urlToQueue);
|
515
|
+
const initialType = asset.type ?? guessedType; // Use provided type or fallback to guessed type
|
516
|
+
|
517
|
+
// Add the resolved asset to the processing queue
|
518
|
+
assetsToProcess.push({
|
519
|
+
url: urlToQueue, // Use the resolved URL
|
520
|
+
type: initialType,
|
521
|
+
content: undefined, // Content is initially undefined
|
522
|
+
});
|
523
|
+
logger?.debug(` -> Queued initial asset: ${urlToQueue} (Original raw: ${asset.url})`);
|
524
|
+
} else {
|
525
|
+
logger?.debug(` -> Skipping already processed/queued initial asset: ${urlToQueue}`);
|
526
|
+
}
|
527
|
+
}
|
528
|
+
|
529
|
+
// --- Main processing loop (continues as long as there are assets to process) ---
|
530
|
+
let iterationCount = 0;
|
531
|
+
while (assetsToProcess.length > 0) {
|
532
|
+
iterationCount++;
|
533
|
+
// Prevent potential infinite loops
|
534
|
+
if (iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS) {
|
535
|
+
logger?.error(
|
536
|
+
`🛑 Asset extraction loop limit hit (${MAX_ASSET_EXTRACTION_ITERATIONS})! Aborting.`
|
537
|
+
);
|
538
|
+
const remainingUrls = assetsToProcess
|
539
|
+
.map(a => a.url)
|
540
|
+
.slice(0, 10)
|
541
|
+
.join(', ');
|
542
|
+
logger?.error(
|
543
|
+
`Remaining queue sample (${assetsToProcess.length} items): ${remainingUrls}...`
|
544
|
+
);
|
545
|
+
// Add assets remaining in queue to final map without content before breaking
|
546
|
+
assetsToProcess.forEach(asset => {
|
547
|
+
if (!finalAssetsMap.has(asset.url)) {
|
548
|
+
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
488
549
|
}
|
550
|
+
});
|
551
|
+
assetsToProcess = []; // Clear queue to stop the loop
|
552
|
+
break; // Exit loop
|
489
553
|
}
|
490
554
|
|
491
|
-
//
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
// Prevent potential infinite loops
|
496
|
-
if (iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS) {
|
497
|
-
logger?.error(`🛑 Asset extraction loop limit hit (${MAX_ASSET_EXTRACTION_ITERATIONS})! Aborting.`);
|
498
|
-
const remainingUrls = assetsToProcess.map(a => a.url).slice(0, 10).join(', ');
|
499
|
-
logger?.error(`Remaining queue sample (${assetsToProcess.length} items): ${remainingUrls}...`);
|
500
|
-
// Add assets remaining in queue to final map without content before breaking
|
501
|
-
assetsToProcess.forEach(asset => {
|
502
|
-
if (!finalAssetsMap.has(asset.url)) {
|
503
|
-
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
504
|
-
}
|
505
|
-
});
|
506
|
-
assetsToProcess = []; // Clear queue to stop the loop
|
507
|
-
break; // Exit loop
|
508
|
-
}
|
555
|
+
// Take a snapshot of the current queue to process in this iteration
|
556
|
+
const currentBatch = [...assetsToProcess];
|
557
|
+
// Clear the main queue; new assets found in this batch will be added here for the *next* iteration
|
558
|
+
assetsToProcess = [];
|
509
559
|
|
510
|
-
|
511
|
-
const currentBatch = [...assetsToProcess];
|
512
|
-
// Clear the main queue; new assets found in this batch will be added here for the *next* iteration
|
513
|
-
assetsToProcess = [];
|
560
|
+
logger?.debug(`--- Processing batch ${iterationCount}: ${currentBatch.length} asset(s) ---`);
|
514
561
|
|
515
|
-
|
562
|
+
// Process each asset in the current batch
|
563
|
+
for (const asset of currentBatch) {
|
564
|
+
// Double-check: Skip if this asset somehow got fully processed in a previous iteration (shouldn't happen with current logic, but safe check)
|
565
|
+
if (finalAssetsMap.has(asset.url)) {
|
566
|
+
logger?.debug(`Skipping asset already in final map: ${asset.url}`);
|
567
|
+
continue;
|
568
|
+
}
|
516
569
|
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
if (finalAssetsMap.has(asset.url)) {
|
521
|
-
logger?.debug(`Skipping asset already in final map: ${asset.url}`);
|
522
|
-
continue;
|
523
|
-
}
|
570
|
+
let assetContentBuffer: Buffer | null = null; // To store fetched binary content
|
571
|
+
let finalContent: string | undefined = undefined; // Final content (text or data URI) for the Asset object
|
572
|
+
let cssContentForParsing: string | undefined = undefined; // Text content specifically for parsing CSS
|
524
573
|
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
continue;
|
546
|
-
}
|
547
|
-
|
548
|
-
// --- Fetch Asset ---
|
549
|
-
if (assetUrlObj) {
|
550
|
-
// Call fetchAsset (which handles http/https/file and errors)
|
551
|
-
assetContentBuffer = await fetchAsset(assetUrlObj, logger);
|
552
|
-
// fetchAsset returns null on failure
|
553
|
-
}
|
554
|
-
} // End if(needsFetching)
|
555
|
-
|
556
|
-
// --- If fetching was required but failed, store asset without content and continue ---
|
557
|
-
if (needsFetching && assetContentBuffer === null) {
|
558
|
-
logger?.debug(`Storing asset ${asset.url} without content due to fetch failure.`);
|
559
|
-
// Add to final map with undefined content
|
560
|
-
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
561
|
-
// Skip to the next asset in the current batch
|
562
|
-
continue;
|
563
|
-
}
|
574
|
+
// --- Determine if fetching is needed ---
|
575
|
+
// Fetch if we need to embed all assets OR if it's CSS (we need content to parse for nested assets)
|
576
|
+
const needsFetching = embedAssets || asset.type === 'css';
|
577
|
+
let assetUrlObj: URL | null = null; // URL object needed for fetchAsset
|
578
|
+
|
579
|
+
if (needsFetching) {
|
580
|
+
// --- Create URL object for fetching ---
|
581
|
+
try {
|
582
|
+
// Asset URL should be absolute at this point
|
583
|
+
assetUrlObj = new URL(asset.url);
|
584
|
+
} catch (urlError) {
|
585
|
+
// Log error if creating URL object fails
|
586
|
+
logger?.warn(
|
587
|
+
`Cannot create URL object for "${asset.url}", skipping fetch. Error: ${urlError instanceof Error ? urlError.message : String(urlError)}`
|
588
|
+
);
|
589
|
+
// Store asset without content in the final map
|
590
|
+
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
591
|
+
// Skip to next asset in the current batch
|
592
|
+
continue;
|
593
|
+
}
|
564
594
|
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
const attemptedTextContent = assetContentBuffer.toString('utf-8');
|
629
|
-
if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
|
630
|
-
// If text decoding is lossy, warn and use base64
|
631
|
-
logger?.warn(`Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`);
|
632
|
-
finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
|
633
|
-
} else {
|
634
|
-
// Store as text if decoding worked
|
635
|
-
finalContent = attemptedTextContent;
|
636
|
-
logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
|
637
|
-
}
|
638
|
-
} catch (decodeError) {
|
639
|
-
// If toString fails, warn and use base64
|
640
|
-
logger?.warn(`Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`);
|
641
|
-
finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
|
642
|
-
}
|
643
|
-
} else {
|
644
|
-
finalContent = undefined; // Not embedding
|
645
|
-
}
|
646
|
-
}
|
647
|
-
} else { // Content was not fetched (e.g., embedAssets=false and not CSS)
|
648
|
-
finalContent = undefined;
|
649
|
-
cssContentForParsing = undefined;
|
595
|
+
// --- Fetch Asset ---
|
596
|
+
if (assetUrlObj) {
|
597
|
+
// Call fetchAsset (which handles http/https/file and errors)
|
598
|
+
assetContentBuffer = await fetchAsset(assetUrlObj, logger);
|
599
|
+
// fetchAsset returns null on failure
|
600
|
+
}
|
601
|
+
} // End if(needsFetching)
|
602
|
+
|
603
|
+
// --- If fetching was required but failed, store asset without content and continue ---
|
604
|
+
if (needsFetching && assetContentBuffer === null) {
|
605
|
+
logger?.debug(`Storing asset ${asset.url} without content due to fetch failure.`);
|
606
|
+
// Add to final map with undefined content
|
607
|
+
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
608
|
+
// Skip to the next asset in the current batch
|
609
|
+
continue;
|
610
|
+
}
|
611
|
+
|
612
|
+
// --- Prepare Content for Storing/Embedding (if fetched successfully) ---
|
613
|
+
if (assetContentBuffer) {
|
614
|
+
// Only proceed if content was fetched
|
615
|
+
// Guess MIME type based on the asset's URL extension
|
616
|
+
const mimeInfo = guessMimeType(asset.url);
|
617
|
+
// Use the guessed MIME type or fallback to a generic binary type
|
618
|
+
const effectiveMime = mimeInfo.mime || 'application/octet-stream';
|
619
|
+
|
620
|
+
// Handle TEXT types (CSS, JS)
|
621
|
+
if (TEXT_ASSET_TYPES.has(asset.type)) {
|
622
|
+
let textContent: string | undefined;
|
623
|
+
let wasLossy = false;
|
624
|
+
try {
|
625
|
+
// Try decoding the buffer as UTF-8
|
626
|
+
textContent = assetContentBuffer.toString('utf-8');
|
627
|
+
// Check if the decoding process lost information (e.g., invalid sequences replaced)
|
628
|
+
wasLossy = isUtf8DecodingLossy(assetContentBuffer, textContent);
|
629
|
+
} catch (e) {
|
630
|
+
// Decoding itself failed
|
631
|
+
textContent = undefined;
|
632
|
+
wasLossy = true;
|
633
|
+
}
|
634
|
+
|
635
|
+
// If decoding was successful and not lossy
|
636
|
+
if (!wasLossy && textContent !== undefined) {
|
637
|
+
// If embedding, store the text content
|
638
|
+
if (embedAssets) {
|
639
|
+
finalContent = textContent;
|
640
|
+
} else {
|
641
|
+
finalContent = undefined; // Not embedding text, store undefined
|
642
|
+
}
|
643
|
+
// If it's CSS, store its text content for parsing regardless of embedding option
|
644
|
+
if (asset.type === 'css') {
|
645
|
+
cssContentForParsing = textContent;
|
646
|
+
}
|
647
|
+
} else {
|
648
|
+
// Decoding failed or was lossy
|
649
|
+
logger?.warn(
|
650
|
+
`Could not decode ${asset.type} asset ${asset.url} as valid UTF-8 text.${embedAssets ? ' Falling back to base64 data URI.' : ''}`
|
651
|
+
);
|
652
|
+
cssContentForParsing = undefined; // Cannot parse CSS if decoding failed
|
653
|
+
// Embed as base64 data URI if requested, using the effective MIME type
|
654
|
+
if (embedAssets) {
|
655
|
+
finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
|
656
|
+
} else {
|
657
|
+
finalContent = undefined; // Not embedding
|
650
658
|
}
|
659
|
+
}
|
660
|
+
}
|
661
|
+
// Handle BINARY types (image, font, video, audio)
|
662
|
+
else if (BINARY_ASSET_TYPES.has(asset.type)) {
|
663
|
+
// Embed as base64 data URI if requested
|
664
|
+
if (embedAssets) {
|
665
|
+
finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
|
666
|
+
} else {
|
667
|
+
finalContent = undefined; // Not embedding
|
668
|
+
}
|
669
|
+
cssContentForParsing = undefined; // Not CSS, so no parsing needed
|
670
|
+
}
|
671
|
+
// Handle 'other' or unknown types
|
672
|
+
else {
|
673
|
+
cssContentForParsing = undefined; // Assume not parseable as CSS
|
674
|
+
// If embedding, attempt to store as text, fallback to base64 if invalid UTF-8
|
675
|
+
if (embedAssets) {
|
676
|
+
try {
|
677
|
+
const attemptedTextContent = assetContentBuffer.toString('utf-8');
|
678
|
+
if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
|
679
|
+
// If text decoding is lossy, warn and use base64
|
680
|
+
logger?.warn(
|
681
|
+
`Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`
|
682
|
+
);
|
683
|
+
finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
|
684
|
+
} else {
|
685
|
+
// Store as text if decoding worked
|
686
|
+
finalContent = attemptedTextContent;
|
687
|
+
logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
|
688
|
+
}
|
689
|
+
} catch (decodeError) {
|
690
|
+
// If toString fails, warn and use base64
|
691
|
+
logger?.warn(
|
692
|
+
`Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`
|
693
|
+
);
|
694
|
+
finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
|
695
|
+
}
|
696
|
+
} else {
|
697
|
+
finalContent = undefined; // Not embedding
|
698
|
+
}
|
699
|
+
}
|
700
|
+
} else {
|
701
|
+
// Content was not fetched (e.g., embedAssets=false and not CSS)
|
702
|
+
finalContent = undefined;
|
703
|
+
cssContentForParsing = undefined;
|
704
|
+
}
|
705
|
+
|
706
|
+
// --- Store the final processed asset in the map ---
|
707
|
+
// Use the resolved URL as the key and ensure the asset object also uses the resolved URL
|
708
|
+
finalAssetsMap.set(asset.url, { ...asset, url: asset.url, content: finalContent });
|
709
|
+
// Note: URL was already added to processedOrQueuedUrls when initially queued or discovered in CSS
|
710
|
+
|
711
|
+
// --- Process CSS for nested assets ---
|
712
|
+
// Only if it's CSS and we successfully decoded its content for parsing
|
713
|
+
if (asset.type === 'css' && cssContentForParsing) {
|
714
|
+
// Determine the base URL *for this specific CSS file* to resolve its relative links
|
715
|
+
const cssBaseContextUrl = determineBaseUrl(asset.url, logger); // CSS URL is absolute here
|
716
|
+
logger?.debug(
|
717
|
+
`CSS base context for resolving nested assets within ${asset.url}: ${cssBaseContextUrl}`
|
718
|
+
);
|
651
719
|
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
}
|
720
|
+
if (cssBaseContextUrl) {
|
721
|
+
// Extract URLs found within this CSS content
|
722
|
+
const newlyDiscoveredAssets = extractUrlsFromCSS(
|
723
|
+
cssContentForParsing,
|
724
|
+
cssBaseContextUrl, // Use the CSS file's own URL as the base
|
725
|
+
logger
|
726
|
+
);
|
727
|
+
|
728
|
+
// If new assets were found in the CSS
|
729
|
+
if (newlyDiscoveredAssets.length > 0) {
|
730
|
+
logger?.debug(
|
731
|
+
`Discovered ${newlyDiscoveredAssets.length} nested assets in CSS ${asset.url}. Checking against queue...`
|
732
|
+
);
|
733
|
+
// Process each newly discovered asset
|
734
|
+
for (const newAsset of newlyDiscoveredAssets) {
|
735
|
+
// CHECK: Add to the main processing queue only if this resolved URL hasn't been processed OR queued before.
|
736
|
+
if (!processedOrQueuedUrls.has(newAsset.url)) {
|
737
|
+
processedOrQueuedUrls.add(newAsset.url); // Mark as queued now
|
738
|
+
assetsToProcess.push(newAsset); // Add to the queue for the *next* iteration
|
739
|
+
logger?.debug(` -> Queued new nested asset: ${newAsset.url}`);
|
740
|
+
} else {
|
741
|
+
// Skip if already handled
|
742
|
+
logger?.debug(
|
743
|
+
` -> Skipping already processed/queued nested asset: ${newAsset.url}`
|
744
|
+
);
|
745
|
+
}
|
746
|
+
}
|
747
|
+
}
|
748
|
+
} else {
|
749
|
+
// Warn if the base URL for the CSS file couldn't be determined (shouldn't happen if asset.url was valid)
|
750
|
+
logger?.warn(
|
751
|
+
`Could not determine base URL context for CSS file ${asset.url}. Cannot resolve nested relative paths within it.`
|
752
|
+
);
|
753
|
+
}
|
754
|
+
} // End if(asset.type === 'css' && cssContentForParsing)
|
755
|
+
} // End for loop over currentBatch
|
756
|
+
} // End while loop (assetsToProcess.length > 0)
|
757
|
+
|
758
|
+
// Log completion summary
|
759
|
+
const finalIterationCount =
|
760
|
+
iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS
|
761
|
+
? `${MAX_ASSET_EXTRACTION_ITERATIONS}+ (limit hit)`
|
762
|
+
: iterationCount;
|
763
|
+
logger?.info(
|
764
|
+
`✅ Asset extraction COMPLETE! Found ${finalAssetsMap.size} unique assets in ${finalIterationCount} iterations.`
|
765
|
+
);
|
766
|
+
|
767
|
+
// Return the original HTML content and the final list of processed assets from the map
|
768
|
+
return {
|
769
|
+
htmlContent: parsed.htmlContent,
|
770
|
+
assets: Array.from(finalAssetsMap.values()),
|
771
|
+
};
|
772
|
+
}
|