portapack 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/.eslintrc.json +67 -8
  2. package/.releaserc.js +25 -27
  3. package/CHANGELOG.md +14 -22
  4. package/LICENSE.md +21 -0
  5. package/README.md +22 -53
  6. package/commitlint.config.js +30 -34
  7. package/dist/cli/cli-entry.cjs +183 -98
  8. package/dist/cli/cli-entry.cjs.map +1 -1
  9. package/dist/index.d.ts +0 -3
  10. package/dist/index.js +178 -97
  11. package/dist/index.js.map +1 -1
  12. package/docs/.vitepress/config.ts +38 -33
  13. package/docs/.vitepress/sidebar-generator.ts +89 -38
  14. package/docs/architecture.md +186 -0
  15. package/docs/cli.md +23 -23
  16. package/docs/code-of-conduct.md +7 -1
  17. package/docs/configuration.md +12 -11
  18. package/docs/contributing.md +6 -2
  19. package/docs/deployment.md +10 -5
  20. package/docs/development.md +8 -5
  21. package/docs/getting-started.md +13 -13
  22. package/docs/index.md +1 -1
  23. package/docs/public/android-chrome-192x192.png +0 -0
  24. package/docs/public/android-chrome-512x512.png +0 -0
  25. package/docs/public/apple-touch-icon.png +0 -0
  26. package/docs/public/favicon-16x16.png +0 -0
  27. package/docs/public/favicon-32x32.png +0 -0
  28. package/docs/public/favicon.ico +0 -0
  29. package/docs/roadmap.md +233 -0
  30. package/docs/site.webmanifest +1 -0
  31. package/docs/troubleshooting.md +12 -1
  32. package/examples/main.ts +5 -30
  33. package/examples/sample-project/script.js +1 -1
  34. package/jest.config.ts +8 -13
  35. package/nodemon.json +5 -10
  36. package/package.json +2 -5
  37. package/src/cli/cli-entry.ts +2 -2
  38. package/src/cli/cli.ts +21 -16
  39. package/src/cli/options.ts +127 -113
  40. package/src/core/bundler.ts +253 -222
  41. package/src/core/extractor.ts +632 -565
  42. package/src/core/minifier.ts +173 -162
  43. package/src/core/packer.ts +141 -137
  44. package/src/core/parser.ts +74 -73
  45. package/src/core/web-fetcher.ts +270 -258
  46. package/src/index.ts +18 -17
  47. package/src/types.ts +9 -11
  48. package/src/utils/font.ts +12 -6
  49. package/src/utils/logger.ts +110 -105
  50. package/src/utils/meta.ts +75 -76
  51. package/src/utils/mime.ts +50 -50
  52. package/src/utils/slugify.ts +33 -34
  53. package/tests/unit/cli/cli-entry.test.ts +72 -70
  54. package/tests/unit/cli/cli.test.ts +314 -278
  55. package/tests/unit/cli/options.test.ts +294 -301
  56. package/tests/unit/core/bundler.test.ts +426 -329
  57. package/tests/unit/core/extractor.test.ts +793 -549
  58. package/tests/unit/core/minifier.test.ts +374 -274
  59. package/tests/unit/core/packer.test.ts +298 -264
  60. package/tests/unit/core/parser.test.ts +538 -150
  61. package/tests/unit/core/web-fetcher.test.ts +389 -359
  62. package/tests/unit/index.test.ts +238 -197
  63. package/tests/unit/utils/font.test.ts +26 -21
  64. package/tests/unit/utils/logger.test.ts +267 -260
  65. package/tests/unit/utils/meta.test.ts +29 -28
  66. package/tests/unit/utils/mime.test.ts +73 -74
  67. package/tests/unit/utils/slugify.test.ts +14 -12
  68. package/tsconfig.build.json +9 -10
  69. package/tsconfig.jest.json +1 -1
  70. package/tsconfig.json +2 -2
  71. package/tsup.config.ts +8 -9
  72. package/typedoc.json +5 -9
  73. /package/docs/{portapack-transparent.png → public/portapack-transparent.png} +0 -0
  74. /package/docs/{portapack.jpg → public/portapack.jpg} +0 -0
@@ -2,7 +2,6 @@
2
2
  * @file src/core/extractor.ts
3
3
  * @description Handles discovery, resolution, fetching, and optional embedding of assets
4
4
  * linked from HTML and recursively within CSS (@import, url()). This is the heart of finding EVERYTHING.
5
- * @version 1.1.6 - Revised fetchAsset error handling logic for Axios errors.
6
5
  */
7
6
 
8
7
  // === Node.js Core Imports ===
@@ -14,7 +13,12 @@ import { fileURLToPath, URL } from 'url'; // Crucial for file path/URL conversio
14
13
 
15
14
  // === External Dependencies ===
16
15
  import * as axiosNs from 'axios'; // Using namespace import for clarity
17
- import type { AxiosError, AxiosRequestConfig, AxiosResponse, InternalAxiosRequestConfig } from 'axios'; // Import necessary types
16
+ import type {
17
+ AxiosError,
18
+ AxiosRequestConfig,
19
+ AxiosResponse,
20
+ InternalAxiosRequestConfig,
21
+ } from 'axios'; // Import necessary types
18
22
 
19
23
  // === Project Imports ===
20
24
  import type { Asset, ParsedHTML } from '../types'; // Adjust path if needed
@@ -30,7 +34,6 @@ const BINARY_ASSET_TYPES: Set<Asset['type']> = new Set(['image', 'font', 'video'
30
34
  const MAX_ASSET_EXTRACTION_ITERATIONS = 1000;
31
35
 
32
36
  // === Helper Functions ===
33
-
34
37
  /**
35
38
  * Custom type for Node.js error objects with a `code` property.
36
39
  */
@@ -43,15 +46,15 @@ type NodeJSErrnoException = Error & { code?: string };
43
46
  * @returns {boolean} True if re-encoding doesn't match original buffer (lossy), false otherwise.
44
47
  */
45
48
  function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boolean {
46
- try {
47
- // Re-encode the decoded string back to a buffer using UTF-8
48
- const reEncodedBuffer = Buffer.from(decodedString, 'utf-8');
49
- // Compare the re-encoded buffer with the original buffer
50
- return !originalBuffer.equals(reEncodedBuffer);
51
- } catch (e) {
52
- // If an error occurs during re-encoding, it implies the original wasn't valid UTF-8
53
- return true;
54
- }
49
+ try {
50
+ // Re-encode the decoded string back to a buffer using UTF-8
51
+ const reEncodedBuffer = Buffer.from(decodedString, 'utf-8');
52
+ // Compare the re-encoded buffer with the original buffer
53
+ return !originalBuffer.equals(reEncodedBuffer);
54
+ } catch (e) {
55
+ // If an error occurs during re-encoding, it implies the original wasn't valid UTF-8
56
+ return true;
57
+ }
55
58
  }
56
59
 
57
60
  /**
@@ -62,92 +65,98 @@ function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boo
62
65
  * @returns {string | undefined} The absolute base URL string ending in '/', or undefined if determination fails.
63
66
  */
64
67
  function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | undefined {
65
- // Log the input for debugging purposes
66
- // console.log(`[DEBUG determineBaseUrl] Input: "${inputPathOrUrl}"`); // Keep debug log commented unless needed
67
- logger?.debug(`Determining base URL for input: ${inputPathOrUrl}`);
68
-
69
- // Handle invalid or empty input
70
- if (!inputPathOrUrl) {
71
- logger?.warn('Cannot determine base URL: inputPathOrUrl is empty or invalid.');
72
- return undefined;
68
+ // Log the input for debugging purposes
69
+ // console.log(`[DEBUG determineBaseUrl] Input: "${inputPathOrUrl}"`); // Keep debug log commented unless needed
70
+ logger?.debug(`Determining base URL for input: ${inputPathOrUrl}`);
71
+
72
+ // Handle invalid or empty input
73
+ if (!inputPathOrUrl) {
74
+ logger?.warn('Cannot determine base URL: inputPathOrUrl is empty or invalid.');
75
+ return undefined;
76
+ }
77
+
78
+ try {
79
+ // Handle non-file URLs (HTTP, HTTPS)
80
+ if (/^https?:\/\//i.test(inputPathOrUrl)) {
81
+ const url = new URL(inputPathOrUrl);
82
+ // Construct the base URL by taking the path up to the last '/'
83
+ url.pathname = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
84
+ url.search = ''; // Remove query parameters
85
+ url.hash = ''; // Remove fragments
86
+ const baseUrl = url.href;
87
+ logger?.debug(`Determined remote base URL: ${baseUrl}`);
88
+ // console.log(`[DEBUG determineBaseUrl] Determined Remote URL: "${baseUrl}"`); // Keep debug log commented unless needed
89
+ // Return the constructed base URL (usually ends in '/')
90
+ return baseUrl;
73
91
  }
74
-
75
- try {
76
- // Handle non-file URLs (HTTP, HTTPS)
77
- if (/^https?:\/\//i.test(inputPathOrUrl)) {
78
- const url = new URL(inputPathOrUrl);
79
- // Construct the base URL by taking the path up to the last '/'
80
- url.pathname = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
81
- url.search = ''; // Remove query parameters
82
- url.hash = ''; // Remove fragments
83
- const baseUrl = url.href;
84
- logger?.debug(`Determined remote base URL: ${baseUrl}`);
85
- // console.log(`[DEBUG determineBaseUrl] Determined Remote URL: "${baseUrl}"`); // Keep debug log commented unless needed
86
- // Return the constructed base URL (usually ends in '/')
87
- return baseUrl;
88
- }
89
- // Handle other protocols (warn and return undefined)
90
- else if (inputPathOrUrl.includes('://') && !inputPathOrUrl.startsWith('file:')) {
91
- logger?.warn(`Input "${inputPathOrUrl}" looks like a URL but uses an unsupported protocol. Cannot determine base URL.`);
92
- // console.log(`[DEBUG determineBaseUrl] Unsupported protocol.`); // Keep debug log commented unless needed
93
- return undefined;
94
- }
95
- // Handle file paths and file: URLs
96
- else {
97
- let resourcePath: string; // Path to the actual file or dir input
98
- let isInputLikelyDirectory = false;
99
-
100
- // Convert input to an absolute path
101
- if (inputPathOrUrl.startsWith('file:')) {
102
- // Convert file URL to path
103
- resourcePath = fileURLToPath(inputPathOrUrl);
104
- // file: URLs ending in / strongly suggest a directory
105
- isInputLikelyDirectory = inputPathOrUrl.endsWith('/');
106
- } else {
107
- // Resolve relative/absolute file paths
108
- resourcePath = path.resolve(inputPathOrUrl);
109
- // Check if the resolved path *actually* exists and is a directory
110
- try {
111
- // Use statSync carefully - assumes it's available and works (or mocked)
112
- isInputLikelyDirectory = fs.statSync(resourcePath).isDirectory();
113
- } catch {
114
- // If stat fails (ENOENT, EACCES), assume it refers to a file path
115
- isInputLikelyDirectory = false;
116
- }
117
- }
118
- // console.log(`[DEBUG determineBaseUrl] resourcePath: "${resourcePath}", isInputLikelyDirectory: ${isInputLikelyDirectory}`); // Keep debug log commented unless needed
119
-
120
- // The base directory is the directory containing the resourcePath,
121
- // OR resourcePath itself if it was identified as a directory.
122
- const baseDirPath = isInputLikelyDirectory ? resourcePath : path.dirname(resourcePath);
123
- // console.log(`[DEBUG determineBaseUrl] Calculated baseDirPath: "${baseDirPath}"`); // Keep debug log commented unless needed
124
-
125
- // Convert base directory path back to a file URL ending in '/'
126
- let normalizedPathForURL = baseDirPath.replace(/\\/g, '/'); // Use forward slashes for URL consistency
127
- // Ensure leading slash for Windows file URLs (e.g., /C:/...)
128
- if (/^[A-Z]:\//i.test(normalizedPathForURL) && !normalizedPathForURL.startsWith('/')) {
129
- normalizedPathForURL = '/' + normalizedPathForURL;
130
- }
131
- // Ensure trailing slash for the directory URL
132
- if (!normalizedPathForURL.endsWith('/')) {
133
- normalizedPathForURL += '/';
134
- }
135
-
136
- // Create the final file URL object and get its string representation
137
- const fileUrl = new URL('file://' + normalizedPathForURL);
138
- const fileUrlString = fileUrl.href;
139
-
140
- logger?.debug(`Determined base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved base dir: ${baseDirPath})`);
141
- // console.log(`[DEBUG determineBaseUrl] Determined File URL: "${fileUrlString}"`); // Keep debug log commented unless needed
142
- return fileUrlString;
92
+ // Handle other protocols (warn and return undefined)
93
+ else if (inputPathOrUrl.includes('://') && !inputPathOrUrl.startsWith('file:')) {
94
+ logger?.warn(
95
+ `Input "${inputPathOrUrl}" looks like a URL but uses an unsupported protocol. Cannot determine base URL.`
96
+ );
97
+ // console.log(`[DEBUG determineBaseUrl] Unsupported protocol.`); // Keep debug log commented unless needed
98
+ return undefined;
99
+ }
100
+ // Handle file paths and file: URLs
101
+ else {
102
+ let resourcePath: string; // Path to the actual file or dir input
103
+ let isInputLikelyDirectory = false;
104
+
105
+ // Convert input to an absolute path
106
+ if (inputPathOrUrl.startsWith('file:')) {
107
+ // Convert file URL to path
108
+ resourcePath = fileURLToPath(inputPathOrUrl);
109
+ // file: URLs ending in / strongly suggest a directory
110
+ isInputLikelyDirectory = inputPathOrUrl.endsWith('/');
111
+ } else {
112
+ // Resolve relative/absolute file paths
113
+ resourcePath = path.resolve(inputPathOrUrl);
114
+ // Check if the resolved path *actually* exists and is a directory
115
+ try {
116
+ // Use statSync carefully - assumes it's available and works (or mocked)
117
+ isInputLikelyDirectory = fs.statSync(resourcePath).isDirectory();
118
+ } catch {
119
+ // If stat fails (ENOENT, EACCES), assume it refers to a file path
120
+ isInputLikelyDirectory = false;
143
121
  }
144
- } catch (error: unknown) {
145
- // Handle any errors during base URL determination
146
- const message = error instanceof Error ? error.message : String(error);
147
- // console.error(`[DEBUG determineBaseUrl] Error determining base URL: ${message}`); // Keep debug log commented unless needed
148
- logger?.error(`💀 Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error && error.stack ? ` - Stack: ${error.stack}` : ''}`);
149
- return undefined;
122
+ }
123
+ // console.log(`[DEBUG determineBaseUrl] resourcePath: "${resourcePath}", isInputLikelyDirectory: ${isInputLikelyDirectory}`); // Keep debug log commented unless needed
124
+
125
+ // The base directory is the directory containing the resourcePath,
126
+ // OR resourcePath itself if it was identified as a directory.
127
+ const baseDirPath = isInputLikelyDirectory ? resourcePath : path.dirname(resourcePath);
128
+ // console.log(`[DEBUG determineBaseUrl] Calculated baseDirPath: "${baseDirPath}"`); // Keep debug log commented unless needed
129
+
130
+ // Convert base directory path back to a file URL ending in '/'
131
+ let normalizedPathForURL = baseDirPath.replace(/\\/g, '/'); // Use forward slashes for URL consistency
132
+ // Ensure leading slash for Windows file URLs (e.g., /C:/...)
133
+ if (/^[A-Z]:\//i.test(normalizedPathForURL) && !normalizedPathForURL.startsWith('/')) {
134
+ normalizedPathForURL = '/' + normalizedPathForURL;
135
+ }
136
+ // Ensure trailing slash for the directory URL
137
+ if (!normalizedPathForURL.endsWith('/')) {
138
+ normalizedPathForURL += '/';
139
+ }
140
+
141
+ // Create the final file URL object and get its string representation
142
+ const fileUrl = new URL('file://' + normalizedPathForURL);
143
+ const fileUrlString = fileUrl.href;
144
+
145
+ logger?.debug(
146
+ `Determined base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved base dir: ${baseDirPath})`
147
+ );
148
+ // console.log(`[DEBUG determineBaseUrl] Determined File URL: "${fileUrlString}"`); // Keep debug log commented unless needed
149
+ return fileUrlString;
150
150
  }
151
+ } catch (error: unknown) {
152
+ // Handle any errors during base URL determination
153
+ const message = error instanceof Error ? error.message : String(error);
154
+ // console.error(`[DEBUG determineBaseUrl] Error determining base URL: ${message}`); // Keep debug log commented unless needed
155
+ logger?.error(
156
+ `💀 Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error && error.stack ? ` - Stack: ${error.stack}` : ''}`
157
+ );
158
+ return undefined;
159
+ }
151
160
  }
152
161
 
153
162
  /**
@@ -159,53 +168,59 @@ function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | und
159
168
  * @returns {URL | null} A validated, absolute URL object, or null if invalid/ignorable.
160
169
  */
161
170
  function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Logger): URL | null {
162
- // Trim whitespace from the URL
163
- const trimmedUrl = assetUrl?.trim();
171
+ // Trim whitespace from the URL
172
+ const trimmedUrl = assetUrl?.trim();
164
173
 
165
- // Ignore empty URLs, data URIs, or fragment-only URLs
166
- if (!trimmedUrl || trimmedUrl.startsWith('data:') || trimmedUrl.startsWith('#')) {
167
- return null;
168
- }
174
+ // Ignore empty URLs, data URIs, or fragment-only URLs
175
+ if (!trimmedUrl || trimmedUrl.startsWith('data:') || trimmedUrl.startsWith('#')) {
176
+ return null;
177
+ }
169
178
 
170
- let resolvableUrl = trimmedUrl;
179
+ let resolvableUrl = trimmedUrl;
171
180
 
172
- // Handle protocol-relative URLs (e.g., //example.com/image.png)
173
- if (resolvableUrl.startsWith('//') && baseContextUrl) {
174
- try {
175
- // Prepend the protocol from the base context URL
176
- const base = new URL(baseContextUrl);
177
- resolvableUrl = base.protocol + resolvableUrl;
178
- } catch (e) {
179
- // Log a warning if the base protocol cannot be determined
180
- logger?.warn(`Could not extract protocol from base "${baseContextUrl}" for protocol-relative URL "${trimmedUrl}". Skipping.`);
181
- return null;
182
- }
181
+ // Handle protocol-relative URLs (e.g., //example.com/image.png)
182
+ if (resolvableUrl.startsWith('//') && baseContextUrl) {
183
+ try {
184
+ // Prepend the protocol from the base context URL
185
+ const base = new URL(baseContextUrl);
186
+ resolvableUrl = base.protocol + resolvableUrl;
187
+ } catch (e) {
188
+ // Log a warning if the base protocol cannot be determined
189
+ logger?.warn(
190
+ `Could not extract protocol from base "${baseContextUrl}" for protocol-relative URL "${trimmedUrl}". Skipping.`
191
+ );
192
+ return null;
183
193
  }
194
+ }
184
195
 
185
- try {
186
- // Use URL constructor for resolution. Handles absolute paths, relative paths, ../ etc.
187
- const resolved = new URL(resolvableUrl, baseContextUrl);
196
+ try {
197
+ // Use URL constructor for resolution. Handles absolute paths, relative paths, ../ etc.
198
+ const resolved = new URL(resolvableUrl, baseContextUrl);
188
199
 
189
- // Skip assets with unsupported protocols (e.g., mailto:, ws:)
190
- if (!['http:', 'https:', 'file:'].includes(resolved.protocol)) {
191
- logger?.debug(`Skipping asset with unsupported protocol: ${resolved.href}`);
192
- return null;
193
- }
194
- // Return the resolved URL object
195
- return resolved;
196
- } catch (error: unknown) {
197
- // Log errors during URL parsing/resolution
198
- const message = error instanceof Error ? error.message : String(error);
199
- // Avoid redundant warnings for relative paths when no base context was provided (expected failure)
200
- if (!/^[a-z]+:/i.test(resolvableUrl) && !resolvableUrl.startsWith('/') && !baseContextUrl) {
201
- logger?.warn(`Cannot resolve relative URL "${resolvableUrl}" - Base context URL was not provided or determined.`);
202
- } else {
203
- // Log other resolution failures
204
- logger?.warn(`⚠️ Failed to parse/resolve URL "${resolvableUrl}" ${baseContextUrl ? 'against base "' + baseContextUrl + '"' : '(no base provided)'}: ${message}`);
205
- }
206
- // Return null if resolution fails
207
- return null;
200
+ // Skip assets with unsupported protocols (e.g., mailto:, ws:)
201
+ if (!['http:', 'https:', 'file:'].includes(resolved.protocol)) {
202
+ logger?.debug(`Skipping asset with unsupported protocol: ${resolved.href}`);
203
+ return null;
208
204
  }
205
+ // Return the resolved URL object
206
+ return resolved;
207
+ } catch (error: unknown) {
208
+ // Log errors during URL parsing/resolution
209
+ const message = error instanceof Error ? error.message : String(error);
210
+ // Avoid redundant warnings for relative paths when no base context was provided (expected failure)
211
+ if (!/^[a-z]+:/i.test(resolvableUrl) && !resolvableUrl.startsWith('/') && !baseContextUrl) {
212
+ logger?.warn(
213
+ `Cannot resolve relative URL "${resolvableUrl}" - Base context URL was not provided or determined.`
214
+ );
215
+ } else {
216
+ // Log other resolution failures
217
+ logger?.warn(
218
+ `⚠️ Failed to parse/resolve URL "${resolvableUrl}" ${baseContextUrl ? 'against base "' + baseContextUrl + '"' : '(no base provided)'}: ${message}`
219
+ );
220
+ }
221
+ // Return null if resolution fails
222
+ return null;
223
+ }
209
224
  }
210
225
 
211
226
  /**
@@ -217,36 +232,34 @@ function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Log
217
232
  * @returns {string | null} The resolved absolute URL string, or null if resolution fails/invalid.
218
233
  */
219
234
  function resolveCssRelativeUrl(
220
- relativeUrl: string,
221
- cssBaseContextUrl: string, // e.g., file:///C:/mock/base/dir/css/deep.css or https://.../style.css
222
- logger?: Logger
235
+ relativeUrl: string,
236
+ cssBaseContextUrl: string, // e.g., file:///C:/mock/base/dir/css/deep.css or https://.../style.css
237
+ logger?: Logger
223
238
  ): string | null {
224
- // console.log(`[DEBUG resolveCssRelativeUrl] Input: relative="${relativeUrl}", base="${cssBaseContextUrl}"`); // Keep debug log commented unless needed
225
-
226
- // Ignore empty, data URIs, or fragments
227
- if (!relativeUrl || relativeUrl.startsWith('data:') || relativeUrl.startsWith('#')) {
228
- return null;
229
- }
230
-
231
- try {
232
- // Use the URL constructor which correctly handles relative paths including ../
233
- // relative to the base URL provided (the CSS file's URL).
234
- const resolvedUrl = new URL(relativeUrl, cssBaseContextUrl);
235
- // console.log(`[DEBUG resolveCssRelativeUrl] Resolved URL object href: "${resolvedUrl.href}"`); // Keep debug log commented unless needed
236
- // Return the resolved absolute URL string
237
- return resolvedUrl.href;
238
-
239
- } catch (error) {
240
- // Log warning if URL resolution fails
241
- logger?.warn(
242
- `Failed to resolve CSS URL: "${relativeUrl}" relative to "${cssBaseContextUrl}": ${String(error)}`
243
- );
244
- // console.error(`[DEBUG resolveCssRelativeUrl] Error resolving: ${String(error)}`); // Keep debug log commented unless needed
245
- return null;
246
- }
239
+ // console.log(`[DEBUG resolveCssRelativeUrl] Input: relative="${relativeUrl}", base="${cssBaseContextUrl}"`); // Keep debug log commented unless needed
240
+
241
+ // Ignore empty, data URIs, or fragments
242
+ if (!relativeUrl || relativeUrl.startsWith('data:') || relativeUrl.startsWith('#')) {
243
+ return null;
244
+ }
245
+
246
+ try {
247
+ // Use the URL constructor which correctly handles relative paths including ../
248
+ // relative to the base URL provided (the CSS file's URL).
249
+ const resolvedUrl = new URL(relativeUrl, cssBaseContextUrl);
250
+ // console.log(`[DEBUG resolveCssRelativeUrl] Resolved URL object href: "${resolvedUrl.href}"`); // Keep debug log commented unless needed
251
+ // Return the resolved absolute URL string
252
+ return resolvedUrl.href;
253
+ } catch (error) {
254
+ // Log warning if URL resolution fails
255
+ logger?.warn(
256
+ `Failed to resolve CSS URL: "${relativeUrl}" relative to "${cssBaseContextUrl}": ${String(error)}`
257
+ );
258
+ // console.error(`[DEBUG resolveCssRelativeUrl] Error resolving: ${String(error)}`); // Keep debug log commented unless needed
259
+ return null;
260
+ }
247
261
  }
248
262
 
249
-
250
263
  /**
251
264
  * Asynchronously fetches the content of a resolved asset URL (http, https, file).
252
265
  * @async
@@ -255,94 +268,103 @@ function resolveCssRelativeUrl(
255
268
  * @param {number} [timeout=10000] - Network timeout in milliseconds for HTTP(S) requests.
256
269
  * @returns {Promise<Buffer | null>} Asset content as a Buffer, or null on failure.
257
270
  */
258
- async function fetchAsset(resolvedUrl: URL, logger?: Logger, timeout: number = 10000): Promise<Buffer | null> {
259
- // console.log(`[DEBUG fetchAsset] Attempting fetch for URL: ${resolvedUrl.href}`); // Keep debug log commented unless needed
260
- logger?.debug(`Attempting to fetch asset: ${resolvedUrl.href}`);
261
- const protocol = resolvedUrl.protocol;
262
-
263
- try {
264
- // Handle HTTP and HTTPS protocols
265
- if (protocol === 'http:' || protocol === 'https:') {
266
- // Use axios to fetch remote content as an ArrayBuffer
267
- const response: AxiosResponse<ArrayBuffer> = await axiosNs.default.get(resolvedUrl.href, {
268
- responseType: 'arraybuffer', // Fetch as binary data
269
- timeout: timeout, // Apply network timeout
270
- });
271
- logger?.debug(`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data?.byteLength ?? 0} bytes)`);
272
- // console.log(`[DEBUG fetchAsset] HTTP fetch SUCCESS for: ${resolvedUrl.href}, Status: ${response.status}`); // Keep debug log commented unless needed
273
- // Return the fetched data as a Node.js Buffer
274
- return Buffer.from(response.data);
275
- }
276
- // Handle file protocol
277
- else if (protocol === 'file:') {
278
- let filePath: string;
279
- try {
280
- // Convert file URL to a system file path
281
- // IMPORTANT: This strips query params and fragments from the URL
282
- filePath = fileURLToPath(resolvedUrl);
283
- } catch (e: any) {
284
- // console.error(`[DEBUG fetchAsset] fileURLToPath FAILED for: ${resolvedUrl.href}`, e); // Keep debug log commented unless needed
285
- logger?.error(`Could not convert file URL to path: ${resolvedUrl.href}. Error: ${e.message}`);
286
- return null; // Return null if conversion fails
287
- }
288
-
289
- const normalizedForLog = path.normalize(filePath);
290
- // console.log(`[DEBUG fetchAsset] Attempting readFile with path: "${normalizedForLog}" (Original from URL: "${filePath}")`); // Keep debug log commented unless needed
271
+ async function fetchAsset(
272
+ resolvedUrl: URL,
273
+ logger?: Logger,
274
+ timeout: number = 10000
275
+ ): Promise<Buffer | null> {
276
+ // console.log(`[DEBUG fetchAsset] Attempting fetch for URL: ${resolvedUrl.href}`); // Keep debug log commented unless needed
277
+ logger?.debug(`Attempting to fetch asset: ${resolvedUrl.href}`);
278
+ const protocol = resolvedUrl.protocol;
279
+
280
+ try {
281
+ // Handle HTTP and HTTPS protocols
282
+ if (protocol === 'http:' || protocol === 'https:') {
283
+ // Use axios to fetch remote content as an ArrayBuffer
284
+ const response: AxiosResponse<ArrayBuffer> = await axiosNs.default.get(resolvedUrl.href, {
285
+ responseType: 'arraybuffer', // Fetch as binary data
286
+ timeout: timeout, // Apply network timeout
287
+ });
288
+ logger?.debug(
289
+ `Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data?.byteLength ?? 0} bytes)`
290
+ );
291
+ // Return the fetched data as a Node.js Buffer
292
+ return Buffer.from(response.data);
293
+ }
294
+ // Handle file protocol
295
+ else if (protocol === 'file:') {
296
+ let filePath: string;
297
+ try {
298
+ // Convert file URL to a system file path
299
+ // IMPORTANT: This strips query params and fragments from the URL
300
+ filePath = fileURLToPath(resolvedUrl);
301
+ } catch (e: any) {
302
+ logger?.error(
303
+ `Could not convert file URL to path: ${resolvedUrl.href}. Error: ${e.message}`
304
+ );
305
+ return null; // Return null if conversion fails
306
+ }
291
307
 
292
- // Read file content using fs/promises
293
- const data = await readFile(filePath); // This call uses the mock in tests
308
+ const normalizedForLog = path.normalize(filePath);
294
309
 
295
- // console.log(`[DEBUG fetchAsset] readFile call SUCCEEDED for path: "${normalizedForLog}". Data length: ${data?.byteLength}`); // Keep debug log commented unless needed
296
- logger?.debug(`Read local file ${filePath} (${data.byteLength} bytes)`);
297
- // Return the file content as a Buffer
298
- return data;
299
- }
300
- // Handle unsupported protocols
301
- else {
302
- // console.log(`[DEBUG fetchAsset] Unsupported protocol: ${protocol}`); // Keep debug log commented unless needed
303
- logger?.warn(`Unsupported protocol "${protocol}" in URL: ${resolvedUrl.href}`);
304
- return null;
305
- }
306
- } catch (error: unknown) {
307
- // --- Handle Errors During Fetch/Read ---
308
- const failedId = protocol === 'file:' ? path.normalize(fileURLToPath(resolvedUrl)) : resolvedUrl.href;
309
- // console.error(`[DEBUG fetchAsset] CAUGHT Error for ${failedId}. Type: ${Object.prototype.toString.call(error)}, Constructor: ${error?.constructor?.name}, isAxiosError property: ${(error as any)?.isAxiosError}, Code: ${(error as any)?.code}`); // Keep for debugging if needed
310
-
311
- // *** FIXED LOGIC: Check for AxiosError using its property *before* generic instanceof Error ***
312
- if ((protocol === 'http:' || protocol === 'https:') && (error as any)?.isAxiosError === true) {
313
- const axiosError = error as AxiosError; // Cast for easier property access
314
- const status = axiosError.response?.status ?? 'N/A';
315
- const code = axiosError.code ?? 'N/A'; // e.g., ECONNABORTED for timeout
316
- // Use the specific log format
317
- const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: ${axiosError.message} (Code: ${code})`;
318
- logger?.warn(logMessage);
319
- }
320
- // Check for file system errors *next*
321
- else if (protocol === 'file:' && error instanceof Error) {
322
- let failedPath = resolvedUrl.href;
323
- try { failedPath = fileURLToPath(resolvedUrl); } catch { /* ignore */ }
324
- failedPath = path.normalize(failedPath);
325
-
326
- if ((error as NodeJSErrnoException).code === 'ENOENT') {
327
- logger?.warn(`⚠️ File not found (ENOENT) for asset: ${failedPath}.`);
328
- } else if ((error as NodeJSErrnoException).code === 'EACCES') {
329
- // Log ONLY the specific EACCES message
330
- logger?.warn(`⚠️ Permission denied (EACCES) reading asset: ${failedPath}.`);
331
- } else {
332
- logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
333
- }
334
- }
335
- // Generic fallback for *other* types of Errors (that are not Axios or known FS errors)
336
- else if (error instanceof Error) {
337
- logger?.warn(`⚠️ An unexpected error occurred processing asset ${resolvedUrl.href}: ${error.message}`);
338
- }
339
- // Fallback for non-Error throws (e.g., strings, numbers)
340
- else {
341
- logger?.warn(`⚠️ An unknown and unexpected error occurred processing asset ${resolvedUrl.href}: ${String(error)}`);
342
- }
343
- // Return null on ANY error
344
- return null;
310
+ // Read file content using fs/promises
311
+ const data = await readFile(filePath); // This call uses the mock in tests
312
+ logger?.debug(`Read local file ${filePath} (${data.byteLength} bytes)`);
313
+ // Return the file content as a Buffer
314
+ return data;
315
+ }
316
+ // Handle unsupported protocols
317
+ else {
318
+ // console.log(`[DEBUG fetchAsset] Unsupported protocol: ${protocol}`); // Keep debug log commented unless needed
319
+ logger?.warn(`Unsupported protocol "${protocol}" in URL: ${resolvedUrl.href}`);
320
+ return null;
321
+ }
322
+ } catch (error: unknown) {
323
+ // --- Handle Errors During Fetch/Read ---
324
+ const failedId =
325
+ protocol === 'file:' ? path.normalize(fileURLToPath(resolvedUrl)) : resolvedUrl.href;
326
+ if ((protocol === 'http:' || protocol === 'https:') && (error as any)?.isAxiosError === true) {
327
+ const axiosError = error as AxiosError; // Cast for easier property access
328
+ const status = axiosError.response?.status ?? 'N/A';
329
+ const code = axiosError.code ?? 'N/A'; // e.g., ECONNABORTED for timeout
330
+ // Use the specific log format
331
+ const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: ${axiosError.message} (Code: ${code})`;
332
+ logger?.warn(logMessage);
333
+ }
334
+ // Check for file system errors *next*
335
+ else if (protocol === 'file:' && error instanceof Error) {
336
+ let failedPath = resolvedUrl.href;
337
+ try {
338
+ failedPath = fileURLToPath(resolvedUrl);
339
+ } catch {
340
+ /* ignore */
341
+ }
342
+ failedPath = path.normalize(failedPath);
343
+
344
+ if ((error as NodeJSErrnoException).code === 'ENOENT') {
345
+ logger?.warn(`⚠️ File not found (ENOENT) for asset: ${failedPath}.`);
346
+ } else if ((error as NodeJSErrnoException).code === 'EACCES') {
347
+ // Log ONLY the specific EACCES message
348
+ logger?.warn(`⚠️ Permission denied (EACCES) reading asset: ${failedPath}.`);
349
+ } else {
350
+ logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
351
+ }
352
+ }
353
+ // Generic fallback for *other* types of Errors (that are not Axios or known FS errors)
354
+ else if (error instanceof Error) {
355
+ logger?.warn(
356
+ `⚠️ An unexpected error occurred processing asset ${resolvedUrl.href}: ${error.message}`
357
+ );
345
358
  }
359
+ // Fallback for non-Error throws (e.g., strings, numbers)
360
+ else {
361
+ logger?.warn(
362
+ `⚠️ An unknown and unexpected error occurred processing asset ${resolvedUrl.href}: ${String(error)}`
363
+ );
364
+ }
365
+ // Return null on ANY error
366
+ return null;
367
+ }
346
368
  }
347
369
 
348
370
  /**
@@ -354,62 +376,65 @@ async function fetchAsset(resolvedUrl: URL, logger?: Logger, timeout: number = 1
354
376
  * @returns {Asset[]} An array of newly discovered Asset objects (type, resolved URL, content initially undefined).
355
377
  */
356
378
  function extractUrlsFromCSS(
357
- cssContent: string,
358
- cssBaseContextUrl: string,
359
- logger?: Logger
379
+ cssContent: string,
380
+ cssBaseContextUrl: string,
381
+ logger?: Logger
360
382
  ): Asset[] {
361
- // Array to hold assets discovered within this CSS content
362
- const newlyDiscovered: Asset[] = [];
363
- // Set to track URLs processed within this specific CSS file to avoid adding duplicates from the same file
364
- const processedInThisParse = new Set<string>();
365
-
366
- // Regex for url(...) patterns, handling optional quotes (non-greedy match for URL)
367
- const urlRegex = /url\(\s*(['"]?)(.*?)\1\s*\)/gi;
368
- // Regex for @import rules, handling url() or bare string, optional quotes (non-greedy match for URL)
369
- const importRegex = /@import\s+(?:url\(\s*(['"]?)(.*?)\1\s*\)|(['"])(.*?)\3)\s*;/gi;
370
-
371
- /** Internal helper to process a found URL string */
372
- const processFoundUrl = (rawUrl: string | undefined, ruleType: '@import' | 'url()') => {
373
- // Skip if URL is empty, undefined, a data URI, or only a fragment
374
- if (!rawUrl || rawUrl.trim() === '' || rawUrl.startsWith('data:') || rawUrl.startsWith('#')) return;
375
-
376
- // Resolve the potentially relative URL against the CSS file's base URL
377
- const resolvedUrl = resolveCssRelativeUrl(rawUrl, cssBaseContextUrl, logger);
378
-
379
- // If successfully resolved and not already found *in this specific CSS file*
380
- if (resolvedUrl && !processedInThisParse.has(resolvedUrl)) {
381
- // Mark this resolved URL as processed for this CSS file
382
- processedInThisParse.add(resolvedUrl);
383
- // Guess the asset type (css, image, font, etc.) based on the resolved URL
384
- const { assetType } = guessMimeType(resolvedUrl);
385
-
386
- // Add the discovered asset to the list for this CSS file
387
- newlyDiscovered.push({
388
- type: assetType,
389
- url: resolvedUrl, // Store the resolved absolute URL string
390
- content: undefined // Content will be fetched later if needed
391
- });
392
- logger?.debug(`Discovered nested ${assetType} asset (${ruleType}) in CSS ${cssBaseContextUrl}: ${resolvedUrl}`);
393
- }
394
- };
395
-
396
- // Find all url(...) matches in the CSS content
397
- let match;
398
- while ((match = urlRegex.exec(cssContent)) !== null) {
399
- // Group 2 captures the URL part inside url()
400
- processFoundUrl(match[2], 'url()');
383
+ // Array to hold assets discovered within this CSS content
384
+ const newlyDiscovered: Asset[] = [];
385
+ // Set to track URLs processed within this specific CSS file to avoid adding duplicates from the same file
386
+ const processedInThisParse = new Set<string>();
387
+
388
+ // Regex for url(...) patterns, handling optional quotes (non-greedy match for URL)
389
+ const urlRegex = /url\(\s*(['"]?)(.*?)\1\s*\)/gi;
390
+ // Regex for @import rules, handling url() or bare string, optional quotes (non-greedy match for URL)
391
+ const importRegex = /@import\s+(?:url\(\s*(['"]?)(.*?)\1\s*\)|(['"])(.*?)\3)\s*;/gi;
392
+
393
+ /** Internal helper to process a found URL string */
394
+ const processFoundUrl = (rawUrl: string | undefined, ruleType: '@import' | 'url()') => {
395
+ // Skip if URL is empty, undefined, a data URI, or only a fragment
396
+ if (!rawUrl || rawUrl.trim() === '' || rawUrl.startsWith('data:') || rawUrl.startsWith('#'))
397
+ return;
398
+
399
+ // Resolve the potentially relative URL against the CSS file's base URL
400
+ const resolvedUrl = resolveCssRelativeUrl(rawUrl, cssBaseContextUrl, logger);
401
+
402
+ // If successfully resolved and not already found *in this specific CSS file*
403
+ if (resolvedUrl && !processedInThisParse.has(resolvedUrl)) {
404
+ // Mark this resolved URL as processed for this CSS file
405
+ processedInThisParse.add(resolvedUrl);
406
+ // Guess the asset type (css, image, font, etc.) based on the resolved URL
407
+ const { assetType } = guessMimeType(resolvedUrl);
408
+
409
+ // Add the discovered asset to the list for this CSS file
410
+ newlyDiscovered.push({
411
+ type: assetType,
412
+ url: resolvedUrl, // Store the resolved absolute URL string
413
+ content: undefined, // Content will be fetched later if needed
414
+ });
415
+ logger?.debug(
416
+ `Discovered nested ${assetType} asset (${ruleType}) in CSS ${cssBaseContextUrl}: ${resolvedUrl}`
417
+ );
401
418
  }
402
-
403
- // Find all @import matches in the CSS content
404
- // Reset lastIndex as we're reusing the regex object implicitly
405
- importRegex.lastIndex = 0;
406
- while ((match = importRegex.exec(cssContent)) !== null) {
407
- // Group 2 captures url('...'), Group 4 captures bare "..."
408
- processFoundUrl(match[2] || match[4], '@import');
409
- }
410
-
411
- // Return the list of assets discovered within this CSS content
412
- return newlyDiscovered;
419
+ };
420
+
421
+ // Find all url(...) matches in the CSS content
422
+ let match;
423
+ while ((match = urlRegex.exec(cssContent)) !== null) {
424
+ // Group 2 captures the URL part inside url()
425
+ processFoundUrl(match[2], 'url()');
426
+ }
427
+
428
+ // Find all @import matches in the CSS content
429
+ // Reset lastIndex as we're reusing the regex object implicitly
430
+ importRegex.lastIndex = 0;
431
+ while ((match = importRegex.exec(cssContent)) !== null) {
432
+ // Group 2 captures url('...'), Group 4 captures bare "..."
433
+ processFoundUrl(match[2] || match[4], '@import');
434
+ }
435
+
436
+ // Return the list of assets discovered within this CSS content
437
+ return newlyDiscovered;
413
438
  }
414
439
 
415
440
  /**
@@ -427,279 +452,321 @@ function extractUrlsFromCSS(
427
452
  * @returns {Promise<ParsedHTML>} Processed data with `htmlContent` and the final `assets` array containing all discovered assets (with content if `embedAssets` was true and fetch succeeded).
428
453
  */
429
454
  export async function extractAssets(
430
- parsed: ParsedHTML,
431
- embedAssets = true,
432
- inputPathOrUrl?: string,
433
- logger?: Logger
455
+ parsed: ParsedHTML,
456
+ embedAssets = true,
457
+ inputPathOrUrl?: string,
458
+ logger?: Logger
434
459
  ): Promise<ParsedHTML> {
435
- logger?.info(`🚀 Starting asset extraction! Embed: ${embedAssets}. Input: ${inputPathOrUrl || '(HTML content only)'}`);
436
-
437
- // Get the initial list of assets found directly in the HTML
438
- const initialAssets: Asset[] = parsed.assets || [];
439
- // Stores the final result: Map<resolved URL string, Asset object> to ensure uniqueness
440
- const finalAssetsMap = new Map<string, Asset>();
441
- // Queue holds assets whose content needs to be processed (fetched/analyzed)
442
- let assetsToProcess: Asset[] = [];
443
- // Set to track URLs that are either already fully processed (in finalAssetsMap)
444
- // OR currently in the processing queue (assetsToProcess) to prevent reprocessing/loops.
445
- const processedOrQueuedUrls = new Set<string>();
446
-
447
- // --- Determine Base URL Context for the HTML ---
448
- const htmlBaseContextUrl = determineBaseUrl(inputPathOrUrl || '', logger);
449
- // Warn if no base URL could be found and there are relative paths in the initial assets
450
- if (!htmlBaseContextUrl && initialAssets.some(a => !/^[a-z]+:/i.test(a.url) && !a.url.startsWith('data:') && !a.url.startsWith('#') && !a.url.startsWith('/'))) {
451
- logger?.warn("🚨 No valid base path/URL determined for the HTML source! Resolution of relative asset paths from HTML may fail.");
452
- } else if (htmlBaseContextUrl) {
453
- logger?.debug(`Using HTML base context URL: ${htmlBaseContextUrl}`);
460
+ logger?.info(
461
+ `🚀 Starting asset extraction! Embed: ${embedAssets}. Input: ${inputPathOrUrl || '(HTML content only)'}`
462
+ );
463
+
464
+ // Get the initial list of assets found directly in the HTML
465
+ const initialAssets: Asset[] = parsed.assets || [];
466
+ // Stores the final result: Map<resolved URL string, Asset object> to ensure uniqueness
467
+ const finalAssetsMap = new Map<string, Asset>();
468
+ // Queue holds assets whose content needs to be processed (fetched/analyzed)
469
+ let assetsToProcess: Asset[] = [];
470
+ // Set to track URLs that are either already fully processed (in finalAssetsMap)
471
+ // OR currently in the processing queue (assetsToProcess) to prevent reprocessing/loops.
472
+ const processedOrQueuedUrls = new Set<string>();
473
+
474
+ // --- Determine Base URL Context for the HTML ---
475
+ const htmlBaseContextUrl = determineBaseUrl(inputPathOrUrl || '', logger);
476
+ // Warn if no base URL could be found and there are relative paths in the initial assets
477
+ if (
478
+ !htmlBaseContextUrl &&
479
+ initialAssets.some(
480
+ a =>
481
+ !/^[a-z]+:/i.test(a.url) &&
482
+ !a.url.startsWith('data:') &&
483
+ !a.url.startsWith('#') &&
484
+ !a.url.startsWith('/')
485
+ )
486
+ ) {
487
+ logger?.warn(
488
+ '🚨 No valid base path/URL determined for the HTML source! Resolution of relative asset paths from HTML may fail.'
489
+ );
490
+ } else if (htmlBaseContextUrl) {
491
+ logger?.debug(`Using HTML base context URL: ${htmlBaseContextUrl}`);
492
+ }
493
+
494
+ // --- Initial Queue Population from HTML assets ---
495
+ logger?.debug(`Queueing ${initialAssets.length} initial assets parsed from HTML...`);
496
+ for (const asset of initialAssets) {
497
+ // Resolve the initial asset URL against the HTML base context
498
+ const resolvedUrlObj = resolveAssetUrl(asset.url, htmlBaseContextUrl, logger);
499
+
500
+ // Skip if URL is invalid, data URI, fragment, or unsupported protocol
501
+ if (!resolvedUrlObj) {
502
+ logger?.debug(` -> Skipping initial asset with unresolvable/ignorable URL: ${asset.url}`);
503
+ continue;
454
504
  }
455
-
456
- // --- Initial Queue Population from HTML assets ---
457
- logger?.debug(`Queueing ${initialAssets.length} initial assets parsed from HTML...`);
458
- for (const asset of initialAssets) {
459
- // Resolve the initial asset URL against the HTML base context
460
- const resolvedUrlObj = resolveAssetUrl(asset.url, htmlBaseContextUrl, logger);
461
-
462
- // Skip if URL is invalid, data URI, fragment, or unsupported protocol
463
- if (!resolvedUrlObj) {
464
- logger?.debug(` -> Skipping initial asset with unresolvable/ignorable URL: ${asset.url}`);
465
- continue;
466
- }
467
- // Get the resolved absolute URL string
468
- const urlToQueue = resolvedUrlObj.href;
469
-
470
- // Check if this URL is already tracked (processed or queued)
471
- if (!processedOrQueuedUrls.has(urlToQueue)) {
472
- // Mark as queued (add to set *before* adding to array)
473
- processedOrQueuedUrls.add(urlToQueue);
474
-
475
- // Guess type from the resolved/original URL if not provided initially
476
- const { assetType: guessedType } = guessMimeType(urlToQueue);
477
- const initialType = asset.type ?? guessedType; // Use provided type or fallback to guessed type
478
-
479
- // Add the resolved asset to the processing queue
480
- assetsToProcess.push({
481
- url: urlToQueue, // Use the resolved URL
482
- type: initialType,
483
- content: undefined // Content is initially undefined
484
- });
485
- logger?.debug(` -> Queued initial asset: ${urlToQueue} (Original raw: ${asset.url})`);
486
- } else {
487
- logger?.debug(` -> Skipping already processed/queued initial asset: ${urlToQueue}`);
505
+ // Get the resolved absolute URL string
506
+ const urlToQueue = resolvedUrlObj.href;
507
+
508
+ // Check if this URL is already tracked (processed or queued)
509
+ if (!processedOrQueuedUrls.has(urlToQueue)) {
510
+ // Mark as queued (add to set *before* adding to array)
511
+ processedOrQueuedUrls.add(urlToQueue);
512
+
513
+ // Guess type from the resolved/original URL if not provided initially
514
+ const { assetType: guessedType } = guessMimeType(urlToQueue);
515
+ const initialType = asset.type ?? guessedType; // Use provided type or fallback to guessed type
516
+
517
+ // Add the resolved asset to the processing queue
518
+ assetsToProcess.push({
519
+ url: urlToQueue, // Use the resolved URL
520
+ type: initialType,
521
+ content: undefined, // Content is initially undefined
522
+ });
523
+ logger?.debug(` -> Queued initial asset: ${urlToQueue} (Original raw: ${asset.url})`);
524
+ } else {
525
+ logger?.debug(` -> Skipping already processed/queued initial asset: ${urlToQueue}`);
526
+ }
527
+ }
528
+
529
+ // --- Main processing loop (continues as long as there are assets to process) ---
530
+ let iterationCount = 0;
531
+ while (assetsToProcess.length > 0) {
532
+ iterationCount++;
533
+ // Prevent potential infinite loops
534
+ if (iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS) {
535
+ logger?.error(
536
+ `🛑 Asset extraction loop limit hit (${MAX_ASSET_EXTRACTION_ITERATIONS})! Aborting.`
537
+ );
538
+ const remainingUrls = assetsToProcess
539
+ .map(a => a.url)
540
+ .slice(0, 10)
541
+ .join(', ');
542
+ logger?.error(
543
+ `Remaining queue sample (${assetsToProcess.length} items): ${remainingUrls}...`
544
+ );
545
+ // Add assets remaining in queue to final map without content before breaking
546
+ assetsToProcess.forEach(asset => {
547
+ if (!finalAssetsMap.has(asset.url)) {
548
+ finalAssetsMap.set(asset.url, { ...asset, content: undefined });
488
549
  }
550
+ });
551
+ assetsToProcess = []; // Clear queue to stop the loop
552
+ break; // Exit loop
489
553
  }
490
554
 
491
- // --- Main processing loop (continues as long as there are assets to process) ---
492
- let iterationCount = 0;
493
- while (assetsToProcess.length > 0) {
494
- iterationCount++;
495
- // Prevent potential infinite loops
496
- if (iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS) {
497
- logger?.error(`🛑 Asset extraction loop limit hit (${MAX_ASSET_EXTRACTION_ITERATIONS})! Aborting.`);
498
- const remainingUrls = assetsToProcess.map(a => a.url).slice(0, 10).join(', ');
499
- logger?.error(`Remaining queue sample (${assetsToProcess.length} items): ${remainingUrls}...`);
500
- // Add assets remaining in queue to final map without content before breaking
501
- assetsToProcess.forEach(asset => {
502
- if (!finalAssetsMap.has(asset.url)) {
503
- finalAssetsMap.set(asset.url, { ...asset, content: undefined });
504
- }
505
- });
506
- assetsToProcess = []; // Clear queue to stop the loop
507
- break; // Exit loop
508
- }
555
+ // Take a snapshot of the current queue to process in this iteration
556
+ const currentBatch = [...assetsToProcess];
557
+ // Clear the main queue; new assets found in this batch will be added here for the *next* iteration
558
+ assetsToProcess = [];
509
559
 
510
- // Take a snapshot of the current queue to process in this iteration
511
- const currentBatch = [...assetsToProcess];
512
- // Clear the main queue; new assets found in this batch will be added here for the *next* iteration
513
- assetsToProcess = [];
560
+ logger?.debug(`--- Processing batch ${iterationCount}: ${currentBatch.length} asset(s) ---`);
514
561
 
515
- logger?.debug(`--- Processing batch ${iterationCount}: ${currentBatch.length} asset(s) ---`);
562
+ // Process each asset in the current batch
563
+ for (const asset of currentBatch) {
564
+ // Double-check: Skip if this asset somehow got fully processed in a previous iteration (shouldn't happen with current logic, but safe check)
565
+ if (finalAssetsMap.has(asset.url)) {
566
+ logger?.debug(`Skipping asset already in final map: ${asset.url}`);
567
+ continue;
568
+ }
516
569
 
517
- // Process each asset in the current batch
518
- for (const asset of currentBatch) {
519
- // Double-check: Skip if this asset somehow got fully processed in a previous iteration (shouldn't happen with current logic, but safe check)
520
- if (finalAssetsMap.has(asset.url)) {
521
- logger?.debug(`Skipping asset already in final map: ${asset.url}`);
522
- continue;
523
- }
570
+ let assetContentBuffer: Buffer | null = null; // To store fetched binary content
571
+ let finalContent: string | undefined = undefined; // Final content (text or data URI) for the Asset object
572
+ let cssContentForParsing: string | undefined = undefined; // Text content specifically for parsing CSS
524
573
 
525
- let assetContentBuffer: Buffer | null = null; // To store fetched binary content
526
- let finalContent: string | undefined = undefined; // Final content (text or data URI) for the Asset object
527
- let cssContentForParsing: string | undefined = undefined; // Text content specifically for parsing CSS
528
-
529
- // --- Determine if fetching is needed ---
530
- // Fetch if we need to embed all assets OR if it's CSS (we need content to parse for nested assets)
531
- const needsFetching = embedAssets || asset.type === 'css';
532
- let assetUrlObj: URL | null = null; // URL object needed for fetchAsset
533
-
534
- if (needsFetching) {
535
- // --- Create URL object for fetching ---
536
- try {
537
- // Asset URL should be absolute at this point
538
- assetUrlObj = new URL(asset.url);
539
- } catch (urlError) {
540
- // Log error if creating URL object fails
541
- logger?.warn(`Cannot create URL object for "${asset.url}", skipping fetch. Error: ${urlError instanceof Error ? urlError.message : String(urlError)}`);
542
- // Store asset without content in the final map
543
- finalAssetsMap.set(asset.url, { ...asset, content: undefined });
544
- // Skip to next asset in the current batch
545
- continue;
546
- }
547
-
548
- // --- Fetch Asset ---
549
- if (assetUrlObj) {
550
- // Call fetchAsset (which handles http/https/file and errors)
551
- assetContentBuffer = await fetchAsset(assetUrlObj, logger);
552
- // fetchAsset returns null on failure
553
- }
554
- } // End if(needsFetching)
555
-
556
- // --- If fetching was required but failed, store asset without content and continue ---
557
- if (needsFetching && assetContentBuffer === null) {
558
- logger?.debug(`Storing asset ${asset.url} without content due to fetch failure.`);
559
- // Add to final map with undefined content
560
- finalAssetsMap.set(asset.url, { ...asset, content: undefined });
561
- // Skip to the next asset in the current batch
562
- continue;
563
- }
574
+ // --- Determine if fetching is needed ---
575
+ // Fetch if we need to embed all assets OR if it's CSS (we need content to parse for nested assets)
576
+ const needsFetching = embedAssets || asset.type === 'css';
577
+ let assetUrlObj: URL | null = null; // URL object needed for fetchAsset
578
+
579
+ if (needsFetching) {
580
+ // --- Create URL object for fetching ---
581
+ try {
582
+ // Asset URL should be absolute at this point
583
+ assetUrlObj = new URL(asset.url);
584
+ } catch (urlError) {
585
+ // Log error if creating URL object fails
586
+ logger?.warn(
587
+ `Cannot create URL object for "${asset.url}", skipping fetch. Error: ${urlError instanceof Error ? urlError.message : String(urlError)}`
588
+ );
589
+ // Store asset without content in the final map
590
+ finalAssetsMap.set(asset.url, { ...asset, content: undefined });
591
+ // Skip to next asset in the current batch
592
+ continue;
593
+ }
564
594
 
565
- // --- Prepare Content for Storing/Embedding (if fetched successfully) ---
566
- if (assetContentBuffer) { // Only proceed if content was fetched
567
- // Guess MIME type based on the asset's URL extension
568
- const mimeInfo = guessMimeType(asset.url);
569
- // Use the guessed MIME type or fallback to a generic binary type
570
- const effectiveMime = mimeInfo.mime || 'application/octet-stream';
571
-
572
- // Handle TEXT types (CSS, JS)
573
- if (TEXT_ASSET_TYPES.has(asset.type)) {
574
- let textContent: string | undefined;
575
- let wasLossy = false;
576
- try {
577
- // Try decoding the buffer as UTF-8
578
- textContent = assetContentBuffer.toString('utf-8');
579
- // Check if the decoding process lost information (e.g., invalid sequences replaced)
580
- wasLossy = isUtf8DecodingLossy(assetContentBuffer, textContent);
581
- } catch (e) {
582
- // Decoding itself failed
583
- textContent = undefined;
584
- wasLossy = true;
585
- }
586
-
587
- // If decoding was successful and not lossy
588
- if (!wasLossy && textContent !== undefined) {
589
- // If embedding, store the text content
590
- if (embedAssets) {
591
- finalContent = textContent;
592
- } else {
593
- finalContent = undefined; // Not embedding text, store undefined
594
- }
595
- // If it's CSS, store its text content for parsing regardless of embedding option
596
- if (asset.type === 'css') {
597
- cssContentForParsing = textContent;
598
- }
599
- } else {
600
- // Decoding failed or was lossy
601
- // Fixed log message: Added "asset" after type.
602
- logger?.warn(`Could not decode ${asset.type} asset ${asset.url} as valid UTF-8 text.${embedAssets ? ' Falling back to base64 data URI.' : ''}`);
603
- cssContentForParsing = undefined; // Cannot parse CSS if decoding failed
604
- // Embed as base64 data URI if requested, using the effective MIME type
605
- if (embedAssets) {
606
- finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
607
- } else {
608
- finalContent = undefined; // Not embedding
609
- }
610
- }
611
- }
612
- // Handle BINARY types (image, font, video, audio)
613
- else if (BINARY_ASSET_TYPES.has(asset.type)) {
614
- // Embed as base64 data URI if requested
615
- if (embedAssets) {
616
- finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
617
- } else {
618
- finalContent = undefined; // Not embedding
619
- }
620
- cssContentForParsing = undefined; // Not CSS, so no parsing needed
621
- }
622
- // Handle 'other' or unknown types
623
- else {
624
- cssContentForParsing = undefined; // Assume not parseable as CSS
625
- // If embedding, attempt to store as text, fallback to base64 if invalid UTF-8
626
- if (embedAssets) {
627
- try {
628
- const attemptedTextContent = assetContentBuffer.toString('utf-8');
629
- if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
630
- // If text decoding is lossy, warn and use base64
631
- logger?.warn(`Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`);
632
- finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
633
- } else {
634
- // Store as text if decoding worked
635
- finalContent = attemptedTextContent;
636
- logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
637
- }
638
- } catch (decodeError) {
639
- // If toString fails, warn and use base64
640
- logger?.warn(`Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`);
641
- finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
642
- }
643
- } else {
644
- finalContent = undefined; // Not embedding
645
- }
646
- }
647
- } else { // Content was not fetched (e.g., embedAssets=false and not CSS)
648
- finalContent = undefined;
649
- cssContentForParsing = undefined;
595
+ // --- Fetch Asset ---
596
+ if (assetUrlObj) {
597
+ // Call fetchAsset (which handles http/https/file and errors)
598
+ assetContentBuffer = await fetchAsset(assetUrlObj, logger);
599
+ // fetchAsset returns null on failure
600
+ }
601
+ } // End if(needsFetching)
602
+
603
+ // --- If fetching was required but failed, store asset without content and continue ---
604
+ if (needsFetching && assetContentBuffer === null) {
605
+ logger?.debug(`Storing asset ${asset.url} without content due to fetch failure.`);
606
+ // Add to final map with undefined content
607
+ finalAssetsMap.set(asset.url, { ...asset, content: undefined });
608
+ // Skip to the next asset in the current batch
609
+ continue;
610
+ }
611
+
612
+ // --- Prepare Content for Storing/Embedding (if fetched successfully) ---
613
+ if (assetContentBuffer) {
614
+ // Only proceed if content was fetched
615
+ // Guess MIME type based on the asset's URL extension
616
+ const mimeInfo = guessMimeType(asset.url);
617
+ // Use the guessed MIME type or fallback to a generic binary type
618
+ const effectiveMime = mimeInfo.mime || 'application/octet-stream';
619
+
620
+ // Handle TEXT types (CSS, JS)
621
+ if (TEXT_ASSET_TYPES.has(asset.type)) {
622
+ let textContent: string | undefined;
623
+ let wasLossy = false;
624
+ try {
625
+ // Try decoding the buffer as UTF-8
626
+ textContent = assetContentBuffer.toString('utf-8');
627
+ // Check if the decoding process lost information (e.g., invalid sequences replaced)
628
+ wasLossy = isUtf8DecodingLossy(assetContentBuffer, textContent);
629
+ } catch (e) {
630
+ // Decoding itself failed
631
+ textContent = undefined;
632
+ wasLossy = true;
633
+ }
634
+
635
+ // If decoding was successful and not lossy
636
+ if (!wasLossy && textContent !== undefined) {
637
+ // If embedding, store the text content
638
+ if (embedAssets) {
639
+ finalContent = textContent;
640
+ } else {
641
+ finalContent = undefined; // Not embedding text, store undefined
642
+ }
643
+ // If it's CSS, store its text content for parsing regardless of embedding option
644
+ if (asset.type === 'css') {
645
+ cssContentForParsing = textContent;
646
+ }
647
+ } else {
648
+ // Decoding failed or was lossy
649
+ logger?.warn(
650
+ `Could not decode ${asset.type} asset ${asset.url} as valid UTF-8 text.${embedAssets ? ' Falling back to base64 data URI.' : ''}`
651
+ );
652
+ cssContentForParsing = undefined; // Cannot parse CSS if decoding failed
653
+ // Embed as base64 data URI if requested, using the effective MIME type
654
+ if (embedAssets) {
655
+ finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
656
+ } else {
657
+ finalContent = undefined; // Not embedding
650
658
  }
659
+ }
660
+ }
661
+ // Handle BINARY types (image, font, video, audio)
662
+ else if (BINARY_ASSET_TYPES.has(asset.type)) {
663
+ // Embed as base64 data URI if requested
664
+ if (embedAssets) {
665
+ finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
666
+ } else {
667
+ finalContent = undefined; // Not embedding
668
+ }
669
+ cssContentForParsing = undefined; // Not CSS, so no parsing needed
670
+ }
671
+ // Handle 'other' or unknown types
672
+ else {
673
+ cssContentForParsing = undefined; // Assume not parseable as CSS
674
+ // If embedding, attempt to store as text, fallback to base64 if invalid UTF-8
675
+ if (embedAssets) {
676
+ try {
677
+ const attemptedTextContent = assetContentBuffer.toString('utf-8');
678
+ if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
679
+ // If text decoding is lossy, warn and use base64
680
+ logger?.warn(
681
+ `Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`
682
+ );
683
+ finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
684
+ } else {
685
+ // Store as text if decoding worked
686
+ finalContent = attemptedTextContent;
687
+ logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
688
+ }
689
+ } catch (decodeError) {
690
+ // If toString fails, warn and use base64
691
+ logger?.warn(
692
+ `Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`
693
+ );
694
+ finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
695
+ }
696
+ } else {
697
+ finalContent = undefined; // Not embedding
698
+ }
699
+ }
700
+ } else {
701
+ // Content was not fetched (e.g., embedAssets=false and not CSS)
702
+ finalContent = undefined;
703
+ cssContentForParsing = undefined;
704
+ }
705
+
706
+ // --- Store the final processed asset in the map ---
707
+ // Use the resolved URL as the key and ensure the asset object also uses the resolved URL
708
+ finalAssetsMap.set(asset.url, { ...asset, url: asset.url, content: finalContent });
709
+ // Note: URL was already added to processedOrQueuedUrls when initially queued or discovered in CSS
710
+
711
+ // --- Process CSS for nested assets ---
712
+ // Only if it's CSS and we successfully decoded its content for parsing
713
+ if (asset.type === 'css' && cssContentForParsing) {
714
+ // Determine the base URL *for this specific CSS file* to resolve its relative links
715
+ const cssBaseContextUrl = determineBaseUrl(asset.url, logger); // CSS URL is absolute here
716
+ logger?.debug(
717
+ `CSS base context for resolving nested assets within ${asset.url}: ${cssBaseContextUrl}`
718
+ );
651
719
 
652
- // --- Store the final processed asset in the map ---
653
- // Use the resolved URL as the key and ensure the asset object also uses the resolved URL
654
- finalAssetsMap.set(asset.url, { ...asset, url: asset.url, content: finalContent });
655
- // Note: URL was already added to processedOrQueuedUrls when initially queued or discovered in CSS
656
-
657
- // --- Process CSS for nested assets ---
658
- // Only if it's CSS and we successfully decoded its content for parsing
659
- if (asset.type === 'css' && cssContentForParsing) {
660
- // Determine the base URL *for this specific CSS file* to resolve its relative links
661
- const cssBaseContextUrl = determineBaseUrl(asset.url, logger); // CSS URL is absolute here
662
- logger?.debug(`CSS base context for resolving nested assets within ${asset.url}: ${cssBaseContextUrl}`);
663
-
664
- if (cssBaseContextUrl) {
665
- // Extract URLs found within this CSS content
666
- const newlyDiscoveredAssets = extractUrlsFromCSS(
667
- cssContentForParsing,
668
- cssBaseContextUrl, // Use the CSS file's own URL as the base
669
- logger
670
- );
671
-
672
- // If new assets were found in the CSS
673
- if (newlyDiscoveredAssets.length > 0) {
674
- logger?.debug(`Discovered ${newlyDiscoveredAssets.length} nested assets in CSS ${asset.url}. Checking against queue...`);
675
- // Process each newly discovered asset
676
- for (const newAsset of newlyDiscoveredAssets) {
677
- // CHECK: Add to the main processing queue only if this resolved URL hasn't been processed OR queued before.
678
- if (!processedOrQueuedUrls.has(newAsset.url)) {
679
- processedOrQueuedUrls.add(newAsset.url); // Mark as queued now
680
- assetsToProcess.push(newAsset); // Add to the queue for the *next* iteration
681
- logger?.debug(` -> Queued new nested asset: ${newAsset.url}`);
682
- } else {
683
- // Skip if already handled
684
- logger?.debug(` -> Skipping already processed/queued nested asset: ${newAsset.url}`);
685
- }
686
- }
687
- }
688
- } else {
689
- // Warn if the base URL for the CSS file couldn't be determined (shouldn't happen if asset.url was valid)
690
- logger?.warn(`Could not determine base URL context for CSS file ${asset.url}. Cannot resolve nested relative paths within it.`);
691
- }
692
- } // End if(asset.type === 'css' && cssContentForParsing)
693
- } // End for loop over currentBatch
694
- } // End while loop (assetsToProcess.length > 0)
695
-
696
- // Log completion summary
697
- const finalIterationCount = iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS ? `${MAX_ASSET_EXTRACTION_ITERATIONS}+ (limit hit)` : iterationCount;
698
- logger?.info(`✅ Asset extraction COMPLETE! Found ${finalAssetsMap.size} unique assets in ${finalIterationCount} iterations.`);
699
-
700
- // Return the original HTML content and the final list of processed assets from the map
701
- return {
702
- htmlContent: parsed.htmlContent,
703
- assets: Array.from(finalAssetsMap.values())
704
- };
705
- }
720
+ if (cssBaseContextUrl) {
721
+ // Extract URLs found within this CSS content
722
+ const newlyDiscoveredAssets = extractUrlsFromCSS(
723
+ cssContentForParsing,
724
+ cssBaseContextUrl, // Use the CSS file's own URL as the base
725
+ logger
726
+ );
727
+
728
+ // If new assets were found in the CSS
729
+ if (newlyDiscoveredAssets.length > 0) {
730
+ logger?.debug(
731
+ `Discovered ${newlyDiscoveredAssets.length} nested assets in CSS ${asset.url}. Checking against queue...`
732
+ );
733
+ // Process each newly discovered asset
734
+ for (const newAsset of newlyDiscoveredAssets) {
735
+ // CHECK: Add to the main processing queue only if this resolved URL hasn't been processed OR queued before.
736
+ if (!processedOrQueuedUrls.has(newAsset.url)) {
737
+ processedOrQueuedUrls.add(newAsset.url); // Mark as queued now
738
+ assetsToProcess.push(newAsset); // Add to the queue for the *next* iteration
739
+ logger?.debug(` -> Queued new nested asset: ${newAsset.url}`);
740
+ } else {
741
+ // Skip if already handled
742
+ logger?.debug(
743
+ ` -> Skipping already processed/queued nested asset: ${newAsset.url}`
744
+ );
745
+ }
746
+ }
747
+ }
748
+ } else {
749
+ // Warn if the base URL for the CSS file couldn't be determined (shouldn't happen if asset.url was valid)
750
+ logger?.warn(
751
+ `Could not determine base URL context for CSS file ${asset.url}. Cannot resolve nested relative paths within it.`
752
+ );
753
+ }
754
+ } // End if(asset.type === 'css' && cssContentForParsing)
755
+ } // End for loop over currentBatch
756
+ } // End while loop (assetsToProcess.length > 0)
757
+
758
+ // Log completion summary
759
+ const finalIterationCount =
760
+ iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS
761
+ ? `${MAX_ASSET_EXTRACTION_ITERATIONS}+ (limit hit)`
762
+ : iterationCount;
763
+ logger?.info(
764
+ `✅ Asset extraction COMPLETE! Found ${finalAssetsMap.size} unique assets in ${finalIterationCount} iterations.`
765
+ );
766
+
767
+ // Return the original HTML content and the final list of processed assets from the map
768
+ return {
769
+ htmlContent: parsed.htmlContent,
770
+ assets: Array.from(finalAssetsMap.values()),
771
+ };
772
+ }