portapack 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/.eslintrc.json +67 -8
  2. package/.github/workflows/ci.yml +5 -4
  3. package/.releaserc.js +25 -27
  4. package/CHANGELOG.md +12 -19
  5. package/LICENSE.md +21 -0
  6. package/README.md +34 -36
  7. package/commitlint.config.js +30 -34
  8. package/dist/cli/cli-entry.cjs +199 -135
  9. package/dist/cli/cli-entry.cjs.map +1 -1
  10. package/dist/index.d.ts +0 -3
  11. package/dist/index.js +194 -134
  12. package/dist/index.js.map +1 -1
  13. package/docs/.vitepress/config.ts +36 -34
  14. package/docs/.vitepress/sidebar-generator.ts +89 -38
  15. package/docs/cli.md +29 -82
  16. package/docs/code-of-conduct.md +7 -1
  17. package/docs/configuration.md +103 -117
  18. package/docs/contributing.md +6 -2
  19. package/docs/deployment.md +10 -5
  20. package/docs/development.md +8 -5
  21. package/docs/getting-started.md +76 -45
  22. package/docs/index.md +1 -1
  23. package/docs/public/android-chrome-192x192.png +0 -0
  24. package/docs/public/android-chrome-512x512.png +0 -0
  25. package/docs/public/apple-touch-icon.png +0 -0
  26. package/docs/public/favicon-16x16.png +0 -0
  27. package/docs/public/favicon-32x32.png +0 -0
  28. package/docs/public/favicon.ico +0 -0
  29. package/docs/site.webmanifest +1 -0
  30. package/docs/troubleshooting.md +12 -1
  31. package/examples/main.ts +7 -10
  32. package/examples/sample-project/script.js +1 -1
  33. package/jest.config.ts +8 -13
  34. package/nodemon.json +5 -10
  35. package/package.json +2 -5
  36. package/src/cli/cli-entry.ts +2 -2
  37. package/src/cli/cli.ts +21 -16
  38. package/src/cli/options.ts +127 -113
  39. package/src/core/bundler.ts +254 -221
  40. package/src/core/extractor.ts +639 -520
  41. package/src/core/minifier.ts +173 -162
  42. package/src/core/packer.ts +141 -137
  43. package/src/core/parser.ts +74 -73
  44. package/src/core/web-fetcher.ts +270 -258
  45. package/src/index.ts +18 -17
  46. package/src/types.ts +9 -11
  47. package/src/utils/font.ts +12 -6
  48. package/src/utils/logger.ts +110 -105
  49. package/src/utils/meta.ts +75 -76
  50. package/src/utils/mime.ts +50 -50
  51. package/src/utils/slugify.ts +33 -34
  52. package/tests/unit/cli/cli-entry.test.ts +72 -70
  53. package/tests/unit/cli/cli.test.ts +314 -278
  54. package/tests/unit/cli/options.test.ts +294 -301
  55. package/tests/unit/core/bundler.test.ts +426 -329
  56. package/tests/unit/core/extractor.test.ts +828 -380
  57. package/tests/unit/core/minifier.test.ts +374 -274
  58. package/tests/unit/core/packer.test.ts +298 -264
  59. package/tests/unit/core/parser.test.ts +538 -150
  60. package/tests/unit/core/web-fetcher.test.ts +389 -359
  61. package/tests/unit/index.test.ts +238 -197
  62. package/tests/unit/utils/font.test.ts +26 -21
  63. package/tests/unit/utils/logger.test.ts +267 -260
  64. package/tests/unit/utils/meta.test.ts +29 -28
  65. package/tests/unit/utils/mime.test.ts +73 -74
  66. package/tests/unit/utils/slugify.test.ts +14 -12
  67. package/tsconfig.build.json +9 -10
  68. package/tsconfig.jest.json +2 -1
  69. package/tsconfig.json +2 -2
  70. package/tsup.config.ts +8 -8
  71. package/typedoc.json +5 -9
  72. package/docs/demo.md +0 -46
  73. /package/docs/{portapack-transparent.png → public/portapack-transparent.png} +0 -0
  74. /package/docs/{portapack.jpg → public/portapack.jpg} +0 -0
@@ -2,21 +2,23 @@
2
2
  * @file src/core/extractor.ts
3
3
  * @description Handles discovery, resolution, fetching, and optional embedding of assets
4
4
  * linked from HTML and recursively within CSS (@import, url()). This is the heart of finding EVERYTHING.
5
- * @version 1.1.4 - Added console logs for debugging path/URL resolution. Refined determineBaseUrl.
6
5
  */
7
6
 
8
7
  // === Node.js Core Imports ===
9
8
  import { readFile } from 'fs/promises';
10
9
  import * as fs from 'fs'; // Required for statSync for sync directory check
11
- import type { FileHandle } from 'fs/promises';
10
+ import type { FileHandle } from 'fs/promises'; // Import specific type if needed elsewhere
12
11
  import path from 'path';
13
12
  import { fileURLToPath, URL } from 'url'; // Crucial for file path/URL conversion
14
13
 
15
14
  // === External Dependencies ===
16
- // Using requireNamespace avoids potential ESM/CJS interop issues with mocks if they arise
17
- // const axios = require('axios'); // Alternative if import * causes issues with mocks
18
15
  import * as axiosNs from 'axios'; // Using namespace import for clarity
19
- import type { AxiosError, AxiosRequestConfig, AxiosResponse, InternalAxiosRequestConfig } from 'axios';
16
+ import type {
17
+ AxiosError,
18
+ AxiosRequestConfig,
19
+ AxiosResponse,
20
+ InternalAxiosRequestConfig,
21
+ } from 'axios'; // Import necessary types
20
22
 
21
23
  // === Project Imports ===
22
24
  import type { Asset, ParsedHTML } from '../types'; // Adjust path if needed
@@ -32,7 +34,6 @@ const BINARY_ASSET_TYPES: Set<Asset['type']> = new Set(['image', 'font', 'video'
32
34
  const MAX_ASSET_EXTRACTION_ITERATIONS = 1000;
33
35
 
34
36
  // === Helper Functions ===
35
-
36
37
  /**
37
38
  * Custom type for Node.js error objects with a `code` property.
38
39
  */
@@ -45,13 +46,15 @@ type NodeJSErrnoException = Error & { code?: string };
45
46
  * @returns {boolean} True if re-encoding doesn't match original buffer (lossy), false otherwise.
46
47
  */
47
48
  function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boolean {
48
- try {
49
- const reEncodedBuffer = Buffer.from(decodedString, 'utf-8');
50
- return !originalBuffer.equals(reEncodedBuffer);
51
- } catch (e) {
52
- // Error during re-encoding likely means original wasn't valid UTF-8
53
- return true;
54
- }
49
+ try {
50
+ // Re-encode the decoded string back to a buffer using UTF-8
51
+ const reEncodedBuffer = Buffer.from(decodedString, 'utf-8');
52
+ // Compare the re-encoded buffer with the original buffer
53
+ return !originalBuffer.equals(reEncodedBuffer);
54
+ } catch (e) {
55
+ // If an error occurs during re-encoding, it implies the original wasn't valid UTF-8
56
+ return true;
57
+ }
55
58
  }
56
59
 
57
60
  /**
@@ -62,92 +65,98 @@ function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boo
62
65
  * @returns {string | undefined} The absolute base URL string ending in '/', or undefined if determination fails.
63
66
  */
64
67
  function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | undefined {
65
- // [DEBUG LOG] Added for diagnostics
66
- console.log(`[DEBUG determineBaseUrl] Input: "${inputPathOrUrl}"`);
67
- logger?.debug(`Determining base URL for input: ${inputPathOrUrl}`);
68
- if (!inputPathOrUrl) {
69
- logger?.warn('Cannot determine base URL: inputPathOrUrl is empty or invalid.');
70
- return undefined;
68
+ // Log the input for debugging purposes
69
+ // console.log(`[DEBUG determineBaseUrl] Input: "${inputPathOrUrl}"`); // Keep debug log commented unless needed
70
+ logger?.debug(`Determining base URL for input: ${inputPathOrUrl}`);
71
+
72
+ // Handle invalid or empty input
73
+ if (!inputPathOrUrl) {
74
+ logger?.warn('Cannot determine base URL: inputPathOrUrl is empty or invalid.');
75
+ return undefined;
76
+ }
77
+
78
+ try {
79
+ // Handle non-file URLs (HTTP, HTTPS)
80
+ if (/^https?:\/\//i.test(inputPathOrUrl)) {
81
+ const url = new URL(inputPathOrUrl);
82
+ // Construct the base URL by taking the path up to the last '/'
83
+ url.pathname = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
84
+ url.search = ''; // Remove query parameters
85
+ url.hash = ''; // Remove fragments
86
+ const baseUrl = url.href;
87
+ logger?.debug(`Determined remote base URL: ${baseUrl}`);
88
+ // console.log(`[DEBUG determineBaseUrl] Determined Remote URL: "${baseUrl}"`); // Keep debug log commented unless needed
89
+ // Return the constructed base URL (usually ends in '/')
90
+ return baseUrl;
71
91
  }
72
-
73
- try {
74
- // Handle non-file URLs (HTTP, HTTPS)
75
- if (/^https?:\/\//i.test(inputPathOrUrl)) {
76
- const url = new URL(inputPathOrUrl);
77
- // Get URL up to the last slash in the path
78
- url.pathname = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
79
- url.search = ''; url.hash = ''; // Clear query params/fragments
80
- const baseUrl = url.href;
81
- logger?.debug(`Determined remote base URL: ${baseUrl}`);
82
- // [DEBUG LOG] Added for diagnostics
83
- console.log(`[DEBUG determineBaseUrl] Determined Remote URL: "${baseUrl}"`);
84
- return baseUrl; // URLs from constructor usually end in '/' if path ends in '/'
85
- }
86
- // Handle other protocols (warn and return undefined)
87
- else if (inputPathOrUrl.includes('://') && !inputPathOrUrl.startsWith('file:')) {
88
- logger?.warn(`Input "${inputPathOrUrl}" looks like a URL but uses an unsupported protocol. Cannot determine base URL.`);
89
- // [DEBUG LOG] Added for diagnostics
90
- console.log(`[DEBUG determineBaseUrl] Unsupported protocol.`);
91
- return undefined;
92
- }
93
- // Handle file paths and file: URLs
94
- else {
95
- let resourcePath: string; // Path to the actual file or dir input
96
- let isInputLikelyDirectory = false;
97
-
98
- // Convert input to an absolute path
99
- if (inputPathOrUrl.startsWith('file:')) {
100
- resourcePath = fileURLToPath(inputPathOrUrl);
101
- // file: URLs ending in / strongly suggest a directory
102
- isInputLikelyDirectory = inputPathOrUrl.endsWith('/');
103
- } else {
104
- resourcePath = path.resolve(inputPathOrUrl); // Resolve relative/absolute file paths
105
- // Check if the resolved path *actually* exists and is a directory
106
- // This distinguishes 'C:\path\to\dir' from 'C:\path\to\file.html'
107
- try {
108
- // Use statSync carefully - assumes it's available and works (or mocked)
109
- isInputLikelyDirectory = fs.statSync(resourcePath).isDirectory();
110
- } catch {
111
- // If stat fails (ENOENT, EACCES), assume it refers to a file path
112
- isInputLikelyDirectory = false;
113
- }
114
- }
115
- // [DEBUG LOG] Added for diagnostics
116
- console.log(`[DEBUG determineBaseUrl] resourcePath: "${resourcePath}", isInputLikelyDirectory: ${isInputLikelyDirectory}`);
117
-
118
- // The base directory is the directory containing the resourcePath,
119
- // OR resourcePath itself if it was identified as a directory.
120
- const baseDirPath = isInputLikelyDirectory ? resourcePath : path.dirname(resourcePath);
121
- // [DEBUG LOG] Added for diagnostics
122
- console.log(`[DEBUG determineBaseUrl] Calculated baseDirPath: "${baseDirPath}"`);
123
-
124
- // Convert base directory path back to a file URL ending in '/'
125
- let normalizedPathForURL = baseDirPath.replace(/\\/g, '/'); // Use forward slashes
126
- // Ensure leading slash for Windows file URLs (e.g., /C:/...)
127
- if (/^[A-Z]:\//i.test(normalizedPathForURL) && !normalizedPathForURL.startsWith('/')) {
128
- normalizedPathForURL = '/' + normalizedPathForURL;
129
- }
130
- // Ensure trailing slash for the directory URL
131
- if (!normalizedPathForURL.endsWith('/')) {
132
- normalizedPathForURL += '/';
133
- }
134
-
135
- const fileUrl = new URL('file://' + normalizedPathForURL);
136
- const fileUrlString = fileUrl.href;
137
-
138
- logger?.debug(`Determined base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved base dir: ${baseDirPath})`);
139
- // [DEBUG LOG] Added for diagnostics
140
- console.log(`[DEBUG determineBaseUrl] Determined File URL: "${fileUrlString}"`);
141
- return fileUrlString;
142
-
92
+ // Handle other protocols (warn and return undefined)
93
+ else if (inputPathOrUrl.includes('://') && !inputPathOrUrl.startsWith('file:')) {
94
+ logger?.warn(
95
+ `Input "${inputPathOrUrl}" looks like a URL but uses an unsupported protocol. Cannot determine base URL.`
96
+ );
97
+ // console.log(`[DEBUG determineBaseUrl] Unsupported protocol.`); // Keep debug log commented unless needed
98
+ return undefined;
99
+ }
100
+ // Handle file paths and file: URLs
101
+ else {
102
+ let resourcePath: string; // Path to the actual file or dir input
103
+ let isInputLikelyDirectory = false;
104
+
105
+ // Convert input to an absolute path
106
+ if (inputPathOrUrl.startsWith('file:')) {
107
+ // Convert file URL to path
108
+ resourcePath = fileURLToPath(inputPathOrUrl);
109
+ // file: URLs ending in / strongly suggest a directory
110
+ isInputLikelyDirectory = inputPathOrUrl.endsWith('/');
111
+ } else {
112
+ // Resolve relative/absolute file paths
113
+ resourcePath = path.resolve(inputPathOrUrl);
114
+ // Check if the resolved path *actually* exists and is a directory
115
+ try {
116
+ // Use statSync carefully - assumes it's available and works (or mocked)
117
+ isInputLikelyDirectory = fs.statSync(resourcePath).isDirectory();
118
+ } catch {
119
+ // If stat fails (ENOENT, EACCES), assume it refers to a file path
120
+ isInputLikelyDirectory = false;
143
121
  }
144
- } catch (error: unknown) {
145
- const message = error instanceof Error ? error.message : String(error);
146
- // [DEBUG LOG] Added for diagnostics
147
- console.error(`[DEBUG determineBaseUrl] Error determining base URL: ${message}`);
148
- logger?.error(`💀 Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error && error.stack ? ` - Stack: ${error.stack}` : ''}`);
149
- return undefined;
122
+ }
123
+ // console.log(`[DEBUG determineBaseUrl] resourcePath: "${resourcePath}", isInputLikelyDirectory: ${isInputLikelyDirectory}`); // Keep debug log commented unless needed
124
+
125
+ // The base directory is the directory containing the resourcePath,
126
+ // OR resourcePath itself if it was identified as a directory.
127
+ const baseDirPath = isInputLikelyDirectory ? resourcePath : path.dirname(resourcePath);
128
+ // console.log(`[DEBUG determineBaseUrl] Calculated baseDirPath: "${baseDirPath}"`); // Keep debug log commented unless needed
129
+
130
+ // Convert base directory path back to a file URL ending in '/'
131
+ let normalizedPathForURL = baseDirPath.replace(/\\/g, '/'); // Use forward slashes for URL consistency
132
+ // Ensure leading slash for Windows file URLs (e.g., /C:/...)
133
+ if (/^[A-Z]:\//i.test(normalizedPathForURL) && !normalizedPathForURL.startsWith('/')) {
134
+ normalizedPathForURL = '/' + normalizedPathForURL;
135
+ }
136
+ // Ensure trailing slash for the directory URL
137
+ if (!normalizedPathForURL.endsWith('/')) {
138
+ normalizedPathForURL += '/';
139
+ }
140
+
141
+ // Create the final file URL object and get its string representation
142
+ const fileUrl = new URL('file://' + normalizedPathForURL);
143
+ const fileUrlString = fileUrl.href;
144
+
145
+ logger?.debug(
146
+ `Determined base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved base dir: ${baseDirPath})`
147
+ );
148
+ // console.log(`[DEBUG determineBaseUrl] Determined File URL: "${fileUrlString}"`); // Keep debug log commented unless needed
149
+ return fileUrlString;
150
150
  }
151
+ } catch (error: unknown) {
152
+ // Handle any errors during base URL determination
153
+ const message = error instanceof Error ? error.message : String(error);
154
+ // console.error(`[DEBUG determineBaseUrl] Error determining base URL: ${message}`); // Keep debug log commented unless needed
155
+ logger?.error(
156
+ `💀 Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error && error.stack ? ` - Stack: ${error.stack}` : ''}`
157
+ );
158
+ return undefined;
159
+ }
151
160
  }
152
161
 
153
162
  /**
@@ -159,46 +168,59 @@ function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | und
159
168
  * @returns {URL | null} A validated, absolute URL object, or null if invalid/ignorable.
160
169
  */
161
170
  function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Logger): URL | null {
162
- const trimmedUrl = assetUrl?.trim();
163
- // Ignore empty, data URIs, or fragment-only URLs
164
- if (!trimmedUrl || trimmedUrl.startsWith('data:') || trimmedUrl.startsWith('#')) {
165
- return null;
166
- }
171
+ // Trim whitespace from the URL
172
+ const trimmedUrl = assetUrl?.trim();
167
173
 
168
- let resolvableUrl = trimmedUrl;
174
+ // Ignore empty URLs, data URIs, or fragment-only URLs
175
+ if (!trimmedUrl || trimmedUrl.startsWith('data:') || trimmedUrl.startsWith('#')) {
176
+ return null;
177
+ }
169
178
 
170
- // Handle protocol-relative URLs (e.g., //example.com/image.png)
171
- if (resolvableUrl.startsWith('//') && baseContextUrl) {
172
- try {
173
- const base = new URL(baseContextUrl);
174
- resolvableUrl = base.protocol + resolvableUrl; // Prepend the base protocol (http: or https:)
175
- } catch (e) {
176
- logger?.warn(`Could not extract protocol from base "${baseContextUrl}" for protocol-relative URL "${trimmedUrl}". Skipping.`);
177
- return null;
178
- }
179
- }
179
+ let resolvableUrl = trimmedUrl;
180
180
 
181
+ // Handle protocol-relative URLs (e.g., //example.com/image.png)
182
+ if (resolvableUrl.startsWith('//') && baseContextUrl) {
181
183
  try {
182
- // Use URL constructor for resolution. Handles absolute, relative paths, ../ etc.
183
- // baseContextUrl provides the context for resolving relative URLs.
184
- const resolved = new URL(resolvableUrl, baseContextUrl);
185
- // Don't attempt to fetch ws://, mailto:, etc. Add protocols as needed.
186
- if (!['http:', 'https:', 'file:'].includes(resolved.protocol)) {
187
- logger?.debug(`Skipping asset with unsupported protocol: ${resolved.href}`);
188
- return null;
189
- }
190
- return resolved;
191
- } catch (error: unknown) {
192
- // Log errors during URL parsing/resolution but don't halt the process
193
- const message = error instanceof Error ? error.message : String(error);
194
- // Avoid warning for relative paths when no base was provided (e.g., direct HTML string input)
195
- if (!/^[a-z]+:/i.test(resolvableUrl) && !resolvableUrl.startsWith('/') && !baseContextUrl) {
196
- logger?.warn(`Cannot resolve relative URL "${resolvableUrl}" - Base context URL was not provided or determined.`);
197
- } else {
198
- logger?.warn(`⚠️ Failed to parse/resolve URL "${resolvableUrl}" ${baseContextUrl ? 'against base "' + baseContextUrl + '"' : '(no base provided)'}: ${message}`);
199
- }
200
- return null; // Return null if resolution fails
184
+ // Prepend the protocol from the base context URL
185
+ const base = new URL(baseContextUrl);
186
+ resolvableUrl = base.protocol + resolvableUrl;
187
+ } catch (e) {
188
+ // Log a warning if the base protocol cannot be determined
189
+ logger?.warn(
190
+ `Could not extract protocol from base "${baseContextUrl}" for protocol-relative URL "${trimmedUrl}". Skipping.`
191
+ );
192
+ return null;
201
193
  }
194
+ }
195
+
196
+ try {
197
+ // Use URL constructor for resolution. Handles absolute paths, relative paths, ../ etc.
198
+ const resolved = new URL(resolvableUrl, baseContextUrl);
199
+
200
+ // Skip assets with unsupported protocols (e.g., mailto:, ws:)
201
+ if (!['http:', 'https:', 'file:'].includes(resolved.protocol)) {
202
+ logger?.debug(`Skipping asset with unsupported protocol: ${resolved.href}`);
203
+ return null;
204
+ }
205
+ // Return the resolved URL object
206
+ return resolved;
207
+ } catch (error: unknown) {
208
+ // Log errors during URL parsing/resolution
209
+ const message = error instanceof Error ? error.message : String(error);
210
+ // Avoid redundant warnings for relative paths when no base context was provided (expected failure)
211
+ if (!/^[a-z]+:/i.test(resolvableUrl) && !resolvableUrl.startsWith('/') && !baseContextUrl) {
212
+ logger?.warn(
213
+ `Cannot resolve relative URL "${resolvableUrl}" - Base context URL was not provided or determined.`
214
+ );
215
+ } else {
216
+ // Log other resolution failures
217
+ logger?.warn(
218
+ `⚠️ Failed to parse/resolve URL "${resolvableUrl}" ${baseContextUrl ? 'against base "' + baseContextUrl + '"' : '(no base provided)'}: ${message}`
219
+ );
220
+ }
221
+ // Return null if resolution fails
222
+ return null;
223
+ }
202
224
  }
203
225
 
204
226
  /**
@@ -210,38 +232,34 @@ function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Log
210
232
  * @returns {string | null} The resolved absolute URL string, or null if resolution fails/invalid.
211
233
  */
212
234
  function resolveCssRelativeUrl(
213
- relativeUrl: string,
214
- cssBaseContextUrl: string, // e.g., file:///C:/mock/base/dir/css/deep.css or https://.../style.css
215
- logger?: Logger
235
+ relativeUrl: string,
236
+ cssBaseContextUrl: string, // e.g., file:///C:/mock/base/dir/css/deep.css or https://.../style.css
237
+ logger?: Logger
216
238
  ): string | null {
217
- // [DEBUG LOG] Added for diagnostics
218
- console.log(`[DEBUG resolveCssRelativeUrl] Input: relative="${relativeUrl}", base="${cssBaseContextUrl}"`);
219
-
220
- if (!relativeUrl || relativeUrl.startsWith('data:') || relativeUrl.startsWith('#')) {
221
- return null; // Ignore empty, data URIs, or fragments
222
- }
223
-
224
- try {
225
- // Use the URL constructor which correctly handles relative paths including ../
226
- // relative to the base URL provided.
227
- const resolvedUrl = new URL(relativeUrl, cssBaseContextUrl);
228
-
229
- // [DEBUG LOG] Added for diagnostics
230
- console.log(`[DEBUG resolveCssRelativeUrl] Resolved URL object href: "${resolvedUrl.href}"`);
231
- return resolvedUrl.href; // Return the resolved absolute URL string
232
-
233
- } catch (error) {
234
- // Log warning if URL resolution fails for some reason
235
- logger?.warn(
236
- `Failed to resolve CSS URL: "${relativeUrl}" relative to "${cssBaseContextUrl}": ${String(error)}`
237
- );
238
- // [DEBUG LOG] Added for diagnostics
239
- console.error(`[DEBUG resolveCssRelativeUrl] Error resolving: ${String(error)}`);
240
- return null;
241
- }
239
+ // console.log(`[DEBUG resolveCssRelativeUrl] Input: relative="${relativeUrl}", base="${cssBaseContextUrl}"`); // Keep debug log commented unless needed
240
+
241
+ // Ignore empty, data URIs, or fragments
242
+ if (!relativeUrl || relativeUrl.startsWith('data:') || relativeUrl.startsWith('#')) {
243
+ return null;
244
+ }
245
+
246
+ try {
247
+ // Use the URL constructor which correctly handles relative paths including ../
248
+ // relative to the base URL provided (the CSS file's URL).
249
+ const resolvedUrl = new URL(relativeUrl, cssBaseContextUrl);
250
+ // console.log(`[DEBUG resolveCssRelativeUrl] Resolved URL object href: "${resolvedUrl.href}"`); // Keep debug log commented unless needed
251
+ // Return the resolved absolute URL string
252
+ return resolvedUrl.href;
253
+ } catch (error) {
254
+ // Log warning if URL resolution fails
255
+ logger?.warn(
256
+ `Failed to resolve CSS URL: "${relativeUrl}" relative to "${cssBaseContextUrl}": ${String(error)}`
257
+ );
258
+ // console.error(`[DEBUG resolveCssRelativeUrl] Error resolving: ${String(error)}`); // Keep debug log commented unless needed
259
+ return null;
260
+ }
242
261
  }
243
262
 
244
-
245
263
  /**
246
264
  * Asynchronously fetches the content of a resolved asset URL (http, https, file).
247
265
  * @async
@@ -250,94 +268,108 @@ function resolveCssRelativeUrl(
250
268
  * @param {number} [timeout=10000] - Network timeout in milliseconds for HTTP(S) requests.
251
269
  * @returns {Promise<Buffer | null>} Asset content as a Buffer, or null on failure.
252
270
  */
253
- async function fetchAsset(resolvedUrl: URL, logger?: Logger, timeout: number = 10000): Promise<Buffer | null> {
254
- // [DEBUG LOG] Added for diagnostics
255
- console.log(`[DEBUG fetchAsset] Attempting fetch for URL: ${resolvedUrl.href}`);
256
- logger?.debug(`Attempting to fetch asset: ${resolvedUrl.href}`);
257
- const protocol = resolvedUrl.protocol;
258
-
259
- try {
260
- if (protocol === 'http:' || protocol === 'https:') {
261
- // Use axios namespace import's default property
262
- const response: AxiosResponse<ArrayBuffer> = await axiosNs.default.get(resolvedUrl.href, {
263
- responseType: 'arraybuffer', timeout: timeout,
264
- });
265
- logger?.debug(`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data?.byteLength ?? 0} bytes)`);
266
- // [DEBUG LOG] Added for diagnostics
267
- console.log(`[DEBUG fetchAsset] HTTP fetch SUCCESS for: ${resolvedUrl.href}, Status: ${response.status}`);
268
- return Buffer.from(response.data);
269
- } else if (protocol === 'file:') {
270
- let filePath: string;
271
- try {
272
- // Convert file URL to path. IMPORTANT: This strips query params and fragments.
273
- filePath = fileURLToPath(resolvedUrl);
274
- } catch (e: any) {
275
- // [DEBUG LOG] Added for diagnostics
276
- console.error(`[DEBUG fetchAsset] fileURLToPath FAILED for: ${resolvedUrl.href}`, e);
277
- logger?.error(`Could not convert file URL to path: ${resolvedUrl.href}. Error: ${e.message}`);
278
- return null;
279
- }
271
+ async function fetchAsset(
272
+ resolvedUrl: URL,
273
+ logger?: Logger,
274
+ timeout: number = 10000
275
+ ): Promise<Buffer | null> {
276
+ // console.log(`[DEBUG fetchAsset] Attempting fetch for URL: ${resolvedUrl.href}`); // Keep debug log commented unless needed
277
+ logger?.debug(`Attempting to fetch asset: ${resolvedUrl.href}`);
278
+ const protocol = resolvedUrl.protocol;
279
+
280
+ try {
281
+ // Handle HTTP and HTTPS protocols
282
+ if (protocol === 'http:' || protocol === 'https:') {
283
+ // Use axios to fetch remote content as an ArrayBuffer
284
+ const response: AxiosResponse<ArrayBuffer> = await axiosNs.default.get(resolvedUrl.href, {
285
+ responseType: 'arraybuffer', // Fetch as binary data
286
+ timeout: timeout, // Apply network timeout
287
+ });
288
+ logger?.debug(
289
+ `Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data?.byteLength ?? 0} bytes)`
290
+ );
291
+ // console.log(`[DEBUG fetchAsset] HTTP fetch SUCCESS for: ${resolvedUrl.href}, Status: ${response.status}`); // Keep debug log commented unless needed
292
+ // Return the fetched data as a Node.js Buffer
293
+ return Buffer.from(response.data);
294
+ }
295
+ // Handle file protocol
296
+ else if (protocol === 'file:') {
297
+ let filePath: string;
298
+ try {
299
+ // Convert file URL to a system file path
300
+ // IMPORTANT: This strips query params and fragments from the URL
301
+ filePath = fileURLToPath(resolvedUrl);
302
+ } catch (e: any) {
303
+ // console.error(`[DEBUG fetchAsset] fileURLToPath FAILED for: ${resolvedUrl.href}`, e); // Keep debug log commented unless needed
304
+ logger?.error(
305
+ `Could not convert file URL to path: ${resolvedUrl.href}. Error: ${e.message}`
306
+ );
307
+ return null; // Return null if conversion fails
308
+ }
280
309
 
281
- const normalizedForLog = path.normalize(filePath);
282
- // [DEBUG LOG] Added for diagnostics
283
- console.log(`[DEBUG fetchAsset] Attempting readFile with path: "${normalizedForLog}" (Original from URL: "${filePath}")`);
310
+ const normalizedForLog = path.normalize(filePath);
311
+ // console.log(`[DEBUG fetchAsset] Attempting readFile with path: "${normalizedForLog}" (Original from URL: "${filePath}")`); // Keep debug log commented unless needed
284
312
 
285
- // Read file using fs/promises
286
- const data = await readFile(filePath); // This call uses the mock in tests
313
+ // Read file content using fs/promises
314
+ const data = await readFile(filePath); // This call uses the mock in tests
287
315
 
288
- // [DEBUG LOG] Added for diagnostics
289
- console.log(`[DEBUG fetchAsset] readFile call SUCCEEDED for path: "${normalizedForLog}". Data length: ${data?.byteLength}`);
290
- logger?.debug(`Read local file ${filePath} (${data.byteLength} bytes)`);
291
- return data;
292
- } else {
293
- // [DEBUG LOG] Added for diagnostics
294
- console.log(`[DEBUG fetchAsset] Unsupported protocol: ${protocol}`);
295
- logger?.warn(`Unsupported protocol "${protocol}" in URL: ${resolvedUrl.href}`);
296
- return null;
297
- }
298
- } catch (error: unknown) {
299
- // [DEBUG LOG] Added for diagnostics
300
- const failedId = protocol === 'file:' ? path.normalize(fileURLToPath(resolvedUrl)) : resolvedUrl.href;
301
- console.error(`[DEBUG fetchAsset] fetch/read FAILED for: "${failedId}". Error:`, error);
302
-
303
- // --- Handle Errors Based on Protocol/Context ---
304
- // Use the imported namespace directly for isAxiosError check
305
- if ((protocol === 'http:' || protocol === 'https:') && axiosNs.isAxiosError(error)) {
306
- const status = error.response?.status ?? 'N/A';
307
- const statusText = error.response?.statusText ?? 'Error';
308
- const code = error.code ?? 'N/A';
309
- const message = error.message;
310
- // Format consistent with test expectations
311
- const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: Status ${status} - ${statusText}. Code: ${code}, Message: ${message}`;
312
- logger?.warn(logMessage);
313
- }
314
- // Check for specific FS errors (only relevant if protocol was file:)
315
- if (error instanceof Error && (error as { code?: string }).code === 'ENOENT') {
316
- let failedPath = resolvedUrl.href; // Fallback path for logging if conversion fails
317
- try { failedPath = fileURLToPath(resolvedUrl); } catch { /* ignore */ }
318
- failedPath = path.normalize(failedPath); // Normalize for consistent logging
319
-
320
- if (error instanceof Error && (error as NodeJSErrnoException).code === 'ENOENT') {
321
- logger?.warn(`⚠️ File not found (ENOENT) for asset: ${failedPath}.`);
322
- } else if (error instanceof Error && (error as NodeJSErrnoException).code === 'EACCES') {
323
- // Log EACCES specifically for tests to catch if needed
324
- logger?.warn(`⚠️ Permission denied (EACCES) reading asset: ${failedPath}.`);
325
- // Also log the more generic message that the test currently expects
326
- logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
327
- } else if (error instanceof Error) {
328
- logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
329
- } else {
330
- logger?.warn(`⚠️ An unknown error occurred while reading local asset ${failedPath}: ${String(error)}`);
331
- }
332
- }
333
- // Generic fallback for truly unexpected errors during fetch/read
334
- else if (error instanceof Error) {
335
- logger?.warn(`⚠️ An unexpected error occurred processing asset ${resolvedUrl.href}: ${error.message}`);
336
- } else {
337
- logger?.warn(`⚠️ An unknown and unexpected error occurred processing asset ${resolvedUrl.href}: ${String(error)}`);
338
- }
339
- return null; // Return null on ANY fetch/read error caught here
316
+ // console.log(`[DEBUG fetchAsset] readFile call SUCCEEDED for path: "${normalizedForLog}". Data length: ${data?.byteLength}`); // Keep debug log commented unless needed
317
+ logger?.debug(`Read local file ${filePath} (${data.byteLength} bytes)`);
318
+ // Return the file content as a Buffer
319
+ return data;
320
+ }
321
+ // Handle unsupported protocols
322
+ else {
323
+ // console.log(`[DEBUG fetchAsset] Unsupported protocol: ${protocol}`); // Keep debug log commented unless needed
324
+ logger?.warn(`Unsupported protocol "${protocol}" in URL: ${resolvedUrl.href}`);
325
+ return null;
326
+ }
327
+ } catch (error: unknown) {
328
+ // --- Handle Errors During Fetch/Read ---
329
+ const failedId =
330
+ protocol === 'file:' ? path.normalize(fileURLToPath(resolvedUrl)) : resolvedUrl.href;
331
+ if ((protocol === 'http:' || protocol === 'https:') && (error as any)?.isAxiosError === true) {
332
+ const axiosError = error as AxiosError; // Cast for easier property access
333
+ const status = axiosError.response?.status ?? 'N/A';
334
+ const code = axiosError.code ?? 'N/A'; // e.g., ECONNABORTED for timeout
335
+ // Use the specific log format
336
+ const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: ${axiosError.message} (Code: ${code})`;
337
+ logger?.warn(logMessage);
338
+ }
339
+ // Check for file system errors *next*
340
+ else if (protocol === 'file:' && error instanceof Error) {
341
+ let failedPath = resolvedUrl.href;
342
+ try {
343
+ failedPath = fileURLToPath(resolvedUrl);
344
+ } catch {
345
+ /* ignore */
346
+ }
347
+ failedPath = path.normalize(failedPath);
348
+
349
+ if ((error as NodeJSErrnoException).code === 'ENOENT') {
350
+ logger?.warn(`⚠️ File not found (ENOENT) for asset: ${failedPath}.`);
351
+ } else if ((error as NodeJSErrnoException).code === 'EACCES') {
352
+ // Log ONLY the specific EACCES message
353
+ logger?.warn(`⚠️ Permission denied (EACCES) reading asset: ${failedPath}.`);
354
+ } else {
355
+ logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
356
+ }
357
+ }
358
+ // Generic fallback for *other* types of Errors (that are not Axios or known FS errors)
359
+ else if (error instanceof Error) {
360
+ logger?.warn(
361
+ `⚠️ An unexpected error occurred processing asset ${resolvedUrl.href}: ${error.message}`
362
+ );
340
363
  }
364
+ // Fallback for non-Error throws (e.g., strings, numbers)
365
+ else {
366
+ logger?.warn(
367
+ `⚠️ An unknown and unexpected error occurred processing asset ${resolvedUrl.href}: ${String(error)}`
368
+ );
369
+ }
370
+ // Return null on ANY error
371
+ return null;
372
+ }
341
373
  }
342
374
 
343
375
  /**
@@ -349,55 +381,65 @@ async function fetchAsset(resolvedUrl: URL, logger?: Logger, timeout: number = 1
349
381
  * @returns {Asset[]} An array of newly discovered Asset objects (type, resolved URL, content initially undefined).
350
382
  */
351
383
  function extractUrlsFromCSS(
352
- cssContent: string,
353
- cssBaseContextUrl: string,
354
- logger?: Logger
384
+ cssContent: string,
385
+ cssBaseContextUrl: string,
386
+ logger?: Logger
355
387
  ): Asset[] {
356
- const newlyDiscovered: Asset[] = [];
357
- // Track URLs processed within this specific CSS file to avoid adding duplicates from the same file
358
- const processedInThisParse = new Set<string>();
359
-
360
- // Regex for url(...) patterns, handling optional quotes
361
- const urlRegex = /url\(\s*(['"]?)(.*?)\1\s*\)/gi;
362
- // Regex for @import rules, handling url() or bare string, optional quotes
363
- const importRegex = /@import\s+(?:url\(\s*(['"]?)(.*?)\1\s*\)|(['"])(.*?)\3)\s*;/gi;
364
-
365
- /** Internal helper to process a found URL string */
366
- const processFoundUrl = (rawUrl: string | undefined, ruleType: '@import' | 'url()') => {
367
- if (!rawUrl || rawUrl.trim() === '' || rawUrl.startsWith('data:')) return;
368
-
369
- const resolvedUrl = resolveCssRelativeUrl(rawUrl, cssBaseContextUrl, logger);
370
-
371
- // If successfully resolved and not already found in *this* CSS file
372
- if (resolvedUrl && !processedInThisParse.has(resolvedUrl)) {
373
- processedInThisParse.add(resolvedUrl);
374
- const { assetType } = guessMimeType(resolvedUrl); // Guess type based on resolved URL
375
-
376
- // Add to the list of assets discovered in this pass
377
- newlyDiscovered.push({
378
- type: assetType,
379
- url: resolvedUrl, // The resolved absolute URL string
380
- content: undefined // Content will be fetched later if needed
381
- });
382
- logger?.debug(`Discovered nested ${assetType} asset (${ruleType}) in CSS ${cssBaseContextUrl}: ${resolvedUrl}`);
383
- }
384
- };
385
-
386
- // Execute regex for url(...)
387
- let match;
388
- while ((match = urlRegex.exec(cssContent)) !== null) {
389
- processFoundUrl(match[2], 'url()'); // Group 2 captures the URL part
388
+ // Array to hold assets discovered within this CSS content
389
+ const newlyDiscovered: Asset[] = [];
390
+ // Set to track URLs processed within this specific CSS file to avoid adding duplicates from the same file
391
+ const processedInThisParse = new Set<string>();
392
+
393
+ // Regex for url(...) patterns, handling optional quotes (non-greedy match for URL)
394
+ const urlRegex = /url\(\s*(['"]?)(.*?)\1\s*\)/gi;
395
+ // Regex for @import rules, handling url() or bare string, optional quotes (non-greedy match for URL)
396
+ const importRegex = /@import\s+(?:url\(\s*(['"]?)(.*?)\1\s*\)|(['"])(.*?)\3)\s*;/gi;
397
+
398
+ /** Internal helper to process a found URL string */
399
+ const processFoundUrl = (rawUrl: string | undefined, ruleType: '@import' | 'url()') => {
400
+ // Skip if URL is empty, undefined, a data URI, or only a fragment
401
+ if (!rawUrl || rawUrl.trim() === '' || rawUrl.startsWith('data:') || rawUrl.startsWith('#'))
402
+ return;
403
+
404
+ // Resolve the potentially relative URL against the CSS file's base URL
405
+ const resolvedUrl = resolveCssRelativeUrl(rawUrl, cssBaseContextUrl, logger);
406
+
407
+ // If successfully resolved and not already found *in this specific CSS file*
408
+ if (resolvedUrl && !processedInThisParse.has(resolvedUrl)) {
409
+ // Mark this resolved URL as processed for this CSS file
410
+ processedInThisParse.add(resolvedUrl);
411
+ // Guess the asset type (css, image, font, etc.) based on the resolved URL
412
+ const { assetType } = guessMimeType(resolvedUrl);
413
+
414
+ // Add the discovered asset to the list for this CSS file
415
+ newlyDiscovered.push({
416
+ type: assetType,
417
+ url: resolvedUrl, // Store the resolved absolute URL string
418
+ content: undefined, // Content will be fetched later if needed
419
+ });
420
+ logger?.debug(
421
+ `Discovered nested ${assetType} asset (${ruleType}) in CSS ${cssBaseContextUrl}: ${resolvedUrl}`
422
+ );
390
423
  }
391
-
392
- // Execute regex for @import
393
- // Reset lastIndex as we're using the same regex instance implicitly if defined outside loop
394
- importRegex.lastIndex = 0; // Explicitly reset
395
- while ((match = importRegex.exec(cssContent)) !== null) {
396
- // Group 2 captures url('...'), Group 4 captures bare "..."
397
- processFoundUrl(match[2] || match[4], '@import');
398
- }
399
-
400
- return newlyDiscovered;
424
+ };
425
+
426
+ // Find all url(...) matches in the CSS content
427
+ let match;
428
+ while ((match = urlRegex.exec(cssContent)) !== null) {
429
+ // Group 2 captures the URL part inside url()
430
+ processFoundUrl(match[2], 'url()');
431
+ }
432
+
433
+ // Find all @import matches in the CSS content
434
+ // Reset lastIndex as we're reusing the regex object implicitly
435
+ importRegex.lastIndex = 0;
436
+ while ((match = importRegex.exec(cssContent)) !== null) {
437
+ // Group 2 captures url('...'), Group 4 captures bare "..."
438
+ processFoundUrl(match[2] || match[4], '@import');
439
+ }
440
+
441
+ // Return the list of assets discovered within this CSS content
442
+ return newlyDiscovered;
401
443
  }
402
444
 
403
445
  /**
@@ -415,244 +457,321 @@ function extractUrlsFromCSS(
415
457
  * @returns {Promise<ParsedHTML>} Processed data with `htmlContent` and the final `assets` array containing all discovered assets (with content if `embedAssets` was true and fetch succeeded).
416
458
  */
417
459
  export async function extractAssets(
418
- parsed: ParsedHTML,
419
- embedAssets = true,
420
- inputPathOrUrl?: string,
421
- logger?: Logger
460
+ parsed: ParsedHTML,
461
+ embedAssets = true,
462
+ inputPathOrUrl?: string,
463
+ logger?: Logger
422
464
  ): Promise<ParsedHTML> {
423
- logger?.info(`🚀 Starting asset extraction! Embed: ${embedAssets}. Input: ${inputPathOrUrl || '(HTML content only)'}`);
424
-
425
- const initialAssets: Asset[] = parsed.assets || [];
426
- // Stores the final result: Map<resolved URL string, Asset object>
427
- const finalAssetsMap = new Map<string, Asset>();
428
- // Queue holds assets to be processed: { url: string (resolved), type: ..., content?: ... }
429
- let assetsToProcess: Asset[] = [];
430
- // Set to track URLs that are already processed (in finalAssetsMap) OR currently in the queue (assetsToProcess)
431
- const processedOrQueuedUrls = new Set<string>();
432
-
433
- // --- Determine Base URL Context ---
434
- const htmlBaseContextUrl = determineBaseUrl(inputPathOrUrl || '', logger);
435
- if (!htmlBaseContextUrl && initialAssets.some(a => !/^[a-z]+:/i.test(a.url) && !a.url.startsWith('data:') && !a.url.startsWith('#') && !a.url.startsWith('/'))) {
436
- logger?.warn("🚨 No valid base path/URL determined for the HTML source! Resolution of relative asset paths from HTML may fail.");
437
- } else if (htmlBaseContextUrl) {
438
- logger?.debug(`Using HTML base context URL: ${htmlBaseContextUrl}`);
465
+ logger?.info(
466
+ `🚀 Starting asset extraction! Embed: ${embedAssets}. Input: ${inputPathOrUrl || '(HTML content only)'}`
467
+ );
468
+
469
+ // Get the initial list of assets found directly in the HTML
470
+ const initialAssets: Asset[] = parsed.assets || [];
471
+ // Stores the final result: Map<resolved URL string, Asset object> to ensure uniqueness
472
+ const finalAssetsMap = new Map<string, Asset>();
473
+ // Queue holds assets whose content needs to be processed (fetched/analyzed)
474
+ let assetsToProcess: Asset[] = [];
475
+ // Set to track URLs that are either already fully processed (in finalAssetsMap)
476
+ // OR currently in the processing queue (assetsToProcess) to prevent reprocessing/loops.
477
+ const processedOrQueuedUrls = new Set<string>();
478
+
479
+ // --- Determine Base URL Context for the HTML ---
480
+ const htmlBaseContextUrl = determineBaseUrl(inputPathOrUrl || '', logger);
481
+ // Warn if no base URL could be found and there are relative paths in the initial assets
482
+ if (
483
+ !htmlBaseContextUrl &&
484
+ initialAssets.some(
485
+ a =>
486
+ !/^[a-z]+:/i.test(a.url) &&
487
+ !a.url.startsWith('data:') &&
488
+ !a.url.startsWith('#') &&
489
+ !a.url.startsWith('/')
490
+ )
491
+ ) {
492
+ logger?.warn(
493
+ '🚨 No valid base path/URL determined for the HTML source! Resolution of relative asset paths from HTML may fail.'
494
+ );
495
+ } else if (htmlBaseContextUrl) {
496
+ logger?.debug(`Using HTML base context URL: ${htmlBaseContextUrl}`);
497
+ }
498
+
499
+ // --- Initial Queue Population from HTML assets ---
500
+ logger?.debug(`Queueing ${initialAssets.length} initial assets parsed from HTML...`);
501
+ for (const asset of initialAssets) {
502
+ // Resolve the initial asset URL against the HTML base context
503
+ const resolvedUrlObj = resolveAssetUrl(asset.url, htmlBaseContextUrl, logger);
504
+
505
+ // Skip if URL is invalid, data URI, fragment, or unsupported protocol
506
+ if (!resolvedUrlObj) {
507
+ logger?.debug(` -> Skipping initial asset with unresolvable/ignorable URL: ${asset.url}`);
508
+ continue;
439
509
  }
440
-
441
- // --- Initial Queue Population ---
442
- logger?.debug(`Queueing ${initialAssets.length} initial assets parsed from HTML...`);
443
- for (const asset of initialAssets) {
444
- // Resolve the initial asset URL against the HTML base context
445
- const resolvedUrlObj = resolveAssetUrl(asset.url, htmlBaseContextUrl, logger);
446
- if (!resolvedUrlObj) {
447
- logger?.debug(` -> Skipping initial asset with unresolvable/ignorable URL: ${asset.url}`);
448
- continue; // Skip if URL is invalid or data URI etc.
449
- }
450
- const urlToQueue = resolvedUrlObj.href; // Use the resolved absolute URL string
451
-
452
- // Skip data URIs and check if this URL is already tracked
453
- if (!urlToQueue.startsWith('data:') && !processedOrQueuedUrls.has(urlToQueue)) {
454
- processedOrQueuedUrls.add(urlToQueue); // Mark as queued
455
-
456
- // Guess type from the resolved/original URL if not provided initially
457
- const { assetType: guessedType } = guessMimeType(urlToQueue);
458
- const initialType = asset.type ?? guessedType;
459
-
460
- // Add to the processing queue
461
- assetsToProcess.push({
462
- url: urlToQueue, // Use the resolved URL
463
- type: initialType,
464
- content: undefined
465
- });
466
- logger?.debug(` -> Queued initial asset: ${urlToQueue} (Original raw: ${asset.url})`);
467
- } else if (urlToQueue.startsWith('data:')) {
468
- logger?.debug(` -> Skipping data URI: ${urlToQueue.substring(0, 50)}...`);
469
- } else {
470
- logger?.debug(` -> Skipping already processed/queued initial asset: ${urlToQueue}`);
510
+ // Get the resolved absolute URL string
511
+ const urlToQueue = resolvedUrlObj.href;
512
+
513
+ // Check if this URL is already tracked (processed or queued)
514
+ if (!processedOrQueuedUrls.has(urlToQueue)) {
515
+ // Mark as queued (add to set *before* adding to array)
516
+ processedOrQueuedUrls.add(urlToQueue);
517
+
518
+ // Guess type from the resolved/original URL if not provided initially
519
+ const { assetType: guessedType } = guessMimeType(urlToQueue);
520
+ const initialType = asset.type ?? guessedType; // Use provided type or fallback to guessed type
521
+
522
+ // Add the resolved asset to the processing queue
523
+ assetsToProcess.push({
524
+ url: urlToQueue, // Use the resolved URL
525
+ type: initialType,
526
+ content: undefined, // Content is initially undefined
527
+ });
528
+ logger?.debug(` -> Queued initial asset: ${urlToQueue} (Original raw: ${asset.url})`);
529
+ } else {
530
+ logger?.debug(` -> Skipping already processed/queued initial asset: ${urlToQueue}`);
531
+ }
532
+ }
533
+
534
+ // --- Main processing loop (continues as long as there are assets to process) ---
535
+ let iterationCount = 0;
536
+ while (assetsToProcess.length > 0) {
537
+ iterationCount++;
538
+ // Prevent potential infinite loops
539
+ if (iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS) {
540
+ logger?.error(
541
+ `🛑 Asset extraction loop limit hit (${MAX_ASSET_EXTRACTION_ITERATIONS})! Aborting.`
542
+ );
543
+ const remainingUrls = assetsToProcess
544
+ .map(a => a.url)
545
+ .slice(0, 10)
546
+ .join(', ');
547
+ logger?.error(
548
+ `Remaining queue sample (${assetsToProcess.length} items): ${remainingUrls}...`
549
+ );
550
+ // Add assets remaining in queue to final map without content before breaking
551
+ assetsToProcess.forEach(asset => {
552
+ if (!finalAssetsMap.has(asset.url)) {
553
+ finalAssetsMap.set(asset.url, { ...asset, content: undefined });
471
554
  }
555
+ });
556
+ assetsToProcess = []; // Clear queue to stop the loop
557
+ break; // Exit loop
472
558
  }
473
559
 
474
- // --- Main processing loop ---
475
- let iterationCount = 0;
476
- while (assetsToProcess.length > 0) {
477
- iterationCount++;
478
- if (iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS) {
479
- logger?.error(`🛑 Asset extraction loop limit hit (${MAX_ASSET_EXTRACTION_ITERATIONS})! Aborting.`);
480
- const remainingUrls = assetsToProcess.map(a => a.url).slice(0, 10).join(', ');
481
- logger?.error(`Remaining queue sample (${assetsToProcess.length} items): ${remainingUrls}...`);
482
- // Add assets remaining in queue to final map without content before breaking
483
- assetsToProcess.forEach(asset => {
484
- if (!finalAssetsMap.has(asset.url)) {
485
- finalAssetsMap.set(asset.url, { ...asset, content: undefined });
486
- }
487
- });
488
- assetsToProcess = []; // Clear queue
489
- break; // Exit loop
490
- }
560
+ // Take a snapshot of the current queue to process in this iteration
561
+ const currentBatch = [...assetsToProcess];
562
+ // Clear the main queue; new assets found in this batch will be added here for the *next* iteration
563
+ assetsToProcess = [];
491
564
 
492
- // Process assets in batches for clarity in logs
493
- const currentBatch = [...assetsToProcess];
494
- assetsToProcess = []; // Clear queue for the next batch discovered in this iteration
565
+ logger?.debug(`--- Processing batch ${iterationCount}: ${currentBatch.length} asset(s) ---`);
495
566
 
496
- logger?.debug(`--- Processing batch ${iterationCount}: ${currentBatch.length} asset(s) ---`);
567
+ // Process each asset in the current batch
568
+ for (const asset of currentBatch) {
569
+ // Double-check: Skip if this asset somehow got fully processed in a previous iteration (shouldn't happen with current logic, but safe check)
570
+ if (finalAssetsMap.has(asset.url)) {
571
+ logger?.debug(`Skipping asset already in final map: ${asset.url}`);
572
+ continue;
573
+ }
497
574
 
498
- for (const asset of currentBatch) {
499
- // Skip if already fully processed (e.g., added in a previous batch)
500
- if (finalAssetsMap.has(asset.url)) {
501
- logger?.debug(`Skipping asset already in final map: ${asset.url}`);
502
- continue;
503
- }
575
+ let assetContentBuffer: Buffer | null = null; // To store fetched binary content
576
+ let finalContent: string | undefined = undefined; // Final content (text or data URI) for the Asset object
577
+ let cssContentForParsing: string | undefined = undefined; // Text content specifically for parsing CSS
504
578
 
505
- let assetContentBuffer: Buffer | null = null;
506
- let finalContent: string | undefined = undefined; // For embedding
507
- let cssContentForParsing: string | undefined = undefined; // For CSS parsing
508
-
509
- // --- Determine if fetching is needed ---
510
- // Fetch if embedding everything OR if it's CSS (need content for parsing)
511
- const needsFetching = embedAssets || asset.type === 'css';
512
- let assetUrlObj: URL | null = null; // URL object needed for fetchAsset
513
-
514
- if (needsFetching) {
515
- // --- Create URL object for fetching ---
516
- try {
517
- assetUrlObj = new URL(asset.url); // Asset URL should be absolute here
518
- } catch (urlError) {
519
- logger?.warn(`Cannot create URL object for "${asset.url}", skipping fetch. Error: ${urlError instanceof Error ? urlError.message : String(urlError)}`);
520
- finalAssetsMap.set(asset.url, { ...asset, content: undefined }); // Store asset without content
521
- continue; // Skip to next asset in batch
522
- }
523
-
524
- // --- Fetch Asset ---
525
- if (assetUrlObj) {
526
- assetContentBuffer = await fetchAsset(assetUrlObj, logger);
527
- // fetchAsset returns null on failure
528
- }
529
- } // End if(needsFetching)
530
-
531
- // --- If fetching was needed but failed, store asset without content and skip ---
532
- if (needsFetching && assetContentBuffer === null) {
533
- logger?.debug(`Storing asset ${asset.url} without content due to fetch failure.`);
534
- finalAssetsMap.set(asset.url, { ...asset, content: undefined });
535
- continue; // Skip to next asset in batch
536
- }
579
+ // --- Determine if fetching is needed ---
580
+ // Fetch if we need to embed all assets OR if it's CSS (we need content to parse for nested assets)
581
+ const needsFetching = embedAssets || asset.type === 'css';
582
+ let assetUrlObj: URL | null = null; // URL object needed for fetchAsset
583
+
584
+ if (needsFetching) {
585
+ // --- Create URL object for fetching ---
586
+ try {
587
+ // Asset URL should be absolute at this point
588
+ assetUrlObj = new URL(asset.url);
589
+ } catch (urlError) {
590
+ // Log error if creating URL object fails
591
+ logger?.warn(
592
+ `Cannot create URL object for "${asset.url}", skipping fetch. Error: ${urlError instanceof Error ? urlError.message : String(urlError)}`
593
+ );
594
+ // Store asset without content in the final map
595
+ finalAssetsMap.set(asset.url, { ...asset, content: undefined });
596
+ // Skip to next asset in the current batch
597
+ continue;
598
+ }
537
599
 
538
- // --- Prepare Content for Storing/Embedding (if fetched successfully) ---
539
- if (assetContentBuffer) { // Only proceed if content was fetched
540
- const mimeInfo = guessMimeType(asset.url); // Guess MIME based on URL extension
541
- const effectiveMime = mimeInfo.mime || 'application/octet-stream'; // Fallback MIME
542
-
543
- // Try to decode TEXT types as UTF-8
544
- if (TEXT_ASSET_TYPES.has(asset.type)) {
545
- let textContent: string | undefined;
546
- let wasLossy = false;
547
- try {
548
- textContent = assetContentBuffer.toString('utf-8');
549
- wasLossy = isUtf8DecodingLossy(assetContentBuffer, textContent);
550
- } catch (e) { textContent = undefined; wasLossy = true; }
551
-
552
- if (!wasLossy && textContent !== undefined) {
553
- // If embedding, store the text content
554
- if (embedAssets) {
555
- finalContent = textContent;
556
- } else {
557
- finalContent = undefined; // Not embedding text
558
- }
559
- // If it's CSS, store its text content for parsing regardless of embedding
560
- if (asset.type === 'css') {
561
- cssContentForParsing = textContent;
562
- }
563
- } else {
564
- // Decoding failed or was lossy
565
- logger?.warn(`Could not decode ${asset.type} asset ${asset.url} as valid UTF-8 text.${embedAssets ? ' Falling back to base64 data URI.' : ''}`);
566
- cssContentForParsing = undefined; // Cannot parse if decoding failed
567
- // Embed as base64 data URI if requested
568
- if (embedAssets) {
569
- finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
570
- } else {
571
- finalContent = undefined;
572
- }
573
- }
574
- }
575
- // Embed BINARY types as base64 data URI if requested
576
- else if (BINARY_ASSET_TYPES.has(asset.type)) {
577
- if (embedAssets) {
578
- finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
579
- } else {
580
- finalContent = undefined; // Not embedding
581
- }
582
- cssContentForParsing = undefined; // Not CSS
583
- }
584
- // Handle 'other' types: attempt text decode, fallback to base64 if embedding
585
- else { // asset.type === 'other' or unknown
586
- cssContentForParsing = undefined; // Not CSS
587
- if (embedAssets) {
588
- try {
589
- const attemptedTextContent = assetContentBuffer.toString('utf-8');
590
- if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
591
- logger?.warn(`Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`);
592
- finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
593
- } else {
594
- finalContent = attemptedTextContent;
595
- logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
596
- }
597
- } catch (decodeError) {
598
- logger?.warn(`Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`);
599
- finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
600
- }
601
- } else {
602
- finalContent = undefined; // Not embedding
603
- }
604
- }
605
- } else { // Content was not fetched (e.g., embedAssets=false and not CSS)
606
- finalContent = undefined;
607
- cssContentForParsing = undefined;
600
+ // --- Fetch Asset ---
601
+ if (assetUrlObj) {
602
+ // Call fetchAsset (which handles http/https/file and errors)
603
+ assetContentBuffer = await fetchAsset(assetUrlObj, logger);
604
+ // fetchAsset returns null on failure
605
+ }
606
+ } // End if(needsFetching)
607
+
608
+ // --- If fetching was required but failed, store asset without content and continue ---
609
+ if (needsFetching && assetContentBuffer === null) {
610
+ logger?.debug(`Storing asset ${asset.url} without content due to fetch failure.`);
611
+ // Add to final map with undefined content
612
+ finalAssetsMap.set(asset.url, { ...asset, content: undefined });
613
+ // Skip to the next asset in the current batch
614
+ continue;
615
+ }
616
+
617
+ // --- Prepare Content for Storing/Embedding (if fetched successfully) ---
618
+ if (assetContentBuffer) {
619
+ // Only proceed if content was fetched
620
+ // Guess MIME type based on the asset's URL extension
621
+ const mimeInfo = guessMimeType(asset.url);
622
+ // Use the guessed MIME type or fallback to a generic binary type
623
+ const effectiveMime = mimeInfo.mime || 'application/octet-stream';
624
+
625
+ // Handle TEXT types (CSS, JS)
626
+ if (TEXT_ASSET_TYPES.has(asset.type)) {
627
+ let textContent: string | undefined;
628
+ let wasLossy = false;
629
+ try {
630
+ // Try decoding the buffer as UTF-8
631
+ textContent = assetContentBuffer.toString('utf-8');
632
+ // Check if the decoding process lost information (e.g., invalid sequences replaced)
633
+ wasLossy = isUtf8DecodingLossy(assetContentBuffer, textContent);
634
+ } catch (e) {
635
+ // Decoding itself failed
636
+ textContent = undefined;
637
+ wasLossy = true;
638
+ }
639
+
640
+ // If decoding was successful and not lossy
641
+ if (!wasLossy && textContent !== undefined) {
642
+ // If embedding, store the text content
643
+ if (embedAssets) {
644
+ finalContent = textContent;
645
+ } else {
646
+ finalContent = undefined; // Not embedding text, store undefined
647
+ }
648
+ // If it's CSS, store its text content for parsing regardless of embedding option
649
+ if (asset.type === 'css') {
650
+ cssContentForParsing = textContent;
608
651
  }
652
+ } else {
653
+ // Decoding failed or was lossy
654
+ logger?.warn(
655
+ `Could not decode ${asset.type} asset ${asset.url} as valid UTF-8 text.${embedAssets ? ' Falling back to base64 data URI.' : ''}`
656
+ );
657
+ cssContentForParsing = undefined; // Cannot parse CSS if decoding failed
658
+ // Embed as base64 data URI if requested, using the effective MIME type
659
+ if (embedAssets) {
660
+ finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
661
+ } else {
662
+ finalContent = undefined; // Not embedding
663
+ }
664
+ }
665
+ }
666
+ // Handle BINARY types (image, font, video, audio)
667
+ else if (BINARY_ASSET_TYPES.has(asset.type)) {
668
+ // Embed as base64 data URI if requested
669
+ if (embedAssets) {
670
+ finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
671
+ } else {
672
+ finalContent = undefined; // Not embedding
673
+ }
674
+ cssContentForParsing = undefined; // Not CSS, so no parsing needed
675
+ }
676
+ // Handle 'other' or unknown types
677
+ else {
678
+ cssContentForParsing = undefined; // Assume not parseable as CSS
679
+ // If embedding, attempt to store as text, fallback to base64 if invalid UTF-8
680
+ if (embedAssets) {
681
+ try {
682
+ const attemptedTextContent = assetContentBuffer.toString('utf-8');
683
+ if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
684
+ // If text decoding is lossy, warn and use base64
685
+ logger?.warn(
686
+ `Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`
687
+ );
688
+ finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
689
+ } else {
690
+ // Store as text if decoding worked
691
+ finalContent = attemptedTextContent;
692
+ logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
693
+ }
694
+ } catch (decodeError) {
695
+ // If toString fails, warn and use base64
696
+ logger?.warn(
697
+ `Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`
698
+ );
699
+ finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
700
+ }
701
+ } else {
702
+ finalContent = undefined; // Not embedding
703
+ }
704
+ }
705
+ } else {
706
+ // Content was not fetched (e.g., embedAssets=false and not CSS)
707
+ finalContent = undefined;
708
+ cssContentForParsing = undefined;
709
+ }
710
+
711
+ // --- Store the final processed asset in the map ---
712
+ // Use the resolved URL as the key and ensure the asset object also uses the resolved URL
713
+ finalAssetsMap.set(asset.url, { ...asset, url: asset.url, content: finalContent });
714
+ // Note: URL was already added to processedOrQueuedUrls when initially queued or discovered in CSS
715
+
716
+ // --- Process CSS for nested assets ---
717
+ // Only if it's CSS and we successfully decoded its content for parsing
718
+ if (asset.type === 'css' && cssContentForParsing) {
719
+ // Determine the base URL *for this specific CSS file* to resolve its relative links
720
+ const cssBaseContextUrl = determineBaseUrl(asset.url, logger); // CSS URL is absolute here
721
+ logger?.debug(
722
+ `CSS base context for resolving nested assets within ${asset.url}: ${cssBaseContextUrl}`
723
+ );
609
724
 
610
- // --- Store the final asset ---
611
- // Use the resolved URL as the key and in the asset object itself
612
- finalAssetsMap.set(asset.url, { ...asset, url: asset.url, content: finalContent });
613
- // Note: URL was already added to processedOrQueuedUrls when initially queued or discovered
614
-
615
- // --- Process CSS for nested assets ---
616
- // Only if it's CSS and we successfully decoded its content for parsing
617
- if (asset.type === 'css' && cssContentForParsing) {
618
- // Determine the base URL *for this specific CSS file* to resolve its relative links
619
- const cssBaseContextUrl = determineBaseUrl(asset.url, logger); // CSS URL is absolute here
620
- logger?.debug(`CSS base context for resolving nested assets within ${asset.url}: ${cssBaseContextUrl}`);
621
-
622
- if (cssBaseContextUrl) {
623
- // Get the list of *potentially* new assets discovered in this CSS file's content
624
- const newlyDiscoveredAssets = extractUrlsFromCSS(
625
- cssContentForParsing,
626
- cssBaseContextUrl, // Use CSS file's base URL
627
- logger
628
- );
629
-
630
- if (newlyDiscoveredAssets.length > 0) {
631
- logger?.debug(`Discovered ${newlyDiscoveredAssets.length} nested assets in CSS ${asset.url}. Checking against queue...`);
632
- for (const newAsset of newlyDiscoveredAssets) {
633
- // CHECK: Add to queue only if this resolved URL hasn't been processed OR queued before.
634
- if (!processedOrQueuedUrls.has(newAsset.url)) {
635
- processedOrQueuedUrls.add(newAsset.url); // Mark as queued now
636
- assetsToProcess.push(newAsset); // Add to the main queue for the *next* iteration
637
- logger?.debug(` -> Queued new nested asset: ${newAsset.url}`);
638
- } else {
639
- logger?.debug(` -> Skipping already processed/queued nested asset: ${newAsset.url}`);
640
- }
641
- }
642
- }
643
- } else {
644
- logger?.warn(`Could not determine base URL context for CSS file ${asset.url}. Cannot resolve nested relative paths within it.`);
645
- }
646
- } // End if(asset.type === 'css' && cssContentForParsing)
647
- } // End for loop over currentBatch
648
- } // End while loop
649
-
650
- const finalIterationCount = iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS ? 'MAX+' : iterationCount;
651
- logger?.info(`✅ Asset extraction COMPLETE! Found ${finalAssetsMap.size} unique assets in ${finalIterationCount} iterations.`);
652
-
653
- // Return the original HTML content and the final list of processed assets
654
- return {
655
- htmlContent: parsed.htmlContent,
656
- assets: Array.from(finalAssetsMap.values())
657
- };
658
- }
725
+ if (cssBaseContextUrl) {
726
+ // Extract URLs found within this CSS content
727
+ const newlyDiscoveredAssets = extractUrlsFromCSS(
728
+ cssContentForParsing,
729
+ cssBaseContextUrl, // Use the CSS file's own URL as the base
730
+ logger
731
+ );
732
+
733
+ // If new assets were found in the CSS
734
+ if (newlyDiscoveredAssets.length > 0) {
735
+ logger?.debug(
736
+ `Discovered ${newlyDiscoveredAssets.length} nested assets in CSS ${asset.url}. Checking against queue...`
737
+ );
738
+ // Process each newly discovered asset
739
+ for (const newAsset of newlyDiscoveredAssets) {
740
+ // CHECK: Add to the main processing queue only if this resolved URL hasn't been processed OR queued before.
741
+ if (!processedOrQueuedUrls.has(newAsset.url)) {
742
+ processedOrQueuedUrls.add(newAsset.url); // Mark as queued now
743
+ assetsToProcess.push(newAsset); // Add to the queue for the *next* iteration
744
+ logger?.debug(` -> Queued new nested asset: ${newAsset.url}`);
745
+ } else {
746
+ // Skip if already handled
747
+ logger?.debug(
748
+ ` -> Skipping already processed/queued nested asset: ${newAsset.url}`
749
+ );
750
+ }
751
+ }
752
+ }
753
+ } else {
754
+ // Warn if the base URL for the CSS file couldn't be determined (shouldn't happen if asset.url was valid)
755
+ logger?.warn(
756
+ `Could not determine base URL context for CSS file ${asset.url}. Cannot resolve nested relative paths within it.`
757
+ );
758
+ }
759
+ } // End if(asset.type === 'css' && cssContentForParsing)
760
+ } // End for loop over currentBatch
761
+ } // End while loop (assetsToProcess.length > 0)
762
+
763
+ // Log completion summary
764
+ const finalIterationCount =
765
+ iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS
766
+ ? `${MAX_ASSET_EXTRACTION_ITERATIONS}+ (limit hit)`
767
+ : iterationCount;
768
+ logger?.info(
769
+ `✅ Asset extraction COMPLETE! Found ${finalAssetsMap.size} unique assets in ${finalIterationCount} iterations.`
770
+ );
771
+
772
+ // Return the original HTML content and the final list of processed assets from the map
773
+ return {
774
+ htmlContent: parsed.htmlContent,
775
+ assets: Array.from(finalAssetsMap.values()),
776
+ };
777
+ }