portapack 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/.eslintrc.json +9 -0
  2. package/.github/workflows/ci.yml +73 -0
  3. package/.github/workflows/deploy-pages.yml +56 -0
  4. package/.prettierrc +9 -0
  5. package/.releaserc.js +29 -0
  6. package/CHANGELOG.md +21 -0
  7. package/README.md +288 -0
  8. package/commitlint.config.js +36 -0
  9. package/dist/cli/cli-entry.js +1694 -0
  10. package/dist/cli/cli-entry.js.map +1 -0
  11. package/dist/index.d.ts +275 -0
  12. package/dist/index.js +1405 -0
  13. package/dist/index.js.map +1 -0
  14. package/docs/.vitepress/config.ts +89 -0
  15. package/docs/.vitepress/sidebar-generator.ts +73 -0
  16. package/docs/cli.md +117 -0
  17. package/docs/code-of-conduct.md +65 -0
  18. package/docs/configuration.md +151 -0
  19. package/docs/contributing.md +107 -0
  20. package/docs/demo.md +46 -0
  21. package/docs/deployment.md +132 -0
  22. package/docs/development.md +168 -0
  23. package/docs/getting-started.md +106 -0
  24. package/docs/index.md +40 -0
  25. package/docs/portapack-transparent.png +0 -0
  26. package/docs/portapack.jpg +0 -0
  27. package/docs/troubleshooting.md +107 -0
  28. package/examples/main.ts +118 -0
  29. package/examples/sample-project/index.html +12 -0
  30. package/examples/sample-project/logo.png +1 -0
  31. package/examples/sample-project/script.js +1 -0
  32. package/examples/sample-project/styles.css +1 -0
  33. package/jest.config.ts +124 -0
  34. package/jest.setup.cjs +211 -0
  35. package/nodemon.json +11 -0
  36. package/output.html +1 -0
  37. package/package.json +161 -0
  38. package/site-packed.html +1 -0
  39. package/src/cli/cli-entry.ts +28 -0
  40. package/src/cli/cli.ts +139 -0
  41. package/src/cli/options.ts +151 -0
  42. package/src/core/bundler.ts +201 -0
  43. package/src/core/extractor.ts +618 -0
  44. package/src/core/minifier.ts +233 -0
  45. package/src/core/packer.ts +191 -0
  46. package/src/core/parser.ts +115 -0
  47. package/src/core/web-fetcher.ts +292 -0
  48. package/src/index.ts +262 -0
  49. package/src/types.ts +163 -0
  50. package/src/utils/font.ts +41 -0
  51. package/src/utils/logger.ts +139 -0
  52. package/src/utils/meta.ts +100 -0
  53. package/src/utils/mime.ts +90 -0
  54. package/src/utils/slugify.ts +70 -0
  55. package/test-output.html +0 -0
  56. package/tests/__fixtures__/sample-project/index.html +5 -0
  57. package/tests/unit/cli/cli-entry.test.ts +104 -0
  58. package/tests/unit/cli/cli.test.ts +230 -0
  59. package/tests/unit/cli/options.test.ts +316 -0
  60. package/tests/unit/core/bundler.test.ts +287 -0
  61. package/tests/unit/core/extractor.test.ts +1129 -0
  62. package/tests/unit/core/minifier.test.ts +414 -0
  63. package/tests/unit/core/packer.test.ts +193 -0
  64. package/tests/unit/core/parser.test.ts +540 -0
  65. package/tests/unit/core/web-fetcher.test.ts +374 -0
  66. package/tests/unit/index.test.ts +339 -0
  67. package/tests/unit/utils/font.test.ts +81 -0
  68. package/tests/unit/utils/logger.test.ts +275 -0
  69. package/tests/unit/utils/meta.test.ts +70 -0
  70. package/tests/unit/utils/mime.test.ts +96 -0
  71. package/tests/unit/utils/slugify.test.ts +71 -0
  72. package/tsconfig.build.json +11 -0
  73. package/tsconfig.jest.json +17 -0
  74. package/tsconfig.json +20 -0
  75. package/tsup.config.ts +71 -0
  76. package/typedoc.json +28 -0
@@ -0,0 +1,618 @@
1
+ /**
2
+ * @file src/core/extractor.ts
3
+ * @description Handles discovery, resolution, fetching, and optional embedding of assets
4
+ * linked from HTML and recursively within CSS (@import, url()). This is the heart of finding EVERYTHING.
5
+ * @version 1.1.3 - Fixed CSS path resolution and handling of 'other' asset types.
6
+ */
7
+
8
+ // === Node.js Core Imports ===
9
+ import { readFile } from 'fs/promises';
10
+ import * as fs from 'fs'; // Required for statSync for sync directory check
11
+ import type { FileHandle } from 'fs/promises';
12
+ import path from 'path';
13
+ import { fileURLToPath, URL } from 'url'; // Crucial for file path/URL conversion
14
+
15
+ // === External Dependencies ===
16
+ import * as axios from 'axios'; // Using namespace import for clarity
17
+ import type { AxiosError, AxiosRequestConfig, AxiosResponse, InternalAxiosRequestConfig } from 'axios';
18
+
19
+ // === Project Imports ===
20
+ import type { Asset, ParsedHTML } from '../types';
21
+ import { guessMimeType } from '../utils/mime';
22
+ import { Logger } from '../utils/logger';
23
+
24
+ // === Constants ===
25
+ /** Set of asset types defined in Asset['type'] generally considered text-based */
26
+ const TEXT_ASSET_TYPES: Set<Asset['type']> = new Set(['css', 'js']);
27
+ /** Set of asset types defined in Asset['type'] generally considered binary and embedded via Base64 Data URI */
28
+ const BINARY_ASSET_TYPES: Set<Asset['type']> = new Set(['image', 'font', 'video', 'audio']);
29
+ /** Maximum number of iterations for the asset discovery loop to prevent infinite cycles. */
30
+ const MAX_ASSET_EXTRACTION_ITERATIONS = 1000;
31
+
32
+ // === Helper Functions ===
33
+
34
+ /**
35
+ * Checks if decoding a buffer as UTF-8 and re-encoding is lossy.
36
+ * @param {Buffer} originalBuffer The original binary buffer.
37
+ * @param {string} decodedString The string resulting from toString('utf-8').
38
+ * @returns {boolean} True if re-encoding doesn't match original buffer (lossy), false otherwise.
39
+ */
40
+ function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boolean {
41
+ try {
42
+ const reEncodedBuffer = Buffer.from(decodedString, 'utf-8');
43
+ return !originalBuffer.equals(reEncodedBuffer);
44
+ } catch (e) {
45
+ return true;
46
+ }
47
+ }
48
+
49
+ /**
50
+ * Determines the absolute base directory URL (http://, https://, or file:///) ending in '/'.
51
+ * @param {string} inputPathOrUrl - The original source HTML file path or a full HTTP/HTTPS URL.
52
+ * @param {Logger} [logger] - Optional logger instance.
53
+ * @returns {string | undefined} The absolute base URL string ending in '/', or undefined if determination fails.
54
+ */
55
+ function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | undefined {
56
+ logger?.debug(`Determining base URL for input: ${inputPathOrUrl}`);
57
+ if (!inputPathOrUrl) {
58
+ logger?.warn('Cannot determine base URL: inputPathOrUrl is empty or invalid.');
59
+ return undefined;
60
+ }
61
+
62
+ try {
63
+ if (/^https?:\/\//i.test(inputPathOrUrl)) {
64
+ const url = new URL(inputPathOrUrl);
65
+ url.pathname = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
66
+ url.search = ''; url.hash = '';
67
+ const baseUrl = url.href;
68
+ logger?.debug(`Determined remote base URL: ${baseUrl}`);
69
+ return baseUrl;
70
+ }
71
+ else if (inputPathOrUrl.includes('://') && !inputPathOrUrl.startsWith('file:')) {
72
+ logger?.warn(`Input "${inputPathOrUrl}" looks like a URL but uses an unsupported protocol. Cannot determine base URL.`);
73
+ return undefined;
74
+ }
75
+ else {
76
+ let absolutePath: string;
77
+ if (inputPathOrUrl.startsWith('file:')) {
78
+ try { absolutePath = fileURLToPath(inputPathOrUrl); }
79
+ catch (e: any) { logger?.error(`💀 Failed to convert file URL "${inputPathOrUrl}" to path: ${e.message}`); return undefined; }
80
+ } else {
81
+ absolutePath = path.resolve(inputPathOrUrl);
82
+ }
83
+ let isDirectory = false;
84
+ try { isDirectory = fs.statSync(absolutePath).isDirectory(); }
85
+ catch (statError: unknown) {
86
+ if (statError instanceof Error && (statError as NodeJS.ErrnoException).code === 'ENOENT') {
87
+ logger?.debug(`Path "${absolutePath}" not found. Assuming input represents a file, using its parent directory as base.`);
88
+ } else {
89
+ logger?.warn(`Could not stat local path "${absolutePath}" during base URL determination: ${statError instanceof Error ? statError.message : String(statError)}. Assuming input represents a file.`);
90
+ }
91
+ isDirectory = false;
92
+ }
93
+ const dirPath = isDirectory ? absolutePath : path.dirname(absolutePath);
94
+ let normalizedPathForURL = dirPath.replace(/\\/g, '/');
95
+ if (/^[A-Z]:\//i.test(normalizedPathForURL) && !normalizedPathForURL.startsWith('/')) {
96
+ normalizedPathForURL = '/' + normalizedPathForURL;
97
+ }
98
+ const fileUrl = new URL('file://' + normalizedPathForURL);
99
+ let fileUrlString = fileUrl.href;
100
+ if (!fileUrlString.endsWith('/')) { fileUrlString += '/'; }
101
+ logger?.debug(`Determined local base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved dir: ${dirPath}, isDir: ${isDirectory})`);
102
+ return fileUrlString;
103
+ }
104
+ } catch (error: unknown) {
105
+ const message = error instanceof Error ? error.message : String(error);
106
+ logger?.error(`💀 Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error ? ` - Stack: ${error.stack}` : ''}`);
107
+ return undefined;
108
+ }
109
+ }
110
+
111
+ /**
112
+ * Resolves an asset URL relative to a base URL context.
113
+ * @param {string} assetUrl - The raw URL string found in the source.
114
+ * @param {string} [baseContextUrl] - The absolute base URL of the containing document.
115
+ * @param {Logger} [logger] - Optional logger instance.
116
+ * @returns {URL | null} A validated, absolute URL object or null.
117
+ */
118
+ function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Logger): URL | null {
119
+ const trimmedUrl = assetUrl?.trim();
120
+ if (!trimmedUrl || trimmedUrl.startsWith('data:') || trimmedUrl.startsWith('#')) {
121
+ return null;
122
+ }
123
+ let resolvableUrl = trimmedUrl;
124
+ if (resolvableUrl.startsWith('//') && baseContextUrl) {
125
+ try {
126
+ const base = new URL(baseContextUrl);
127
+ resolvableUrl = base.protocol + resolvableUrl;
128
+ } catch (e) {
129
+ logger?.warn(`Could not extract protocol from base "${baseContextUrl}" for protocol-relative URL "${trimmedUrl}". Skipping.`);
130
+ return null;
131
+ }
132
+ }
133
+ try {
134
+ const resolved = new URL(resolvableUrl, baseContextUrl);
135
+ return resolved;
136
+ } catch (error: unknown) {
137
+ const message = error instanceof Error ? error.message : String(error);
138
+ if (!/^[a-z]+:/i.test(resolvableUrl) && !resolvableUrl.startsWith('/') && !baseContextUrl) {
139
+ logger?.warn(`Cannot resolve relative URL "${resolvableUrl}" - Base context URL was not provided or determined.`);
140
+ } else {
141
+ logger?.warn(`⚠️ Failed to parse/resolve URL "${resolvableUrl}" ${baseContextUrl ? 'against base "' + baseContextUrl + '"' : '(no base provided)'}: ${message}`);
142
+ }
143
+ return null;
144
+ }
145
+ }
146
+
147
+ /**
148
+ * Properly resolves CSS relative paths, handling "../" correctly.
149
+ * This is critical for properly resolving paths in CSS like "../images/bg.png".
150
+ *
151
+ * @param {string} relativeUrl - The relative URL from CSS (e.g., "../images/bg.png")
152
+ * @param {string} cssBaseUrl - The base URL of the CSS file
153
+ * @param {Logger} [logger] - Optional logger instance
154
+ * @returns {string | null} The resolved absolute URL or null if resolution fails
155
+ */
156
+ function resolveCssRelativeUrl(
157
+ relativeUrl: string,
158
+ cssBaseContextUrl: string,
159
+ logger?: Logger
160
+ ): string | null {
161
+ // Skip empty or data URLs
162
+ if (!relativeUrl || relativeUrl.startsWith('data:')) {
163
+ return null;
164
+ }
165
+
166
+ try {
167
+ if (cssBaseContextUrl.startsWith('file:')) {
168
+ // Turn the CSS base URL into a filesystem path
169
+ const basePath = fileURLToPath(cssBaseContextUrl);
170
+
171
+ // If that base path is actually a directory, use it directly;
172
+ // otherwise, use its dirname. This prevents us from dropping
173
+ // the final directory name when we already have a trailing slash.
174
+ let cssDir: string;
175
+ try {
176
+ const stat = fs.statSync(basePath);
177
+ if (stat.isDirectory()) {
178
+ cssDir = basePath;
179
+ } else {
180
+ cssDir = path.dirname(basePath);
181
+ }
182
+ } catch {
183
+ // If stat fails, assume it's a file path
184
+ cssDir = path.dirname(basePath);
185
+ }
186
+
187
+ // Resolve relativeUrl against this directory
188
+ let resolvedPath = path.resolve(cssDir, relativeUrl);
189
+ resolvedPath = resolvedPath.replace(/\\/g, '/'); // Normalize to forward slashes
190
+
191
+ // On Windows, ensure file:///C:/something
192
+ if (/^[A-Z]:/i.test(resolvedPath) && !resolvedPath.startsWith('/')) {
193
+ resolvedPath = '/' + resolvedPath;
194
+ }
195
+ return `file://${resolvedPath}`;
196
+ } else {
197
+ // For http/https etc., do standard resolution
198
+ return new URL(relativeUrl, cssBaseContextUrl).href;
199
+ }
200
+ } catch (error) {
201
+ logger?.warn(
202
+ `Failed to resolve CSS URL: "${relativeUrl}" against "${cssBaseContextUrl}": ${String(error)}`
203
+ );
204
+ return null;
205
+ }
206
+ }
207
+
208
+
209
+ /**
210
+ * Asynchronously fetches the content of a resolved asset URL.
211
+ * @async
212
+ * @param {URL} resolvedUrl - The absolute URL object of the asset to fetch.
213
+ * @param {Logger} [logger] - Optional logger instance.
214
+ * @param {number} [timeout=10000] - Network timeout in milliseconds.
215
+ * @returns {Promise<Buffer | null>} Asset content as a Buffer, or null on failure.
216
+ */
217
+ /**
218
+ * Asynchronously fetches the content of a resolved asset URL.
219
+ * @async
220
+ * @param {URL} resolvedUrl - The absolute URL object of the asset to fetch.
221
+ * @param {Logger} [logger] - Optional logger instance.
222
+ * @param {number} [timeout=10000] - Network timeout in milliseconds.
223
+ * @returns {Promise<Buffer | null>} Asset content as a Buffer, or null on failure.
224
+ */
225
+ async function fetchAsset(resolvedUrl: URL, logger?: Logger, timeout: number = 10000): Promise<Buffer | null> {
226
+ logger?.debug(`Attempting to fetch asset: ${resolvedUrl.href}`);
227
+ const protocol = resolvedUrl.protocol;
228
+
229
+ try {
230
+ if (protocol === 'http:' || protocol === 'https:') {
231
+ const response: AxiosResponse<ArrayBuffer> = await axios.default.get(resolvedUrl.href, {
232
+ responseType: 'arraybuffer', timeout: timeout,
233
+ });
234
+ logger?.debug(`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data.byteLength} bytes)`);
235
+ return Buffer.from(response.data);
236
+ } else if (protocol === 'file:') {
237
+ let filePath: string;
238
+ try {
239
+ filePath = fileURLToPath(resolvedUrl);
240
+ } catch (e: any) {
241
+ // Log error specifically for path conversion failure
242
+ logger?.error(`Could not convert file URL to path: ${resolvedUrl.href}. Error: ${e.message}`);
243
+ return null; // Cannot proceed without a valid path
244
+ }
245
+ // This section will now only be reached if fileURLToPath succeeded
246
+ const data = await readFile(filePath); // This might throw ENOENT, EACCES etc.
247
+ logger?.debug(`Read local file ${filePath} (${data.byteLength} bytes)`);
248
+ return data;
249
+ } else {
250
+ logger?.warn(`Unsupported protocol "${protocol}" in URL: ${resolvedUrl.href}`);
251
+ return null;
252
+ }
253
+ } catch (error: unknown) {
254
+ // --- Handle Errors Based on Protocol/Context ---
255
+
256
+ // Check for AxiosError FIRST (only relevant if protocol was http/https)
257
+ if ((protocol === 'http:' || protocol === 'https:') && axios.default.isAxiosError(error)) {
258
+ const status = error.response?.status ?? 'N/A';
259
+ const statusText = error.response?.statusText ?? 'Error';
260
+ const code = error.code ?? 'N/A';
261
+ const message = error.message;
262
+ // Construct the message matching test expectation
263
+ const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: Status ${status} - ${statusText}. Code: ${code}, Message: ${message}`;
264
+ logger?.warn(logMessage);
265
+ }
266
+ // Check for specific FS errors (only relevant if protocol was file:)
267
+ else if (protocol === 'file:') {
268
+ // Determine the file path again for logging, handling potential errors
269
+ let failedPath = resolvedUrl.href;
270
+ try { failedPath = fileURLToPath(resolvedUrl); } catch { /* ignore if conversion fails here, use original href */ }
271
+
272
+ if (error instanceof Error && (error as NodeJS.ErrnoException).code === 'ENOENT') {
273
+ logger?.warn(`⚠️ File not found (ENOENT) for asset: ${failedPath}.`);
274
+ } else if (error instanceof Error && (error as NodeJS.ErrnoException).code === 'EACCES') {
275
+ logger?.warn(`⚠️ Permission denied (EACCES) reading asset: ${failedPath}.`);
276
+ } else if (error instanceof Error) { // Catch other errors during file reading (but not path conversion which is handled above)
277
+ logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
278
+ } else {
279
+ logger?.warn(`⚠️ An unknown error occurred while reading local asset ${failedPath}: ${String(error)}`);
280
+ }
281
+ }
282
+ // Check for other specific errors like invalid URL types if necessary (ERR_INVALID_URL handled above mostly)
283
+ // else if (error instanceof TypeError && error.message.includes('ERR_INVALID_URL')) { ... }
284
+
285
+ // Generic fallback for truly unexpected errors during fetch/read
286
+ else if (error instanceof Error) {
287
+ logger?.warn(`⚠️ An unexpected error occurred processing asset ${resolvedUrl.href}: ${error.message}`);
288
+ } else {
289
+ logger?.warn(`⚠️ An unknown and unexpected error occurred processing asset ${resolvedUrl.href}: ${String(error)}`);
290
+ }
291
+ return null; // Return null on ANY fetch/read error caught here
292
+ }
293
+ }
294
+
295
+ /**
296
+ * Extracts URLs from CSS content and resolves them against the CSS base URL.
297
+ * @param {string} cssContent - The CSS content to parse
298
+ * @param {string} cssBaseContextUrl - The base URL of the CSS file
299
+ * @param {Asset[]} discoveredAssets - Array to push newly discovered assets to
300
+ * @param {Set<string>} visitedUrls - Set of already visited URLs to avoid duplicates
301
+ * @param {Logger} [logger] - Optional logger instance
302
+ */
303
+ /**
304
+ * Extracts URLs from CSS content and resolves them against the CSS base URL.
305
+ * Returns an array of *potentially* new Asset objects with resolved URLs.
306
+ */
307
+ function extractUrlsFromCSS(
308
+ cssContent: string,
309
+ cssBaseContextUrl: string,
310
+ // discoveredAssets: Asset[], // REMOVE: This function will now RETURN the assets
311
+ // visitedUrls: Set<string>, // REMOVE
312
+ logger?: Logger
313
+ ): Asset[] { // RETURN the discovered assets
314
+ const newlyDiscovered: Asset[] = []; // Internal list for this parse
315
+ const processedInThisParse = new Set<string>(); // Track URLs found in *this specific* CSS file to avoid duplicates from the same file
316
+
317
+ const urlRegex = /url\(\s*(['"]?)(.*?)\1\s*\)/gi;
318
+ const importRegex = /@import\s+(?:url\(\s*(['"]?)(.*?)\1\s*\)|(['"])(.*?)\3)\s*;/gi;
319
+
320
+ const processFoundUrl = (rawUrl: string | undefined, ruleType: '@import' | 'url()') => {
321
+ if (!rawUrl || rawUrl.trim() === '' || rawUrl.startsWith('data:')) return;
322
+
323
+ const resolvedUrl = resolveCssRelativeUrl(rawUrl, cssBaseContextUrl, logger);
324
+
325
+ // Check if resolved AND not already processed within *this* CSS file
326
+ if (resolvedUrl && !processedInThisParse.has(resolvedUrl)) {
327
+ processedInThisParse.add(resolvedUrl); // Mark as found in this file
328
+ const { assetType } = guessMimeType(resolvedUrl);
329
+
330
+ // Add to the list to be returned
331
+ newlyDiscovered.push({
332
+ type: assetType,
333
+ url: resolvedUrl, // The resolved URL string
334
+ content: undefined
335
+ });
336
+ logger?.debug(`Discovered nested ${assetType} asset (${ruleType}) in CSS ${cssBaseContextUrl}: ${resolvedUrl}`);
337
+ }
338
+ };
339
+
340
+ // ... (run regex loops calling processFoundUrl) ...
341
+ urlRegex.lastIndex = 0;
342
+ importRegex.lastIndex = 0;
343
+ let match;
344
+ while ((match = urlRegex.exec(cssContent)) !== null) {
345
+ processFoundUrl(match[2], 'url()');
346
+ }
347
+ importRegex.lastIndex = 0;
348
+ while ((match = importRegex.exec(cssContent)) !== null) {
349
+ processFoundUrl(match[2] || match[4], '@import');
350
+ }
351
+
352
+ return newlyDiscovered; // Return the list
353
+ }
354
+
355
+ /**
356
+ * Extracts all discoverable assets recursively from HTML and CSS.
357
+ * @async
358
+ * @export
359
+ * @param {ParsedHTML} parsed - Initial parsed HTML data.
360
+ * @param {boolean} [embedAssets=true] - Whether to embed content.
361
+ * @param {string} [inputPathOrUrl] - Original HTML source location.
362
+ * @param {Logger} [logger] - Optional logger instance.
363
+ * @returns {Promise<ParsedHTML>} Processed data with all assets.
364
+ */
365
+ /**
366
+ * Extracts all discoverable assets recursively from HTML and CSS.
367
+ * Fetches assets if embedAssets is true or if the asset is CSS (to parse for more assets).
368
+ * Resolves URLs relative to their context (HTML base or CSS file location).
369
+ * @async
370
+ * @export
371
+ * @param {ParsedHTML} parsed - Initial parsed HTML data containing `htmlContent` and an initial `assets` array.
372
+ * @param {boolean} [embedAssets=true] - Whether to fetch asset content and store it (usually as a data URI or text). If false, content remains undefined, but assets are still discovered.
373
+ * @param {string} [inputPathOrUrl] - The original source location (file path or URL) of the HTML. Used to determine the base context for resolving relative paths in the HTML.
374
+ * @param {Logger} [logger] - Optional logger instance for detailed logging.
375
+ * @returns {Promise<ParsedHTML>} Processed data with `htmlContent` and the final `assets` array containing all discovered assets (with content if `embedAssets` was true and fetch succeeded).
376
+ */
377
+ export async function extractAssets(
378
+ parsed: ParsedHTML,
379
+ embedAssets = true,
380
+ inputPathOrUrl?: string,
381
+ logger?: Logger
382
+ ): Promise<ParsedHTML> {
383
+ logger?.info(`🚀 Starting asset extraction! Embed: ${embedAssets}. Input: ${inputPathOrUrl || '(HTML content only)'}`);
384
+
385
+ const initialAssets: Asset[] = parsed.assets || [];
386
+ // Stores the final result: Map<resolved URL string, Asset object>
387
+ const finalAssetsMap = new Map<string, Asset>();
388
+ // Queue holds assets to be processed: { url: string (resolved), type: ..., content?: ... }
389
+ let assetsToProcess: Asset[] = [];
390
+
391
+ // Determine the base URL context for resolving relative paths FROM THE HTML
392
+ const htmlBaseContextUrl = determineBaseUrl(inputPathOrUrl || '', logger);
393
+ if (!htmlBaseContextUrl && initialAssets.some(a => !/^[a-z]+:/i.test(a.url) && !a.url.startsWith('data:') && !a.url.startsWith('#') && !a.url.startsWith('/'))) {
394
+ logger?.warn("🚨 No valid base path/URL determined for the HTML source! Resolution of relative asset paths from HTML may fail.");
395
+ } else if (htmlBaseContextUrl) {
396
+ logger?.debug(`Using HTML base context URL: ${htmlBaseContextUrl}`);
397
+ }
398
+
399
+ // --- CORRECTED: Define processedOrQueuedUrls HERE in the main function scope ---
400
+ // Set to track URLs that are already processed (in finalAssetsMap) OR currently in the queue (assetsToProcess)
401
+ // This prevents adding the same asset to the queue multiple times.
402
+ const processedOrQueuedUrls = new Set<string>();
403
+
404
+ // --- Initial Queue Population ---
405
+ logger?.debug(`Queueing ${initialAssets.length} initial assets parsed from HTML...`);
406
+ for (const asset of initialAssets) {
407
+ // Resolve the initial asset URL against the HTML base context
408
+ const resolvedUrlObj = resolveAssetUrl(asset.url, htmlBaseContextUrl, logger);
409
+ // Use the resolved URL string if resolution succeeded, otherwise use the original
410
+ const urlToQueue = resolvedUrlObj ? resolvedUrlObj.href : asset.url;
411
+
412
+ // Skip data URIs and check if this URL is already tracked
413
+ if (!urlToQueue.startsWith('data:') && !processedOrQueuedUrls.has(urlToQueue)) {
414
+ processedOrQueuedUrls.add(urlToQueue); // Mark as queued
415
+
416
+ // Guess type from the resolved/original URL if not provided initially
417
+ const { assetType: guessedType } = guessMimeType(urlToQueue);
418
+ const initialType = asset.type ?? guessedType;
419
+
420
+ // Add to the processing queue
421
+ assetsToProcess.push({
422
+ url: urlToQueue,
423
+ type: initialType,
424
+ content: undefined
425
+ });
426
+ logger?.debug(` -> Queued initial asset: ${urlToQueue} (Original raw: ${asset.url})`);
427
+ } else if (urlToQueue.startsWith('data:')) {
428
+ logger?.debug(` -> Skipping data URI: ${urlToQueue.substring(0, 50)}...`);
429
+ } else {
430
+ logger?.debug(` -> Skipping already queued initial asset: ${urlToQueue}`);
431
+ }
432
+ }
433
+
434
+ // --- Main processing loop ---
435
+ let iterationCount = 0;
436
+ while (assetsToProcess.length > 0) {
437
+ iterationCount++;
438
+ if (iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS) {
439
+ logger?.error(`🛑 Asset extraction loop limit hit (${MAX_ASSET_EXTRACTION_ITERATIONS})! Aborting.`);
440
+ const remainingUrls = assetsToProcess.map(a => a.url).slice(0, 10).join(', ');
441
+ logger?.error(`Remaining queue sample (${assetsToProcess.length} items): ${remainingUrls}...`);
442
+ // Add assets remaining in queue to final map without content before breaking
443
+ assetsToProcess.forEach(asset => {
444
+ if (!finalAssetsMap.has(asset.url)) {
445
+ finalAssetsMap.set(asset.url, { ...asset, content: undefined });
446
+ }
447
+ });
448
+ assetsToProcess = []; // Clear queue
449
+ break; // Exit loop
450
+ }
451
+
452
+ // Process assets in batches
453
+ const currentBatch = [...assetsToProcess];
454
+ assetsToProcess = []; // Clear queue for the next batch discovered in this iteration
455
+
456
+ logger?.debug(`--- Processing batch ${iterationCount}: ${currentBatch.length} asset(s) ---`);
457
+
458
+ for (const asset of currentBatch) {
459
+ // Skip if already fully processed
460
+ if (finalAssetsMap.has(asset.url)) {
461
+ logger?.debug(`Skipping asset already in final map: ${asset.url}`);
462
+ continue;
463
+ }
464
+
465
+ let assetContentBuffer: Buffer | null = null;
466
+ let finalContent: string | undefined = undefined; // For embedding
467
+ let cssContentForParsing: string | undefined = undefined; // For CSS parsing
468
+
469
+ // --- Determine if fetching is needed ---
470
+ const needsFetching = embedAssets || asset.type === 'css';
471
+ let assetUrlObj: URL | null = null;
472
+
473
+ if (needsFetching) {
474
+ // --- Create URL object for fetching ---
475
+ try {
476
+ assetUrlObj = new URL(asset.url);
477
+ } catch (urlError) {
478
+ logger?.warn(`Cannot create URL object for "${asset.url}", skipping fetch. Error: ${urlError instanceof Error ? urlError.message : String(urlError)}`);
479
+ finalAssetsMap.set(asset.url, { ...asset, content: undefined });
480
+ continue; // Skip to next asset in batch
481
+ }
482
+
483
+ // --- Fetch Asset ---
484
+ if (assetUrlObj) {
485
+ assetContentBuffer = await fetchAsset(assetUrlObj, logger);
486
+ }
487
+ } // End if(needsFetching)
488
+
489
+ // --- If fetching was needed but failed, add to map without content and skip ---
490
+ if (needsFetching && assetContentBuffer === null) {
491
+ logger?.debug(`Storing asset ${asset.url} without content due to fetch failure.`);
492
+ finalAssetsMap.set(asset.url, { ...asset, content: undefined });
493
+ continue; // Skip to next asset in batch
494
+ }
495
+
496
+ // --- Prepare Content for Storing/Embedding (if fetched successfully) ---
497
+ if (assetContentBuffer) { // Only proceed if content was fetched
498
+ const mimeInfo = guessMimeType(asset.url);
499
+ const effectiveMime = mimeInfo.mime || 'application/octet-stream';
500
+
501
+ // Try to decode TEXT types as UTF-8
502
+ if (TEXT_ASSET_TYPES.has(asset.type)) {
503
+ let textContent: string | undefined;
504
+ let wasLossy = false;
505
+ try {
506
+ textContent = assetContentBuffer.toString('utf-8');
507
+ wasLossy = isUtf8DecodingLossy(assetContentBuffer, textContent);
508
+ } catch (e) { textContent = undefined; wasLossy = true; }
509
+
510
+ if (!wasLossy && textContent !== undefined) {
511
+ // Store the decoded text content if embedding or it's CSS (for parsing)
512
+ if (embedAssets) {
513
+ finalContent = textContent;
514
+ } else {
515
+ finalContent = undefined; // Not embedding text
516
+ }
517
+ // If it's CSS, store it for parsing later regardless of embedding
518
+ if (asset.type === 'css') {
519
+ cssContentForParsing = textContent;
520
+ }
521
+ } else {
522
+ // Decoding failed or was lossy
523
+ logger?.warn(`Could not decode ${asset.type} ${asset.url} as valid UTF-8 text.${embedAssets ? ' Falling back to base64 data URI.' : ''}`);
524
+ cssContentForParsing = undefined; // Cannot parse if decoding failed
525
+ // Embed as base64 if requested
526
+ if (embedAssets) {
527
+ finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
528
+ } else {
529
+ finalContent = undefined; // Not embedding, content remains undefined
530
+ }
531
+ }
532
+ }
533
+ // Embed BINARY types as base64 if requested
534
+ else if (BINARY_ASSET_TYPES.has(asset.type)) {
535
+ if (embedAssets) {
536
+ finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
537
+ } else {
538
+ finalContent = undefined; // Not embedding
539
+ }
540
+ cssContentForParsing = undefined; // Not CSS
541
+ }
542
+ // Handle 'other' types: try text, fallback to base64 if embedding
543
+ else { // asset.type === 'other' or unknown
544
+ cssContentForParsing = undefined; // Not CSS
545
+ if (embedAssets) {
546
+ try {
547
+ const attemptedTextContent = assetContentBuffer.toString('utf-8');
548
+ if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
549
+ logger?.warn(`Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`);
550
+ finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
551
+ } else {
552
+ finalContent = attemptedTextContent;
553
+ logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
554
+ }
555
+ } catch (decodeError) {
556
+ logger?.warn(`Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`);
557
+ finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
558
+ }
559
+ } else {
560
+ finalContent = undefined; // Not embedding
561
+ }
562
+ }
563
+ } else {
564
+ // Content was not fetched
565
+ finalContent = undefined;
566
+ cssContentForParsing = undefined;
567
+ }
568
+
569
+ // --- Store the final asset ---
570
+ // Use the resolved URL as the key and in the asset object itself
571
+ finalAssetsMap.set(asset.url, { ...asset, url: asset.url, content: finalContent });
572
+ // Note: URL is already marked in processedOrQueuedUrls
573
+
574
+ // --- Process CSS for nested assets ---
575
+ // Only if it's CSS and we successfully decoded its content for parsing
576
+ if (asset.type === 'css' && cssContentForParsing) {
577
+ // Determine the base URL *for this specific CSS file*
578
+ const cssBaseContextUrl = determineBaseUrl(asset.url, logger);
579
+ logger?.debug(`CSS base context for resolving nested assets within ${asset.url}: ${cssBaseContextUrl}`);
580
+
581
+ if (cssBaseContextUrl) {
582
+ // Get the list of *potentially* new assets discovered in this CSS
583
+ const newlyDiscoveredAssets = extractUrlsFromCSS(
584
+ cssContentForParsing,
585
+ cssBaseContextUrl,
586
+ logger
587
+ );
588
+
589
+ if (newlyDiscoveredAssets.length > 0) {
590
+ logger?.debug(`Discovered ${newlyDiscoveredAssets.length} nested assets in CSS ${asset.url}. Checking against queue...`);
591
+ for (const newAsset of newlyDiscoveredAssets) {
592
+ // CHECK: Add to queue only if this resolved URL hasn't been processed OR queued before.
593
+ // Use the 'processedOrQueuedUrls' Set which tracks both.
594
+ if (!processedOrQueuedUrls.has(newAsset.url)) {
595
+ processedOrQueuedUrls.add(newAsset.url); // Mark as queued now
596
+ assetsToProcess.push(newAsset); // Add to the main queue for the *next* iteration
597
+ logger?.debug(` -> Queued new nested asset: ${newAsset.url}`);
598
+ } else {
599
+ logger?.debug(` -> Skipping already processed/queued nested asset: ${newAsset.url}`);
600
+ }
601
+ }
602
+ }
603
+ } else {
604
+ logger?.warn(`Could not determine base URL context for CSS file ${asset.url}. Cannot resolve nested relative paths within it.`);
605
+ }
606
+ } // End if(asset.type === 'css' && cssContentForParsing)
607
+ } // End for loop over currentBatch
608
+ } // End while loop
609
+
610
+ const finalIterationCount = iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS ? 'MAX+' : iterationCount;
611
+ logger?.info(`✅ Asset extraction COMPLETE! Found ${finalAssetsMap.size} unique assets in ${finalIterationCount} iterations.`);
612
+
613
+ // Return the original HTML content and the final list of processed assets
614
+ return {
615
+ htmlContent: parsed.htmlContent,
616
+ assets: Array.from(finalAssetsMap.values())
617
+ };
618
+ }