portapack 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +9 -0
- package/.github/workflows/ci.yml +73 -0
- package/.github/workflows/deploy-pages.yml +56 -0
- package/.prettierrc +9 -0
- package/.releaserc.js +29 -0
- package/CHANGELOG.md +21 -0
- package/README.md +288 -0
- package/commitlint.config.js +36 -0
- package/dist/cli/cli-entry.js +1694 -0
- package/dist/cli/cli-entry.js.map +1 -0
- package/dist/index.d.ts +275 -0
- package/dist/index.js +1405 -0
- package/dist/index.js.map +1 -0
- package/docs/.vitepress/config.ts +89 -0
- package/docs/.vitepress/sidebar-generator.ts +73 -0
- package/docs/cli.md +117 -0
- package/docs/code-of-conduct.md +65 -0
- package/docs/configuration.md +151 -0
- package/docs/contributing.md +107 -0
- package/docs/demo.md +46 -0
- package/docs/deployment.md +132 -0
- package/docs/development.md +168 -0
- package/docs/getting-started.md +106 -0
- package/docs/index.md +40 -0
- package/docs/portapack-transparent.png +0 -0
- package/docs/portapack.jpg +0 -0
- package/docs/troubleshooting.md +107 -0
- package/examples/main.ts +118 -0
- package/examples/sample-project/index.html +12 -0
- package/examples/sample-project/logo.png +1 -0
- package/examples/sample-project/script.js +1 -0
- package/examples/sample-project/styles.css +1 -0
- package/jest.config.ts +124 -0
- package/jest.setup.cjs +211 -0
- package/nodemon.json +11 -0
- package/output.html +1 -0
- package/package.json +161 -0
- package/site-packed.html +1 -0
- package/src/cli/cli-entry.ts +28 -0
- package/src/cli/cli.ts +139 -0
- package/src/cli/options.ts +151 -0
- package/src/core/bundler.ts +201 -0
- package/src/core/extractor.ts +618 -0
- package/src/core/minifier.ts +233 -0
- package/src/core/packer.ts +191 -0
- package/src/core/parser.ts +115 -0
- package/src/core/web-fetcher.ts +292 -0
- package/src/index.ts +262 -0
- package/src/types.ts +163 -0
- package/src/utils/font.ts +41 -0
- package/src/utils/logger.ts +139 -0
- package/src/utils/meta.ts +100 -0
- package/src/utils/mime.ts +90 -0
- package/src/utils/slugify.ts +70 -0
- package/test-output.html +0 -0
- package/tests/__fixtures__/sample-project/index.html +5 -0
- package/tests/unit/cli/cli-entry.test.ts +104 -0
- package/tests/unit/cli/cli.test.ts +230 -0
- package/tests/unit/cli/options.test.ts +316 -0
- package/tests/unit/core/bundler.test.ts +287 -0
- package/tests/unit/core/extractor.test.ts +1129 -0
- package/tests/unit/core/minifier.test.ts +414 -0
- package/tests/unit/core/packer.test.ts +193 -0
- package/tests/unit/core/parser.test.ts +540 -0
- package/tests/unit/core/web-fetcher.test.ts +374 -0
- package/tests/unit/index.test.ts +339 -0
- package/tests/unit/utils/font.test.ts +81 -0
- package/tests/unit/utils/logger.test.ts +275 -0
- package/tests/unit/utils/meta.test.ts +70 -0
- package/tests/unit/utils/mime.test.ts +96 -0
- package/tests/unit/utils/slugify.test.ts +71 -0
- package/tsconfig.build.json +11 -0
- package/tsconfig.jest.json +17 -0
- package/tsconfig.json +20 -0
- package/tsup.config.ts +71 -0
- package/typedoc.json +28 -0
@@ -0,0 +1,618 @@
|
|
1
|
+
/**
|
2
|
+
* @file src/core/extractor.ts
|
3
|
+
* @description Handles discovery, resolution, fetching, and optional embedding of assets
|
4
|
+
* linked from HTML and recursively within CSS (@import, url()). This is the heart of finding EVERYTHING.
|
5
|
+
* @version 1.1.3 - Fixed CSS path resolution and handling of 'other' asset types.
|
6
|
+
*/
|
7
|
+
|
8
|
+
// === Node.js Core Imports ===
|
9
|
+
import { readFile } from 'fs/promises';
|
10
|
+
import * as fs from 'fs'; // Required for statSync for sync directory check
|
11
|
+
import type { FileHandle } from 'fs/promises';
|
12
|
+
import path from 'path';
|
13
|
+
import { fileURLToPath, URL } from 'url'; // Crucial for file path/URL conversion
|
14
|
+
|
15
|
+
// === External Dependencies ===
|
16
|
+
import * as axios from 'axios'; // Using namespace import for clarity
|
17
|
+
import type { AxiosError, AxiosRequestConfig, AxiosResponse, InternalAxiosRequestConfig } from 'axios';
|
18
|
+
|
19
|
+
// === Project Imports ===
|
20
|
+
import type { Asset, ParsedHTML } from '../types';
|
21
|
+
import { guessMimeType } from '../utils/mime';
|
22
|
+
import { Logger } from '../utils/logger';
|
23
|
+
|
24
|
+
// === Constants ===
|
25
|
+
/** Set of asset types defined in Asset['type'] generally considered text-based */
|
26
|
+
const TEXT_ASSET_TYPES: Set<Asset['type']> = new Set(['css', 'js']);
|
27
|
+
/** Set of asset types defined in Asset['type'] generally considered binary and embedded via Base64 Data URI */
|
28
|
+
const BINARY_ASSET_TYPES: Set<Asset['type']> = new Set(['image', 'font', 'video', 'audio']);
|
29
|
+
/** Maximum number of iterations for the asset discovery loop to prevent infinite cycles. */
|
30
|
+
const MAX_ASSET_EXTRACTION_ITERATIONS = 1000;
|
31
|
+
|
32
|
+
// === Helper Functions ===
|
33
|
+
|
34
|
+
/**
|
35
|
+
* Checks if decoding a buffer as UTF-8 and re-encoding is lossy.
|
36
|
+
* @param {Buffer} originalBuffer The original binary buffer.
|
37
|
+
* @param {string} decodedString The string resulting from toString('utf-8').
|
38
|
+
* @returns {boolean} True if re-encoding doesn't match original buffer (lossy), false otherwise.
|
39
|
+
*/
|
40
|
+
function isUtf8DecodingLossy(originalBuffer: Buffer, decodedString: string): boolean {
|
41
|
+
try {
|
42
|
+
const reEncodedBuffer = Buffer.from(decodedString, 'utf-8');
|
43
|
+
return !originalBuffer.equals(reEncodedBuffer);
|
44
|
+
} catch (e) {
|
45
|
+
return true;
|
46
|
+
}
|
47
|
+
}
|
48
|
+
|
49
|
+
/**
|
50
|
+
* Determines the absolute base directory URL (http://, https://, or file:///) ending in '/'.
|
51
|
+
* @param {string} inputPathOrUrl - The original source HTML file path or a full HTTP/HTTPS URL.
|
52
|
+
* @param {Logger} [logger] - Optional logger instance.
|
53
|
+
* @returns {string | undefined} The absolute base URL string ending in '/', or undefined if determination fails.
|
54
|
+
*/
|
55
|
+
function determineBaseUrl(inputPathOrUrl: string, logger?: Logger): string | undefined {
|
56
|
+
logger?.debug(`Determining base URL for input: ${inputPathOrUrl}`);
|
57
|
+
if (!inputPathOrUrl) {
|
58
|
+
logger?.warn('Cannot determine base URL: inputPathOrUrl is empty or invalid.');
|
59
|
+
return undefined;
|
60
|
+
}
|
61
|
+
|
62
|
+
try {
|
63
|
+
if (/^https?:\/\//i.test(inputPathOrUrl)) {
|
64
|
+
const url = new URL(inputPathOrUrl);
|
65
|
+
url.pathname = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
|
66
|
+
url.search = ''; url.hash = '';
|
67
|
+
const baseUrl = url.href;
|
68
|
+
logger?.debug(`Determined remote base URL: ${baseUrl}`);
|
69
|
+
return baseUrl;
|
70
|
+
}
|
71
|
+
else if (inputPathOrUrl.includes('://') && !inputPathOrUrl.startsWith('file:')) {
|
72
|
+
logger?.warn(`Input "${inputPathOrUrl}" looks like a URL but uses an unsupported protocol. Cannot determine base URL.`);
|
73
|
+
return undefined;
|
74
|
+
}
|
75
|
+
else {
|
76
|
+
let absolutePath: string;
|
77
|
+
if (inputPathOrUrl.startsWith('file:')) {
|
78
|
+
try { absolutePath = fileURLToPath(inputPathOrUrl); }
|
79
|
+
catch (e: any) { logger?.error(`💀 Failed to convert file URL "${inputPathOrUrl}" to path: ${e.message}`); return undefined; }
|
80
|
+
} else {
|
81
|
+
absolutePath = path.resolve(inputPathOrUrl);
|
82
|
+
}
|
83
|
+
let isDirectory = false;
|
84
|
+
try { isDirectory = fs.statSync(absolutePath).isDirectory(); }
|
85
|
+
catch (statError: unknown) {
|
86
|
+
if (statError instanceof Error && (statError as NodeJS.ErrnoException).code === 'ENOENT') {
|
87
|
+
logger?.debug(`Path "${absolutePath}" not found. Assuming input represents a file, using its parent directory as base.`);
|
88
|
+
} else {
|
89
|
+
logger?.warn(`Could not stat local path "${absolutePath}" during base URL determination: ${statError instanceof Error ? statError.message : String(statError)}. Assuming input represents a file.`);
|
90
|
+
}
|
91
|
+
isDirectory = false;
|
92
|
+
}
|
93
|
+
const dirPath = isDirectory ? absolutePath : path.dirname(absolutePath);
|
94
|
+
let normalizedPathForURL = dirPath.replace(/\\/g, '/');
|
95
|
+
if (/^[A-Z]:\//i.test(normalizedPathForURL) && !normalizedPathForURL.startsWith('/')) {
|
96
|
+
normalizedPathForURL = '/' + normalizedPathForURL;
|
97
|
+
}
|
98
|
+
const fileUrl = new URL('file://' + normalizedPathForURL);
|
99
|
+
let fileUrlString = fileUrl.href;
|
100
|
+
if (!fileUrlString.endsWith('/')) { fileUrlString += '/'; }
|
101
|
+
logger?.debug(`Determined local base URL: ${fileUrlString} (from: ${inputPathOrUrl}, resolved dir: ${dirPath}, isDir: ${isDirectory})`);
|
102
|
+
return fileUrlString;
|
103
|
+
}
|
104
|
+
} catch (error: unknown) {
|
105
|
+
const message = error instanceof Error ? error.message : String(error);
|
106
|
+
logger?.error(`💀 Failed to determine base URL for "${inputPathOrUrl}": ${message}${error instanceof Error ? ` - Stack: ${error.stack}` : ''}`);
|
107
|
+
return undefined;
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
/**
|
112
|
+
* Resolves an asset URL relative to a base URL context.
|
113
|
+
* @param {string} assetUrl - The raw URL string found in the source.
|
114
|
+
* @param {string} [baseContextUrl] - The absolute base URL of the containing document.
|
115
|
+
* @param {Logger} [logger] - Optional logger instance.
|
116
|
+
* @returns {URL | null} A validated, absolute URL object or null.
|
117
|
+
*/
|
118
|
+
function resolveAssetUrl(assetUrl: string, baseContextUrl?: string, logger?: Logger): URL | null {
|
119
|
+
const trimmedUrl = assetUrl?.trim();
|
120
|
+
if (!trimmedUrl || trimmedUrl.startsWith('data:') || trimmedUrl.startsWith('#')) {
|
121
|
+
return null;
|
122
|
+
}
|
123
|
+
let resolvableUrl = trimmedUrl;
|
124
|
+
if (resolvableUrl.startsWith('//') && baseContextUrl) {
|
125
|
+
try {
|
126
|
+
const base = new URL(baseContextUrl);
|
127
|
+
resolvableUrl = base.protocol + resolvableUrl;
|
128
|
+
} catch (e) {
|
129
|
+
logger?.warn(`Could not extract protocol from base "${baseContextUrl}" for protocol-relative URL "${trimmedUrl}". Skipping.`);
|
130
|
+
return null;
|
131
|
+
}
|
132
|
+
}
|
133
|
+
try {
|
134
|
+
const resolved = new URL(resolvableUrl, baseContextUrl);
|
135
|
+
return resolved;
|
136
|
+
} catch (error: unknown) {
|
137
|
+
const message = error instanceof Error ? error.message : String(error);
|
138
|
+
if (!/^[a-z]+:/i.test(resolvableUrl) && !resolvableUrl.startsWith('/') && !baseContextUrl) {
|
139
|
+
logger?.warn(`Cannot resolve relative URL "${resolvableUrl}" - Base context URL was not provided or determined.`);
|
140
|
+
} else {
|
141
|
+
logger?.warn(`⚠️ Failed to parse/resolve URL "${resolvableUrl}" ${baseContextUrl ? 'against base "' + baseContextUrl + '"' : '(no base provided)'}: ${message}`);
|
142
|
+
}
|
143
|
+
return null;
|
144
|
+
}
|
145
|
+
}
|
146
|
+
|
147
|
+
/**
|
148
|
+
* Properly resolves CSS relative paths, handling "../" correctly.
|
149
|
+
* This is critical for properly resolving paths in CSS like "../images/bg.png".
|
150
|
+
*
|
151
|
+
* @param {string} relativeUrl - The relative URL from CSS (e.g., "../images/bg.png")
|
152
|
+
* @param {string} cssBaseUrl - The base URL of the CSS file
|
153
|
+
* @param {Logger} [logger] - Optional logger instance
|
154
|
+
* @returns {string | null} The resolved absolute URL or null if resolution fails
|
155
|
+
*/
|
156
|
+
function resolveCssRelativeUrl(
|
157
|
+
relativeUrl: string,
|
158
|
+
cssBaseContextUrl: string,
|
159
|
+
logger?: Logger
|
160
|
+
): string | null {
|
161
|
+
// Skip empty or data URLs
|
162
|
+
if (!relativeUrl || relativeUrl.startsWith('data:')) {
|
163
|
+
return null;
|
164
|
+
}
|
165
|
+
|
166
|
+
try {
|
167
|
+
if (cssBaseContextUrl.startsWith('file:')) {
|
168
|
+
// Turn the CSS base URL into a filesystem path
|
169
|
+
const basePath = fileURLToPath(cssBaseContextUrl);
|
170
|
+
|
171
|
+
// If that base path is actually a directory, use it directly;
|
172
|
+
// otherwise, use its dirname. This prevents us from dropping
|
173
|
+
// the final directory name when we already have a trailing slash.
|
174
|
+
let cssDir: string;
|
175
|
+
try {
|
176
|
+
const stat = fs.statSync(basePath);
|
177
|
+
if (stat.isDirectory()) {
|
178
|
+
cssDir = basePath;
|
179
|
+
} else {
|
180
|
+
cssDir = path.dirname(basePath);
|
181
|
+
}
|
182
|
+
} catch {
|
183
|
+
// If stat fails, assume it's a file path
|
184
|
+
cssDir = path.dirname(basePath);
|
185
|
+
}
|
186
|
+
|
187
|
+
// Resolve relativeUrl against this directory
|
188
|
+
let resolvedPath = path.resolve(cssDir, relativeUrl);
|
189
|
+
resolvedPath = resolvedPath.replace(/\\/g, '/'); // Normalize to forward slashes
|
190
|
+
|
191
|
+
// On Windows, ensure file:///C:/something
|
192
|
+
if (/^[A-Z]:/i.test(resolvedPath) && !resolvedPath.startsWith('/')) {
|
193
|
+
resolvedPath = '/' + resolvedPath;
|
194
|
+
}
|
195
|
+
return `file://${resolvedPath}`;
|
196
|
+
} else {
|
197
|
+
// For http/https etc., do standard resolution
|
198
|
+
return new URL(relativeUrl, cssBaseContextUrl).href;
|
199
|
+
}
|
200
|
+
} catch (error) {
|
201
|
+
logger?.warn(
|
202
|
+
`Failed to resolve CSS URL: "${relativeUrl}" against "${cssBaseContextUrl}": ${String(error)}`
|
203
|
+
);
|
204
|
+
return null;
|
205
|
+
}
|
206
|
+
}
|
207
|
+
|
208
|
+
|
209
|
+
/**
|
210
|
+
* Asynchronously fetches the content of a resolved asset URL.
|
211
|
+
* @async
|
212
|
+
* @param {URL} resolvedUrl - The absolute URL object of the asset to fetch.
|
213
|
+
* @param {Logger} [logger] - Optional logger instance.
|
214
|
+
* @param {number} [timeout=10000] - Network timeout in milliseconds.
|
215
|
+
* @returns {Promise<Buffer | null>} Asset content as a Buffer, or null on failure.
|
216
|
+
*/
|
217
|
+
/**
|
218
|
+
* Asynchronously fetches the content of a resolved asset URL.
|
219
|
+
* @async
|
220
|
+
* @param {URL} resolvedUrl - The absolute URL object of the asset to fetch.
|
221
|
+
* @param {Logger} [logger] - Optional logger instance.
|
222
|
+
* @param {number} [timeout=10000] - Network timeout in milliseconds.
|
223
|
+
* @returns {Promise<Buffer | null>} Asset content as a Buffer, or null on failure.
|
224
|
+
*/
|
225
|
+
async function fetchAsset(resolvedUrl: URL, logger?: Logger, timeout: number = 10000): Promise<Buffer | null> {
|
226
|
+
logger?.debug(`Attempting to fetch asset: ${resolvedUrl.href}`);
|
227
|
+
const protocol = resolvedUrl.protocol;
|
228
|
+
|
229
|
+
try {
|
230
|
+
if (protocol === 'http:' || protocol === 'https:') {
|
231
|
+
const response: AxiosResponse<ArrayBuffer> = await axios.default.get(resolvedUrl.href, {
|
232
|
+
responseType: 'arraybuffer', timeout: timeout,
|
233
|
+
});
|
234
|
+
logger?.debug(`Workspaceed remote asset ${resolvedUrl.href} (Status: ${response.status}, Type: ${response.headers['content-type'] || 'N/A'}, Size: ${response.data.byteLength} bytes)`);
|
235
|
+
return Buffer.from(response.data);
|
236
|
+
} else if (protocol === 'file:') {
|
237
|
+
let filePath: string;
|
238
|
+
try {
|
239
|
+
filePath = fileURLToPath(resolvedUrl);
|
240
|
+
} catch (e: any) {
|
241
|
+
// Log error specifically for path conversion failure
|
242
|
+
logger?.error(`Could not convert file URL to path: ${resolvedUrl.href}. Error: ${e.message}`);
|
243
|
+
return null; // Cannot proceed without a valid path
|
244
|
+
}
|
245
|
+
// This section will now only be reached if fileURLToPath succeeded
|
246
|
+
const data = await readFile(filePath); // This might throw ENOENT, EACCES etc.
|
247
|
+
logger?.debug(`Read local file ${filePath} (${data.byteLength} bytes)`);
|
248
|
+
return data;
|
249
|
+
} else {
|
250
|
+
logger?.warn(`Unsupported protocol "${protocol}" in URL: ${resolvedUrl.href}`);
|
251
|
+
return null;
|
252
|
+
}
|
253
|
+
} catch (error: unknown) {
|
254
|
+
// --- Handle Errors Based on Protocol/Context ---
|
255
|
+
|
256
|
+
// Check for AxiosError FIRST (only relevant if protocol was http/https)
|
257
|
+
if ((protocol === 'http:' || protocol === 'https:') && axios.default.isAxiosError(error)) {
|
258
|
+
const status = error.response?.status ?? 'N/A';
|
259
|
+
const statusText = error.response?.statusText ?? 'Error';
|
260
|
+
const code = error.code ?? 'N/A';
|
261
|
+
const message = error.message;
|
262
|
+
// Construct the message matching test expectation
|
263
|
+
const logMessage = `⚠️ Failed to fetch remote asset ${resolvedUrl.href}: Status ${status} - ${statusText}. Code: ${code}, Message: ${message}`;
|
264
|
+
logger?.warn(logMessage);
|
265
|
+
}
|
266
|
+
// Check for specific FS errors (only relevant if protocol was file:)
|
267
|
+
else if (protocol === 'file:') {
|
268
|
+
// Determine the file path again for logging, handling potential errors
|
269
|
+
let failedPath = resolvedUrl.href;
|
270
|
+
try { failedPath = fileURLToPath(resolvedUrl); } catch { /* ignore if conversion fails here, use original href */ }
|
271
|
+
|
272
|
+
if (error instanceof Error && (error as NodeJS.ErrnoException).code === 'ENOENT') {
|
273
|
+
logger?.warn(`⚠️ File not found (ENOENT) for asset: ${failedPath}.`);
|
274
|
+
} else if (error instanceof Error && (error as NodeJS.ErrnoException).code === 'EACCES') {
|
275
|
+
logger?.warn(`⚠️ Permission denied (EACCES) reading asset: ${failedPath}.`);
|
276
|
+
} else if (error instanceof Error) { // Catch other errors during file reading (but not path conversion which is handled above)
|
277
|
+
logger?.warn(`⚠️ Failed to read local asset ${failedPath}: ${error.message}`);
|
278
|
+
} else {
|
279
|
+
logger?.warn(`⚠️ An unknown error occurred while reading local asset ${failedPath}: ${String(error)}`);
|
280
|
+
}
|
281
|
+
}
|
282
|
+
// Check for other specific errors like invalid URL types if necessary (ERR_INVALID_URL handled above mostly)
|
283
|
+
// else if (error instanceof TypeError && error.message.includes('ERR_INVALID_URL')) { ... }
|
284
|
+
|
285
|
+
// Generic fallback for truly unexpected errors during fetch/read
|
286
|
+
else if (error instanceof Error) {
|
287
|
+
logger?.warn(`⚠️ An unexpected error occurred processing asset ${resolvedUrl.href}: ${error.message}`);
|
288
|
+
} else {
|
289
|
+
logger?.warn(`⚠️ An unknown and unexpected error occurred processing asset ${resolvedUrl.href}: ${String(error)}`);
|
290
|
+
}
|
291
|
+
return null; // Return null on ANY fetch/read error caught here
|
292
|
+
}
|
293
|
+
}
|
294
|
+
|
295
|
+
/**
|
296
|
+
* Extracts URLs from CSS content and resolves them against the CSS base URL.
|
297
|
+
* @param {string} cssContent - The CSS content to parse
|
298
|
+
* @param {string} cssBaseContextUrl - The base URL of the CSS file
|
299
|
+
* @param {Asset[]} discoveredAssets - Array to push newly discovered assets to
|
300
|
+
* @param {Set<string>} visitedUrls - Set of already visited URLs to avoid duplicates
|
301
|
+
* @param {Logger} [logger] - Optional logger instance
|
302
|
+
*/
|
303
|
+
/**
|
304
|
+
* Extracts URLs from CSS content and resolves them against the CSS base URL.
|
305
|
+
* Returns an array of *potentially* new Asset objects with resolved URLs.
|
306
|
+
*/
|
307
|
+
function extractUrlsFromCSS(
|
308
|
+
cssContent: string,
|
309
|
+
cssBaseContextUrl: string,
|
310
|
+
// discoveredAssets: Asset[], // REMOVE: This function will now RETURN the assets
|
311
|
+
// visitedUrls: Set<string>, // REMOVE
|
312
|
+
logger?: Logger
|
313
|
+
): Asset[] { // RETURN the discovered assets
|
314
|
+
const newlyDiscovered: Asset[] = []; // Internal list for this parse
|
315
|
+
const processedInThisParse = new Set<string>(); // Track URLs found in *this specific* CSS file to avoid duplicates from the same file
|
316
|
+
|
317
|
+
const urlRegex = /url\(\s*(['"]?)(.*?)\1\s*\)/gi;
|
318
|
+
const importRegex = /@import\s+(?:url\(\s*(['"]?)(.*?)\1\s*\)|(['"])(.*?)\3)\s*;/gi;
|
319
|
+
|
320
|
+
const processFoundUrl = (rawUrl: string | undefined, ruleType: '@import' | 'url()') => {
|
321
|
+
if (!rawUrl || rawUrl.trim() === '' || rawUrl.startsWith('data:')) return;
|
322
|
+
|
323
|
+
const resolvedUrl = resolveCssRelativeUrl(rawUrl, cssBaseContextUrl, logger);
|
324
|
+
|
325
|
+
// Check if resolved AND not already processed within *this* CSS file
|
326
|
+
if (resolvedUrl && !processedInThisParse.has(resolvedUrl)) {
|
327
|
+
processedInThisParse.add(resolvedUrl); // Mark as found in this file
|
328
|
+
const { assetType } = guessMimeType(resolvedUrl);
|
329
|
+
|
330
|
+
// Add to the list to be returned
|
331
|
+
newlyDiscovered.push({
|
332
|
+
type: assetType,
|
333
|
+
url: resolvedUrl, // The resolved URL string
|
334
|
+
content: undefined
|
335
|
+
});
|
336
|
+
logger?.debug(`Discovered nested ${assetType} asset (${ruleType}) in CSS ${cssBaseContextUrl}: ${resolvedUrl}`);
|
337
|
+
}
|
338
|
+
};
|
339
|
+
|
340
|
+
// ... (run regex loops calling processFoundUrl) ...
|
341
|
+
urlRegex.lastIndex = 0;
|
342
|
+
importRegex.lastIndex = 0;
|
343
|
+
let match;
|
344
|
+
while ((match = urlRegex.exec(cssContent)) !== null) {
|
345
|
+
processFoundUrl(match[2], 'url()');
|
346
|
+
}
|
347
|
+
importRegex.lastIndex = 0;
|
348
|
+
while ((match = importRegex.exec(cssContent)) !== null) {
|
349
|
+
processFoundUrl(match[2] || match[4], '@import');
|
350
|
+
}
|
351
|
+
|
352
|
+
return newlyDiscovered; // Return the list
|
353
|
+
}
|
354
|
+
|
355
|
+
/**
|
356
|
+
* Extracts all discoverable assets recursively from HTML and CSS.
|
357
|
+
* @async
|
358
|
+
* @export
|
359
|
+
* @param {ParsedHTML} parsed - Initial parsed HTML data.
|
360
|
+
* @param {boolean} [embedAssets=true] - Whether to embed content.
|
361
|
+
* @param {string} [inputPathOrUrl] - Original HTML source location.
|
362
|
+
* @param {Logger} [logger] - Optional logger instance.
|
363
|
+
* @returns {Promise<ParsedHTML>} Processed data with all assets.
|
364
|
+
*/
|
365
|
+
/**
|
366
|
+
* Extracts all discoverable assets recursively from HTML and CSS.
|
367
|
+
* Fetches assets if embedAssets is true or if the asset is CSS (to parse for more assets).
|
368
|
+
* Resolves URLs relative to their context (HTML base or CSS file location).
|
369
|
+
* @async
|
370
|
+
* @export
|
371
|
+
* @param {ParsedHTML} parsed - Initial parsed HTML data containing `htmlContent` and an initial `assets` array.
|
372
|
+
* @param {boolean} [embedAssets=true] - Whether to fetch asset content and store it (usually as a data URI or text). If false, content remains undefined, but assets are still discovered.
|
373
|
+
* @param {string} [inputPathOrUrl] - The original source location (file path or URL) of the HTML. Used to determine the base context for resolving relative paths in the HTML.
|
374
|
+
* @param {Logger} [logger] - Optional logger instance for detailed logging.
|
375
|
+
* @returns {Promise<ParsedHTML>} Processed data with `htmlContent` and the final `assets` array containing all discovered assets (with content if `embedAssets` was true and fetch succeeded).
|
376
|
+
*/
|
377
|
+
export async function extractAssets(
|
378
|
+
parsed: ParsedHTML,
|
379
|
+
embedAssets = true,
|
380
|
+
inputPathOrUrl?: string,
|
381
|
+
logger?: Logger
|
382
|
+
): Promise<ParsedHTML> {
|
383
|
+
logger?.info(`🚀 Starting asset extraction! Embed: ${embedAssets}. Input: ${inputPathOrUrl || '(HTML content only)'}`);
|
384
|
+
|
385
|
+
const initialAssets: Asset[] = parsed.assets || [];
|
386
|
+
// Stores the final result: Map<resolved URL string, Asset object>
|
387
|
+
const finalAssetsMap = new Map<string, Asset>();
|
388
|
+
// Queue holds assets to be processed: { url: string (resolved), type: ..., content?: ... }
|
389
|
+
let assetsToProcess: Asset[] = [];
|
390
|
+
|
391
|
+
// Determine the base URL context for resolving relative paths FROM THE HTML
|
392
|
+
const htmlBaseContextUrl = determineBaseUrl(inputPathOrUrl || '', logger);
|
393
|
+
if (!htmlBaseContextUrl && initialAssets.some(a => !/^[a-z]+:/i.test(a.url) && !a.url.startsWith('data:') && !a.url.startsWith('#') && !a.url.startsWith('/'))) {
|
394
|
+
logger?.warn("🚨 No valid base path/URL determined for the HTML source! Resolution of relative asset paths from HTML may fail.");
|
395
|
+
} else if (htmlBaseContextUrl) {
|
396
|
+
logger?.debug(`Using HTML base context URL: ${htmlBaseContextUrl}`);
|
397
|
+
}
|
398
|
+
|
399
|
+
// --- CORRECTED: Define processedOrQueuedUrls HERE in the main function scope ---
|
400
|
+
// Set to track URLs that are already processed (in finalAssetsMap) OR currently in the queue (assetsToProcess)
|
401
|
+
// This prevents adding the same asset to the queue multiple times.
|
402
|
+
const processedOrQueuedUrls = new Set<string>();
|
403
|
+
|
404
|
+
// --- Initial Queue Population ---
|
405
|
+
logger?.debug(`Queueing ${initialAssets.length} initial assets parsed from HTML...`);
|
406
|
+
for (const asset of initialAssets) {
|
407
|
+
// Resolve the initial asset URL against the HTML base context
|
408
|
+
const resolvedUrlObj = resolveAssetUrl(asset.url, htmlBaseContextUrl, logger);
|
409
|
+
// Use the resolved URL string if resolution succeeded, otherwise use the original
|
410
|
+
const urlToQueue = resolvedUrlObj ? resolvedUrlObj.href : asset.url;
|
411
|
+
|
412
|
+
// Skip data URIs and check if this URL is already tracked
|
413
|
+
if (!urlToQueue.startsWith('data:') && !processedOrQueuedUrls.has(urlToQueue)) {
|
414
|
+
processedOrQueuedUrls.add(urlToQueue); // Mark as queued
|
415
|
+
|
416
|
+
// Guess type from the resolved/original URL if not provided initially
|
417
|
+
const { assetType: guessedType } = guessMimeType(urlToQueue);
|
418
|
+
const initialType = asset.type ?? guessedType;
|
419
|
+
|
420
|
+
// Add to the processing queue
|
421
|
+
assetsToProcess.push({
|
422
|
+
url: urlToQueue,
|
423
|
+
type: initialType,
|
424
|
+
content: undefined
|
425
|
+
});
|
426
|
+
logger?.debug(` -> Queued initial asset: ${urlToQueue} (Original raw: ${asset.url})`);
|
427
|
+
} else if (urlToQueue.startsWith('data:')) {
|
428
|
+
logger?.debug(` -> Skipping data URI: ${urlToQueue.substring(0, 50)}...`);
|
429
|
+
} else {
|
430
|
+
logger?.debug(` -> Skipping already queued initial asset: ${urlToQueue}`);
|
431
|
+
}
|
432
|
+
}
|
433
|
+
|
434
|
+
// --- Main processing loop ---
|
435
|
+
let iterationCount = 0;
|
436
|
+
while (assetsToProcess.length > 0) {
|
437
|
+
iterationCount++;
|
438
|
+
if (iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS) {
|
439
|
+
logger?.error(`🛑 Asset extraction loop limit hit (${MAX_ASSET_EXTRACTION_ITERATIONS})! Aborting.`);
|
440
|
+
const remainingUrls = assetsToProcess.map(a => a.url).slice(0, 10).join(', ');
|
441
|
+
logger?.error(`Remaining queue sample (${assetsToProcess.length} items): ${remainingUrls}...`);
|
442
|
+
// Add assets remaining in queue to final map without content before breaking
|
443
|
+
assetsToProcess.forEach(asset => {
|
444
|
+
if (!finalAssetsMap.has(asset.url)) {
|
445
|
+
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
446
|
+
}
|
447
|
+
});
|
448
|
+
assetsToProcess = []; // Clear queue
|
449
|
+
break; // Exit loop
|
450
|
+
}
|
451
|
+
|
452
|
+
// Process assets in batches
|
453
|
+
const currentBatch = [...assetsToProcess];
|
454
|
+
assetsToProcess = []; // Clear queue for the next batch discovered in this iteration
|
455
|
+
|
456
|
+
logger?.debug(`--- Processing batch ${iterationCount}: ${currentBatch.length} asset(s) ---`);
|
457
|
+
|
458
|
+
for (const asset of currentBatch) {
|
459
|
+
// Skip if already fully processed
|
460
|
+
if (finalAssetsMap.has(asset.url)) {
|
461
|
+
logger?.debug(`Skipping asset already in final map: ${asset.url}`);
|
462
|
+
continue;
|
463
|
+
}
|
464
|
+
|
465
|
+
let assetContentBuffer: Buffer | null = null;
|
466
|
+
let finalContent: string | undefined = undefined; // For embedding
|
467
|
+
let cssContentForParsing: string | undefined = undefined; // For CSS parsing
|
468
|
+
|
469
|
+
// --- Determine if fetching is needed ---
|
470
|
+
const needsFetching = embedAssets || asset.type === 'css';
|
471
|
+
let assetUrlObj: URL | null = null;
|
472
|
+
|
473
|
+
if (needsFetching) {
|
474
|
+
// --- Create URL object for fetching ---
|
475
|
+
try {
|
476
|
+
assetUrlObj = new URL(asset.url);
|
477
|
+
} catch (urlError) {
|
478
|
+
logger?.warn(`Cannot create URL object for "${asset.url}", skipping fetch. Error: ${urlError instanceof Error ? urlError.message : String(urlError)}`);
|
479
|
+
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
480
|
+
continue; // Skip to next asset in batch
|
481
|
+
}
|
482
|
+
|
483
|
+
// --- Fetch Asset ---
|
484
|
+
if (assetUrlObj) {
|
485
|
+
assetContentBuffer = await fetchAsset(assetUrlObj, logger);
|
486
|
+
}
|
487
|
+
} // End if(needsFetching)
|
488
|
+
|
489
|
+
// --- If fetching was needed but failed, add to map without content and skip ---
|
490
|
+
if (needsFetching && assetContentBuffer === null) {
|
491
|
+
logger?.debug(`Storing asset ${asset.url} without content due to fetch failure.`);
|
492
|
+
finalAssetsMap.set(asset.url, { ...asset, content: undefined });
|
493
|
+
continue; // Skip to next asset in batch
|
494
|
+
}
|
495
|
+
|
496
|
+
// --- Prepare Content for Storing/Embedding (if fetched successfully) ---
|
497
|
+
if (assetContentBuffer) { // Only proceed if content was fetched
|
498
|
+
const mimeInfo = guessMimeType(asset.url);
|
499
|
+
const effectiveMime = mimeInfo.mime || 'application/octet-stream';
|
500
|
+
|
501
|
+
// Try to decode TEXT types as UTF-8
|
502
|
+
if (TEXT_ASSET_TYPES.has(asset.type)) {
|
503
|
+
let textContent: string | undefined;
|
504
|
+
let wasLossy = false;
|
505
|
+
try {
|
506
|
+
textContent = assetContentBuffer.toString('utf-8');
|
507
|
+
wasLossy = isUtf8DecodingLossy(assetContentBuffer, textContent);
|
508
|
+
} catch (e) { textContent = undefined; wasLossy = true; }
|
509
|
+
|
510
|
+
if (!wasLossy && textContent !== undefined) {
|
511
|
+
// Store the decoded text content if embedding or it's CSS (for parsing)
|
512
|
+
if (embedAssets) {
|
513
|
+
finalContent = textContent;
|
514
|
+
} else {
|
515
|
+
finalContent = undefined; // Not embedding text
|
516
|
+
}
|
517
|
+
// If it's CSS, store it for parsing later regardless of embedding
|
518
|
+
if (asset.type === 'css') {
|
519
|
+
cssContentForParsing = textContent;
|
520
|
+
}
|
521
|
+
} else {
|
522
|
+
// Decoding failed or was lossy
|
523
|
+
logger?.warn(`Could not decode ${asset.type} ${asset.url} as valid UTF-8 text.${embedAssets ? ' Falling back to base64 data URI.' : ''}`);
|
524
|
+
cssContentForParsing = undefined; // Cannot parse if decoding failed
|
525
|
+
// Embed as base64 if requested
|
526
|
+
if (embedAssets) {
|
527
|
+
finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
|
528
|
+
} else {
|
529
|
+
finalContent = undefined; // Not embedding, content remains undefined
|
530
|
+
}
|
531
|
+
}
|
532
|
+
}
|
533
|
+
// Embed BINARY types as base64 if requested
|
534
|
+
else if (BINARY_ASSET_TYPES.has(asset.type)) {
|
535
|
+
if (embedAssets) {
|
536
|
+
finalContent = `data:${effectiveMime};base64,${assetContentBuffer.toString('base64')}`;
|
537
|
+
} else {
|
538
|
+
finalContent = undefined; // Not embedding
|
539
|
+
}
|
540
|
+
cssContentForParsing = undefined; // Not CSS
|
541
|
+
}
|
542
|
+
// Handle 'other' types: try text, fallback to base64 if embedding
|
543
|
+
else { // asset.type === 'other' or unknown
|
544
|
+
cssContentForParsing = undefined; // Not CSS
|
545
|
+
if (embedAssets) {
|
546
|
+
try {
|
547
|
+
const attemptedTextContent = assetContentBuffer.toString('utf-8');
|
548
|
+
if (isUtf8DecodingLossy(assetContentBuffer, attemptedTextContent)) {
|
549
|
+
logger?.warn(`Couldn't embed unclassified asset ${asset.url} as text due to invalid UTF-8 sequences. Falling back to base64 (octet-stream).`);
|
550
|
+
finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
|
551
|
+
} else {
|
552
|
+
finalContent = attemptedTextContent;
|
553
|
+
logger?.debug(`Successfully embedded unclassified asset ${asset.url} as text.`);
|
554
|
+
}
|
555
|
+
} catch (decodeError) {
|
556
|
+
logger?.warn(`Error during text decoding for unclassified asset ${asset.url}: ${decodeError instanceof Error ? decodeError.message : String(decodeError)}. Falling back to base64.`);
|
557
|
+
finalContent = `data:application/octet-stream;base64,${assetContentBuffer.toString('base64')}`;
|
558
|
+
}
|
559
|
+
} else {
|
560
|
+
finalContent = undefined; // Not embedding
|
561
|
+
}
|
562
|
+
}
|
563
|
+
} else {
|
564
|
+
// Content was not fetched
|
565
|
+
finalContent = undefined;
|
566
|
+
cssContentForParsing = undefined;
|
567
|
+
}
|
568
|
+
|
569
|
+
// --- Store the final asset ---
|
570
|
+
// Use the resolved URL as the key and in the asset object itself
|
571
|
+
finalAssetsMap.set(asset.url, { ...asset, url: asset.url, content: finalContent });
|
572
|
+
// Note: URL is already marked in processedOrQueuedUrls
|
573
|
+
|
574
|
+
// --- Process CSS for nested assets ---
|
575
|
+
// Only if it's CSS and we successfully decoded its content for parsing
|
576
|
+
if (asset.type === 'css' && cssContentForParsing) {
|
577
|
+
// Determine the base URL *for this specific CSS file*
|
578
|
+
const cssBaseContextUrl = determineBaseUrl(asset.url, logger);
|
579
|
+
logger?.debug(`CSS base context for resolving nested assets within ${asset.url}: ${cssBaseContextUrl}`);
|
580
|
+
|
581
|
+
if (cssBaseContextUrl) {
|
582
|
+
// Get the list of *potentially* new assets discovered in this CSS
|
583
|
+
const newlyDiscoveredAssets = extractUrlsFromCSS(
|
584
|
+
cssContentForParsing,
|
585
|
+
cssBaseContextUrl,
|
586
|
+
logger
|
587
|
+
);
|
588
|
+
|
589
|
+
if (newlyDiscoveredAssets.length > 0) {
|
590
|
+
logger?.debug(`Discovered ${newlyDiscoveredAssets.length} nested assets in CSS ${asset.url}. Checking against queue...`);
|
591
|
+
for (const newAsset of newlyDiscoveredAssets) {
|
592
|
+
// CHECK: Add to queue only if this resolved URL hasn't been processed OR queued before.
|
593
|
+
// Use the 'processedOrQueuedUrls' Set which tracks both.
|
594
|
+
if (!processedOrQueuedUrls.has(newAsset.url)) {
|
595
|
+
processedOrQueuedUrls.add(newAsset.url); // Mark as queued now
|
596
|
+
assetsToProcess.push(newAsset); // Add to the main queue for the *next* iteration
|
597
|
+
logger?.debug(` -> Queued new nested asset: ${newAsset.url}`);
|
598
|
+
} else {
|
599
|
+
logger?.debug(` -> Skipping already processed/queued nested asset: ${newAsset.url}`);
|
600
|
+
}
|
601
|
+
}
|
602
|
+
}
|
603
|
+
} else {
|
604
|
+
logger?.warn(`Could not determine base URL context for CSS file ${asset.url}. Cannot resolve nested relative paths within it.`);
|
605
|
+
}
|
606
|
+
} // End if(asset.type === 'css' && cssContentForParsing)
|
607
|
+
} // End for loop over currentBatch
|
608
|
+
} // End while loop
|
609
|
+
|
610
|
+
const finalIterationCount = iterationCount > MAX_ASSET_EXTRACTION_ITERATIONS ? 'MAX+' : iterationCount;
|
611
|
+
logger?.info(`✅ Asset extraction COMPLETE! Found ${finalAssetsMap.size} unique assets in ${finalIterationCount} iterations.`);
|
612
|
+
|
613
|
+
// Return the original HTML content and the final list of processed assets
|
614
|
+
return {
|
615
|
+
htmlContent: parsed.htmlContent,
|
616
|
+
assets: Array.from(finalAssetsMap.values())
|
617
|
+
};
|
618
|
+
}
|