portapack 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/.eslintrc.json +9 -0
  2. package/.github/workflows/ci.yml +73 -0
  3. package/.github/workflows/deploy-pages.yml +56 -0
  4. package/.prettierrc +9 -0
  5. package/.releaserc.js +29 -0
  6. package/CHANGELOG.md +21 -0
  7. package/README.md +288 -0
  8. package/commitlint.config.js +36 -0
  9. package/dist/cli/cli-entry.js +1694 -0
  10. package/dist/cli/cli-entry.js.map +1 -0
  11. package/dist/index.d.ts +275 -0
  12. package/dist/index.js +1405 -0
  13. package/dist/index.js.map +1 -0
  14. package/docs/.vitepress/config.ts +89 -0
  15. package/docs/.vitepress/sidebar-generator.ts +73 -0
  16. package/docs/cli.md +117 -0
  17. package/docs/code-of-conduct.md +65 -0
  18. package/docs/configuration.md +151 -0
  19. package/docs/contributing.md +107 -0
  20. package/docs/demo.md +46 -0
  21. package/docs/deployment.md +132 -0
  22. package/docs/development.md +168 -0
  23. package/docs/getting-started.md +106 -0
  24. package/docs/index.md +40 -0
  25. package/docs/portapack-transparent.png +0 -0
  26. package/docs/portapack.jpg +0 -0
  27. package/docs/troubleshooting.md +107 -0
  28. package/examples/main.ts +118 -0
  29. package/examples/sample-project/index.html +12 -0
  30. package/examples/sample-project/logo.png +1 -0
  31. package/examples/sample-project/script.js +1 -0
  32. package/examples/sample-project/styles.css +1 -0
  33. package/jest.config.ts +124 -0
  34. package/jest.setup.cjs +211 -0
  35. package/nodemon.json +11 -0
  36. package/output.html +1 -0
  37. package/package.json +161 -0
  38. package/site-packed.html +1 -0
  39. package/src/cli/cli-entry.ts +28 -0
  40. package/src/cli/cli.ts +139 -0
  41. package/src/cli/options.ts +151 -0
  42. package/src/core/bundler.ts +201 -0
  43. package/src/core/extractor.ts +618 -0
  44. package/src/core/minifier.ts +233 -0
  45. package/src/core/packer.ts +191 -0
  46. package/src/core/parser.ts +115 -0
  47. package/src/core/web-fetcher.ts +292 -0
  48. package/src/index.ts +262 -0
  49. package/src/types.ts +163 -0
  50. package/src/utils/font.ts +41 -0
  51. package/src/utils/logger.ts +139 -0
  52. package/src/utils/meta.ts +100 -0
  53. package/src/utils/mime.ts +90 -0
  54. package/src/utils/slugify.ts +70 -0
  55. package/test-output.html +0 -0
  56. package/tests/__fixtures__/sample-project/index.html +5 -0
  57. package/tests/unit/cli/cli-entry.test.ts +104 -0
  58. package/tests/unit/cli/cli.test.ts +230 -0
  59. package/tests/unit/cli/options.test.ts +316 -0
  60. package/tests/unit/core/bundler.test.ts +287 -0
  61. package/tests/unit/core/extractor.test.ts +1129 -0
  62. package/tests/unit/core/minifier.test.ts +414 -0
  63. package/tests/unit/core/packer.test.ts +193 -0
  64. package/tests/unit/core/parser.test.ts +540 -0
  65. package/tests/unit/core/web-fetcher.test.ts +374 -0
  66. package/tests/unit/index.test.ts +339 -0
  67. package/tests/unit/utils/font.test.ts +81 -0
  68. package/tests/unit/utils/logger.test.ts +275 -0
  69. package/tests/unit/utils/meta.test.ts +70 -0
  70. package/tests/unit/utils/mime.test.ts +96 -0
  71. package/tests/unit/utils/slugify.test.ts +71 -0
  72. package/tsconfig.build.json +11 -0
  73. package/tsconfig.jest.json +17 -0
  74. package/tsconfig.json +20 -0
  75. package/tsup.config.ts +71 -0
  76. package/typedoc.json +28 -0
@@ -0,0 +1,233 @@
1
+ /**
2
+ * @file src/core/minifier.ts
3
+ * @description
4
+ * Provides the core functionality for minifying HTML, CSS, and JavaScript content
5
+ * within the PortaPack bundling process. Uses `html-minifier-terser`, `clean-css`,
6
+ * and `terser` libraries. Handles errors gracefully by logging warnings and returning
7
+ * original content for the specific asset that failed minification.
8
+ * Includes workarounds for apparent issues in @types/clean-css definitions.
9
+ */
10
+
11
+ // --- Imports ---
12
+ import { minify as htmlMinify } from 'html-minifier-terser';
13
+ import type { Options as HtmlMinifyOptions } from 'html-minifier-terser';
14
+ import CleanCSS from 'clean-css';
15
+ // Import specific types from clean-css. Note: Using these directly caused issues.
16
+ import type { Options as CleanCSSOptions } from 'clean-css';
17
+ import { minify as jsMinify } from 'terser';
18
+ import type { MinifyOptions, MinifyOutput } from 'terser';
19
+ // Import necessary types from project - ensure these paths are correct and use .js extension
20
+ import type { ParsedHTML, BundleOptions, Asset } from '../types.js';
21
+ import { Logger } from '../utils/logger.js';
22
+
23
+ // --- Helper Interface for Workaround ---
24
+
25
+ /**
26
+ * Represents the expected structure of the synchronous output from clean-css.
27
+ * Used with type assertion as a workaround for problematic official type definitions.
28
+ */
29
+ export interface CleanCSSSyncResult { // <<< MUST HAVE 'export'
30
+ styles?: string;
31
+ errors?: string[];
32
+ warnings?: string[];
33
+ stats?: {
34
+ originalSize: number;
35
+ minifiedSize: number;
36
+ };
37
+ }
38
+
39
+ // --- Default Minification Options Constants ---
40
+
41
+ /**
42
+ * Default options for html-minifier-terser.
43
+ */
44
+ const HTML_MINIFY_OPTIONS: HtmlMinifyOptions = {
45
+ collapseWhitespace: true,
46
+ removeComments: true,
47
+ conservativeCollapse: true,
48
+ minifyCSS: false, // Handled separately
49
+ minifyJS: false, // Handled separately
50
+ removeAttributeQuotes: false,
51
+ removeRedundantAttributes: true,
52
+ removeScriptTypeAttributes: true,
53
+ removeStyleLinkTypeAttributes: true,
54
+ useShortDoctype: true,
55
+ };
56
+
57
+ /**
58
+ * Default options for clean-css.
59
+ * Explicitly set returnPromise to false to ensure synchronous operation.
60
+ */
61
+ const CSS_MINIFY_OPTIONS: CleanCSSOptions = {
62
+ returnPromise: false, // <<< *** Ensures sync operation at runtime ***
63
+ level: {
64
+ 1: { // Level 1 optimizations (safe transformations)
65
+ optimizeBackground: true,
66
+ optimizeBorderRadius: true,
67
+ optimizeFilter: true,
68
+ optimizeFontWeight: true,
69
+ optimizeOutline: true,
70
+ },
71
+ 2: { // Level 2 optimizations (structural changes, generally safe)
72
+ mergeMedia: true,
73
+ mergeNonAdjacentRules: true,
74
+ removeDuplicateFontRules: true,
75
+ removeDuplicateMediaBlocks: true,
76
+ removeDuplicateRules: true,
77
+ restructureRules: true,
78
+ }
79
+ }
80
+ // Note: Type checking based on these options seems problematic with current @types/clean-css
81
+ };
82
+
83
+ /**
84
+ * Default options for terser (JavaScript minifier).
85
+ */
86
+ const JS_MINIFY_OPTIONS: MinifyOptions = {
87
+ compress: {
88
+ dead_code: true,
89
+ drop_console: false,
90
+ drop_debugger: true,
91
+ ecma: 2020,
92
+ keep_classnames: true,
93
+ keep_fnames: true
94
+ },
95
+ mangle: {
96
+ keep_classnames: true,
97
+ keep_fnames: true
98
+ },
99
+ format: { comments: false }
100
+ };
101
+
102
+ // --- Main Minification Function ---
103
+
104
+ /**
105
+ * Applies HTML, CSS, and JS minification conditionally based on BundleOptions.
106
+ * Uses type assertion for clean-css result and @ts-ignore for its constructor
107
+ * due to persistent type definition issues.
108
+ * Creates and returns a *new* ParsedHTML object containing the potentially minified content.
109
+ *
110
+ * @param {ParsedHTML} parsed - Input ParsedHTML object.
111
+ * @param {BundleOptions} [options={}] - Options controlling minification.
112
+ * @param {Logger} [logger] - Optional logger instance.
113
+ * @returns {Promise<ParsedHTML>} A Promise resolving to a new ParsedHTML object.
114
+ */
115
+ export async function minifyAssets(
116
+ parsed: ParsedHTML,
117
+ options: BundleOptions = {},
118
+ logger?: Logger
119
+ ): Promise<ParsedHTML> {
120
+ const { htmlContent, assets } = parsed;
121
+
122
+ // Use optional chaining and nullish coalescing for safer access
123
+ const currentHtmlContent = htmlContent ?? '';
124
+ const currentAssets = assets ?? [];
125
+
126
+
127
+ if (!currentHtmlContent && currentAssets.length === 0) {
128
+ logger?.debug('Minification skipped: No content.');
129
+ return { htmlContent: currentHtmlContent, assets: currentAssets };
130
+ }
131
+
132
+ const minifyFlags = {
133
+ minifyHtml: options.minifyHtml !== false,
134
+ minifyCss: options.minifyCss !== false,
135
+ minifyJs: options.minifyJs !== false
136
+ };
137
+
138
+ logger?.debug(`Minification flags: ${JSON.stringify(minifyFlags)}`);
139
+
140
+ const minifiedAssets: Asset[] = await Promise.all(
141
+ currentAssets.map(async (asset): Promise<Asset> => {
142
+ // Make a shallow copy to avoid modifying the original asset object
143
+ let processedAsset = { ...asset };
144
+
145
+ if (typeof processedAsset.content !== 'string' || processedAsset.content.length === 0) {
146
+ return processedAsset; // Return the copy
147
+ }
148
+
149
+ let newContent = processedAsset.content; // Work with the content of the copy
150
+ const assetIdentifier = processedAsset.url || `inline ${processedAsset.type}`;
151
+
152
+ try {
153
+ // --- Minify CSS (Synchronous Call with Type Assertion Workaround) ---
154
+ if (minifyFlags.minifyCss && processedAsset.type === 'css') {
155
+ logger?.debug(`Minifying CSS: ${assetIdentifier}`);
156
+
157
+ // @ts-ignore - Suppress error TS2769 due to likely faulty @types/clean-css constructor overload definitions for sync mode.
158
+ const cssMinifier = new CleanCSS(CSS_MINIFY_OPTIONS); // <<< @ts-ignore HERE
159
+
160
+ // WORKAROUND using Type Assertion
161
+ const result = cssMinifier.minify(processedAsset.content) as CleanCSSSyncResult;
162
+
163
+ // Access properties based on the asserted type
164
+ if (result.errors && result.errors.length > 0) {
165
+ logger?.warn(`⚠️ CleanCSS failed for ${assetIdentifier}: ${result.errors.join(', ')}`);
166
+ } else {
167
+ if (result.warnings && result.warnings.length > 0) {
168
+ logger?.debug(`CleanCSS warnings for ${assetIdentifier}: ${result.warnings.join(', ')}`);
169
+ }
170
+ if (result.styles) {
171
+ newContent = result.styles; // Update newContent
172
+ logger?.debug(`CSS minified successfully: ${assetIdentifier}`);
173
+ } else {
174
+ logger?.warn(`⚠️ CleanCSS produced no styles but reported no errors for ${assetIdentifier}. Keeping original.`);
175
+ }
176
+ }
177
+ }
178
+
179
+ // --- Minify JS (Asynchronous Call) ---
180
+ if (minifyFlags.minifyJs && processedAsset.type === 'js') {
181
+ logger?.debug(`Minifying JS: ${assetIdentifier}`);
182
+ const result: MinifyOutput = await jsMinify(processedAsset.content, JS_MINIFY_OPTIONS);
183
+ if (result.code) {
184
+ newContent = result.code; // Update newContent
185
+ logger?.debug(`JS minified successfully: ${assetIdentifier}`);
186
+ } else {
187
+ const terserError = (result as any).error;
188
+ if (terserError) {
189
+ logger?.warn(`⚠️ Terser failed for ${assetIdentifier}: ${terserError.message || terserError}`);
190
+ } else {
191
+ logger?.warn(`⚠️ Terser produced no code but reported no errors for ${assetIdentifier}. Keeping original.`);
192
+ }
193
+ }
194
+ }
195
+ } catch (err: unknown) {
196
+ const errorMessage = err instanceof Error ? err.message : String(err);
197
+ logger?.warn(`⚠️ Failed to minify asset ${assetIdentifier} (${processedAsset.type}): ${errorMessage}`);
198
+ // Keep original content if error occurs (newContent remains unchanged)
199
+ }
200
+
201
+ // Update the content property of the copied asset
202
+ processedAsset.content = newContent;
203
+ return processedAsset; // Return the modified copy
204
+ })
205
+ );
206
+
207
+ // --- Minify the main HTML content itself ---
208
+ let finalHtml = currentHtmlContent; // Start with potentially empty original HTML
209
+ if (minifyFlags.minifyHtml && finalHtml.length > 0) {
210
+ logger?.debug('Minifying HTML content...');
211
+ try {
212
+ finalHtml = await htmlMinify(finalHtml, {
213
+ ...HTML_MINIFY_OPTIONS,
214
+ minifyCSS: minifyFlags.minifyCss,
215
+ minifyJS: minifyFlags.minifyJs
216
+ });
217
+ logger?.debug('HTML minified successfully.');
218
+ } catch (err: unknown) {
219
+ const errorMessage = err instanceof Error ? err.message : String(err);
220
+ logger?.warn(`⚠️ HTML minification failed: ${errorMessage}`);
221
+ // Keep original HTML (finalHtml already holds it)
222
+ }
223
+ } else if (finalHtml.length > 0) {
224
+ logger?.debug('HTML minification skipped (disabled).');
225
+ }
226
+
227
+
228
+ // --- Return the final result object ---
229
+ return {
230
+ htmlContent: finalHtml,
231
+ assets: minifiedAssets // The array of processed asset copies
232
+ };
233
+ }
@@ -0,0 +1,191 @@
1
+ /**
2
+ * @file src/core/packer.ts
3
+ * @description Inlines CSS, JS, and images into an HTML document for full portability.
4
+ * Uses Cheerio for safe DOM manipulation.
5
+ */
6
+
7
+ import * as cheerio from 'cheerio';
8
+ // Import CheerioAPI type
9
+ import type { CheerioAPI } from 'cheerio';
10
+ import type { ParsedHTML, Asset } from '../types'; // Assuming correct path
11
+ import { Logger } from '../utils/logger'; // Assuming correct path
12
+ import { guessMimeType } from '../utils/mime'; // Assuming correct path
13
+
14
+ /**
15
+ * Escapes characters potentially problematic within inline `<script>` tags.
16
+ */
17
+ function escapeScriptContent(code: string): string {
18
+ return code.replace(/<\/(script)/gi, '<\\/$1');
19
+ }
20
+
21
+ /**
22
+ * Ensures a `<base href="./">` tag exists within the `<head>` of the HTML.
23
+ * Creates <head> or even <html> if necessary using Cheerio.
24
+ *
25
+ * @param {CheerioAPI} $ - The Cheerio instance representing the HTML document.
26
+ * @param {Logger} [logger] - Optional logger instance.
27
+ */
28
+ function ensureBaseTag($: CheerioAPI, logger?: Logger): void {
29
+ let head = $('head');
30
+
31
+ // If <head> doesn't exist, create it, ensuring <html> exists first.
32
+ if (head.length === 0) {
33
+ logger?.debug('No <head> tag found. Creating <head> and ensuring <html> exists.');
34
+ let htmlElement = $('html');
35
+
36
+ // If <html> doesn't exist, create it and wrap the existing content.
37
+ if (htmlElement.length === 0) {
38
+ logger?.debug('No <html> tag found. Wrapping content in <html><body>...');
39
+ const bodyContent = $.root().html() || '';
40
+ $.root().empty();
41
+ // FIX: Use 'as any' for type assertion
42
+ htmlElement = $('<html>').appendTo($.root()) as any;
43
+ // FIX: Use 'as any' for type assertion
44
+ head = $('<head>').appendTo(htmlElement) as any;
45
+ $('<body>').html(bodyContent).appendTo(htmlElement);
46
+ } else {
47
+ // If <html> exists but <head> doesn't, prepend <head> to <html>
48
+ // FIX: Use 'as any' for type assertion
49
+ head = $('<head>').prependTo(htmlElement) as any;
50
+ }
51
+ }
52
+
53
+ // Now head should represent the head element selection.
54
+ // Check if <base> exists within the guaranteed <head>.
55
+ // Use type guard just in case head couldn't be created properly
56
+ if (head && head.length > 0 && head.find('base[href]').length === 0) {
57
+ logger?.debug('Prepending <base href="./"> to <head>.');
58
+ head.prepend('<base href="./">');
59
+ }
60
+ }
61
+
62
+
63
+ /**
64
+ * Inlines assets into the HTML document using Cheerio for safe DOM manipulation.
65
+ */
66
+ function inlineAssets($: CheerioAPI, assets: Asset[], logger?: Logger): void {
67
+ logger?.debug(`Inlining ${assets.filter(a => a.content).length} assets with content...`);
68
+ const assetMap = new Map<string, Asset>(assets.map(asset => [asset.url, asset]));
69
+
70
+ // 1. Inline CSS (<link rel="stylesheet" href="...">)
71
+ $('link[rel="stylesheet"][href]').each((_, el) => {
72
+ const link = $(el);
73
+ const href = link.attr('href');
74
+ const asset = href ? assetMap.get(href) : undefined;
75
+ if (asset?.content && typeof asset.content === 'string') {
76
+ if (asset.content.startsWith('data:')) {
77
+ logger?.debug(`Replacing link with style tag using existing data URI: ${asset.url}`);
78
+ const styleTag = $('<style>').text(`@import url("${asset.content}");`);
79
+ link.replaceWith(styleTag);
80
+ } else {
81
+ logger?.debug(`Inlining CSS: ${asset.url}`);
82
+ const styleTag = $('<style>').text(asset.content);
83
+ link.replaceWith(styleTag);
84
+ }
85
+ } else if (href) {
86
+ logger?.warn(`Could not inline CSS: ${href}. Content missing or invalid.`);
87
+ }
88
+ });
89
+
90
+ // 2. Inline JS (<script src="...">)
91
+ $('script[src]').each((_, el) => {
92
+ const script = $(el);
93
+ const src = script.attr('src');
94
+ const asset = src ? assetMap.get(src) : undefined;
95
+ if (asset?.content && typeof asset.content === 'string') {
96
+ logger?.debug(`Inlining JS: ${asset.url}`);
97
+ const inlineScript = $('<script>');
98
+ inlineScript.text(escapeScriptContent(asset.content));
99
+ Object.entries(script.attr() || {}).forEach(([key, value]) => {
100
+ if (key.toLowerCase() !== 'src') inlineScript.attr(key, value);
101
+ });
102
+ script.replaceWith(inlineScript);
103
+ } else if (src) {
104
+ logger?.warn(`Could not inline JS: ${src}. Content missing or not string.`);
105
+ }
106
+ });
107
+
108
+ // 3. Inline Images (<img src="...">, <video poster="...">, etc.)
109
+ $('img[src], video[poster], input[type="image"][src]').each((_, el) => {
110
+ const element = $(el);
111
+ const srcAttr = element.is('video') ? 'poster' : 'src';
112
+ const src = element.attr(srcAttr);
113
+ const asset = src ? assetMap.get(src) : undefined;
114
+ if (asset?.content && typeof asset.content === 'string' && asset.content.startsWith('data:')) {
115
+ logger?.debug(`Inlining image via ${srcAttr}: ${asset.url}`);
116
+ element.attr(srcAttr, asset.content);
117
+ } else if (src) {
118
+ logger?.warn(`Could not inline image via ${srcAttr}: ${src}. Content missing or not a data URI.`);
119
+ }
120
+ });
121
+
122
+ // 4. Inline srcset attributes (<img srcset="...">, <source srcset="...">)
123
+ $('img[srcset], source[srcset]').each((_, el) => {
124
+ const element = $(el);
125
+ const srcset = element.attr('srcset');
126
+ if (!srcset) return;
127
+ const newSrcsetParts: string[] = [];
128
+ let changed = false;
129
+ srcset.split(',').forEach(part => {
130
+ const trimmedPart = part.trim();
131
+ const [url, descriptor] = trimmedPart.split(/\s+/, 2);
132
+ const asset = url ? assetMap.get(url) : undefined;
133
+ if (asset?.content && typeof asset.content === 'string' && asset.content.startsWith('data:')) {
134
+ newSrcsetParts.push(`${asset.content}${descriptor ? ' ' + descriptor : ''}`);
135
+ changed = true;
136
+ } else {
137
+ newSrcsetParts.push(trimmedPart);
138
+ }
139
+ });
140
+ if (changed) {
141
+ element.attr('srcset', newSrcsetParts.join(', '));
142
+ }
143
+ });
144
+
145
+ // 5. Inline other asset types (video, audio sources)
146
+ $('video[src], audio[src], video > source[src], audio > source[src]').each((_, el) => {
147
+ const element = $(el);
148
+ const src = element.attr('src');
149
+ const asset = src ? assetMap.get(src) : undefined;
150
+ if (asset?.content && typeof asset.content === 'string' && asset.content.startsWith('data:')) {
151
+ logger?.debug(`Inlining media source: ${asset.url}`);
152
+ element.attr('src', asset.content);
153
+ }
154
+ });
155
+
156
+ logger?.debug('Asset inlining process complete.');
157
+ }
158
+
159
+
160
+ /**
161
+ * Packs a ParsedHTML object into a single, self-contained HTML string.
162
+ * This involves ensuring a base tag exists and inlining all assets
163
+ * that have content available. Uses Cheerio for safe DOM manipulation.
164
+ *
165
+ * @export
166
+ * @param {ParsedHTML} parsed - The parsed HTML document object, including its list of assets (which may have content).
167
+ * @param {Logger} [logger] - Optional logger instance.
168
+ * @returns {string} The packed HTML string with assets inlined. Returns a minimal HTML structure if input is invalid.
169
+ */
170
+ export function packHTML(parsed: ParsedHTML, logger?: Logger): string {
171
+ const { htmlContent, assets } = parsed;
172
+ if (!htmlContent || typeof htmlContent !== 'string') {
173
+ logger?.warn('Packer received empty or invalid htmlContent. Returning minimal HTML shell.');
174
+ return '<!DOCTYPE html><html><head><base href="./"></head><body></body></html>';
175
+ }
176
+
177
+ logger?.debug('Loading HTML content into Cheerio for packing...');
178
+ const $ = cheerio.load(htmlContent);
179
+
180
+ logger?.debug('Ensuring <base> tag exists...');
181
+ ensureBaseTag($, logger); // Ensure base tag safely
182
+
183
+ logger?.debug('Starting asset inlining...');
184
+ inlineAssets($, assets, logger); // Inline assets safely
185
+
186
+ logger?.debug('Generating final packed HTML string...');
187
+ const finalHtml = $.html();
188
+
189
+ logger?.debug(`Packing complete. Final size: ${Buffer.byteLength(finalHtml)} bytes.`);
190
+ return finalHtml;
191
+ }
@@ -0,0 +1,115 @@
1
+ /**
2
+ * @file src/core/parser.ts
3
+ * @description
4
+ * Parses an HTML file using Cheerio to extract the basic structure
5
+ * and identify top-level linked assets (CSS, JS, images, fonts, video, audio etc.).
6
+ * It relies on tag names, link relations, and file extensions to guess asset types.
7
+ * It does *not* fetch or analyze the content of linked assets. Inline styles/scripts
8
+ * and data URIs are ignored. Duplicate asset URLs are ignored.
9
+ */
10
+
11
+ // FIX: Use only the named import for readFile
12
+ import { readFile } from 'fs/promises';
13
+ // NOTE: 'path' module was imported but not used, so removed. Add back if needed later.
14
+ // import path from 'path';
15
+ import * as cheerio from 'cheerio';
16
+ import type { CheerioAPI } from 'cheerio';
17
+ import type { Asset, ParsedHTML } from '../types.js';
18
+ import { Logger } from '../utils/logger.js';
19
+ import { guessMimeType } from '../utils/mime.js';
20
+
21
+ /**
22
+ * Parses an HTML file from the given path using Cheerio.
23
+ * Extracts references to external assets like CSS, JS, images, fonts, video, audio
24
+ * found in common HTML tags (<link>, <script>, <img>, <source>, <video>, <audio>, <input type="image">).
25
+ * Does not extract assets linked *within* CSS (like @import, fonts or background images).
26
+ * Data URIs and empty URLs are ignored. Duplicate URLs are ignored.
27
+ *
28
+ * @async
29
+ * @function parseHTML
30
+ * @param {string} entryFilePath - Absolute or relative path to the input HTML file.
31
+ * @param {Logger} [logger] - Optional logger instance.
32
+ * @returns {Promise<ParsedHTML>} A promise that resolves to the parsed HTML content
33
+ * and a list of discovered asset URLs with their inferred types.
34
+ * @throws {Error} Throws an error with cause if the file cannot be read.
35
+ */
36
+ export async function parseHTML(entryFilePath: string, logger?: Logger): Promise<ParsedHTML> {
37
+ logger?.debug(`Parsing HTML file: ${entryFilePath}`);
38
+ let htmlContent: string;
39
+ try {
40
+ // FIX: Use the correctly imported 'readFile' function directly
41
+ htmlContent = await readFile(entryFilePath, 'utf-8');
42
+ logger?.debug(`Successfully read HTML file (${Buffer.byteLength(htmlContent)} bytes).`);
43
+ } catch (err: any) {
44
+ logger?.error(`Failed to read HTML file "${entryFilePath}": ${err.message}`);
45
+ throw new Error(`Could not read input HTML file: ${entryFilePath}`, { cause: err });
46
+ }
47
+
48
+ const $: CheerioAPI = cheerio.load(htmlContent);
49
+ const assets: Asset[] = [];
50
+ const addedUrls = new Set<string>();
51
+
52
+ /** Helper to add unique assets */
53
+ const addAsset = (url?: string, forcedType?: Asset['type']): void => {
54
+ if (!url || url.trim() === '' || url.startsWith('data:')) {
55
+ return;
56
+ }
57
+ if (!addedUrls.has(url)) {
58
+ addedUrls.add(url);
59
+ const mimeInfo = guessMimeType(url);
60
+ const type = forcedType ?? mimeInfo.assetType;
61
+ assets.push({ type, url });
62
+ logger?.debug(`Discovered asset: Type='${type}', URL='${url}'`);
63
+ } else {
64
+ logger?.debug(`Skipping duplicate asset URL: ${url}`);
65
+ }
66
+ };
67
+
68
+ logger?.debug('Extracting assets from HTML tags...');
69
+
70
+ // --- Extract Assets from Various Tags ---
71
+ // Stylesheets: <link rel="stylesheet" href="...">
72
+ $('link[rel="stylesheet"][href]').each((_, el) => {
73
+ addAsset($(el).attr('href'), 'css');
74
+ });
75
+ // JavaScript: <script src="...">
76
+ $('script[src]').each((_, el) => {
77
+ addAsset($(el).attr('src'), 'js');
78
+ });
79
+ // Images: <img src="...">, <input type="image" src="...">
80
+ $('img[src]').each((_, el) => addAsset($(el).attr('src'), 'image'));
81
+ $('input[type="image"][src]').each((_, el) => addAsset($(el).attr('src'), 'image'));
82
+ // Image srcset: <img srcset="...">, <source srcset="..."> (within picture)
83
+ $('img[srcset], picture source[srcset]').each((_, el) => {
84
+ const srcset = $(el).attr('srcset');
85
+ srcset?.split(',').forEach(entry => {
86
+ const [url] = entry.trim().split(/\s+/);
87
+ addAsset(url, 'image');
88
+ });
89
+ });
90
+ // Video: <video src="...">, <video poster="...">
91
+ $('video[src]').each((_, el) => addAsset($(el).attr('src'), 'video'));
92
+ $('video[poster]').each((_, el) => addAsset($(el).attr('poster'), 'image'));
93
+ // Audio: <audio src="...">
94
+ $('audio[src]').each((_, el) => addAsset($(el).attr('src'), 'audio'));
95
+ // Media Sources: <source src="..."> within <video> or <audio>
96
+ $('video > source[src]').each((_, el) => addAsset($(el).attr('src'), 'video'));
97
+ $('audio > source[src]').each((_, el) => addAsset($(el).attr('src'), 'audio'));
98
+ // Icons and Manifest: <link rel="icon/shortcut icon/apple-touch-icon/manifest" href="...">
99
+ $('link[href]').filter((_, el) => {
100
+ const rel = $(el).attr('rel')?.toLowerCase() ?? '';
101
+ return ['icon', 'shortcut icon', 'apple-touch-icon', 'manifest'].includes(rel);
102
+ }).each((_, el) => {
103
+ const rel = $(el).attr('rel')?.toLowerCase() ?? '';
104
+ const isIcon = ['icon', 'shortcut icon', 'apple-touch-icon'].includes(rel);
105
+ addAsset($(el).attr('href'), isIcon ? 'image' : undefined);
106
+ });
107
+ // Preloaded Fonts: <link rel="preload" as="font" href="...">
108
+ $('link[rel="preload"][as="font"][href]').each((_, el) => {
109
+ addAsset($(el).attr('href'), 'font');
110
+ });
111
+
112
+ // --- Parsing Complete ---
113
+ logger?.info(`HTML parsing complete. Discovered ${assets.length} unique asset links.`);
114
+ return { htmlContent, assets };
115
+ }