design-clone 2.1.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. package/README.md +13 -34
  2. package/SKILL.md +69 -45
  3. package/bin/cli.js +22 -4
  4. package/bin/commands/clone-site.js +31 -171
  5. package/bin/commands/help.js +19 -6
  6. package/bin/commands/init.js +9 -86
  7. package/bin/commands/uninstall.js +105 -0
  8. package/bin/commands/update.js +70 -0
  9. package/bin/commands/verify.js +7 -14
  10. package/bin/utils/paths.js +28 -0
  11. package/bin/utils/validate.js +2 -22
  12. package/bin/utils/version.js +23 -0
  13. package/docs/code-standards.md +789 -0
  14. package/docs/codebase-summary.md +533 -286
  15. package/docs/index.md +74 -0
  16. package/docs/project-overview-pdr.md +797 -0
  17. package/docs/system-architecture.md +718 -0
  18. package/package.json +14 -17
  19. package/src/ai/prompts/design-tokens/basic.md +80 -0
  20. package/src/ai/prompts/design-tokens/section-with-css.md +41 -0
  21. package/src/ai/prompts/design-tokens/section.md +48 -0
  22. package/src/ai/prompts/design-tokens/with-css.md +87 -0
  23. package/src/ai/prompts/structure-analysis/basic.md +55 -0
  24. package/src/ai/prompts/structure-analysis/with-context.md +59 -0
  25. package/src/ai/prompts/structure-analysis/with-dimensions.md +63 -0
  26. package/src/ai/prompts/structure-analysis/with-hierarchy.md +73 -0
  27. package/src/ai/prompts/ux-audit/aggregation.md +42 -0
  28. package/src/ai/prompts/ux-audit/desktop.md +92 -0
  29. package/src/ai/prompts/ux-audit/mobile.md +93 -0
  30. package/src/ai/prompts/ux-audit/tablet.md +92 -0
  31. package/src/core/animation/animation-extractor-ast.js +183 -0
  32. package/src/core/animation/animation-extractor-output.js +152 -0
  33. package/src/core/animation/animation-extractor.js +178 -0
  34. package/src/core/animation/state-capture-detection.js +200 -0
  35. package/src/core/animation/state-capture.js +193 -0
  36. package/src/core/capture/browser-context-pool.js +96 -0
  37. package/src/core/capture/multi-page-screenshot-page.js +110 -0
  38. package/src/core/capture/multi-page-screenshot.js +208 -0
  39. package/src/core/capture/screenshot-extraction.js +186 -0
  40. package/src/core/capture/screenshot-helpers.js +175 -0
  41. package/src/core/capture/screenshot-orchestrator.js +174 -0
  42. package/src/core/capture/screenshot-viewport.js +93 -0
  43. package/src/core/capture/screenshot.js +192 -0
  44. package/src/core/content/content-counter-dom.js +191 -0
  45. package/src/core/content/content-counter.js +76 -0
  46. package/src/core/css/breakpoint-detector.js +66 -0
  47. package/src/core/css/chromium-defaults.json +23 -0
  48. package/src/core/css/computed-style-extractor.js +102 -0
  49. package/src/core/css/css-chunker.js +103 -0
  50. package/src/core/css/filter-css-dead-code.js +120 -0
  51. package/src/core/css/filter-css-html-analyzer.js +110 -0
  52. package/src/core/css/filter-css-selector-matcher.js +172 -0
  53. package/src/core/css/filter-css.js +206 -0
  54. package/src/core/css/merge-css-atrule-processor.js +158 -0
  55. package/src/core/css/merge-css-file-io.js +68 -0
  56. package/src/core/css/merge-css.js +148 -0
  57. package/src/core/detection/framework-detector-routing.js +68 -0
  58. package/src/core/detection/framework-detector-signals.js +65 -0
  59. package/src/core/detection/framework-detector.js +198 -0
  60. package/src/core/dimension/dimension-extractor-card-detector.js +82 -0
  61. package/src/core/dimension/dimension-extractor.js +317 -0
  62. package/src/core/dimension/dimension-output-ai-summary.js +111 -0
  63. package/src/core/dimension/dimension-output.js +173 -0
  64. package/src/core/dimension/dom-tree-analyzer-tree-builders.js +95 -0
  65. package/src/core/dimension/dom-tree-analyzer.js +191 -0
  66. package/src/core/discovery/app-state-snapshot-capture.js +195 -0
  67. package/src/core/discovery/app-state-snapshot-utils.js +178 -0
  68. package/src/core/discovery/app-state-snapshot.js +131 -0
  69. package/src/core/discovery/discover-pages-routes.js +84 -0
  70. package/src/core/discovery/discover-pages-utils.js +177 -0
  71. package/src/core/discovery/discover-pages.js +191 -0
  72. package/src/core/html/html-extractor-inline-styler.js +70 -0
  73. package/src/core/html/html-extractor.js +147 -0
  74. package/src/core/html/semantic-enhancer-mappings.js +200 -0
  75. package/src/core/html/semantic-enhancer-page.js +148 -0
  76. package/src/core/html/semantic-enhancer.js +135 -0
  77. package/src/core/links/rewrite-links-css-rewriter.js +53 -0
  78. package/src/core/links/rewrite-links.js +173 -0
  79. package/src/core/media/asset-validator.js +118 -0
  80. package/src/core/media/extract-assets-downloader.js +187 -0
  81. package/src/core/media/extract-assets-page-scraper.js +115 -0
  82. package/src/core/media/extract-assets.js +159 -0
  83. package/src/core/media/video-capture-convert.js +200 -0
  84. package/src/core/media/video-capture.js +201 -0
  85. package/src/core/{lazy-loader.js → page-prep/lazy-loader.js} +37 -39
  86. package/src/core/section/section-cropper-helpers.js +43 -0
  87. package/src/core/{section-cropper.js → section/section-cropper.js} +11 -88
  88. package/src/core/section/section-detector-strategies.js +139 -0
  89. package/src/core/section/section-detector-utils.js +100 -0
  90. package/src/core/section/section-detector.js +88 -0
  91. package/src/core/tests/test-section-cropper.js +2 -2
  92. package/src/core/tests/test-section-detector.js +2 -2
  93. package/src/post-process/enhance-assets.js +29 -4
  94. package/src/post-process/fetch-images-unsplash-client.js +123 -0
  95. package/src/post-process/fetch-images.js +60 -263
  96. package/src/post-process/inject-gosnap.js +88 -0
  97. package/src/post-process/inject-icons-svg-replacer.js +76 -0
  98. package/src/post-process/inject-icons.js +47 -200
  99. package/src/route-discoverers/base-discoverer-utils.js +137 -0
  100. package/src/route-discoverers/base-discoverer.js +29 -118
  101. package/src/route-discoverers/index.js +1 -1
  102. package/src/shared/config.js +38 -0
  103. package/src/shared/error-codes.js +31 -0
  104. package/src/shared/viewports.js +46 -0
  105. package/src/utils/browser.js +0 -7
  106. package/src/utils/helpers.js +4 -0
  107. package/src/utils/log.js +12 -0
  108. package/src/utils/playwright-loader.js +76 -0
  109. package/src/utils/playwright.js +3 -69
  110. package/src/utils/progress.js +32 -0
  111. package/src/verification/generate-audit-report-css-fixes.js +52 -0
  112. package/src/verification/generate-audit-report-sections.js +158 -0
  113. package/src/verification/generate-audit-report.js +5 -281
  114. package/src/verification/quality-scorer.js +92 -0
  115. package/src/verification/verify-footer-checks.js +103 -0
  116. package/src/verification/verify-footer-helpers.js +178 -0
  117. package/src/verification/verify-footer.js +23 -381
  118. package/src/verification/verify-header-checks.js +104 -0
  119. package/src/verification/verify-header-helpers.js +156 -0
  120. package/src/verification/verify-header.js +23 -365
  121. package/src/verification/verify-layout-report.js +101 -0
  122. package/src/verification/verify-layout.js +13 -259
  123. package/src/verification/verify-menu-checks.js +104 -0
  124. package/src/verification/verify-menu-helpers.js +112 -0
  125. package/src/verification/verify-menu.js +17 -285
  126. package/src/verification/verify-slider-checks.js +115 -0
  127. package/src/verification/verify-slider-constants.js +65 -0
  128. package/src/verification/verify-slider-helpers.js +164 -0
  129. package/src/verification/verify-slider.js +23 -414
  130. package/.env.example +0 -14
  131. package/docs/basic-clone.md +0 -63
  132. package/docs/cli-reference.md +0 -316
  133. package/docs/design-clone-architecture.md +0 -492
  134. package/docs/pixel-perfect.md +0 -117
  135. package/docs/project-roadmap.md +0 -382
  136. package/docs/troubleshooting.md +0 -170
  137. package/requirements.txt +0 -5
  138. package/src/ai/__pycache__/analyze-structure.cpython-313.pyc +0 -0
  139. package/src/ai/__pycache__/extract-design-tokens.cpython-313.pyc +0 -0
  140. package/src/ai/analyze-structure.py +0 -375
  141. package/src/ai/extract-design-tokens.py +0 -782
  142. package/src/ai/prompts/__init__.py +0 -2
  143. package/src/ai/prompts/__pycache__/__init__.cpython-313.pyc +0 -0
  144. package/src/ai/prompts/__pycache__/design_tokens.cpython-313.pyc +0 -0
  145. package/src/ai/prompts/__pycache__/structure_analysis.cpython-313.pyc +0 -0
  146. package/src/ai/prompts/__pycache__/ux_audit.cpython-313.pyc +0 -0
  147. package/src/ai/prompts/design_tokens.py +0 -316
  148. package/src/ai/prompts/structure_analysis.py +0 -592
  149. package/src/ai/prompts/ux_audit.py +0 -198
  150. package/src/ai/ux-audit.js +0 -596
  151. package/src/core/animation-extractor.js +0 -526
  152. package/src/core/app-state-snapshot.js +0 -511
  153. package/src/core/content-counter.js +0 -342
  154. package/src/core/design-tokens.js +0 -103
  155. package/src/core/dimension-extractor.js +0 -438
  156. package/src/core/dimension-output.js +0 -305
  157. package/src/core/discover-pages.js +0 -542
  158. package/src/core/dom-tree-analyzer.js +0 -298
  159. package/src/core/extract-assets.js +0 -468
  160. package/src/core/filter-css.js +0 -499
  161. package/src/core/framework-detector.js +0 -538
  162. package/src/core/html-extractor.js +0 -212
  163. package/src/core/merge-css.js +0 -407
  164. package/src/core/multi-page-screenshot.js +0 -380
  165. package/src/core/rewrite-links.js +0 -226
  166. package/src/core/screenshot.js +0 -701
  167. package/src/core/section-detector.js +0 -386
  168. package/src/core/semantic-enhancer.js +0 -492
  169. package/src/core/state-capture.js +0 -598
  170. package/src/core/video-capture.js +0 -546
  171. package/src/utils/__init__.py +0 -16
  172. package/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
  173. package/src/utils/__pycache__/env.cpython-313.pyc +0 -0
  174. package/src/utils/env.py +0 -134
  175. /package/src/core/{css-extractor.js → css/css-extractor.js} +0 -0
  176. /package/src/core/{cookie-handler.js → page-prep/cookie-handler.js} +0 -0
  177. /package/src/core/{page-readiness.js → page-prep/page-readiness.js} +0 -0
@@ -0,0 +1,173 @@
1
+ /**
2
+ * Link Rewriting Module
3
+ *
4
+ * Rewrites internal links in HTML to point to local .html files.
5
+ * Preserves external links unchanged.
6
+ * CSS link rewriting lives in rewrite-links-css-rewriter.js.
7
+ *
8
+ * Usage:
9
+ * import { rewriteLinks, createPageManifest } from '../links/rewrite-links.js';
10
+ * const rewritten = rewriteLinks(html, manifest, { baseUrl });
11
+ */
12
+
13
+ import fs from 'fs/promises';
14
+ import path from 'path';
15
+ import { normalizeUrl } from '../discovery/discover-pages-utils.js';
16
+ import { rewriteCssLinks } from './rewrite-links-css-rewriter.js';
17
+
18
+ /**
19
+ * Convert URL path to local filename.
20
+ * @param {string} urlPath - e.g. '/about', '/services/consulting'
21
+ * @returns {string} e.g. 'about.html', 'services-consulting.html'
22
+ */
23
+ export function pathToFilename(urlPath) {
24
+ if (!urlPath || urlPath === '/' || urlPath === '') return 'index.html';
25
+
26
+ const name = urlPath
27
+ .replace(/^\//, '')
28
+ .replace(/\/$/, '')
29
+ .replace(/\//g, '-')
30
+ .replace(/[^a-z0-9-]/gi, '-')
31
+ .replace(/-+/g, '-')
32
+ .toLowerCase();
33
+
34
+ return `${name}.html`;
35
+ }
36
+
37
+ /**
38
+ * Create page manifest from discovered pages.
39
+ * @param {Array<{ path, name, url }>} pages
40
+ * @param {Object} options
41
+ * @returns {Object} Page manifest
42
+ */
43
+ export function createPageManifest(pages, options = {}) {
44
+ const baseUrl = pages[0]?.url ? new URL(pages[0].url).origin : '';
45
+
46
+ return {
47
+ baseUrl,
48
+ capturedAt: new Date().toISOString(),
49
+ pages: pages.map(page => ({
50
+ path: page.path,
51
+ name: page.name,
52
+ file: pathToFilename(page.path),
53
+ originalUrl: page.url
54
+ })),
55
+ assets: {
56
+ css: 'styles.css',
57
+ tokens: options.hasTokens ? 'tokens.css' : null
58
+ },
59
+ stats: options.stats || {}
60
+ };
61
+ }
62
+
63
+ /**
64
+ * Build URL-to-filename mapping from manifest (path + full URL variants).
65
+ * @param {Object} manifest
66
+ * @returns {Map<string, string>}
67
+ */
68
+ function buildUrlMap(manifest) {
69
+ const urlMap = new Map();
70
+
71
+ for (const page of manifest.pages) {
72
+ if (page.originalUrl) {
73
+ urlMap.set(page.originalUrl, page.file);
74
+ urlMap.set(page.originalUrl.replace(/\/$/, ''), page.file);
75
+ }
76
+ if (page.path) {
77
+ urlMap.set(page.path, page.file);
78
+ if (page.path !== '/') urlMap.set(page.path.replace(/\/$/, ''), page.file);
79
+ }
80
+ }
81
+
82
+ return urlMap;
83
+ }
84
+
85
+ /**
86
+ * Rewrite links in HTML to point to local files.
87
+ * @param {string} html
88
+ * @param {Object} manifest
89
+ * @param {Object} options
90
+ * @param {string} [options.baseUrl]
91
+ * @param {boolean} [options.rewriteCss=true]
92
+ * @param {boolean} [options.injectTokensCss=false]
93
+ * @returns {string} HTML with rewritten links
94
+ */
95
+ export function rewriteLinks(html, manifest, options = {}) {
96
+ const { baseUrl, rewriteCss = true, injectTokensCss = false } = options;
97
+ const urlMap = buildUrlMap(manifest);
98
+
99
+ let result = html;
100
+
101
+ // Rewrite <a href="..."> internal links
102
+ result = result.replace(
103
+ /(<a\s[^>]*href=["'])([^"']+)(["'][^>]*>)/gi,
104
+ (match, prefix, href, suffix) => {
105
+ if (!href ||
106
+ href.startsWith('javascript:') ||
107
+ href.startsWith('mailto:') ||
108
+ href.startsWith('tel:') ||
109
+ href.startsWith('#')) {
110
+ return match;
111
+ }
112
+
113
+ let filename = null;
114
+
115
+ if (urlMap.has(href)) {
116
+ filename = urlMap.get(href);
117
+ } else if (baseUrl) {
118
+ const normalized = normalizeUrl(baseUrl, href);
119
+ if (normalized && urlMap.has(normalized)) filename = urlMap.get(normalized);
120
+ }
121
+
122
+ if (filename) {
123
+ const fragmentMatch = href.match(/#[^#]*$/);
124
+ const fragment = fragmentMatch ? fragmentMatch[0] : '';
125
+ return `${prefix}${filename}${fragment}${suffix}`;
126
+ }
127
+
128
+ return match;
129
+ }
130
+ );
131
+
132
+ if (rewriteCss) {
133
+ result = rewriteCssLinks(result, injectTokensCss);
134
+ }
135
+
136
+ return result;
137
+ }
138
+
139
+ /**
140
+ * Rewrite links in all HTML files listed in manifest.
141
+ * @param {string} htmlDir
142
+ * @param {Object} manifest
143
+ * @param {Object} options
144
+ * @returns {Promise<{ processed: string[], errors: Array }>}
145
+ */
146
+ export async function rewriteAllLinks(htmlDir, manifest, options = {}) {
147
+ const results = { processed: [], errors: [] };
148
+
149
+ for (const page of manifest.pages) {
150
+ const htmlPath = path.join(htmlDir, page.file);
151
+ try {
152
+ const html = await fs.readFile(htmlPath, 'utf-8');
153
+ const rewritten = rewriteLinks(html, manifest, options);
154
+ await fs.writeFile(htmlPath, rewritten, 'utf-8');
155
+ results.processed.push(page.file);
156
+ } catch (err) {
157
+ results.errors.push({ file: page.file, error: err.message });
158
+ }
159
+ }
160
+
161
+ return results;
162
+ }
163
+
164
+ // CLI stub
165
+ const isMainModule = process.argv[1] && (
166
+ process.argv[1].endsWith('rewrite-links.js') ||
167
+ process.argv[1].includes('rewrite-links')
168
+ );
169
+
170
+ if (isMainModule) {
171
+ console.log('rewrite-links.js - Use as module, not CLI');
172
+ console.log('Exports: rewriteLinks, createPageManifest, pathToFilename, rewriteAllLinks');
173
+ }
@@ -0,0 +1,118 @@
1
+ /**
2
+ * Asset Validator
3
+ *
4
+ * Validates downloaded assets via magic bytes and size thresholds.
5
+ * Sanitizes SVG files by stripping script tags and event handlers.
6
+ */
7
+
8
+ import fs from 'fs/promises';
9
+ import path from 'path';
10
+
11
+ const MAGIC_BYTES = {
12
+ png: [0x89, 0x50, 0x4E, 0x47],
13
+ jpeg: [0xFF, 0xD8, 0xFF],
14
+ gif: [0x47, 0x49, 0x46],
15
+ woff2: [0x77, 0x4F, 0x46, 0x32],
16
+ woff: [0x77, 0x4F, 0x46, 0x46],
17
+ };
18
+
19
+ const MIN_SIZES = { images: 100, fonts: 1024 };
20
+
21
+ /**
22
+ * Validate a single asset file
23
+ * @param {string} filePath - Path to the file
24
+ * @returns {Promise<{valid: boolean, type: string, issues: string[]}>}
25
+ */
26
+ export async function validateAsset(filePath) {
27
+ const issues = [];
28
+ const ext = path.extname(filePath).toLowerCase().slice(1);
29
+
30
+ let stat;
31
+ try {
32
+ stat = await fs.stat(filePath);
33
+ } catch {
34
+ return { valid: false, type: ext, issues: ['File not found'] };
35
+ }
36
+
37
+ // Size check
38
+ const isFont = ['woff', 'woff2', 'ttf', 'otf', 'eot'].includes(ext);
39
+ const minSize = isFont ? MIN_SIZES.fonts : MIN_SIZES.images;
40
+ if (stat.size < minSize) {
41
+ issues.push(`File too small: ${stat.size} bytes (min: ${minSize})`);
42
+ }
43
+
44
+ // SVG check
45
+ if (ext === 'svg') {
46
+ const content = await fs.readFile(filePath, 'utf-8');
47
+ if (/<script[\s>]/i.test(content)) issues.push('SVG contains script tags');
48
+ if (/\son\w+\s*=/i.test(content)) issues.push('SVG contains event handlers');
49
+ if (/javascript\s*:/i.test(content)) issues.push('SVG contains javascript: URIs');
50
+ if (/<(iframe|object|embed)[\s>]/i.test(content)) issues.push('SVG contains unsafe elements');
51
+ if (issues.length > 0) {
52
+ const sanitized = sanitizeSvg(content);
53
+ await fs.writeFile(filePath, sanitized, 'utf-8');
54
+ return { valid: true, type: 'svg', issues, sanitized: true };
55
+ }
56
+ return { valid: true, type: 'svg', issues: [] };
57
+ }
58
+
59
+ // Magic byte check for known types
60
+ const magicKey = ext === 'jpg' ? 'jpeg' : ext;
61
+ const expected = MAGIC_BYTES[magicKey];
62
+ if (expected && stat.size >= expected.length) {
63
+ const buf = Buffer.alloc(expected.length);
64
+ const fh = await fs.open(filePath, 'r');
65
+ await fh.read(buf, 0, expected.length, 0);
66
+ await fh.close();
67
+ const matches = expected.every((b, i) => buf[i] === b);
68
+ if (!matches) issues.push(`Magic bytes mismatch for ${ext}`);
69
+ }
70
+
71
+ return { valid: issues.length === 0, type: ext, issues };
72
+ }
73
+
74
+ /**
75
+ * Validate all assets in a directory
76
+ * @param {string} assetsDir - Assets root directory
77
+ * @returns {Promise<{valid: number, invalid: number, sanitized: number, details: Array}>}
78
+ */
79
+ export async function validateBatch(assetsDir) {
80
+ const results = { valid: 0, invalid: 0, sanitized: 0, details: [] };
81
+ const subdirs = ['images', 'fonts', 'icons'];
82
+
83
+ for (const sub of subdirs) {
84
+ const dir = path.join(assetsDir, sub);
85
+ let files;
86
+ try { files = await fs.readdir(dir); } catch { continue; }
87
+ for (const file of files) {
88
+ const result = await validateAsset(path.join(dir, file));
89
+ if (result.sanitized) results.sanitized++;
90
+ if (result.valid) results.valid++;
91
+ else {
92
+ results.invalid++;
93
+ results.details.push({ file: `${sub}/${file}`, ...result });
94
+ }
95
+ }
96
+ }
97
+
98
+ return results;
99
+ }
100
+
101
+ /**
102
+ * Strip dangerous content from SVG
103
+ * @param {string} content - SVG content
104
+ * @returns {string} Sanitized SVG
105
+ */
106
+ export function sanitizeSvg(content) {
107
+ return content
108
+ // Strip <script> blocks
109
+ .replace(/<script[\s\S]*?<\/script>/gi, '')
110
+ // Strip dangerous elements: <iframe>, <object>, <embed>
111
+ .replace(/<(iframe|object|embed)[\s\S]*?<\/\1>/gi, '')
112
+ .replace(/<(iframe|object|embed)(\s[^>]*)?\s*\/>/gi, '')
113
+ // Strip javascript: URIs from href, xlink:href, src attributes
114
+ .replace(/((?:xlink:)?href|src)\s*=\s*["']\s*javascript\s*:[^"']*["']/gi, '')
115
+ // Strip on* event handler attributes
116
+ .replace(/\son\w+\s*=\s*["'][^"']*["']/gi, '')
117
+ .replace(/\son\w+\s*=\s*\{[^}]*\}/gi, '');
118
+ }
@@ -0,0 +1,187 @@
1
+ /**
2
+ * Asset Downloader Utilities
3
+ *
4
+ * Handles file downloading with timeout/retry, batch rate-limiting,
5
+ * safe filename generation, and asset-type classification.
6
+ */
7
+
8
+ import fs from 'fs/promises';
9
+ import path from 'path';
10
+ import https from 'https';
11
+ import http from 'http';
12
+ import { URL } from 'url';
13
+
14
+ // Asset type configurations
15
+ export const ASSET_TYPES = {
16
+ images: {
17
+ extensions: ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico', '.avif'],
18
+ folder: 'images'
19
+ },
20
+ fonts: {
21
+ extensions: ['.woff', '.woff2', '.ttf', '.otf', '.eot'],
22
+ folder: 'fonts'
23
+ },
24
+ icons: {
25
+ extensions: ['.svg'],
26
+ folder: 'icons'
27
+ }
28
+ };
29
+
30
+ // Rate limiting configuration
31
+ export const RATE_LIMIT = {
32
+ maxConcurrent: 10,
33
+ delayBetweenBatches: 50
34
+ };
35
+
36
+ /**
37
+ * Download a file with timeout and retry.
38
+ * @param {string} url
39
+ * @param {string} destPath
40
+ * @param {number} timeout - ms
41
+ * @param {number} retries
42
+ * @returns {Promise<{ success: boolean, error?: string }>}
43
+ */
44
+ export async function downloadFile(url, destPath, timeout = 30000, retries = 2) {
45
+ const protocol = url.startsWith('https') ? https : http;
46
+
47
+ for (let attempt = 0; attempt <= retries; attempt++) {
48
+ try {
49
+ await new Promise((resolve, reject) => {
50
+ const timeoutId = setTimeout(() => reject(new Error('Download timeout')), timeout);
51
+
52
+ const request = protocol.get(url, {
53
+ headers: {
54
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
55
+ 'Accept': '*/*'
56
+ }
57
+ }, (response) => {
58
+ if (response.statusCode >= 300 && response.statusCode < 400 && response.headers.location) {
59
+ clearTimeout(timeoutId);
60
+ downloadFile(response.headers.location, destPath, timeout, 0).then(resolve).catch(reject);
61
+ return;
62
+ }
63
+
64
+ if (response.statusCode === 429) {
65
+ clearTimeout(timeoutId);
66
+ const retryAfter = parseInt(response.headers['retry-after'] || '0', 10);
67
+ const backoffMs = retryAfter > 0
68
+ ? retryAfter * 1000
69
+ : Math.min(1000 * Math.pow(2, attempt), 8000);
70
+ reject(new Error(`HTTP 429 (retry after ${backoffMs}ms)`));
71
+ return;
72
+ }
73
+
74
+ if (response.statusCode !== 200) {
75
+ clearTimeout(timeoutId);
76
+ reject(new Error(`HTTP ${response.statusCode}`));
77
+ return;
78
+ }
79
+
80
+ const chunks = [];
81
+ response.on('data', chunk => chunks.push(chunk));
82
+ response.on('end', async () => {
83
+ clearTimeout(timeoutId);
84
+ try {
85
+ const buffer = Buffer.concat(chunks);
86
+ await fs.mkdir(path.dirname(destPath), { recursive: true });
87
+ await fs.writeFile(destPath, buffer);
88
+ resolve({ size: buffer.length });
89
+ } catch (err) {
90
+ reject(err);
91
+ }
92
+ });
93
+ response.on('error', err => { clearTimeout(timeoutId); reject(err); });
94
+ });
95
+
96
+ request.on('error', err => { clearTimeout(timeoutId); reject(err); });
97
+ });
98
+
99
+ return { success: true };
100
+ } catch (err) {
101
+ if (attempt === retries) return { success: false, error: err.message };
102
+ const is429 = err.message.includes('HTTP 429');
103
+ const delay = is429
104
+ ? parseInt(err.message.match(/(\d+)ms/)?.[1] || '2000')
105
+ : 500 * (attempt + 1);
106
+ await new Promise(r => setTimeout(r, delay));
107
+ }
108
+ }
109
+ }
110
+
111
+ /**
112
+ * Download files in batches with rate limiting.
113
+ * @param {Array<{ url: string, destPath: string, type: string }>} downloads
114
+ * @param {boolean} verbose
115
+ * @returns {Promise<{ success: number, failed: number, skipped: number, errors: Array }>}
116
+ */
117
+ export async function downloadBatch(downloads, verbose = false, options = {}) {
118
+ const { maxConcurrent = RATE_LIMIT.maxConcurrent, delayBetween = RATE_LIMIT.delayBetweenBatches } = options;
119
+ const results = { success: 0, failed: 0, skipped: 0, errors: [] };
120
+
121
+ for (let i = 0; i < downloads.length; i += maxConcurrent) {
122
+ const batch = downloads.slice(i, i + maxConcurrent);
123
+
124
+ await Promise.all(batch.map(async ({ url, destPath, type }) => {
125
+ try {
126
+ await fs.access(destPath);
127
+ results.skipped++;
128
+ return;
129
+ } catch { /* file doesn't exist, continue */ }
130
+
131
+ const result = await downloadFile(url, destPath);
132
+ if (result.success) {
133
+ results.success++;
134
+ if (verbose) console.error(` ✓ ${type}: ${path.basename(destPath)}`);
135
+ } else {
136
+ results.failed++;
137
+ results.errors.push({ url, error: result.error });
138
+ if (verbose) console.error(` ✗ ${type}: ${path.basename(url)} - ${result.error}`);
139
+ }
140
+ }));
141
+
142
+ if (i + maxConcurrent < downloads.length) {
143
+ await new Promise(r => setTimeout(r, delayBetween));
144
+ }
145
+ }
146
+
147
+ return results;
148
+ }
149
+
150
+ /**
151
+ * Generate safe filename from URL.
152
+ * @param {string} url
153
+ * @returns {string}
154
+ */
155
+ export function getSafeFilename(url) {
156
+ try {
157
+ const urlObj = new URL(url);
158
+ let filename = path.basename(urlObj.pathname);
159
+
160
+ if (urlObj.search) {
161
+ const hash = Buffer.from(urlObj.search).toString('base64').slice(0, 8);
162
+ const ext = path.extname(filename);
163
+ const base = path.basename(filename, ext);
164
+ filename = `${base}-${hash}${ext}`;
165
+ }
166
+
167
+ filename = filename.replace(/[^a-zA-Z0-9._-]/g, '_');
168
+ if (!path.extname(filename)) filename += '.bin';
169
+
170
+ return filename;
171
+ } catch {
172
+ return `asset-${Date.now()}.bin`;
173
+ }
174
+ }
175
+
176
+ /**
177
+ * Determine asset type from URL extension.
178
+ * @param {string} url
179
+ * @returns {'fonts'|'icons'|'images'|'other'}
180
+ */
181
+ export function getAssetType(url) {
182
+ const ext = path.extname(new URL(url).pathname).toLowerCase();
183
+ if (ASSET_TYPES.fonts.extensions.includes(ext)) return 'fonts';
184
+ if (ext === '.svg') return 'icons';
185
+ if (ASSET_TYPES.images.extensions.includes(ext)) return 'images';
186
+ return 'other';
187
+ }
@@ -0,0 +1,115 @@
1
+ /**
2
+ * Page Asset Scraper
3
+ *
4
+ * Extracts asset URLs from a live Playwright page (images, inline SVGs,
5
+ * CSS stylesheet links) and from raw CSS content (background URLs, font-face).
6
+ */
7
+
8
+ import { URL } from 'url';
9
+
10
+ /**
11
+ * Parse CSS text for asset URLs (background images + font-face src).
12
+ * @param {string} cssContent
13
+ * @param {string} baseUrl
14
+ * @returns {string[]} Absolute asset URLs
15
+ */
16
+ export function extractCssUrls(cssContent, baseUrl) {
17
+ const urls = new Set();
18
+
19
+ const bgPattern = /url\s*\(\s*['"]?([^'")\s]+)['"]?\s*\)/gi;
20
+ let match;
21
+ while ((match = bgPattern.exec(cssContent)) !== null) {
22
+ const url = match[1];
23
+ if (!url.startsWith('data:')) {
24
+ try { urls.add(new URL(url, baseUrl).href); } catch { /* ignore */ }
25
+ }
26
+ }
27
+
28
+ const fontPattern = /@font-face\s*\{[^}]*src:\s*([^;]+)/gi;
29
+ while ((match = fontPattern.exec(cssContent)) !== null) {
30
+ const srcValue = match[1];
31
+ const urlPattern = /url\s*\(\s*['"]?([^'")\s]+)['"]?\s*\)/gi;
32
+ let urlMatch;
33
+ while ((urlMatch = urlPattern.exec(srcValue)) !== null) {
34
+ const url = urlMatch[1];
35
+ if (!url.startsWith('data:')) {
36
+ try { urls.add(new URL(url, baseUrl).href); } catch { /* ignore */ }
37
+ }
38
+ }
39
+ }
40
+
41
+ return Array.from(urls);
42
+ }
43
+
44
+ /**
45
+ * Extract all asset URLs and inline SVGs from a Playwright page.
46
+ * @param {import('playwright').Page} page
47
+ * @param {string} baseUrl
48
+ * @returns {Promise<{ images: string[], cssUrls: string[], inlineSvgs: Array<{id: string, content: string}> }>}
49
+ */
50
+ export async function extractAssetsFromPage(page, baseUrl) {
51
+ return await page.evaluate((url) => {
52
+ const imageSet = new Set();
53
+ const cssUrls = [];
54
+
55
+ // img[src]
56
+ document.querySelectorAll('img[src]').forEach(img => {
57
+ const src = img.getAttribute('src');
58
+ if (src && !src.startsWith('data:')) {
59
+ try { imageSet.add(new URL(src, url).href); } catch { /* ignore */ }
60
+ }
61
+ });
62
+
63
+ // srcset
64
+ document.querySelectorAll('[srcset]').forEach(el => {
65
+ const srcset = el.getAttribute('srcset');
66
+ if (srcset) {
67
+ srcset.split(',').forEach(part => {
68
+ const src = part.trim().split(/\s+/)[0];
69
+ if (src && !src.startsWith('data:')) {
70
+ try { imageSet.add(new URL(src, url).href); } catch { /* ignore */ }
71
+ }
72
+ });
73
+ }
74
+ });
75
+
76
+ // Inline background images
77
+ document.querySelectorAll('[style*="background"]').forEach(el => {
78
+ const style = el.getAttribute('style');
79
+ const urlMatch = style.match(/url\s*\(\s*['"]?([^'")\s]+)['"]?\s*\)/i);
80
+ if (urlMatch && !urlMatch[1].startsWith('data:')) {
81
+ try { imageSet.add(new URL(urlMatch[1], url).href); } catch { /* ignore */ }
82
+ }
83
+ });
84
+
85
+ // Favicon and touch icons
86
+ document.querySelectorAll('link[rel*="icon"]').forEach(link => {
87
+ const href = link.getAttribute('href');
88
+ if (href && !href.startsWith('data:')) {
89
+ try { imageSet.add(new URL(href, url).href); } catch { /* ignore */ }
90
+ }
91
+ });
92
+
93
+ // Inline SVGs
94
+ const inlineSvgs = [];
95
+ document.querySelectorAll('svg').forEach((svg, index) => {
96
+ const svgContent = svg.outerHTML;
97
+ if (svgContent.length < 50000) {
98
+ inlineSvgs.push({
99
+ id: svg.id || `inline-svg-${index}`,
100
+ content: svgContent
101
+ });
102
+ }
103
+ });
104
+
105
+ // External CSS stylesheets (for font extraction)
106
+ document.querySelectorAll('link[rel="stylesheet"]').forEach(link => {
107
+ const href = link.getAttribute('href');
108
+ if (href) {
109
+ try { cssUrls.push(new URL(href, url).href); } catch { /* ignore */ }
110
+ }
111
+ });
112
+
113
+ return { images: Array.from(imageSet), cssUrls, inlineSvgs };
114
+ }, baseUrl);
115
+ }