design-clone 2.1.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -34
- package/SKILL.md +69 -45
- package/bin/cli.js +22 -4
- package/bin/commands/clone-site.js +31 -171
- package/bin/commands/help.js +19 -6
- package/bin/commands/init.js +9 -86
- package/bin/commands/uninstall.js +105 -0
- package/bin/commands/update.js +70 -0
- package/bin/commands/verify.js +7 -14
- package/bin/utils/paths.js +28 -0
- package/bin/utils/validate.js +2 -22
- package/bin/utils/version.js +23 -0
- package/docs/code-standards.md +789 -0
- package/docs/codebase-summary.md +533 -286
- package/docs/index.md +74 -0
- package/docs/project-overview-pdr.md +797 -0
- package/docs/system-architecture.md +718 -0
- package/package.json +14 -17
- package/src/ai/prompts/design-tokens/basic.md +80 -0
- package/src/ai/prompts/design-tokens/section-with-css.md +41 -0
- package/src/ai/prompts/design-tokens/section.md +48 -0
- package/src/ai/prompts/design-tokens/with-css.md +87 -0
- package/src/ai/prompts/structure-analysis/basic.md +55 -0
- package/src/ai/prompts/structure-analysis/with-context.md +59 -0
- package/src/ai/prompts/structure-analysis/with-dimensions.md +63 -0
- package/src/ai/prompts/structure-analysis/with-hierarchy.md +73 -0
- package/src/ai/prompts/ux-audit/aggregation.md +42 -0
- package/src/ai/prompts/ux-audit/desktop.md +92 -0
- package/src/ai/prompts/ux-audit/mobile.md +93 -0
- package/src/ai/prompts/ux-audit/tablet.md +92 -0
- package/src/core/animation/animation-extractor-ast.js +183 -0
- package/src/core/animation/animation-extractor-output.js +152 -0
- package/src/core/animation/animation-extractor.js +178 -0
- package/src/core/animation/state-capture-detection.js +200 -0
- package/src/core/animation/state-capture.js +193 -0
- package/src/core/capture/browser-context-pool.js +96 -0
- package/src/core/capture/multi-page-screenshot-page.js +110 -0
- package/src/core/capture/multi-page-screenshot.js +208 -0
- package/src/core/capture/screenshot-extraction.js +186 -0
- package/src/core/capture/screenshot-helpers.js +175 -0
- package/src/core/capture/screenshot-orchestrator.js +174 -0
- package/src/core/capture/screenshot-viewport.js +93 -0
- package/src/core/capture/screenshot.js +192 -0
- package/src/core/content/content-counter-dom.js +191 -0
- package/src/core/content/content-counter.js +76 -0
- package/src/core/css/breakpoint-detector.js +66 -0
- package/src/core/css/chromium-defaults.json +23 -0
- package/src/core/css/computed-style-extractor.js +102 -0
- package/src/core/css/css-chunker.js +103 -0
- package/src/core/css/filter-css-dead-code.js +120 -0
- package/src/core/css/filter-css-html-analyzer.js +110 -0
- package/src/core/css/filter-css-selector-matcher.js +172 -0
- package/src/core/css/filter-css.js +206 -0
- package/src/core/css/merge-css-atrule-processor.js +158 -0
- package/src/core/css/merge-css-file-io.js +68 -0
- package/src/core/css/merge-css.js +148 -0
- package/src/core/detection/framework-detector-routing.js +68 -0
- package/src/core/detection/framework-detector-signals.js +65 -0
- package/src/core/detection/framework-detector.js +198 -0
- package/src/core/dimension/dimension-extractor-card-detector.js +82 -0
- package/src/core/dimension/dimension-extractor.js +317 -0
- package/src/core/dimension/dimension-output-ai-summary.js +111 -0
- package/src/core/dimension/dimension-output.js +173 -0
- package/src/core/dimension/dom-tree-analyzer-tree-builders.js +95 -0
- package/src/core/dimension/dom-tree-analyzer.js +191 -0
- package/src/core/discovery/app-state-snapshot-capture.js +195 -0
- package/src/core/discovery/app-state-snapshot-utils.js +178 -0
- package/src/core/discovery/app-state-snapshot.js +131 -0
- package/src/core/discovery/discover-pages-routes.js +84 -0
- package/src/core/discovery/discover-pages-utils.js +177 -0
- package/src/core/discovery/discover-pages.js +191 -0
- package/src/core/html/html-extractor-inline-styler.js +70 -0
- package/src/core/html/html-extractor.js +147 -0
- package/src/core/html/semantic-enhancer-mappings.js +200 -0
- package/src/core/html/semantic-enhancer-page.js +148 -0
- package/src/core/html/semantic-enhancer.js +135 -0
- package/src/core/links/rewrite-links-css-rewriter.js +53 -0
- package/src/core/links/rewrite-links.js +173 -0
- package/src/core/media/asset-validator.js +118 -0
- package/src/core/media/extract-assets-downloader.js +187 -0
- package/src/core/media/extract-assets-page-scraper.js +115 -0
- package/src/core/media/extract-assets.js +159 -0
- package/src/core/media/video-capture-convert.js +200 -0
- package/src/core/media/video-capture.js +201 -0
- package/src/core/{lazy-loader.js → page-prep/lazy-loader.js} +37 -39
- package/src/core/section/section-cropper-helpers.js +43 -0
- package/src/core/{section-cropper.js → section/section-cropper.js} +11 -88
- package/src/core/section/section-detector-strategies.js +139 -0
- package/src/core/section/section-detector-utils.js +100 -0
- package/src/core/section/section-detector.js +88 -0
- package/src/core/tests/test-section-cropper.js +2 -2
- package/src/core/tests/test-section-detector.js +2 -2
- package/src/post-process/enhance-assets.js +29 -4
- package/src/post-process/fetch-images-unsplash-client.js +123 -0
- package/src/post-process/fetch-images.js +60 -263
- package/src/post-process/inject-gosnap.js +88 -0
- package/src/post-process/inject-icons-svg-replacer.js +76 -0
- package/src/post-process/inject-icons.js +47 -200
- package/src/route-discoverers/base-discoverer-utils.js +137 -0
- package/src/route-discoverers/base-discoverer.js +29 -118
- package/src/route-discoverers/index.js +1 -1
- package/src/shared/config.js +38 -0
- package/src/shared/error-codes.js +31 -0
- package/src/shared/viewports.js +46 -0
- package/src/utils/browser.js +0 -7
- package/src/utils/helpers.js +4 -0
- package/src/utils/log.js +12 -0
- package/src/utils/playwright-loader.js +76 -0
- package/src/utils/playwright.js +3 -69
- package/src/utils/progress.js +32 -0
- package/src/verification/generate-audit-report-css-fixes.js +52 -0
- package/src/verification/generate-audit-report-sections.js +158 -0
- package/src/verification/generate-audit-report.js +5 -281
- package/src/verification/quality-scorer.js +92 -0
- package/src/verification/verify-footer-checks.js +103 -0
- package/src/verification/verify-footer-helpers.js +178 -0
- package/src/verification/verify-footer.js +23 -381
- package/src/verification/verify-header-checks.js +104 -0
- package/src/verification/verify-header-helpers.js +156 -0
- package/src/verification/verify-header.js +23 -365
- package/src/verification/verify-layout-report.js +101 -0
- package/src/verification/verify-layout.js +13 -259
- package/src/verification/verify-menu-checks.js +104 -0
- package/src/verification/verify-menu-helpers.js +112 -0
- package/src/verification/verify-menu.js +17 -285
- package/src/verification/verify-slider-checks.js +115 -0
- package/src/verification/verify-slider-constants.js +65 -0
- package/src/verification/verify-slider-helpers.js +164 -0
- package/src/verification/verify-slider.js +23 -414
- package/.env.example +0 -14
- package/docs/basic-clone.md +0 -63
- package/docs/cli-reference.md +0 -316
- package/docs/design-clone-architecture.md +0 -492
- package/docs/pixel-perfect.md +0 -117
- package/docs/project-roadmap.md +0 -382
- package/docs/troubleshooting.md +0 -170
- package/requirements.txt +0 -5
- package/src/ai/__pycache__/analyze-structure.cpython-313.pyc +0 -0
- package/src/ai/__pycache__/extract-design-tokens.cpython-313.pyc +0 -0
- package/src/ai/analyze-structure.py +0 -375
- package/src/ai/extract-design-tokens.py +0 -782
- package/src/ai/prompts/__init__.py +0 -2
- package/src/ai/prompts/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/ai/prompts/__pycache__/design_tokens.cpython-313.pyc +0 -0
- package/src/ai/prompts/__pycache__/structure_analysis.cpython-313.pyc +0 -0
- package/src/ai/prompts/__pycache__/ux_audit.cpython-313.pyc +0 -0
- package/src/ai/prompts/design_tokens.py +0 -316
- package/src/ai/prompts/structure_analysis.py +0 -592
- package/src/ai/prompts/ux_audit.py +0 -198
- package/src/ai/ux-audit.js +0 -596
- package/src/core/animation-extractor.js +0 -526
- package/src/core/app-state-snapshot.js +0 -511
- package/src/core/content-counter.js +0 -342
- package/src/core/design-tokens.js +0 -103
- package/src/core/dimension-extractor.js +0 -438
- package/src/core/dimension-output.js +0 -305
- package/src/core/discover-pages.js +0 -542
- package/src/core/dom-tree-analyzer.js +0 -298
- package/src/core/extract-assets.js +0 -468
- package/src/core/filter-css.js +0 -499
- package/src/core/framework-detector.js +0 -538
- package/src/core/html-extractor.js +0 -212
- package/src/core/merge-css.js +0 -407
- package/src/core/multi-page-screenshot.js +0 -380
- package/src/core/rewrite-links.js +0 -226
- package/src/core/screenshot.js +0 -701
- package/src/core/section-detector.js +0 -386
- package/src/core/semantic-enhancer.js +0 -492
- package/src/core/state-capture.js +0 -598
- package/src/core/video-capture.js +0 -546
- package/src/utils/__init__.py +0 -16
- package/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/utils/__pycache__/env.cpython-313.pyc +0 -0
- package/src/utils/env.py +0 -134
- /package/src/core/{css-extractor.js → css/css-extractor.js} +0 -0
- /package/src/core/{cookie-handler.js → page-prep/cookie-handler.js} +0 -0
- /package/src/core/{page-readiness.js → page-prep/page-readiness.js} +0 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Link Rewriting Module
|
|
3
|
+
*
|
|
4
|
+
* Rewrites internal links in HTML to point to local .html files.
|
|
5
|
+
* Preserves external links unchanged.
|
|
6
|
+
* CSS link rewriting lives in rewrite-links-css-rewriter.js.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* import { rewriteLinks, createPageManifest } from '../links/rewrite-links.js';
|
|
10
|
+
* const rewritten = rewriteLinks(html, manifest, { baseUrl });
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import fs from 'fs/promises';
|
|
14
|
+
import path from 'path';
|
|
15
|
+
import { normalizeUrl } from '../discovery/discover-pages-utils.js';
|
|
16
|
+
import { rewriteCssLinks } from './rewrite-links-css-rewriter.js';
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Convert URL path to local filename.
|
|
20
|
+
* @param {string} urlPath - e.g. '/about', '/services/consulting'
|
|
21
|
+
* @returns {string} e.g. 'about.html', 'services-consulting.html'
|
|
22
|
+
*/
|
|
23
|
+
export function pathToFilename(urlPath) {
|
|
24
|
+
if (!urlPath || urlPath === '/' || urlPath === '') return 'index.html';
|
|
25
|
+
|
|
26
|
+
const name = urlPath
|
|
27
|
+
.replace(/^\//, '')
|
|
28
|
+
.replace(/\/$/, '')
|
|
29
|
+
.replace(/\//g, '-')
|
|
30
|
+
.replace(/[^a-z0-9-]/gi, '-')
|
|
31
|
+
.replace(/-+/g, '-')
|
|
32
|
+
.toLowerCase();
|
|
33
|
+
|
|
34
|
+
return `${name}.html`;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Create page manifest from discovered pages.
|
|
39
|
+
* @param {Array<{ path, name, url }>} pages
|
|
40
|
+
* @param {Object} options
|
|
41
|
+
* @returns {Object} Page manifest
|
|
42
|
+
*/
|
|
43
|
+
export function createPageManifest(pages, options = {}) {
|
|
44
|
+
const baseUrl = pages[0]?.url ? new URL(pages[0].url).origin : '';
|
|
45
|
+
|
|
46
|
+
return {
|
|
47
|
+
baseUrl,
|
|
48
|
+
capturedAt: new Date().toISOString(),
|
|
49
|
+
pages: pages.map(page => ({
|
|
50
|
+
path: page.path,
|
|
51
|
+
name: page.name,
|
|
52
|
+
file: pathToFilename(page.path),
|
|
53
|
+
originalUrl: page.url
|
|
54
|
+
})),
|
|
55
|
+
assets: {
|
|
56
|
+
css: 'styles.css',
|
|
57
|
+
tokens: options.hasTokens ? 'tokens.css' : null
|
|
58
|
+
},
|
|
59
|
+
stats: options.stats || {}
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Build URL-to-filename mapping from manifest (path + full URL variants).
|
|
65
|
+
* @param {Object} manifest
|
|
66
|
+
* @returns {Map<string, string>}
|
|
67
|
+
*/
|
|
68
|
+
function buildUrlMap(manifest) {
|
|
69
|
+
const urlMap = new Map();
|
|
70
|
+
|
|
71
|
+
for (const page of manifest.pages) {
|
|
72
|
+
if (page.originalUrl) {
|
|
73
|
+
urlMap.set(page.originalUrl, page.file);
|
|
74
|
+
urlMap.set(page.originalUrl.replace(/\/$/, ''), page.file);
|
|
75
|
+
}
|
|
76
|
+
if (page.path) {
|
|
77
|
+
urlMap.set(page.path, page.file);
|
|
78
|
+
if (page.path !== '/') urlMap.set(page.path.replace(/\/$/, ''), page.file);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return urlMap;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Rewrite links in HTML to point to local files.
|
|
87
|
+
* @param {string} html
|
|
88
|
+
* @param {Object} manifest
|
|
89
|
+
* @param {Object} options
|
|
90
|
+
* @param {string} [options.baseUrl]
|
|
91
|
+
* @param {boolean} [options.rewriteCss=true]
|
|
92
|
+
* @param {boolean} [options.injectTokensCss=false]
|
|
93
|
+
* @returns {string} HTML with rewritten links
|
|
94
|
+
*/
|
|
95
|
+
export function rewriteLinks(html, manifest, options = {}) {
|
|
96
|
+
const { baseUrl, rewriteCss = true, injectTokensCss = false } = options;
|
|
97
|
+
const urlMap = buildUrlMap(manifest);
|
|
98
|
+
|
|
99
|
+
let result = html;
|
|
100
|
+
|
|
101
|
+
// Rewrite <a href="..."> internal links
|
|
102
|
+
result = result.replace(
|
|
103
|
+
/(<a\s[^>]*href=["'])([^"']+)(["'][^>]*>)/gi,
|
|
104
|
+
(match, prefix, href, suffix) => {
|
|
105
|
+
if (!href ||
|
|
106
|
+
href.startsWith('javascript:') ||
|
|
107
|
+
href.startsWith('mailto:') ||
|
|
108
|
+
href.startsWith('tel:') ||
|
|
109
|
+
href.startsWith('#')) {
|
|
110
|
+
return match;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
let filename = null;
|
|
114
|
+
|
|
115
|
+
if (urlMap.has(href)) {
|
|
116
|
+
filename = urlMap.get(href);
|
|
117
|
+
} else if (baseUrl) {
|
|
118
|
+
const normalized = normalizeUrl(baseUrl, href);
|
|
119
|
+
if (normalized && urlMap.has(normalized)) filename = urlMap.get(normalized);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (filename) {
|
|
123
|
+
const fragmentMatch = href.match(/#[^#]*$/);
|
|
124
|
+
const fragment = fragmentMatch ? fragmentMatch[0] : '';
|
|
125
|
+
return `${prefix}${filename}${fragment}${suffix}`;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
return match;
|
|
129
|
+
}
|
|
130
|
+
);
|
|
131
|
+
|
|
132
|
+
if (rewriteCss) {
|
|
133
|
+
result = rewriteCssLinks(result, injectTokensCss);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
return result;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Rewrite links in all HTML files listed in manifest.
|
|
141
|
+
* @param {string} htmlDir
|
|
142
|
+
* @param {Object} manifest
|
|
143
|
+
* @param {Object} options
|
|
144
|
+
* @returns {Promise<{ processed: string[], errors: Array }>}
|
|
145
|
+
*/
|
|
146
|
+
export async function rewriteAllLinks(htmlDir, manifest, options = {}) {
|
|
147
|
+
const results = { processed: [], errors: [] };
|
|
148
|
+
|
|
149
|
+
for (const page of manifest.pages) {
|
|
150
|
+
const htmlPath = path.join(htmlDir, page.file);
|
|
151
|
+
try {
|
|
152
|
+
const html = await fs.readFile(htmlPath, 'utf-8');
|
|
153
|
+
const rewritten = rewriteLinks(html, manifest, options);
|
|
154
|
+
await fs.writeFile(htmlPath, rewritten, 'utf-8');
|
|
155
|
+
results.processed.push(page.file);
|
|
156
|
+
} catch (err) {
|
|
157
|
+
results.errors.push({ file: page.file, error: err.message });
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
return results;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// CLI stub
|
|
165
|
+
const isMainModule = process.argv[1] && (
|
|
166
|
+
process.argv[1].endsWith('rewrite-links.js') ||
|
|
167
|
+
process.argv[1].includes('rewrite-links')
|
|
168
|
+
);
|
|
169
|
+
|
|
170
|
+
if (isMainModule) {
|
|
171
|
+
console.log('rewrite-links.js - Use as module, not CLI');
|
|
172
|
+
console.log('Exports: rewriteLinks, createPageManifest, pathToFilename, rewriteAllLinks');
|
|
173
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Asset Validator
|
|
3
|
+
*
|
|
4
|
+
* Validates downloaded assets via magic bytes and size thresholds.
|
|
5
|
+
* Sanitizes SVG files by stripping script tags and event handlers.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import fs from 'fs/promises';
|
|
9
|
+
import path from 'path';
|
|
10
|
+
|
|
11
|
+
const MAGIC_BYTES = {
|
|
12
|
+
png: [0x89, 0x50, 0x4E, 0x47],
|
|
13
|
+
jpeg: [0xFF, 0xD8, 0xFF],
|
|
14
|
+
gif: [0x47, 0x49, 0x46],
|
|
15
|
+
woff2: [0x77, 0x4F, 0x46, 0x32],
|
|
16
|
+
woff: [0x77, 0x4F, 0x46, 0x46],
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
const MIN_SIZES = { images: 100, fonts: 1024 };
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Validate a single asset file
|
|
23
|
+
* @param {string} filePath - Path to the file
|
|
24
|
+
* @returns {Promise<{valid: boolean, type: string, issues: string[]}>}
|
|
25
|
+
*/
|
|
26
|
+
export async function validateAsset(filePath) {
|
|
27
|
+
const issues = [];
|
|
28
|
+
const ext = path.extname(filePath).toLowerCase().slice(1);
|
|
29
|
+
|
|
30
|
+
let stat;
|
|
31
|
+
try {
|
|
32
|
+
stat = await fs.stat(filePath);
|
|
33
|
+
} catch {
|
|
34
|
+
return { valid: false, type: ext, issues: ['File not found'] };
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Size check
|
|
38
|
+
const isFont = ['woff', 'woff2', 'ttf', 'otf', 'eot'].includes(ext);
|
|
39
|
+
const minSize = isFont ? MIN_SIZES.fonts : MIN_SIZES.images;
|
|
40
|
+
if (stat.size < minSize) {
|
|
41
|
+
issues.push(`File too small: ${stat.size} bytes (min: ${minSize})`);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// SVG check
|
|
45
|
+
if (ext === 'svg') {
|
|
46
|
+
const content = await fs.readFile(filePath, 'utf-8');
|
|
47
|
+
if (/<script[\s>]/i.test(content)) issues.push('SVG contains script tags');
|
|
48
|
+
if (/\son\w+\s*=/i.test(content)) issues.push('SVG contains event handlers');
|
|
49
|
+
if (/javascript\s*:/i.test(content)) issues.push('SVG contains javascript: URIs');
|
|
50
|
+
if (/<(iframe|object|embed)[\s>]/i.test(content)) issues.push('SVG contains unsafe elements');
|
|
51
|
+
if (issues.length > 0) {
|
|
52
|
+
const sanitized = sanitizeSvg(content);
|
|
53
|
+
await fs.writeFile(filePath, sanitized, 'utf-8');
|
|
54
|
+
return { valid: true, type: 'svg', issues, sanitized: true };
|
|
55
|
+
}
|
|
56
|
+
return { valid: true, type: 'svg', issues: [] };
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Magic byte check for known types
|
|
60
|
+
const magicKey = ext === 'jpg' ? 'jpeg' : ext;
|
|
61
|
+
const expected = MAGIC_BYTES[magicKey];
|
|
62
|
+
if (expected && stat.size >= expected.length) {
|
|
63
|
+
const buf = Buffer.alloc(expected.length);
|
|
64
|
+
const fh = await fs.open(filePath, 'r');
|
|
65
|
+
await fh.read(buf, 0, expected.length, 0);
|
|
66
|
+
await fh.close();
|
|
67
|
+
const matches = expected.every((b, i) => buf[i] === b);
|
|
68
|
+
if (!matches) issues.push(`Magic bytes mismatch for ${ext}`);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return { valid: issues.length === 0, type: ext, issues };
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Validate all assets in a directory
|
|
76
|
+
* @param {string} assetsDir - Assets root directory
|
|
77
|
+
* @returns {Promise<{valid: number, invalid: number, sanitized: number, details: Array}>}
|
|
78
|
+
*/
|
|
79
|
+
export async function validateBatch(assetsDir) {
|
|
80
|
+
const results = { valid: 0, invalid: 0, sanitized: 0, details: [] };
|
|
81
|
+
const subdirs = ['images', 'fonts', 'icons'];
|
|
82
|
+
|
|
83
|
+
for (const sub of subdirs) {
|
|
84
|
+
const dir = path.join(assetsDir, sub);
|
|
85
|
+
let files;
|
|
86
|
+
try { files = await fs.readdir(dir); } catch { continue; }
|
|
87
|
+
for (const file of files) {
|
|
88
|
+
const result = await validateAsset(path.join(dir, file));
|
|
89
|
+
if (result.sanitized) results.sanitized++;
|
|
90
|
+
if (result.valid) results.valid++;
|
|
91
|
+
else {
|
|
92
|
+
results.invalid++;
|
|
93
|
+
results.details.push({ file: `${sub}/${file}`, ...result });
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
return results;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Strip dangerous content from SVG
|
|
103
|
+
* @param {string} content - SVG content
|
|
104
|
+
* @returns {string} Sanitized SVG
|
|
105
|
+
*/
|
|
106
|
+
export function sanitizeSvg(content) {
|
|
107
|
+
return content
|
|
108
|
+
// Strip <script> blocks
|
|
109
|
+
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
|
110
|
+
// Strip dangerous elements: <iframe>, <object>, <embed>
|
|
111
|
+
.replace(/<(iframe|object|embed)[\s\S]*?<\/\1>/gi, '')
|
|
112
|
+
.replace(/<(iframe|object|embed)(\s[^>]*)?\s*\/>/gi, '')
|
|
113
|
+
// Strip javascript: URIs from href, xlink:href, src attributes
|
|
114
|
+
.replace(/((?:xlink:)?href|src)\s*=\s*["']\s*javascript\s*:[^"']*["']/gi, '')
|
|
115
|
+
// Strip on* event handler attributes
|
|
116
|
+
.replace(/\son\w+\s*=\s*["'][^"']*["']/gi, '')
|
|
117
|
+
.replace(/\son\w+\s*=\s*\{[^}]*\}/gi, '');
|
|
118
|
+
}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Asset Downloader Utilities
|
|
3
|
+
*
|
|
4
|
+
* Handles file downloading with timeout/retry, batch rate-limiting,
|
|
5
|
+
* safe filename generation, and asset-type classification.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import fs from 'fs/promises';
|
|
9
|
+
import path from 'path';
|
|
10
|
+
import https from 'https';
|
|
11
|
+
import http from 'http';
|
|
12
|
+
import { URL } from 'url';
|
|
13
|
+
|
|
14
|
+
// Asset type configurations
|
|
15
|
+
export const ASSET_TYPES = {
|
|
16
|
+
images: {
|
|
17
|
+
extensions: ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico', '.avif'],
|
|
18
|
+
folder: 'images'
|
|
19
|
+
},
|
|
20
|
+
fonts: {
|
|
21
|
+
extensions: ['.woff', '.woff2', '.ttf', '.otf', '.eot'],
|
|
22
|
+
folder: 'fonts'
|
|
23
|
+
},
|
|
24
|
+
icons: {
|
|
25
|
+
extensions: ['.svg'],
|
|
26
|
+
folder: 'icons'
|
|
27
|
+
}
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
// Rate limiting configuration
|
|
31
|
+
export const RATE_LIMIT = {
|
|
32
|
+
maxConcurrent: 10,
|
|
33
|
+
delayBetweenBatches: 50
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Download a file with timeout and retry.
|
|
38
|
+
* @param {string} url
|
|
39
|
+
* @param {string} destPath
|
|
40
|
+
* @param {number} timeout - ms
|
|
41
|
+
* @param {number} retries
|
|
42
|
+
* @returns {Promise<{ success: boolean, error?: string }>}
|
|
43
|
+
*/
|
|
44
|
+
export async function downloadFile(url, destPath, timeout = 30000, retries = 2) {
|
|
45
|
+
const protocol = url.startsWith('https') ? https : http;
|
|
46
|
+
|
|
47
|
+
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
48
|
+
try {
|
|
49
|
+
await new Promise((resolve, reject) => {
|
|
50
|
+
const timeoutId = setTimeout(() => reject(new Error('Download timeout')), timeout);
|
|
51
|
+
|
|
52
|
+
const request = protocol.get(url, {
|
|
53
|
+
headers: {
|
|
54
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
|
|
55
|
+
'Accept': '*/*'
|
|
56
|
+
}
|
|
57
|
+
}, (response) => {
|
|
58
|
+
if (response.statusCode >= 300 && response.statusCode < 400 && response.headers.location) {
|
|
59
|
+
clearTimeout(timeoutId);
|
|
60
|
+
downloadFile(response.headers.location, destPath, timeout, 0).then(resolve).catch(reject);
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if (response.statusCode === 429) {
|
|
65
|
+
clearTimeout(timeoutId);
|
|
66
|
+
const retryAfter = parseInt(response.headers['retry-after'] || '0', 10);
|
|
67
|
+
const backoffMs = retryAfter > 0
|
|
68
|
+
? retryAfter * 1000
|
|
69
|
+
: Math.min(1000 * Math.pow(2, attempt), 8000);
|
|
70
|
+
reject(new Error(`HTTP 429 (retry after ${backoffMs}ms)`));
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (response.statusCode !== 200) {
|
|
75
|
+
clearTimeout(timeoutId);
|
|
76
|
+
reject(new Error(`HTTP ${response.statusCode}`));
|
|
77
|
+
return;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const chunks = [];
|
|
81
|
+
response.on('data', chunk => chunks.push(chunk));
|
|
82
|
+
response.on('end', async () => {
|
|
83
|
+
clearTimeout(timeoutId);
|
|
84
|
+
try {
|
|
85
|
+
const buffer = Buffer.concat(chunks);
|
|
86
|
+
await fs.mkdir(path.dirname(destPath), { recursive: true });
|
|
87
|
+
await fs.writeFile(destPath, buffer);
|
|
88
|
+
resolve({ size: buffer.length });
|
|
89
|
+
} catch (err) {
|
|
90
|
+
reject(err);
|
|
91
|
+
}
|
|
92
|
+
});
|
|
93
|
+
response.on('error', err => { clearTimeout(timeoutId); reject(err); });
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
request.on('error', err => { clearTimeout(timeoutId); reject(err); });
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
return { success: true };
|
|
100
|
+
} catch (err) {
|
|
101
|
+
if (attempt === retries) return { success: false, error: err.message };
|
|
102
|
+
const is429 = err.message.includes('HTTP 429');
|
|
103
|
+
const delay = is429
|
|
104
|
+
? parseInt(err.message.match(/(\d+)ms/)?.[1] || '2000')
|
|
105
|
+
: 500 * (attempt + 1);
|
|
106
|
+
await new Promise(r => setTimeout(r, delay));
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Download files in batches with rate limiting.
|
|
113
|
+
* @param {Array<{ url: string, destPath: string, type: string }>} downloads
|
|
114
|
+
* @param {boolean} verbose
|
|
115
|
+
* @returns {Promise<{ success: number, failed: number, skipped: number, errors: Array }>}
|
|
116
|
+
*/
|
|
117
|
+
export async function downloadBatch(downloads, verbose = false, options = {}) {
|
|
118
|
+
const { maxConcurrent = RATE_LIMIT.maxConcurrent, delayBetween = RATE_LIMIT.delayBetweenBatches } = options;
|
|
119
|
+
const results = { success: 0, failed: 0, skipped: 0, errors: [] };
|
|
120
|
+
|
|
121
|
+
for (let i = 0; i < downloads.length; i += maxConcurrent) {
|
|
122
|
+
const batch = downloads.slice(i, i + maxConcurrent);
|
|
123
|
+
|
|
124
|
+
await Promise.all(batch.map(async ({ url, destPath, type }) => {
|
|
125
|
+
try {
|
|
126
|
+
await fs.access(destPath);
|
|
127
|
+
results.skipped++;
|
|
128
|
+
return;
|
|
129
|
+
} catch { /* file doesn't exist, continue */ }
|
|
130
|
+
|
|
131
|
+
const result = await downloadFile(url, destPath);
|
|
132
|
+
if (result.success) {
|
|
133
|
+
results.success++;
|
|
134
|
+
if (verbose) console.error(` ✓ ${type}: ${path.basename(destPath)}`);
|
|
135
|
+
} else {
|
|
136
|
+
results.failed++;
|
|
137
|
+
results.errors.push({ url, error: result.error });
|
|
138
|
+
if (verbose) console.error(` ✗ ${type}: ${path.basename(url)} - ${result.error}`);
|
|
139
|
+
}
|
|
140
|
+
}));
|
|
141
|
+
|
|
142
|
+
if (i + maxConcurrent < downloads.length) {
|
|
143
|
+
await new Promise(r => setTimeout(r, delayBetween));
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return results;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Generate safe filename from URL.
|
|
152
|
+
* @param {string} url
|
|
153
|
+
* @returns {string}
|
|
154
|
+
*/
|
|
155
|
+
export function getSafeFilename(url) {
|
|
156
|
+
try {
|
|
157
|
+
const urlObj = new URL(url);
|
|
158
|
+
let filename = path.basename(urlObj.pathname);
|
|
159
|
+
|
|
160
|
+
if (urlObj.search) {
|
|
161
|
+
const hash = Buffer.from(urlObj.search).toString('base64').slice(0, 8);
|
|
162
|
+
const ext = path.extname(filename);
|
|
163
|
+
const base = path.basename(filename, ext);
|
|
164
|
+
filename = `${base}-${hash}${ext}`;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
filename = filename.replace(/[^a-zA-Z0-9._-]/g, '_');
|
|
168
|
+
if (!path.extname(filename)) filename += '.bin';
|
|
169
|
+
|
|
170
|
+
return filename;
|
|
171
|
+
} catch {
|
|
172
|
+
return `asset-${Date.now()}.bin`;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Determine asset type from URL extension.
|
|
178
|
+
* @param {string} url
|
|
179
|
+
* @returns {'fonts'|'icons'|'images'|'other'}
|
|
180
|
+
*/
|
|
181
|
+
export function getAssetType(url) {
|
|
182
|
+
const ext = path.extname(new URL(url).pathname).toLowerCase();
|
|
183
|
+
if (ASSET_TYPES.fonts.extensions.includes(ext)) return 'fonts';
|
|
184
|
+
if (ext === '.svg') return 'icons';
|
|
185
|
+
if (ASSET_TYPES.images.extensions.includes(ext)) return 'images';
|
|
186
|
+
return 'other';
|
|
187
|
+
}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Page Asset Scraper
|
|
3
|
+
*
|
|
4
|
+
* Extracts asset URLs from a live Playwright page (images, inline SVGs,
|
|
5
|
+
* CSS stylesheet links) and from raw CSS content (background URLs, font-face).
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { URL } from 'url';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Parse CSS text for asset URLs (background images + font-face src).
|
|
12
|
+
* @param {string} cssContent
|
|
13
|
+
* @param {string} baseUrl
|
|
14
|
+
* @returns {string[]} Absolute asset URLs
|
|
15
|
+
*/
|
|
16
|
+
export function extractCssUrls(cssContent, baseUrl) {
|
|
17
|
+
const urls = new Set();
|
|
18
|
+
|
|
19
|
+
const bgPattern = /url\s*\(\s*['"]?([^'")\s]+)['"]?\s*\)/gi;
|
|
20
|
+
let match;
|
|
21
|
+
while ((match = bgPattern.exec(cssContent)) !== null) {
|
|
22
|
+
const url = match[1];
|
|
23
|
+
if (!url.startsWith('data:')) {
|
|
24
|
+
try { urls.add(new URL(url, baseUrl).href); } catch { /* ignore */ }
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const fontPattern = /@font-face\s*\{[^}]*src:\s*([^;]+)/gi;
|
|
29
|
+
while ((match = fontPattern.exec(cssContent)) !== null) {
|
|
30
|
+
const srcValue = match[1];
|
|
31
|
+
const urlPattern = /url\s*\(\s*['"]?([^'")\s]+)['"]?\s*\)/gi;
|
|
32
|
+
let urlMatch;
|
|
33
|
+
while ((urlMatch = urlPattern.exec(srcValue)) !== null) {
|
|
34
|
+
const url = urlMatch[1];
|
|
35
|
+
if (!url.startsWith('data:')) {
|
|
36
|
+
try { urls.add(new URL(url, baseUrl).href); } catch { /* ignore */ }
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
return Array.from(urls);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Extract all asset URLs and inline SVGs from a Playwright page.
|
|
46
|
+
* @param {import('playwright').Page} page
|
|
47
|
+
* @param {string} baseUrl
|
|
48
|
+
* @returns {Promise<{ images: string[], cssUrls: string[], inlineSvgs: Array<{id: string, content: string}> }>}
|
|
49
|
+
*/
|
|
50
|
+
export async function extractAssetsFromPage(page, baseUrl) {
|
|
51
|
+
return await page.evaluate((url) => {
|
|
52
|
+
const imageSet = new Set();
|
|
53
|
+
const cssUrls = [];
|
|
54
|
+
|
|
55
|
+
// img[src]
|
|
56
|
+
document.querySelectorAll('img[src]').forEach(img => {
|
|
57
|
+
const src = img.getAttribute('src');
|
|
58
|
+
if (src && !src.startsWith('data:')) {
|
|
59
|
+
try { imageSet.add(new URL(src, url).href); } catch { /* ignore */ }
|
|
60
|
+
}
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
// srcset
|
|
64
|
+
document.querySelectorAll('[srcset]').forEach(el => {
|
|
65
|
+
const srcset = el.getAttribute('srcset');
|
|
66
|
+
if (srcset) {
|
|
67
|
+
srcset.split(',').forEach(part => {
|
|
68
|
+
const src = part.trim().split(/\s+/)[0];
|
|
69
|
+
if (src && !src.startsWith('data:')) {
|
|
70
|
+
try { imageSet.add(new URL(src, url).href); } catch { /* ignore */ }
|
|
71
|
+
}
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
// Inline background images
|
|
77
|
+
document.querySelectorAll('[style*="background"]').forEach(el => {
|
|
78
|
+
const style = el.getAttribute('style');
|
|
79
|
+
const urlMatch = style.match(/url\s*\(\s*['"]?([^'")\s]+)['"]?\s*\)/i);
|
|
80
|
+
if (urlMatch && !urlMatch[1].startsWith('data:')) {
|
|
81
|
+
try { imageSet.add(new URL(urlMatch[1], url).href); } catch { /* ignore */ }
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
// Favicon and touch icons
|
|
86
|
+
document.querySelectorAll('link[rel*="icon"]').forEach(link => {
|
|
87
|
+
const href = link.getAttribute('href');
|
|
88
|
+
if (href && !href.startsWith('data:')) {
|
|
89
|
+
try { imageSet.add(new URL(href, url).href); } catch { /* ignore */ }
|
|
90
|
+
}
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
// Inline SVGs
|
|
94
|
+
const inlineSvgs = [];
|
|
95
|
+
document.querySelectorAll('svg').forEach((svg, index) => {
|
|
96
|
+
const svgContent = svg.outerHTML;
|
|
97
|
+
if (svgContent.length < 50000) {
|
|
98
|
+
inlineSvgs.push({
|
|
99
|
+
id: svg.id || `inline-svg-${index}`,
|
|
100
|
+
content: svgContent
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
// External CSS stylesheets (for font extraction)
|
|
106
|
+
document.querySelectorAll('link[rel="stylesheet"]').forEach(link => {
|
|
107
|
+
const href = link.getAttribute('href');
|
|
108
|
+
if (href) {
|
|
109
|
+
try { cssUrls.push(new URL(href, url).href); } catch { /* ignore */ }
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
return { images: Array.from(imageSet), cssUrls, inlineSvgs };
|
|
114
|
+
}, baseUrl);
|
|
115
|
+
}
|