design-clone 1.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/SKILL.md +53 -0
- package/bin/cli.js +16 -0
- package/bin/commands/clone-site.js +324 -0
- package/bin/commands/help.js +16 -4
- package/bin/commands/init.js +29 -1
- package/commands/design/clone-site.md +135 -0
- package/docs/troubleshooting.md +72 -0
- package/package.json +2 -1
- package/src/core/css-extractor.js +38 -13
- package/src/core/design-tokens.js +103 -0
- package/src/core/discover-pages.js +314 -0
- package/src/core/html-extractor.js +72 -3
- package/src/core/merge-css.js +407 -0
- package/src/core/multi-page-screenshot.js +377 -0
- package/src/core/rewrite-links.js +226 -0
- package/src/core/screenshot.js +18 -1
package/docs/troubleshooting.md
CHANGED
|
@@ -95,3 +95,75 @@ GEMINI_API_KEY=your-api-key-here
|
|
|
95
95
|
```bash
|
|
96
96
|
--viewports '[{"width":1440,"height":900,"name":"custom"}]'
|
|
97
97
|
```
|
|
98
|
+
|
|
99
|
+
## clone-site Issues
|
|
100
|
+
|
|
101
|
+
### No pages discovered
|
|
102
|
+
|
|
103
|
+
**Symptom:** Only homepage cloned, other pages not found.
|
|
104
|
+
|
|
105
|
+
**Causes:**
|
|
106
|
+
- Site uses JS-rendered navigation (React/Vue/Angular)
|
|
107
|
+
- Navigation not in standard selectors (header nav, footer nav)
|
|
108
|
+
|
|
109
|
+
**Solutions:**
|
|
110
|
+
```bash
|
|
111
|
+
# Specify pages manually
|
|
112
|
+
design-clone clone-site https://example.com --pages /,/about,/contact,/services
|
|
113
|
+
|
|
114
|
+
# Increase max pages if hitting limit
|
|
115
|
+
design-clone clone-site https://example.com --max-pages 20
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Links not working in cloned pages
|
|
119
|
+
|
|
120
|
+
**Symptom:** Internal links point to original URLs.
|
|
121
|
+
|
|
122
|
+
**Causes:**
|
|
123
|
+
- Page not in discovered list
|
|
124
|
+
- HTML file not found for rewriting
|
|
125
|
+
|
|
126
|
+
**Solutions:**
|
|
127
|
+
1. Check `manifest.json` for page list
|
|
128
|
+
2. Ensure all pages captured successfully (check `capture-results.json`)
|
|
129
|
+
3. Re-run with manual `--pages` flag including missing pages
|
|
130
|
+
|
|
131
|
+
### CSS broken on some pages
|
|
132
|
+
|
|
133
|
+
**Symptom:** Styling differs between cloned pages.
|
|
134
|
+
|
|
135
|
+
**Causes:**
|
|
136
|
+
- Page-specific CSS not merged
|
|
137
|
+
- CSS extraction failed for some pages
|
|
138
|
+
|
|
139
|
+
**Solutions:**
|
|
140
|
+
1. Check `css/` folder for per-page CSS files
|
|
141
|
+
2. Review merge stats in output
|
|
142
|
+
3. Try with fewer pages to isolate issue
|
|
143
|
+
|
|
144
|
+
### Timeout during capture
|
|
145
|
+
|
|
146
|
+
**Error:** `Navigation timeout`
|
|
147
|
+
|
|
148
|
+
**Causes:**
|
|
149
|
+
- Large pages
|
|
150
|
+
- Slow server
|
|
151
|
+
- Too many pages
|
|
152
|
+
|
|
153
|
+
**Solutions:**
|
|
154
|
+
```bash
|
|
155
|
+
# Reduce pages
|
|
156
|
+
design-clone clone-site https://example.com --max-pages 5
|
|
157
|
+
|
|
158
|
+
# Use specific viewports only
|
|
159
|
+
design-clone clone-site https://example.com --viewports desktop
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Memory issues
|
|
163
|
+
|
|
164
|
+
**Symptom:** Process crashes or hangs.
|
|
165
|
+
|
|
166
|
+
**Solutions:**
|
|
167
|
+
1. Reduce `--max-pages` to 5 or fewer
|
|
168
|
+
2. Clone in batches
|
|
169
|
+
3. Close other applications
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "design-clone",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Claude Code skill for cloning website designs via multi-viewport screenshots, HTML/CSS extraction, and Gemini AI analysis",
|
|
6
6
|
"bin": {
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
"files": [
|
|
10
10
|
"bin/",
|
|
11
11
|
"src/",
|
|
12
|
+
"commands/",
|
|
12
13
|
"templates/",
|
|
13
14
|
"docs/",
|
|
14
15
|
"SKILL.md",
|
|
@@ -9,6 +9,27 @@
|
|
|
9
9
|
export const MAX_CSS_SIZE = 5 * 1024 * 1024; // 5MB limit
|
|
10
10
|
export const MAX_CSS_RULES_WARN = 5000; // Warn on large stylesheets
|
|
11
11
|
|
|
12
|
+
// Layout-critical properties for accurate cloning
|
|
13
|
+
export const LAYOUT_PROPERTIES = {
|
|
14
|
+
// Display & Flex
|
|
15
|
+
display: ['display', 'flexDirection', 'flexWrap', 'justifyContent',
|
|
16
|
+
'alignItems', 'alignContent', 'gap', 'rowGap', 'columnGap'],
|
|
17
|
+
// Grid
|
|
18
|
+
grid: ['gridTemplateColumns', 'gridTemplateRows', 'gridGap', 'gridAutoFlow'],
|
|
19
|
+
// Position
|
|
20
|
+
position: ['position', 'top', 'right', 'bottom', 'left', 'zIndex'],
|
|
21
|
+
// Sizing
|
|
22
|
+
sizing: ['width', 'height', 'minWidth', 'maxWidth', 'minHeight', 'maxHeight'],
|
|
23
|
+
// Box Model
|
|
24
|
+
box: ['boxSizing', 'overflow', 'overflowX', 'overflowY', 'borderWidth', 'borderStyle'],
|
|
25
|
+
// Visual (existing)
|
|
26
|
+
visual: ['color', 'backgroundColor', 'fontFamily', 'fontSize',
|
|
27
|
+
'fontWeight', 'lineHeight', 'padding', 'margin', 'borderRadius']
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
// Flatten for iteration
|
|
31
|
+
export const ALL_PROPERTIES = Object.values(LAYOUT_PROPERTIES).flat();
|
|
32
|
+
|
|
12
33
|
/**
|
|
13
34
|
* Extract all CSS from page
|
|
14
35
|
* @param {Page} page - Puppeteer page
|
|
@@ -16,7 +37,7 @@ export const MAX_CSS_RULES_WARN = 5000; // Warn on large stylesheets
|
|
|
16
37
|
* @returns {Promise<{cssBlocks: Array, corsBlocked: Array, computedStyles: Object, totalRules: number, warnings: Array}>}
|
|
17
38
|
*/
|
|
18
39
|
export async function extractAllCss(page, baseUrl) {
|
|
19
|
-
return await page.evaluate((url) => {
|
|
40
|
+
return await page.evaluate((url, allProps) => {
|
|
20
41
|
const cssBlocks = [];
|
|
21
42
|
const corsBlocked = [];
|
|
22
43
|
const warnings = [];
|
|
@@ -77,17 +98,21 @@ export async function extractAllCss(page, baseUrl) {
|
|
|
77
98
|
const el = document.querySelector(selector);
|
|
78
99
|
if (el) {
|
|
79
100
|
const style = getComputedStyle(el);
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
101
|
+
const styles = {};
|
|
102
|
+
|
|
103
|
+
// Extract all layout + visual properties
|
|
104
|
+
allProps.forEach(prop => {
|
|
105
|
+
const value = style[prop];
|
|
106
|
+
// Skip empty/default values to reduce payload (except display)
|
|
107
|
+
if (prop === 'display') {
|
|
108
|
+
styles[prop] = value; // Always include display for inline strategy
|
|
109
|
+
} else if (value && value !== 'none' && value !== 'auto' &&
|
|
110
|
+
value !== 'normal' && value !== '0px' && value !== 'static') {
|
|
111
|
+
styles[prop] = value;
|
|
112
|
+
}
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
computedStyles[selector] = styles;
|
|
91
116
|
}
|
|
92
117
|
} catch (e) {
|
|
93
118
|
// Ignore invalid selectors
|
|
@@ -103,5 +128,5 @@ export async function extractAllCss(page, baseUrl) {
|
|
|
103
128
|
totalRules,
|
|
104
129
|
warnings
|
|
105
130
|
};
|
|
106
|
-
}, baseUrl);
|
|
131
|
+
}, baseUrl, ALL_PROPERTIES);
|
|
107
132
|
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Design Tokens Extraction Wrapper
|
|
3
|
+
*
|
|
4
|
+
* Wraps the Python script for extracting design tokens from screenshots.
|
|
5
|
+
*
|
|
6
|
+
* Usage:
|
|
7
|
+
* import { extractDesignTokens } from './design-tokens.js';
|
|
8
|
+
* const result = await extractDesignTokens(outputDir, cssPath);
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { spawn } from 'child_process';
|
|
12
|
+
import path from 'path';
|
|
13
|
+
import { fileURLToPath } from 'url';
|
|
14
|
+
|
|
15
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
16
|
+
const __dirname = path.dirname(__filename);
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Extract design tokens from screenshots using Gemini Vision API
|
|
20
|
+
*
|
|
21
|
+
* @param {string} outputDir - Output directory (contains analysis/desktop/*.png)
|
|
22
|
+
* @param {string} cssPath - Path to merged CSS file for reference
|
|
23
|
+
* @returns {Promise<Object>} Result with { success, tokens_json, tokens_css }
|
|
24
|
+
*/
|
|
25
|
+
export async function extractDesignTokens(outputDir, cssPath = null) {
|
|
26
|
+
const scriptPath = path.resolve(__dirname, '../ai/extract-design-tokens.py');
|
|
27
|
+
const screenshotsDir = path.join(outputDir, 'analysis', 'desktop');
|
|
28
|
+
|
|
29
|
+
// Build args
|
|
30
|
+
const args = [
|
|
31
|
+
scriptPath,
|
|
32
|
+
'--screenshots', screenshotsDir,
|
|
33
|
+
'--output', outputDir
|
|
34
|
+
];
|
|
35
|
+
|
|
36
|
+
if (cssPath) {
|
|
37
|
+
args.push('--css', cssPath);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return new Promise((resolve) => {
|
|
41
|
+
const proc = spawn('python3', args, {
|
|
42
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
43
|
+
env: { ...process.env }
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
let stdout = '';
|
|
47
|
+
let stderr = '';
|
|
48
|
+
|
|
49
|
+
proc.stdout.on('data', (data) => {
|
|
50
|
+
stdout += data.toString();
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
proc.stderr.on('data', (data) => {
|
|
54
|
+
stderr += data.toString();
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
proc.on('close', (code) => {
|
|
58
|
+
if (code !== 0) {
|
|
59
|
+
// Try to parse error from stdout (script outputs JSON errors)
|
|
60
|
+
try {
|
|
61
|
+
const errorResult = JSON.parse(stdout);
|
|
62
|
+
resolve({
|
|
63
|
+
success: false,
|
|
64
|
+
error: errorResult.error || 'Unknown error',
|
|
65
|
+
hint: errorResult.hint || null
|
|
66
|
+
});
|
|
67
|
+
} catch {
|
|
68
|
+
resolve({
|
|
69
|
+
success: false,
|
|
70
|
+
error: stderr || `Process exited with code ${code}`
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
return;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Parse success result
|
|
77
|
+
try {
|
|
78
|
+
const result = JSON.parse(stdout);
|
|
79
|
+
resolve(result);
|
|
80
|
+
} catch (err) {
|
|
81
|
+
resolve({
|
|
82
|
+
success: false,
|
|
83
|
+
error: `Failed to parse output: ${err.message}`
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
proc.on('error', (err) => {
|
|
89
|
+
if (err.code === 'ENOENT') {
|
|
90
|
+
resolve({
|
|
91
|
+
success: false,
|
|
92
|
+
error: 'Python3 not found',
|
|
93
|
+
hint: 'Install Python 3 to enable AI token extraction'
|
|
94
|
+
});
|
|
95
|
+
} else {
|
|
96
|
+
resolve({
|
|
97
|
+
success: false,
|
|
98
|
+
error: err.message
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
});
|
|
102
|
+
});
|
|
103
|
+
}
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Page Discovery Module
|
|
3
|
+
*
|
|
4
|
+
* Extracts navigation links from a website to discover cloneable pages.
|
|
5
|
+
* Handles SPA hydration, filters external links, and normalizes URLs.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* import { discoverPages } from './discover-pages.js';
|
|
9
|
+
* const result = await discoverPages('https://example.com', { maxPages: 10 });
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { getBrowser, getPage, disconnectBrowser } from '../utils/browser.js';
|
|
13
|
+
import { waitForDomStable, waitForPageReady } from './page-readiness.js';
|
|
14
|
+
import { dismissCookieBanner } from './cookie-handler.js';
|
|
15
|
+
|
|
16
|
+
// Navigation selectors in priority order
|
|
17
|
+
const NAV_SELECTORS = [
|
|
18
|
+
'header nav a',
|
|
19
|
+
'header a',
|
|
20
|
+
'nav a',
|
|
21
|
+
'[role="navigation"] a',
|
|
22
|
+
'.navbar a',
|
|
23
|
+
'.nav-menu a',
|
|
24
|
+
'.navigation a',
|
|
25
|
+
'footer nav a',
|
|
26
|
+
'footer a'
|
|
27
|
+
];
|
|
28
|
+
|
|
29
|
+
// Patterns to exclude from discovered links
|
|
30
|
+
const EXCLUDE_PATTERNS = [
|
|
31
|
+
/^mailto:/i,
|
|
32
|
+
/^tel:/i,
|
|
33
|
+
/^javascript:/i,
|
|
34
|
+
/^#/,
|
|
35
|
+
/\.(pdf|jpg|jpeg|png|gif|svg|webp|ico|zip|tar|gz|mp3|mp4|avi|mov)$/i,
|
|
36
|
+
/facebook\.com/i,
|
|
37
|
+
/twitter\.com/i,
|
|
38
|
+
/instagram\.com/i,
|
|
39
|
+
/linkedin\.com/i,
|
|
40
|
+
/youtube\.com/i,
|
|
41
|
+
/tiktok\.com/i
|
|
42
|
+
];
|
|
43
|
+
|
|
44
|
+
// Default options
|
|
45
|
+
const DEFAULT_OPTIONS = {
|
|
46
|
+
maxPages: 10,
|
|
47
|
+
selectors: null, // Use default NAV_SELECTORS if null
|
|
48
|
+
includeSubdomains: false,
|
|
49
|
+
timeout: 30000
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Normalize URL for comparison and deduplication
|
|
54
|
+
* @param {string} baseUrl - Base URL for resolving relative paths
|
|
55
|
+
* @param {string} href - URL to normalize
|
|
56
|
+
* @returns {string|null} Normalized URL or null if invalid
|
|
57
|
+
*/
|
|
58
|
+
export function normalizeUrl(baseUrl, href) {
|
|
59
|
+
if (!href || typeof href !== 'string') return null;
|
|
60
|
+
|
|
61
|
+
try {
|
|
62
|
+
const url = new URL(href, baseUrl);
|
|
63
|
+
|
|
64
|
+
// Skip non-http(s) protocols
|
|
65
|
+
if (!url.protocol.startsWith('http')) return null;
|
|
66
|
+
|
|
67
|
+
// Build normalized URL: origin + pathname (no hash, no query)
|
|
68
|
+
let normalized = url.origin + url.pathname;
|
|
69
|
+
|
|
70
|
+
// Remove trailing slash (except for root)
|
|
71
|
+
if (normalized.endsWith('/') && normalized !== url.origin + '/') {
|
|
72
|
+
normalized = normalized.slice(0, -1);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return normalized;
|
|
76
|
+
} catch {
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Check if URL is same domain as base
|
|
83
|
+
* @param {string} url - URL to check
|
|
84
|
+
* @param {string} baseDomain - Base domain to compare against
|
|
85
|
+
* @param {boolean} includeSubdomains - Whether to include subdomains
|
|
86
|
+
* @returns {boolean}
|
|
87
|
+
*/
|
|
88
|
+
export function isSameDomain(url, baseDomain, includeSubdomains = false) {
|
|
89
|
+
try {
|
|
90
|
+
const urlObj = new URL(url);
|
|
91
|
+
const hostname = urlObj.hostname.toLowerCase();
|
|
92
|
+
const base = baseDomain.toLowerCase();
|
|
93
|
+
|
|
94
|
+
if (hostname === base) return true;
|
|
95
|
+
|
|
96
|
+
if (includeSubdomains) {
|
|
97
|
+
return hostname.endsWith('.' + base);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return false;
|
|
101
|
+
} catch {
|
|
102
|
+
return false;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Extract page name from link text or URL path
|
|
108
|
+
* @param {string} text - Link text
|
|
109
|
+
* @param {string} path - URL path
|
|
110
|
+
* @returns {string} Page name
|
|
111
|
+
*/
|
|
112
|
+
export function extractPageName(text, path) {
|
|
113
|
+
// Use link text if available and meaningful
|
|
114
|
+
if (text && text.length > 0 && text.length < 50) {
|
|
115
|
+
return text;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// Extract from path
|
|
119
|
+
if (!path || path === '/') return 'Home';
|
|
120
|
+
|
|
121
|
+
// Get last segment of path
|
|
122
|
+
const segments = path.split('/').filter(Boolean);
|
|
123
|
+
if (segments.length === 0) return 'Home';
|
|
124
|
+
|
|
125
|
+
const lastSegment = segments[segments.length - 1];
|
|
126
|
+
|
|
127
|
+
// Convert kebab-case/snake_case to Title Case
|
|
128
|
+
return lastSegment
|
|
129
|
+
.replace(/[-_]/g, ' ')
|
|
130
|
+
.replace(/\b\w/g, c => c.toUpperCase());
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Check if href should be excluded
|
|
135
|
+
* @param {string} href - URL to check
|
|
136
|
+
* @returns {boolean}
|
|
137
|
+
*/
|
|
138
|
+
function shouldExclude(href) {
|
|
139
|
+
if (!href) return true;
|
|
140
|
+
return EXCLUDE_PATTERNS.some(pattern => pattern.test(href));
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Discover pages from a website by extracting navigation links
|
|
145
|
+
* @param {string} baseUrl - Starting URL to discover from
|
|
146
|
+
* @param {Object} options - Discovery options
|
|
147
|
+
* @returns {Promise<Object>} Discovery result
|
|
148
|
+
*/
|
|
149
|
+
export async function discoverPages(baseUrl, options = {}) {
|
|
150
|
+
const opts = { ...DEFAULT_OPTIONS, ...options };
|
|
151
|
+
const startTime = Date.now();
|
|
152
|
+
|
|
153
|
+
let browser = null;
|
|
154
|
+
let page = null;
|
|
155
|
+
|
|
156
|
+
try {
|
|
157
|
+
// Parse base URL
|
|
158
|
+
const baseUrlObj = new URL(baseUrl);
|
|
159
|
+
const baseDomain = baseUrlObj.hostname;
|
|
160
|
+
|
|
161
|
+
// Launch browser
|
|
162
|
+
browser = await getBrowser({ headless: true });
|
|
163
|
+
page = await getPage(browser);
|
|
164
|
+
|
|
165
|
+
// Navigate to page
|
|
166
|
+
await page.goto(baseUrl, {
|
|
167
|
+
waitUntil: ['load', 'networkidle0'],
|
|
168
|
+
timeout: opts.timeout
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
// Wait for SPA hydration
|
|
172
|
+
await page.waitForSelector('nav a, header a, [role="navigation"] a', {
|
|
173
|
+
visible: true,
|
|
174
|
+
timeout: 5000
|
|
175
|
+
}).catch(() => {});
|
|
176
|
+
|
|
177
|
+
await waitForDomStable(page, 500, 5000);
|
|
178
|
+
|
|
179
|
+
// Dismiss cookie banner if present
|
|
180
|
+
await dismissCookieBanner(page);
|
|
181
|
+
|
|
182
|
+
// Wait a bit more for any dynamic content
|
|
183
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
184
|
+
|
|
185
|
+
// Extract links using selectors
|
|
186
|
+
const selectors = opts.selectors || NAV_SELECTORS;
|
|
187
|
+
const selectorString = selectors.join(', ');
|
|
188
|
+
|
|
189
|
+
const rawLinks = await page.$$eval(selectorString, (elements) => {
|
|
190
|
+
return elements.map(el => ({
|
|
191
|
+
href: el.href,
|
|
192
|
+
text: el.textContent?.trim() || '',
|
|
193
|
+
tagName: el.tagName
|
|
194
|
+
}));
|
|
195
|
+
}).catch(() => []);
|
|
196
|
+
|
|
197
|
+
// Process and filter links
|
|
198
|
+
const seenUrls = new Set();
|
|
199
|
+
const pages = [];
|
|
200
|
+
|
|
201
|
+
// Always include homepage first
|
|
202
|
+
const homeUrl = normalizeUrl(baseUrl, '/');
|
|
203
|
+
if (homeUrl) {
|
|
204
|
+
seenUrls.add(homeUrl);
|
|
205
|
+
pages.push({
|
|
206
|
+
path: '/',
|
|
207
|
+
name: 'Home',
|
|
208
|
+
url: homeUrl
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
for (const link of rawLinks) {
|
|
213
|
+
// Skip excluded patterns
|
|
214
|
+
if (shouldExclude(link.href)) continue;
|
|
215
|
+
|
|
216
|
+
// Normalize URL
|
|
217
|
+
const normalized = normalizeUrl(baseUrl, link.href);
|
|
218
|
+
if (!normalized) continue;
|
|
219
|
+
|
|
220
|
+
// Skip if already seen
|
|
221
|
+
if (seenUrls.has(normalized)) continue;
|
|
222
|
+
|
|
223
|
+
// Check same domain
|
|
224
|
+
if (!isSameDomain(normalized, baseDomain, opts.includeSubdomains)) continue;
|
|
225
|
+
|
|
226
|
+
// Extract path
|
|
227
|
+
const urlObj = new URL(normalized);
|
|
228
|
+
const path = urlObj.pathname;
|
|
229
|
+
|
|
230
|
+
// Skip homepage (already added)
|
|
231
|
+
if (path === '/') continue;
|
|
232
|
+
|
|
233
|
+
// Add to results
|
|
234
|
+
seenUrls.add(normalized);
|
|
235
|
+
pages.push({
|
|
236
|
+
path,
|
|
237
|
+
name: extractPageName(link.text, path),
|
|
238
|
+
url: normalized
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
// Check max pages limit
|
|
242
|
+
if (pages.length >= opts.maxPages) break;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// Sort by path depth (shallow first)
|
|
246
|
+
pages.sort((a, b) => {
|
|
247
|
+
if (a.path === '/') return -1;
|
|
248
|
+
if (b.path === '/') return 1;
|
|
249
|
+
const depthA = (a.path.match(/\//g) || []).length;
|
|
250
|
+
const depthB = (b.path.match(/\//g) || []).length;
|
|
251
|
+
return depthA - depthB;
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
const duration = Date.now() - startTime;
|
|
255
|
+
|
|
256
|
+
return {
|
|
257
|
+
success: true,
|
|
258
|
+
baseUrl: baseUrlObj.origin,
|
|
259
|
+
baseDomain,
|
|
260
|
+
pages,
|
|
261
|
+
stats: {
|
|
262
|
+
totalLinksFound: rawLinks.length,
|
|
263
|
+
pagesDiscovered: pages.length,
|
|
264
|
+
durationMs: duration
|
|
265
|
+
}
|
|
266
|
+
};
|
|
267
|
+
} catch (error) {
|
|
268
|
+
return {
|
|
269
|
+
success: false,
|
|
270
|
+
baseUrl,
|
|
271
|
+
pages: [{
|
|
272
|
+
path: '/',
|
|
273
|
+
name: 'Home',
|
|
274
|
+
url: normalizeUrl(baseUrl, '/') || baseUrl
|
|
275
|
+
}],
|
|
276
|
+
error: error.message,
|
|
277
|
+
stats: {
|
|
278
|
+
totalLinksFound: 0,
|
|
279
|
+
pagesDiscovered: 1,
|
|
280
|
+
durationMs: Date.now() - startTime
|
|
281
|
+
}
|
|
282
|
+
};
|
|
283
|
+
} finally {
|
|
284
|
+
if (browser) {
|
|
285
|
+
await disconnectBrowser();
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// CLI support
|
|
291
|
+
const isMainModule = process.argv[1] && (
|
|
292
|
+
process.argv[1].endsWith('discover-pages.js') ||
|
|
293
|
+
process.argv[1].includes('discover-pages')
|
|
294
|
+
);
|
|
295
|
+
|
|
296
|
+
if (isMainModule) {
|
|
297
|
+
const url = process.argv[2];
|
|
298
|
+
const maxPages = parseInt(process.argv[3]) || 10;
|
|
299
|
+
|
|
300
|
+
if (!url) {
|
|
301
|
+
console.error('Usage: node discover-pages.js <url> [maxPages]');
|
|
302
|
+
process.exit(1);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
discoverPages(url, { maxPages })
|
|
306
|
+
.then(result => {
|
|
307
|
+
console.log(JSON.stringify(result, null, 2));
|
|
308
|
+
process.exit(result.success ? 0 : 1);
|
|
309
|
+
})
|
|
310
|
+
.catch(err => {
|
|
311
|
+
console.error(JSON.stringify({ success: false, error: err.message }));
|
|
312
|
+
process.exit(1);
|
|
313
|
+
});
|
|
314
|
+
}
|
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
* event handlers, and framework-specific attributes.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
+
import { LAYOUT_PROPERTIES } from './css-extractor.js';
|
|
9
|
+
|
|
8
10
|
// Size limits
|
|
9
11
|
export const MAX_HTML_SIZE = 10 * 1024 * 1024; // 10MB limit
|
|
10
12
|
export const MAX_DOM_ELEMENTS = 50000; // Warn on large DOMs
|
|
@@ -16,6 +18,20 @@ export const JS_FRAMEWORK_PATTERNS = [
|
|
|
16
18
|
/^data-alpine/i, /^wire:/i, /^@/
|
|
17
19
|
];
|
|
18
20
|
|
|
21
|
+
// Properties to inline on critical elements (layout only, not visual)
|
|
22
|
+
// Uses shared LAYOUT_PROPERTIES from css-extractor (DRY)
|
|
23
|
+
export const INLINE_LAYOUT_PROPS = [
|
|
24
|
+
...LAYOUT_PROPERTIES.display,
|
|
25
|
+
...LAYOUT_PROPERTIES.grid,
|
|
26
|
+
...LAYOUT_PROPERTIES.position,
|
|
27
|
+
...LAYOUT_PROPERTIES.sizing,
|
|
28
|
+
...LAYOUT_PROPERTIES.box.slice(0, 2) // boxSizing, overflow only (skip overflowX/Y, border)
|
|
29
|
+
];
|
|
30
|
+
|
|
31
|
+
// Criteria for critical elements (no sticky - avoid scroll context side effects)
|
|
32
|
+
export const CRITICAL_DISPLAY = ['flex', 'inline-flex', 'grid', 'inline-grid'];
|
|
33
|
+
export const CRITICAL_POSITION = ['absolute', 'fixed'];
|
|
34
|
+
|
|
19
35
|
/**
|
|
20
36
|
* Extract and clean HTML from page
|
|
21
37
|
* @param {Page} page - Puppeteer page
|
|
@@ -23,7 +39,7 @@ export const JS_FRAMEWORK_PATTERNS = [
|
|
|
23
39
|
* @returns {Promise<{html: string, warnings: string[], elementCount: number}>}
|
|
24
40
|
*/
|
|
25
41
|
export async function extractCleanHtml(page, frameworkPatterns = JS_FRAMEWORK_PATTERNS) {
|
|
26
|
-
return await page.evaluate((patterns) => {
|
|
42
|
+
return await page.evaluate((patterns, inlineProps, criticalDisplay, criticalPosition) => {
|
|
27
43
|
const warnings = [];
|
|
28
44
|
|
|
29
45
|
// Check DOM size
|
|
@@ -72,6 +88,58 @@ export async function extractCleanHtml(page, frameworkPatterns = JS_FRAMEWORK_PA
|
|
|
72
88
|
});
|
|
73
89
|
});
|
|
74
90
|
|
|
91
|
+
// Inline computed styles on critical elements (flex/grid/positioned)
|
|
92
|
+
// Using index-based matching for reliability
|
|
93
|
+
const inlineStyles = [];
|
|
94
|
+
let inlinedCount = 0;
|
|
95
|
+
|
|
96
|
+
document.querySelectorAll('*').forEach((liveEl, idx) => {
|
|
97
|
+
const style = getComputedStyle(liveEl);
|
|
98
|
+
const display = style.display;
|
|
99
|
+
const position = style.position;
|
|
100
|
+
|
|
101
|
+
// Only critical elements (flex/grid containers, absolute/fixed positioned)
|
|
102
|
+
if (criticalDisplay.includes(display) || criticalPosition.includes(position)) {
|
|
103
|
+
const props = [];
|
|
104
|
+
inlineProps.forEach(prop => {
|
|
105
|
+
const val = style[prop];
|
|
106
|
+
// Skip defaults/empty values
|
|
107
|
+
if (val && val !== 'auto' && val !== 'none' && val !== 'normal' &&
|
|
108
|
+
val !== '0px' && val !== 'static' && val !== 'visible' &&
|
|
109
|
+
val !== 'content-box') {
|
|
110
|
+
// Convert camelCase to kebab-case
|
|
111
|
+
const cssProp = prop.replace(/([A-Z])/g, '-$1').toLowerCase();
|
|
112
|
+
props.push(`${cssProp}: ${val}`);
|
|
113
|
+
}
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
// Always include display for critical elements
|
|
117
|
+
if (!props.some(p => p.startsWith('display:'))) {
|
|
118
|
+
props.unshift(`display: ${display}`);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (props.length > 0) {
|
|
122
|
+
inlineStyles.push({ idx, style: props.join('; ') });
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
// Apply to cloned doc using index matching
|
|
128
|
+
const clonedElements = doc.querySelectorAll('*');
|
|
129
|
+
inlineStyles.forEach(({ idx, style }) => {
|
|
130
|
+
if (clonedElements[idx]) {
|
|
131
|
+
const existing = clonedElements[idx].getAttribute('style') || '';
|
|
132
|
+
clonedElements[idx].setAttribute('style',
|
|
133
|
+
existing ? `${existing}; ${style}` : style);
|
|
134
|
+
inlinedCount++;
|
|
135
|
+
}
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
// Track for warnings
|
|
139
|
+
if (inlinedCount > 100) {
|
|
140
|
+
warnings.push(`Inlined ${inlinedCount} critical elements`);
|
|
141
|
+
}
|
|
142
|
+
|
|
75
143
|
// Remove hidden elements
|
|
76
144
|
doc.querySelectorAll('[hidden], [style*="display: none"], [style*="display:none"]')
|
|
77
145
|
.forEach(el => el.remove());
|
|
@@ -97,6 +165,7 @@ export async function extractCleanHtml(page, frameworkPatterns = JS_FRAMEWORK_PA
|
|
|
97
165
|
(document.documentElement.lang || 'en') + '">\n' +
|
|
98
166
|
doc.innerHTML + '\n</html>';
|
|
99
167
|
|
|
100
|
-
return { html, warnings, elementCount };
|
|
101
|
-
}, frameworkPatterns.map(r => ({ source: r.source, flags: r.flags }))
|
|
168
|
+
return { html, warnings, elementCount, inlinedCount };
|
|
169
|
+
}, frameworkPatterns.map(r => ({ source: r.source, flags: r.flags })),
|
|
170
|
+
INLINE_LAYOUT_PROPS, CRITICAL_DISPLAY, CRITICAL_POSITION);
|
|
102
171
|
}
|