design-clone 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,103 @@
1
+ /**
2
+ * Design Tokens Extraction Wrapper
3
+ *
4
+ * Wraps the Python script for extracting design tokens from screenshots.
5
+ *
6
+ * Usage:
7
+ * import { extractDesignTokens } from './design-tokens.js';
8
+ * const result = await extractDesignTokens(outputDir, cssPath);
9
+ */
10
+
11
+ import { spawn } from 'child_process';
12
+ import path from 'path';
13
+ import { fileURLToPath } from 'url';
14
+
15
+ const __filename = fileURLToPath(import.meta.url);
16
+ const __dirname = path.dirname(__filename);
17
+
18
+ /**
19
+ * Extract design tokens from screenshots using Gemini Vision API
20
+ *
21
+ * @param {string} outputDir - Output directory (contains analysis/desktop/*.png)
22
+ * @param {string} cssPath - Path to merged CSS file for reference
23
+ * @returns {Promise<Object>} Result with { success, tokens_json, tokens_css }
24
+ */
25
+ export async function extractDesignTokens(outputDir, cssPath = null) {
26
+ const scriptPath = path.resolve(__dirname, '../ai/extract-design-tokens.py');
27
+ const screenshotsDir = path.join(outputDir, 'analysis', 'desktop');
28
+
29
+ // Build args
30
+ const args = [
31
+ scriptPath,
32
+ '--screenshots', screenshotsDir,
33
+ '--output', outputDir
34
+ ];
35
+
36
+ if (cssPath) {
37
+ args.push('--css', cssPath);
38
+ }
39
+
40
+ return new Promise((resolve) => {
41
+ const proc = spawn('python3', args, {
42
+ stdio: ['ignore', 'pipe', 'pipe'],
43
+ env: { ...process.env }
44
+ });
45
+
46
+ let stdout = '';
47
+ let stderr = '';
48
+
49
+ proc.stdout.on('data', (data) => {
50
+ stdout += data.toString();
51
+ });
52
+
53
+ proc.stderr.on('data', (data) => {
54
+ stderr += data.toString();
55
+ });
56
+
57
+ proc.on('close', (code) => {
58
+ if (code !== 0) {
59
+ // Try to parse error from stdout (script outputs JSON errors)
60
+ try {
61
+ const errorResult = JSON.parse(stdout);
62
+ resolve({
63
+ success: false,
64
+ error: errorResult.error || 'Unknown error',
65
+ hint: errorResult.hint || null
66
+ });
67
+ } catch {
68
+ resolve({
69
+ success: false,
70
+ error: stderr || `Process exited with code ${code}`
71
+ });
72
+ }
73
+ return;
74
+ }
75
+
76
+ // Parse success result
77
+ try {
78
+ const result = JSON.parse(stdout);
79
+ resolve(result);
80
+ } catch (err) {
81
+ resolve({
82
+ success: false,
83
+ error: `Failed to parse output: ${err.message}`
84
+ });
85
+ }
86
+ });
87
+
88
+ proc.on('error', (err) => {
89
+ if (err.code === 'ENOENT') {
90
+ resolve({
91
+ success: false,
92
+ error: 'Python3 not found',
93
+ hint: 'Install Python 3 to enable AI token extraction'
94
+ });
95
+ } else {
96
+ resolve({
97
+ success: false,
98
+ error: err.message
99
+ });
100
+ }
101
+ });
102
+ });
103
+ }
@@ -0,0 +1,314 @@
1
+ /**
2
+ * Page Discovery Module
3
+ *
4
+ * Extracts navigation links from a website to discover cloneable pages.
5
+ * Handles SPA hydration, filters external links, and normalizes URLs.
6
+ *
7
+ * Usage:
8
+ * import { discoverPages } from './discover-pages.js';
9
+ * const result = await discoverPages('https://example.com', { maxPages: 10 });
10
+ */
11
+
12
+ import { getBrowser, getPage, disconnectBrowser } from '../utils/browser.js';
13
+ import { waitForDomStable, waitForPageReady } from './page-readiness.js';
14
+ import { dismissCookieBanner } from './cookie-handler.js';
15
+
16
+ // Navigation selectors in priority order
17
+ const NAV_SELECTORS = [
18
+ 'header nav a',
19
+ 'header a',
20
+ 'nav a',
21
+ '[role="navigation"] a',
22
+ '.navbar a',
23
+ '.nav-menu a',
24
+ '.navigation a',
25
+ 'footer nav a',
26
+ 'footer a'
27
+ ];
28
+
29
+ // Patterns to exclude from discovered links
30
+ const EXCLUDE_PATTERNS = [
31
+ /^mailto:/i,
32
+ /^tel:/i,
33
+ /^javascript:/i,
34
+ /^#/,
35
+ /\.(pdf|jpg|jpeg|png|gif|svg|webp|ico|zip|tar|gz|mp3|mp4|avi|mov)$/i,
36
+ /facebook\.com/i,
37
+ /twitter\.com/i,
38
+ /instagram\.com/i,
39
+ /linkedin\.com/i,
40
+ /youtube\.com/i,
41
+ /tiktok\.com/i
42
+ ];
43
+
44
+ // Default options
45
+ const DEFAULT_OPTIONS = {
46
+ maxPages: 10,
47
+ selectors: null, // Use default NAV_SELECTORS if null
48
+ includeSubdomains: false,
49
+ timeout: 30000
50
+ };
51
+
52
+ /**
53
+ * Normalize URL for comparison and deduplication
54
+ * @param {string} baseUrl - Base URL for resolving relative paths
55
+ * @param {string} href - URL to normalize
56
+ * @returns {string|null} Normalized URL or null if invalid
57
+ */
58
+ export function normalizeUrl(baseUrl, href) {
59
+ if (!href || typeof href !== 'string') return null;
60
+
61
+ try {
62
+ const url = new URL(href, baseUrl);
63
+
64
+ // Skip non-http(s) protocols
65
+ if (!url.protocol.startsWith('http')) return null;
66
+
67
+ // Build normalized URL: origin + pathname (no hash, no query)
68
+ let normalized = url.origin + url.pathname;
69
+
70
+ // Remove trailing slash (except for root)
71
+ if (normalized.endsWith('/') && normalized !== url.origin + '/') {
72
+ normalized = normalized.slice(0, -1);
73
+ }
74
+
75
+ return normalized;
76
+ } catch {
77
+ return null;
78
+ }
79
+ }
80
+
81
+ /**
82
+ * Check if URL is same domain as base
83
+ * @param {string} url - URL to check
84
+ * @param {string} baseDomain - Base domain to compare against
85
+ * @param {boolean} includeSubdomains - Whether to include subdomains
86
+ * @returns {boolean}
87
+ */
88
+ export function isSameDomain(url, baseDomain, includeSubdomains = false) {
89
+ try {
90
+ const urlObj = new URL(url);
91
+ const hostname = urlObj.hostname.toLowerCase();
92
+ const base = baseDomain.toLowerCase();
93
+
94
+ if (hostname === base) return true;
95
+
96
+ if (includeSubdomains) {
97
+ return hostname.endsWith('.' + base);
98
+ }
99
+
100
+ return false;
101
+ } catch {
102
+ return false;
103
+ }
104
+ }
105
+
106
+ /**
107
+ * Extract page name from link text or URL path
108
+ * @param {string} text - Link text
109
+ * @param {string} path - URL path
110
+ * @returns {string} Page name
111
+ */
112
+ export function extractPageName(text, path) {
113
+ // Use link text if available and meaningful
114
+ if (text && text.length > 0 && text.length < 50) {
115
+ return text;
116
+ }
117
+
118
+ // Extract from path
119
+ if (!path || path === '/') return 'Home';
120
+
121
+ // Get last segment of path
122
+ const segments = path.split('/').filter(Boolean);
123
+ if (segments.length === 0) return 'Home';
124
+
125
+ const lastSegment = segments[segments.length - 1];
126
+
127
+ // Convert kebab-case/snake_case to Title Case
128
+ return lastSegment
129
+ .replace(/[-_]/g, ' ')
130
+ .replace(/\b\w/g, c => c.toUpperCase());
131
+ }
132
+
133
+ /**
134
+ * Check if href should be excluded
135
+ * @param {string} href - URL to check
136
+ * @returns {boolean}
137
+ */
138
+ function shouldExclude(href) {
139
+ if (!href) return true;
140
+ return EXCLUDE_PATTERNS.some(pattern => pattern.test(href));
141
+ }
142
+
143
+ /**
144
+ * Discover pages from a website by extracting navigation links
145
+ * @param {string} baseUrl - Starting URL to discover from
146
+ * @param {Object} options - Discovery options
147
+ * @returns {Promise<Object>} Discovery result
148
+ */
149
+ export async function discoverPages(baseUrl, options = {}) {
150
+ const opts = { ...DEFAULT_OPTIONS, ...options };
151
+ const startTime = Date.now();
152
+
153
+ let browser = null;
154
+ let page = null;
155
+
156
+ try {
157
+ // Parse base URL
158
+ const baseUrlObj = new URL(baseUrl);
159
+ const baseDomain = baseUrlObj.hostname;
160
+
161
+ // Launch browser
162
+ browser = await getBrowser({ headless: true });
163
+ page = await getPage(browser);
164
+
165
+ // Navigate to page
166
+ await page.goto(baseUrl, {
167
+ waitUntil: ['load', 'networkidle0'],
168
+ timeout: opts.timeout
169
+ });
170
+
171
+ // Wait for SPA hydration
172
+ await page.waitForSelector('nav a, header a, [role="navigation"] a', {
173
+ visible: true,
174
+ timeout: 5000
175
+ }).catch(() => {});
176
+
177
+ await waitForDomStable(page, 500, 5000);
178
+
179
+ // Dismiss cookie banner if present
180
+ await dismissCookieBanner(page);
181
+
182
+ // Wait a bit more for any dynamic content
183
+ await new Promise(r => setTimeout(r, 1000));
184
+
185
+ // Extract links using selectors
186
+ const selectors = opts.selectors || NAV_SELECTORS;
187
+ const selectorString = selectors.join(', ');
188
+
189
+ const rawLinks = await page.$$eval(selectorString, (elements) => {
190
+ return elements.map(el => ({
191
+ href: el.href,
192
+ text: el.textContent?.trim() || '',
193
+ tagName: el.tagName
194
+ }));
195
+ }).catch(() => []);
196
+
197
+ // Process and filter links
198
+ const seenUrls = new Set();
199
+ const pages = [];
200
+
201
+ // Always include homepage first
202
+ const homeUrl = normalizeUrl(baseUrl, '/');
203
+ if (homeUrl) {
204
+ seenUrls.add(homeUrl);
205
+ pages.push({
206
+ path: '/',
207
+ name: 'Home',
208
+ url: homeUrl
209
+ });
210
+ }
211
+
212
+ for (const link of rawLinks) {
213
+ // Skip excluded patterns
214
+ if (shouldExclude(link.href)) continue;
215
+
216
+ // Normalize URL
217
+ const normalized = normalizeUrl(baseUrl, link.href);
218
+ if (!normalized) continue;
219
+
220
+ // Skip if already seen
221
+ if (seenUrls.has(normalized)) continue;
222
+
223
+ // Check same domain
224
+ if (!isSameDomain(normalized, baseDomain, opts.includeSubdomains)) continue;
225
+
226
+ // Extract path
227
+ const urlObj = new URL(normalized);
228
+ const path = urlObj.pathname;
229
+
230
+ // Skip homepage (already added)
231
+ if (path === '/') continue;
232
+
233
+ // Add to results
234
+ seenUrls.add(normalized);
235
+ pages.push({
236
+ path,
237
+ name: extractPageName(link.text, path),
238
+ url: normalized
239
+ });
240
+
241
+ // Check max pages limit
242
+ if (pages.length >= opts.maxPages) break;
243
+ }
244
+
245
+ // Sort by path depth (shallow first)
246
+ pages.sort((a, b) => {
247
+ if (a.path === '/') return -1;
248
+ if (b.path === '/') return 1;
249
+ const depthA = (a.path.match(/\//g) || []).length;
250
+ const depthB = (b.path.match(/\//g) || []).length;
251
+ return depthA - depthB;
252
+ });
253
+
254
+ const duration = Date.now() - startTime;
255
+
256
+ return {
257
+ success: true,
258
+ baseUrl: baseUrlObj.origin,
259
+ baseDomain,
260
+ pages,
261
+ stats: {
262
+ totalLinksFound: rawLinks.length,
263
+ pagesDiscovered: pages.length,
264
+ durationMs: duration
265
+ }
266
+ };
267
+ } catch (error) {
268
+ return {
269
+ success: false,
270
+ baseUrl,
271
+ pages: [{
272
+ path: '/',
273
+ name: 'Home',
274
+ url: normalizeUrl(baseUrl, '/') || baseUrl
275
+ }],
276
+ error: error.message,
277
+ stats: {
278
+ totalLinksFound: 0,
279
+ pagesDiscovered: 1,
280
+ durationMs: Date.now() - startTime
281
+ }
282
+ };
283
+ } finally {
284
+ if (browser) {
285
+ await disconnectBrowser();
286
+ }
287
+ }
288
+ }
289
+
290
+ // CLI support
291
+ const isMainModule = process.argv[1] && (
292
+ process.argv[1].endsWith('discover-pages.js') ||
293
+ process.argv[1].includes('discover-pages')
294
+ );
295
+
296
+ if (isMainModule) {
297
+ const url = process.argv[2];
298
+ const maxPages = parseInt(process.argv[3]) || 10;
299
+
300
+ if (!url) {
301
+ console.error('Usage: node discover-pages.js <url> [maxPages]');
302
+ process.exit(1);
303
+ }
304
+
305
+ discoverPages(url, { maxPages })
306
+ .then(result => {
307
+ console.log(JSON.stringify(result, null, 2));
308
+ process.exit(result.success ? 0 : 1);
309
+ })
310
+ .catch(err => {
311
+ console.error(JSON.stringify({ success: false, error: err.message }));
312
+ process.exit(1);
313
+ });
314
+ }
@@ -5,6 +5,8 @@
5
5
  * event handlers, and framework-specific attributes.
6
6
  */
7
7
 
8
+ import { LAYOUT_PROPERTIES } from './css-extractor.js';
9
+
8
10
  // Size limits
9
11
  export const MAX_HTML_SIZE = 10 * 1024 * 1024; // 10MB limit
10
12
  export const MAX_DOM_ELEMENTS = 50000; // Warn on large DOMs
@@ -16,6 +18,20 @@ export const JS_FRAMEWORK_PATTERNS = [
16
18
  /^data-alpine/i, /^wire:/i, /^@/
17
19
  ];
18
20
 
21
+ // Properties to inline on critical elements (layout only, not visual)
22
+ // Uses shared LAYOUT_PROPERTIES from css-extractor (DRY)
23
+ export const INLINE_LAYOUT_PROPS = [
24
+ ...LAYOUT_PROPERTIES.display,
25
+ ...LAYOUT_PROPERTIES.grid,
26
+ ...LAYOUT_PROPERTIES.position,
27
+ ...LAYOUT_PROPERTIES.sizing,
28
+ ...LAYOUT_PROPERTIES.box.slice(0, 2) // boxSizing, overflow only (skip overflowX/Y, border)
29
+ ];
30
+
31
+ // Criteria for critical elements (no sticky - avoid scroll context side effects)
32
+ export const CRITICAL_DISPLAY = ['flex', 'inline-flex', 'grid', 'inline-grid'];
33
+ export const CRITICAL_POSITION = ['absolute', 'fixed'];
34
+
19
35
  /**
20
36
  * Extract and clean HTML from page
21
37
  * @param {Page} page - Puppeteer page
@@ -23,7 +39,7 @@ export const JS_FRAMEWORK_PATTERNS = [
23
39
  * @returns {Promise<{html: string, warnings: string[], elementCount: number}>}
24
40
  */
25
41
  export async function extractCleanHtml(page, frameworkPatterns = JS_FRAMEWORK_PATTERNS) {
26
- return await page.evaluate((patterns) => {
42
+ return await page.evaluate((patterns, inlineProps, criticalDisplay, criticalPosition) => {
27
43
  const warnings = [];
28
44
 
29
45
  // Check DOM size
@@ -72,6 +88,58 @@ export async function extractCleanHtml(page, frameworkPatterns = JS_FRAMEWORK_PA
72
88
  });
73
89
  });
74
90
 
91
+ // Inline computed styles on critical elements (flex/grid/positioned)
92
+ // Using index-based matching for reliability
93
+ const inlineStyles = [];
94
+ let inlinedCount = 0;
95
+
96
+ document.querySelectorAll('*').forEach((liveEl, idx) => {
97
+ const style = getComputedStyle(liveEl);
98
+ const display = style.display;
99
+ const position = style.position;
100
+
101
+ // Only critical elements (flex/grid containers, absolute/fixed positioned)
102
+ if (criticalDisplay.includes(display) || criticalPosition.includes(position)) {
103
+ const props = [];
104
+ inlineProps.forEach(prop => {
105
+ const val = style[prop];
106
+ // Skip defaults/empty values
107
+ if (val && val !== 'auto' && val !== 'none' && val !== 'normal' &&
108
+ val !== '0px' && val !== 'static' && val !== 'visible' &&
109
+ val !== 'content-box') {
110
+ // Convert camelCase to kebab-case
111
+ const cssProp = prop.replace(/([A-Z])/g, '-$1').toLowerCase();
112
+ props.push(`${cssProp}: ${val}`);
113
+ }
114
+ });
115
+
116
+ // Always include display for critical elements
117
+ if (!props.some(p => p.startsWith('display:'))) {
118
+ props.unshift(`display: ${display}`);
119
+ }
120
+
121
+ if (props.length > 0) {
122
+ inlineStyles.push({ idx, style: props.join('; ') });
123
+ }
124
+ }
125
+ });
126
+
127
+ // Apply to cloned doc using index matching
128
+ const clonedElements = doc.querySelectorAll('*');
129
+ inlineStyles.forEach(({ idx, style }) => {
130
+ if (clonedElements[idx]) {
131
+ const existing = clonedElements[idx].getAttribute('style') || '';
132
+ clonedElements[idx].setAttribute('style',
133
+ existing ? `${existing}; ${style}` : style);
134
+ inlinedCount++;
135
+ }
136
+ });
137
+
138
+ // Track for warnings
139
+ if (inlinedCount > 100) {
140
+ warnings.push(`Inlined ${inlinedCount} critical elements`);
141
+ }
142
+
75
143
  // Remove hidden elements
76
144
  doc.querySelectorAll('[hidden], [style*="display: none"], [style*="display:none"]')
77
145
  .forEach(el => el.remove());
@@ -97,6 +165,7 @@ export async function extractCleanHtml(page, frameworkPatterns = JS_FRAMEWORK_PA
97
165
  (document.documentElement.lang || 'en') + '">\n' +
98
166
  doc.innerHTML + '\n</html>';
99
167
 
100
- return { html, warnings, elementCount };
101
- }, frameworkPatterns.map(r => ({ source: r.source, flags: r.flags })));
168
+ return { html, warnings, elementCount, inlinedCount };
169
+ }, frameworkPatterns.map(r => ({ source: r.source, flags: r.flags })),
170
+ INLINE_LAYOUT_PROPS, CRITICAL_DISPLAY, CRITICAL_POSITION);
102
171
  }