design-clone 2.1.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. package/README.md +13 -34
  2. package/SKILL.md +69 -45
  3. package/bin/cli.js +22 -4
  4. package/bin/commands/clone-site.js +31 -171
  5. package/bin/commands/help.js +19 -6
  6. package/bin/commands/init.js +9 -86
  7. package/bin/commands/uninstall.js +105 -0
  8. package/bin/commands/update.js +70 -0
  9. package/bin/commands/verify.js +7 -14
  10. package/bin/utils/paths.js +28 -0
  11. package/bin/utils/validate.js +2 -22
  12. package/bin/utils/version.js +23 -0
  13. package/docs/code-standards.md +789 -0
  14. package/docs/codebase-summary.md +533 -286
  15. package/docs/index.md +74 -0
  16. package/docs/project-overview-pdr.md +797 -0
  17. package/docs/system-architecture.md +718 -0
  18. package/package.json +14 -17
  19. package/src/ai/prompts/design-tokens/basic.md +80 -0
  20. package/src/ai/prompts/design-tokens/section-with-css.md +41 -0
  21. package/src/ai/prompts/design-tokens/section.md +48 -0
  22. package/src/ai/prompts/design-tokens/with-css.md +87 -0
  23. package/src/ai/prompts/structure-analysis/basic.md +55 -0
  24. package/src/ai/prompts/structure-analysis/with-context.md +59 -0
  25. package/src/ai/prompts/structure-analysis/with-dimensions.md +63 -0
  26. package/src/ai/prompts/structure-analysis/with-hierarchy.md +73 -0
  27. package/src/ai/prompts/ux-audit/aggregation.md +42 -0
  28. package/src/ai/prompts/ux-audit/desktop.md +92 -0
  29. package/src/ai/prompts/ux-audit/mobile.md +93 -0
  30. package/src/ai/prompts/ux-audit/tablet.md +92 -0
  31. package/src/core/animation/animation-extractor-ast.js +183 -0
  32. package/src/core/animation/animation-extractor-output.js +152 -0
  33. package/src/core/animation/animation-extractor.js +178 -0
  34. package/src/core/animation/state-capture-detection.js +200 -0
  35. package/src/core/animation/state-capture.js +193 -0
  36. package/src/core/capture/browser-context-pool.js +96 -0
  37. package/src/core/capture/multi-page-screenshot-page.js +110 -0
  38. package/src/core/capture/multi-page-screenshot.js +208 -0
  39. package/src/core/capture/screenshot-extraction.js +186 -0
  40. package/src/core/capture/screenshot-helpers.js +175 -0
  41. package/src/core/capture/screenshot-orchestrator.js +174 -0
  42. package/src/core/capture/screenshot-viewport.js +93 -0
  43. package/src/core/capture/screenshot.js +192 -0
  44. package/src/core/content/content-counter-dom.js +191 -0
  45. package/src/core/content/content-counter.js +76 -0
  46. package/src/core/css/breakpoint-detector.js +66 -0
  47. package/src/core/css/chromium-defaults.json +23 -0
  48. package/src/core/css/computed-style-extractor.js +102 -0
  49. package/src/core/css/css-chunker.js +103 -0
  50. package/src/core/css/filter-css-dead-code.js +120 -0
  51. package/src/core/css/filter-css-html-analyzer.js +110 -0
  52. package/src/core/css/filter-css-selector-matcher.js +172 -0
  53. package/src/core/css/filter-css.js +206 -0
  54. package/src/core/css/merge-css-atrule-processor.js +158 -0
  55. package/src/core/css/merge-css-file-io.js +68 -0
  56. package/src/core/css/merge-css.js +148 -0
  57. package/src/core/detection/framework-detector-routing.js +68 -0
  58. package/src/core/detection/framework-detector-signals.js +65 -0
  59. package/src/core/detection/framework-detector.js +198 -0
  60. package/src/core/dimension/dimension-extractor-card-detector.js +82 -0
  61. package/src/core/dimension/dimension-extractor.js +317 -0
  62. package/src/core/dimension/dimension-output-ai-summary.js +111 -0
  63. package/src/core/dimension/dimension-output.js +173 -0
  64. package/src/core/dimension/dom-tree-analyzer-tree-builders.js +95 -0
  65. package/src/core/dimension/dom-tree-analyzer.js +191 -0
  66. package/src/core/discovery/app-state-snapshot-capture.js +195 -0
  67. package/src/core/discovery/app-state-snapshot-utils.js +178 -0
  68. package/src/core/discovery/app-state-snapshot.js +131 -0
  69. package/src/core/discovery/discover-pages-routes.js +84 -0
  70. package/src/core/discovery/discover-pages-utils.js +177 -0
  71. package/src/core/discovery/discover-pages.js +191 -0
  72. package/src/core/html/html-extractor-inline-styler.js +70 -0
  73. package/src/core/html/html-extractor.js +147 -0
  74. package/src/core/html/semantic-enhancer-mappings.js +200 -0
  75. package/src/core/html/semantic-enhancer-page.js +148 -0
  76. package/src/core/html/semantic-enhancer.js +135 -0
  77. package/src/core/links/rewrite-links-css-rewriter.js +53 -0
  78. package/src/core/links/rewrite-links.js +173 -0
  79. package/src/core/media/asset-validator.js +118 -0
  80. package/src/core/media/extract-assets-downloader.js +187 -0
  81. package/src/core/media/extract-assets-page-scraper.js +115 -0
  82. package/src/core/media/extract-assets.js +159 -0
  83. package/src/core/media/video-capture-convert.js +200 -0
  84. package/src/core/media/video-capture.js +201 -0
  85. package/src/core/{lazy-loader.js → page-prep/lazy-loader.js} +37 -39
  86. package/src/core/section/section-cropper-helpers.js +43 -0
  87. package/src/core/{section-cropper.js → section/section-cropper.js} +11 -88
  88. package/src/core/section/section-detector-strategies.js +139 -0
  89. package/src/core/section/section-detector-utils.js +100 -0
  90. package/src/core/section/section-detector.js +88 -0
  91. package/src/core/tests/test-section-cropper.js +2 -2
  92. package/src/core/tests/test-section-detector.js +2 -2
  93. package/src/post-process/enhance-assets.js +29 -4
  94. package/src/post-process/fetch-images-unsplash-client.js +123 -0
  95. package/src/post-process/fetch-images.js +60 -263
  96. package/src/post-process/inject-gosnap.js +88 -0
  97. package/src/post-process/inject-icons-svg-replacer.js +76 -0
  98. package/src/post-process/inject-icons.js +47 -200
  99. package/src/route-discoverers/base-discoverer-utils.js +137 -0
  100. package/src/route-discoverers/base-discoverer.js +29 -118
  101. package/src/route-discoverers/index.js +1 -1
  102. package/src/shared/config.js +38 -0
  103. package/src/shared/error-codes.js +31 -0
  104. package/src/shared/viewports.js +46 -0
  105. package/src/utils/browser.js +0 -7
  106. package/src/utils/helpers.js +4 -0
  107. package/src/utils/log.js +12 -0
  108. package/src/utils/playwright-loader.js +76 -0
  109. package/src/utils/playwright.js +3 -69
  110. package/src/utils/progress.js +32 -0
  111. package/src/verification/generate-audit-report-css-fixes.js +52 -0
  112. package/src/verification/generate-audit-report-sections.js +158 -0
  113. package/src/verification/generate-audit-report.js +5 -281
  114. package/src/verification/quality-scorer.js +92 -0
  115. package/src/verification/verify-footer-checks.js +103 -0
  116. package/src/verification/verify-footer-helpers.js +178 -0
  117. package/src/verification/verify-footer.js +23 -381
  118. package/src/verification/verify-header-checks.js +104 -0
  119. package/src/verification/verify-header-helpers.js +156 -0
  120. package/src/verification/verify-header.js +23 -365
  121. package/src/verification/verify-layout-report.js +101 -0
  122. package/src/verification/verify-layout.js +13 -259
  123. package/src/verification/verify-menu-checks.js +104 -0
  124. package/src/verification/verify-menu-helpers.js +112 -0
  125. package/src/verification/verify-menu.js +17 -285
  126. package/src/verification/verify-slider-checks.js +115 -0
  127. package/src/verification/verify-slider-constants.js +65 -0
  128. package/src/verification/verify-slider-helpers.js +164 -0
  129. package/src/verification/verify-slider.js +23 -414
  130. package/.env.example +0 -14
  131. package/docs/basic-clone.md +0 -63
  132. package/docs/cli-reference.md +0 -316
  133. package/docs/design-clone-architecture.md +0 -492
  134. package/docs/pixel-perfect.md +0 -117
  135. package/docs/project-roadmap.md +0 -382
  136. package/docs/troubleshooting.md +0 -170
  137. package/requirements.txt +0 -5
  138. package/src/ai/__pycache__/analyze-structure.cpython-313.pyc +0 -0
  139. package/src/ai/__pycache__/extract-design-tokens.cpython-313.pyc +0 -0
  140. package/src/ai/analyze-structure.py +0 -375
  141. package/src/ai/extract-design-tokens.py +0 -782
  142. package/src/ai/prompts/__init__.py +0 -2
  143. package/src/ai/prompts/__pycache__/__init__.cpython-313.pyc +0 -0
  144. package/src/ai/prompts/__pycache__/design_tokens.cpython-313.pyc +0 -0
  145. package/src/ai/prompts/__pycache__/structure_analysis.cpython-313.pyc +0 -0
  146. package/src/ai/prompts/__pycache__/ux_audit.cpython-313.pyc +0 -0
  147. package/src/ai/prompts/design_tokens.py +0 -316
  148. package/src/ai/prompts/structure_analysis.py +0 -592
  149. package/src/ai/prompts/ux_audit.py +0 -198
  150. package/src/ai/ux-audit.js +0 -596
  151. package/src/core/animation-extractor.js +0 -526
  152. package/src/core/app-state-snapshot.js +0 -511
  153. package/src/core/content-counter.js +0 -342
  154. package/src/core/design-tokens.js +0 -103
  155. package/src/core/dimension-extractor.js +0 -438
  156. package/src/core/dimension-output.js +0 -305
  157. package/src/core/discover-pages.js +0 -542
  158. package/src/core/dom-tree-analyzer.js +0 -298
  159. package/src/core/extract-assets.js +0 -468
  160. package/src/core/filter-css.js +0 -499
  161. package/src/core/framework-detector.js +0 -538
  162. package/src/core/html-extractor.js +0 -212
  163. package/src/core/merge-css.js +0 -407
  164. package/src/core/multi-page-screenshot.js +0 -380
  165. package/src/core/rewrite-links.js +0 -226
  166. package/src/core/screenshot.js +0 -701
  167. package/src/core/section-detector.js +0 -386
  168. package/src/core/semantic-enhancer.js +0 -492
  169. package/src/core/state-capture.js +0 -598
  170. package/src/core/video-capture.js +0 -546
  171. package/src/utils/__init__.py +0 -16
  172. package/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
  173. package/src/utils/__pycache__/env.cpython-313.pyc +0 -0
  174. package/src/utils/env.py +0 -134
  175. /package/src/core/{css-extractor.js → css/css-extractor.js} +0 -0
  176. /package/src/core/{cookie-handler.js → page-prep/cookie-handler.js} +0 -0
  177. /package/src/core/{page-readiness.js → page-prep/page-readiness.js} +0 -0
@@ -0,0 +1,84 @@
1
+ /**
2
+ * Route merging logic for page discovery.
3
+ *
4
+ * Merges framework-discovered routes (higher quality) with
5
+ * link-scraped pages (fallback), deduplicating by normalized path.
6
+ * Used by discover-pages.js (main orchestrator).
7
+ */
8
+
9
+ import { normalizeUrl, extractPageName, normalizePath, logWarning } from './discover-pages-utils.js';
10
+
11
+ /**
12
+ * Merge framework-discovered routes with link-scraped pages.
13
+ * Prioritizes framework routes (higher quality), fills gaps with link-scraped.
14
+ *
15
+ * @param {Array|null} frameworkRoutes - Routes from framework discoverer
16
+ * @param {Array|null} linkScrapedPages - Pages from link scraping
17
+ * @param {string} baseDomain - Base domain for URL normalization
18
+ * @param {string} baseUrl - Base URL for resolving paths
19
+ * @returns {Array} Merged and deduplicated pages
20
+ *
21
+ * @example
22
+ * const merged = mergeRoutes(
23
+ * [{ path: '/about', name: 'About' }],
24
+ * [{ path: '/contact', name: 'Contact' }],
25
+ * 'example.com',
26
+ * 'https://example.com'
27
+ * );
28
+ */
29
+ export function mergeRoutes(frameworkRoutes, linkScrapedPages, baseDomain, baseUrl) {
30
+ // Input validation
31
+ if (!baseDomain || typeof baseDomain !== 'string') {
32
+ logWarning('mergeRoutes: Invalid baseDomain');
33
+ baseDomain = '';
34
+ }
35
+ if (!baseUrl || typeof baseUrl !== 'string') {
36
+ logWarning('mergeRoutes: Invalid baseUrl');
37
+ baseUrl = '';
38
+ }
39
+
40
+ const seenPaths = new Set();
41
+ const merged = [];
42
+
43
+ // Add framework routes first (higher quality, more accurate)
44
+ if (Array.isArray(frameworkRoutes)) {
45
+ for (const route of frameworkRoutes) {
46
+ if (!route || typeof route !== 'object') continue;
47
+
48
+ const normalizedPath = normalizePath(route.path || '/');
49
+ if (seenPaths.has(normalizedPath)) continue;
50
+ seenPaths.add(normalizedPath);
51
+
52
+ const url = normalizeUrl(baseUrl, normalizedPath) || route.url || '';
53
+
54
+ merged.push({
55
+ path: normalizedPath,
56
+ name: route.name || extractPageName('', normalizedPath),
57
+ url,
58
+ source: route.source || 'framework',
59
+ dynamic: Boolean(route.dynamic)
60
+ });
61
+ }
62
+ }
63
+
64
+ // Add link-scraped pages (fill gaps)
65
+ if (Array.isArray(linkScrapedPages)) {
66
+ for (const page of linkScrapedPages) {
67
+ if (!page || typeof page !== 'object') continue;
68
+
69
+ const normalizedPath = normalizePath(page.path || '/');
70
+ if (seenPaths.has(normalizedPath)) continue;
71
+ seenPaths.add(normalizedPath);
72
+
73
+ merged.push({
74
+ path: normalizedPath,
75
+ name: page.name || extractPageName('', normalizedPath),
76
+ url: page.url || normalizeUrl(baseUrl, normalizedPath) || '',
77
+ source: 'link-scrape',
78
+ dynamic: false
79
+ });
80
+ }
81
+ }
82
+
83
+ return merged;
84
+ }
@@ -0,0 +1,177 @@
1
+ /**
2
+ * URL utility helpers for page discovery.
3
+ *
4
+ * Provides URL normalization, domain checking, page name extraction,
5
+ * exclusion filtering, path normalization, and route merging logic.
6
+ * Used by discover-pages.js (main orchestrator).
7
+ */
8
+
9
+ import { logWarn } from '../../utils/log.js';
10
+
11
+ // Navigation selectors in priority order
12
+ export const NAV_SELECTORS = [
13
+ 'header nav a',
14
+ 'header a',
15
+ 'nav a',
16
+ '[role="navigation"] a',
17
+ '.navbar a',
18
+ '.nav-menu a',
19
+ '.navigation a',
20
+ 'footer nav a',
21
+ 'footer a'
22
+ ];
23
+
24
+ // Patterns to exclude from discovered links
25
+ export const EXCLUDE_PATTERNS = [
26
+ /^mailto:/i,
27
+ /^tel:/i,
28
+ /^javascript:/i,
29
+ /^#/,
30
+ /\.(pdf|jpg|jpeg|png|gif|svg|webp|ico|zip|tar|gz|mp3|mp4|avi|mov)$/i,
31
+ /facebook\.com/i,
32
+ /twitter\.com/i,
33
+ /instagram\.com/i,
34
+ /linkedin\.com/i,
35
+ /youtube\.com/i,
36
+ /tiktok\.com/i
37
+ ];
38
+
39
+ // Valid framework names for validation
40
+ export const VALID_FRAMEWORKS = ['next', 'nuxt', 'vue', 'react', 'angular', 'svelte', 'astro'];
41
+
42
+ // Default options
43
+ export const DEFAULT_OPTIONS = {
44
+ maxPages: 10,
45
+ selectors: null, // Use default NAV_SELECTORS if null
46
+ includeSubdomains: false,
47
+ timeout: 30000,
48
+ // SPA/Framework options (v1.3)
49
+ spaMode: true, // Enable SPA detection and route discovery
50
+ framework: null, // Force specific framework (skip detection)
51
+ noSpaDetect: false, // Disable SPA/framework detection entirely
52
+ captureState: false // Capture app state (Redux/Vuex/Pinia/Zustand)
53
+ };
54
+
55
+ /**
56
+ * Log warning message (only in TTY mode)
57
+ * @param {string} message - Warning message
58
+ */
59
+ export function logWarning(message) {
60
+ logWarn(`[discover-pages] ${message}`);
61
+ }
62
+
63
+ /**
64
+ * Validate and normalize framework option
65
+ * @param {string|null} framework - Framework name to validate
66
+ * @returns {string|null} Validated framework name or null
67
+ */
68
+ export function validateFramework(framework) {
69
+ if (!framework) return null;
70
+ const normalized = String(framework).toLowerCase().trim();
71
+ if (VALID_FRAMEWORKS.includes(normalized)) {
72
+ return normalized;
73
+ }
74
+ logWarning(`Invalid framework "${framework}". Valid options: ${VALID_FRAMEWORKS.join(', ')}`);
75
+ return null;
76
+ }
77
+
78
+ /**
79
+ * Normalize URL for comparison and deduplication
80
+ * @param {string} baseUrl - Base URL for resolving relative paths
81
+ * @param {string} href - URL to normalize
82
+ * @returns {string|null} Normalized URL or null if invalid
83
+ */
84
+ export function normalizeUrl(baseUrl, href) {
85
+ if (!href || typeof href !== 'string') return null;
86
+
87
+ try {
88
+ const url = new URL(href, baseUrl);
89
+
90
+ // Skip non-http(s) protocols
91
+ if (!url.protocol.startsWith('http')) return null;
92
+
93
+ // Build normalized URL: origin + pathname (no hash, no query)
94
+ let normalized = url.origin + url.pathname;
95
+
96
+ // Remove trailing slash (except for root)
97
+ if (normalized.endsWith('/') && normalized !== url.origin + '/') {
98
+ normalized = normalized.slice(0, -1);
99
+ }
100
+
101
+ return normalized;
102
+ } catch {
103
+ return null;
104
+ }
105
+ }
106
+
107
+ /**
108
+ * Check if URL is same domain as base
109
+ * @param {string} url - URL to check
110
+ * @param {string} baseDomain - Base domain to compare against
111
+ * @param {boolean} includeSubdomains - Whether to include subdomains
112
+ * @returns {boolean}
113
+ */
114
+ export function isSameDomain(url, baseDomain, includeSubdomains = false) {
115
+ try {
116
+ const urlObj = new URL(url);
117
+ const hostname = urlObj.hostname.toLowerCase();
118
+ const base = baseDomain.toLowerCase();
119
+
120
+ if (hostname === base) return true;
121
+
122
+ if (includeSubdomains) {
123
+ return hostname.endsWith('.' + base);
124
+ }
125
+
126
+ return false;
127
+ } catch {
128
+ return false;
129
+ }
130
+ }
131
+
132
+ /**
133
+ * Extract page name from link text or URL path
134
+ * @param {string} text - Link text
135
+ * @param {string} path - URL path
136
+ * @returns {string} Page name
137
+ */
138
+ export function extractPageName(text, path) {
139
+ // Use link text if available and meaningful
140
+ if (text && text.length > 0 && text.length < 50) {
141
+ return text;
142
+ }
143
+
144
+ // Extract from path
145
+ if (!path || path === '/') return 'Home';
146
+
147
+ // Get last segment of path
148
+ const segments = path.split('/').filter(Boolean);
149
+ if (segments.length === 0) return 'Home';
150
+
151
+ const lastSegment = segments[segments.length - 1];
152
+
153
+ // Convert kebab-case/snake_case to Title Case
154
+ return lastSegment
155
+ .replace(/[-_]/g, ' ')
156
+ .replace(/\b\w/g, c => c.toUpperCase());
157
+ }
158
+
159
+ /**
160
+ * Check if href should be excluded
161
+ * @param {string} href - URL to check
162
+ * @returns {boolean}
163
+ */
164
+ export function shouldExclude(href) {
165
+ if (!href) return true;
166
+ return EXCLUDE_PATTERNS.some(pattern => pattern.test(href));
167
+ }
168
+
169
+ /**
170
+ * Normalize a path (remove trailing slash except for root)
171
+ * @param {string} path - Path to normalize
172
+ * @returns {string} Normalized path
173
+ */
174
+ export function normalizePath(path) {
175
+ if (!path || typeof path !== 'string') return '/';
176
+ return path.endsWith('/') && path !== '/' ? path.slice(0, -1) : path;
177
+ }
@@ -0,0 +1,191 @@
1
+ /**
2
+ * Page Discovery Module
3
+ *
4
+ * Extracts navigation links from a website to discover cloneable pages.
5
+ * Handles SPA hydration, filters external links, and normalizes URLs.
6
+ *
7
+ * Enhanced with SPA/Framework support (v1.3):
8
+ * - Framework detection (Next.js, Nuxt, Vue, React, Angular, Svelte, Astro)
9
+ * - Framework-specific route discovery
10
+ * - App state capture (optional)
11
+ *
12
+ * Usage:
13
+ * import { discoverPages } from '../discovery/discover-pages.js';
14
+ * const result = await discoverPages('https://example.com', { maxPages: 10 });
15
+ */
16
+
17
+ import { getBrowser, getPage, disconnectBrowser } from '../../utils/browser.js';
18
+ import { waitForDomStable } from '../page-prep/page-readiness.js';
19
+ import { dismissCookieBanner } from '../page-prep/cookie-handler.js';
20
+ import { detectFramework } from '../detection/framework-detector.js';
21
+ import { discoverRoutes as discoverFrameworkRoutes } from '../../route-discoverers/index.js';
22
+ import { captureAppState } from './app-state-snapshot.js';
23
+ import {
24
+ NAV_SELECTORS, DEFAULT_OPTIONS, logWarning, validateFramework,
25
+ normalizeUrl, isSameDomain, extractPageName, shouldExclude
26
+ } from './discover-pages-utils.js';
27
+ import { mergeRoutes } from './discover-pages-routes.js';
28
+
29
+ export { normalizeUrl, isSameDomain, extractPageName } from './discover-pages-utils.js';
30
+
31
+ /**
32
+ * Estimate capture time for discovered pages
33
+ * @param {Array} pages - Discovered pages
34
+ * @param {string[]} viewports - Viewport names
35
+ * @returns {Object} Estimate with pages, viewports, totalCaptures, estimatedSeconds
36
+ */
37
+ export function estimateCapture(pages, viewports = ['desktop', 'tablet', 'mobile']) {
38
+ const perCapture = 6; // seconds per viewport capture (avg)
39
+ const totalCaptures = pages.length * viewports.length;
40
+ return {
41
+ pages: pages.length,
42
+ viewports: viewports.length,
43
+ totalCaptures,
44
+ estimatedSeconds: totalCaptures * perCapture,
45
+ estimatedMinutes: Math.ceil((totalCaptures * perCapture) / 60)
46
+ };
47
+ }
48
+
49
+ /**
50
+ * Scrape navigation links from the loaded page.
51
+ * @param {import('playwright').Page} page
52
+ * @param {string} baseUrl
53
+ * @param {string} baseDomain
54
+ * @param {Object} opts
55
+ * @returns {Promise<Array>} linkScrapedPages
56
+ */
57
+ async function scrapeNavLinks(page, baseUrl, baseDomain, opts) {
58
+ const selectorString = (opts.selectors || NAV_SELECTORS).join(', ');
59
+ const rawLinks = await page.$$eval(selectorString, els =>
60
+ els.map(el => ({ href: el.href, text: el.textContent?.trim() || '' }))
61
+ ).catch(() => []);
62
+
63
+ const seenUrls = new Set();
64
+ const pages = [];
65
+
66
+ const homeUrl = normalizeUrl(baseUrl, '/');
67
+ if (homeUrl) { seenUrls.add(homeUrl); pages.push({ path: '/', name: 'Home', url: homeUrl }); }
68
+
69
+ for (const link of rawLinks) {
70
+ if (shouldExclude(link.href)) continue;
71
+ const normalized = normalizeUrl(baseUrl, link.href);
72
+ if (!normalized || seenUrls.has(normalized)) continue;
73
+ if (!isSameDomain(normalized, baseDomain, opts.includeSubdomains)) continue;
74
+ const path = new URL(normalized).pathname;
75
+ if (path === '/') continue;
76
+ seenUrls.add(normalized);
77
+ pages.push({ path, name: extractPageName(link.text, path), url: normalized });
78
+ if (pages.length >= opts.maxPages) break;
79
+ }
80
+
81
+ return { pages, rawCount: rawLinks.length };
82
+ }
83
+
84
+ /**
85
+ * Discover pages from a website by extracting navigation links.
86
+ * @param {string} baseUrl - Starting URL to discover from
87
+ * @param {Object} options - Discovery options
88
+ * @returns {Promise<Object>} Discovery result
89
+ */
90
+ export async function discoverPages(baseUrl, options = {}) {
91
+ const opts = { ...DEFAULT_OPTIONS, ...options };
92
+ const startTime = Date.now();
93
+ let browser = null;
94
+
95
+ try {
96
+ const baseUrlObj = new URL(baseUrl);
97
+ const baseDomain = baseUrlObj.hostname;
98
+
99
+ browser = await getBrowser({ headless: true });
100
+ const page = await getPage(browser);
101
+
102
+ await page.goto(baseUrl, { waitUntil: 'networkidle', timeout: opts.timeout });
103
+ await page.waitForSelector('nav a, header a, [role="navigation"] a', {
104
+ visible: true, timeout: 5000
105
+ }).catch(() => {});
106
+ await waitForDomStable(page, 500, 5000);
107
+ await dismissCookieBanner(page);
108
+ await new Promise(r => setTimeout(r, 1000));
109
+
110
+ // SPA/Framework detection
111
+ let frameworkInfo = null;
112
+ let frameworkRoutes = [];
113
+ let stateSnapshot = null;
114
+
115
+ if (!opts.noSpaDetect) {
116
+ if (opts.framework) {
117
+ const fw = validateFramework(opts.framework);
118
+ if (fw) frameworkInfo = { framework: fw, version: null, routingType: 'spa', confidence: 'forced', signals: ['user-specified'] };
119
+ } else {
120
+ try { frameworkInfo = await detectFramework(page); }
121
+ catch (e) { logWarning(`Framework detection failed: ${e.message}`); }
122
+ }
123
+
124
+ if (frameworkInfo?.framework && opts.spaMode) {
125
+ try {
126
+ const r = await discoverFrameworkRoutes(page, baseUrl, frameworkInfo);
127
+ frameworkRoutes = r.routes || [];
128
+ } catch (e) { logWarning(`Route discovery failed for ${frameworkInfo.framework}: ${e.message}`); }
129
+ }
130
+
131
+ if (opts.captureState && frameworkInfo) {
132
+ try { stateSnapshot = await captureAppState(page, frameworkInfo); }
133
+ catch (e) { logWarning(`State capture failed: ${e.message}`); }
134
+ }
135
+ }
136
+
137
+ // Link scraping
138
+ const { pages: linkScrapedPages, rawCount } = await scrapeNavLinks(page, baseUrl, baseDomain, opts);
139
+
140
+ // Merge and sort
141
+ let pages = frameworkRoutes.length > 0
142
+ ? mergeRoutes(frameworkRoutes, linkScrapedPages, baseDomain, baseUrl)
143
+ : linkScrapedPages.map(p => ({ ...p, source: 'link-scrape', dynamic: false }));
144
+
145
+ if (pages.length > opts.maxPages) pages = pages.slice(0, opts.maxPages);
146
+
147
+ pages.sort((a, b) => {
148
+ if (a.path === '/') return -1;
149
+ if (b.path === '/') return 1;
150
+ return (a.path.match(/\//g) || []).length - (b.path.match(/\//g) || []).length;
151
+ });
152
+
153
+ return {
154
+ success: true,
155
+ baseUrl: baseUrlObj.origin,
156
+ baseDomain,
157
+ framework: frameworkInfo,
158
+ stateSnapshot,
159
+ pages,
160
+ stats: { totalLinksFound: rawCount, frameworkRoutesFound: frameworkRoutes.length, pagesDiscovered: pages.length, durationMs: Date.now() - startTime }
161
+ };
162
+ } catch (error) {
163
+ let normalizedBaseUrl = baseUrl;
164
+ let errorBaseDomain = '';
165
+ try { const u = new URL(baseUrl); normalizedBaseUrl = u.origin; errorBaseDomain = u.hostname; } catch { /* keep original */ }
166
+ return {
167
+ success: false,
168
+ baseUrl: normalizedBaseUrl,
169
+ baseDomain: errorBaseDomain,
170
+ framework: null,
171
+ stateSnapshot: null,
172
+ pages: [{ path: '/', name: 'Home', url: normalizeUrl(baseUrl, '/') || baseUrl, source: 'fallback', dynamic: false }],
173
+ error: error.message,
174
+ stats: { totalLinksFound: 0, frameworkRoutesFound: 0, pagesDiscovered: 1, durationMs: Date.now() - startTime }
175
+ };
176
+ } finally {
177
+ if (browser) await disconnectBrowser();
178
+ }
179
+ }
180
+
181
+ // CLI support
182
+ import { fileURLToPath } from 'url';
183
+ const __filename = fileURLToPath(import.meta.url);
184
+ if (process.argv[1] === __filename) {
185
+ const url = process.argv[2];
186
+ const maxPages = parseInt(process.argv[3]) || 10;
187
+ if (!url) { console.error('Usage: node discover-pages.js <url> [maxPages]'); process.exit(1); }
188
+ discoverPages(url, { maxPages })
189
+ .then(result => { console.log(JSON.stringify(result, null, 2)); process.exit(result.success ? 0 : 1); })
190
+ .catch(err => { console.error(JSON.stringify({ success: false, error: err.message })); process.exit(1); });
191
+ }
@@ -0,0 +1,70 @@
1
+ /**
2
+ * Inline Style Injector for HTML Extraction
3
+ *
4
+ * Computes and inlines critical layout styles (flex, grid, absolute, fixed)
5
+ * onto cloned DOM elements during HTML extraction to preserve visual layout
6
+ * without relying on external stylesheets.
7
+ *
8
+ * Designed to run inside page.evaluate — all functions are serialized as
9
+ * source strings and reconstructed in the browser context.
10
+ */
11
+
12
+ /**
13
+ * Compute inline styles for critical elements and apply them to the cloned doc.
14
+ * Called inside page.evaluate with the live document and cloned doc in scope.
15
+ *
16
+ * @param {Document} liveDocument - The live page document (for getComputedStyle)
17
+ * @param {Document} clonedDoc - The cloned document to mutate
18
+ * @param {string[]} inlineProps - CSS property names (camelCase) to inline
19
+ * @param {string[]} criticalDisplay - Display values that trigger inlining (e.g. 'flex')
20
+ * @param {string[]} criticalPosition - Position values that trigger inlining (e.g. 'fixed')
21
+ * @returns {{ inlinedCount: number, warnings: string[] }}
22
+ */
23
+ export function computeAndApplyInlineStyles(
24
+ liveDocument, clonedDoc, inlineProps, criticalDisplay, criticalPosition
25
+ ) {
26
+ const warnings = [];
27
+ const inlineStyles = [];
28
+ let inlinedCount = 0;
29
+
30
+ liveDocument.querySelectorAll('*').forEach((liveEl, idx) => {
31
+ const style = getComputedStyle(liveEl);
32
+ const display = style.display;
33
+ const position = style.position;
34
+
35
+ if (!criticalDisplay.includes(display) && !criticalPosition.includes(position)) return;
36
+
37
+ const props = [];
38
+ inlineProps.forEach(prop => {
39
+ const val = style[prop];
40
+ if (val && val !== 'auto' && val !== 'none' && val !== 'normal' &&
41
+ val !== '0px' && val !== 'static' && val !== 'visible' &&
42
+ val !== 'content-box') {
43
+ const cssProp = prop.replace(/([A-Z])/g, '-$1').toLowerCase();
44
+ props.push(`${cssProp}: ${val}`);
45
+ }
46
+ });
47
+
48
+ // Always include display for critical elements
49
+ if (!props.some(p => p.startsWith('display:'))) {
50
+ props.unshift(`display: ${display}`);
51
+ }
52
+
53
+ if (props.length > 0) inlineStyles.push({ idx, style: props.join('; ') });
54
+ });
55
+
56
+ // Apply to cloned doc by index
57
+ const clonedElements = clonedDoc.querySelectorAll('*');
58
+ inlineStyles.forEach(({ idx, style }) => {
59
+ if (!clonedElements[idx]) return;
60
+ const existing = clonedElements[idx].getAttribute('style') || '';
61
+ clonedElements[idx].setAttribute('style', existing ? `${existing}; ${style}` : style);
62
+ inlinedCount++;
63
+ });
64
+
65
+ if (inlinedCount > 100) {
66
+ warnings.push(`Inlined ${inlinedCount} critical elements`);
67
+ }
68
+
69
+ return { inlinedCount, warnings };
70
+ }
@@ -0,0 +1,147 @@
1
+ /**
2
+ * HTML Extractor
3
+ *
4
+ * Extract and clean HTML from page, removing scripts,
5
+ * event handlers, and framework-specific attributes.
6
+ * Optionally enhances with WordPress-compatible semantic structure.
7
+ *
8
+ * Inline style computation lives in html-extractor-inline-styler.js
9
+ * and is serialized into the browser context via page.evaluate.
10
+ */
11
+
12
+ import { LAYOUT_PROPERTIES } from '../css/css-extractor.js';
13
+ import { enhanceSemanticHTMLInPage } from './semantic-enhancer.js';
14
+ import { computeAndApplyInlineStyles } from './html-extractor-inline-styler.js';
15
+
16
+ // Size limits
17
+ export const MAX_HTML_SIZE = 10 * 1024 * 1024; // 10MB
18
+ export const MAX_DOM_ELEMENTS = 50000;
19
+
20
+ // JS framework attribute patterns to remove
21
+ export const JS_FRAMEWORK_PATTERNS = [
22
+ /^data-react/i, /^data-vue/i, /^data-ng/i, /^ng-/i,
23
+ /^data-svelte/i, /^x-/i, /^hx-/i, /^v-/i,
24
+ /^data-alpine/i, /^wire:/i, /^@/
25
+ ];
26
+
27
+ // Properties to inline on critical elements (layout only, not visual)
28
+ export const INLINE_LAYOUT_PROPS = [
29
+ ...LAYOUT_PROPERTIES.display,
30
+ ...LAYOUT_PROPERTIES.grid,
31
+ ...LAYOUT_PROPERTIES.position,
32
+ ...LAYOUT_PROPERTIES.sizing,
33
+ ...LAYOUT_PROPERTIES.box.slice(0, 2) // boxSizing, overflow only
34
+ ];
35
+
36
+ export const CRITICAL_DISPLAY = ['flex', 'inline-flex', 'grid', 'inline-grid'];
37
+ export const CRITICAL_POSITION = ['absolute', 'fixed'];
38
+
39
+ /**
40
+ * Extract and clean HTML from page.
41
+ * @param {import('playwright').Page} page
42
+ * @param {Array<RegExp>} frameworkPatterns - Patterns to remove
43
+ * @returns {Promise<{ html: string, warnings: string[], elementCount: number, inlinedCount: number }>}
44
+ */
45
+ export async function extractCleanHtml(page, frameworkPatterns = JS_FRAMEWORK_PATTERNS) {
46
+ // Serialize browser-side helper for inline styling
47
+ const inlineStylerSrc = computeAndApplyInlineStyles.toString();
48
+
49
+ return await page.evaluate(
50
+ ({ patterns, inlineProps, criticalDisplay, criticalPosition, inlineStylerSrc }) => {
51
+ const warnings = [];
52
+
53
+ const elementCount = document.querySelectorAll('*').length;
54
+ if (elementCount > 50000) warnings.push(`Large DOM: ${elementCount} elements`);
55
+
56
+ const doc = document.documentElement.cloneNode(true);
57
+
58
+ // Remove scripts and noscript
59
+ doc.querySelectorAll('script, noscript').forEach(el => el.remove());
60
+ doc.querySelectorAll('svg script, svg a[href^="javascript:"]').forEach(el => el.remove());
61
+
62
+ // Sanitize CSS links
63
+ doc.querySelectorAll('link[rel="stylesheet"]').forEach(link => {
64
+ const href = link.getAttribute('href') || '';
65
+ if (href.startsWith('javascript:') || href.startsWith('data:')) link.remove();
66
+ });
67
+
68
+ // Sanitize inline styles
69
+ doc.querySelectorAll('style').forEach(style => {
70
+ if ((style.textContent || '').match(/@import\s+url\s*\(\s*['"]?(javascript|data):/i)) {
71
+ style.remove();
72
+ }
73
+ });
74
+
75
+ // Remove event handlers and framework attributes
76
+ const patternRegexes = patterns.map(p => new RegExp(p.source, p.flags));
77
+ doc.querySelectorAll('*').forEach(el => {
78
+ [...el.attributes].forEach(attr => {
79
+ if (attr.name.startsWith('on')) el.removeAttribute(attr.name);
80
+ if (patternRegexes.some(p => p.test(attr.name))) el.removeAttribute(attr.name);
81
+ });
82
+ });
83
+
84
+ // Inline critical layout styles (browser-side helper deserialized here)
85
+ // eslint-disable-next-line no-new-func
86
+ const computeAndApplyInlineStyles = new Function('return (' + inlineStylerSrc + ')')();
87
+ const { inlinedCount, warnings: styleWarnings } = computeAndApplyInlineStyles(
88
+ document, doc, inlineProps, criticalDisplay, criticalPosition
89
+ );
90
+ warnings.push(...styleWarnings);
91
+
92
+ // Remove hidden elements
93
+ doc.querySelectorAll('[hidden], [style*="display: none"], [style*="display:none"]')
94
+ .forEach(el => el.remove());
95
+
96
+ // Remove empty style tags and HTML comments
97
+ doc.querySelectorAll('style:empty').forEach(el => el.remove());
98
+
99
+ const removeComments = (node) => {
100
+ [...node.childNodes].forEach(child => {
101
+ if (child.nodeType === 8) child.remove();
102
+ else if (child.nodeType === 1) removeComments(child);
103
+ });
104
+ };
105
+ removeComments(doc);
106
+
107
+ const html = '<!DOCTYPE html>\n<html lang="' +
108
+ (document.documentElement.lang || 'en') + '">\n' +
109
+ doc.innerHTML + '\n</html>';
110
+
111
+ return { html, warnings, elementCount, inlinedCount };
112
+ },
113
+ {
114
+ patterns: frameworkPatterns.map(r => ({ source: r.source, flags: r.flags })),
115
+ inlineProps: INLINE_LAYOUT_PROPS,
116
+ criticalDisplay: CRITICAL_DISPLAY,
117
+ criticalPosition: CRITICAL_POSITION,
118
+ inlineStylerSrc
119
+ }
120
+ );
121
+ }
122
+
123
+ /**
124
+ * Extract, clean, and optionally enhance HTML with semantic structure.
125
+ * @param {import('playwright').Page} page
126
+ * @param {Object} options
127
+ * @param {boolean} [options.enhanceSemantic=true]
128
+ * @param {Array<RegExp>} [options.frameworkPatterns]
129
+ * @returns {Promise<{ html: string, warnings: string[], elementCount: number, semanticStats?: Object }>}
130
+ */
131
+ export async function extractAndEnhanceHtml(page, options = {}) {
132
+ const { enhanceSemantic = true, frameworkPatterns = JS_FRAMEWORK_PATTERNS } = options;
133
+
134
+ const result = await extractCleanHtml(page, frameworkPatterns);
135
+
136
+ if (enhanceSemantic) {
137
+ try {
138
+ const enhanced = await enhanceSemanticHTMLInPage(page, result.html);
139
+ return { ...result, html: enhanced.html, semanticStats: enhanced.stats };
140
+ } catch (err) {
141
+ result.warnings.push(`Semantic enhancement failed: ${err.message}`);
142
+ return result;
143
+ }
144
+ }
145
+
146
+ return result;
147
+ }