design-clone 2.1.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -34
- package/SKILL.md +69 -45
- package/bin/cli.js +22 -4
- package/bin/commands/clone-site.js +31 -171
- package/bin/commands/help.js +19 -6
- package/bin/commands/init.js +9 -86
- package/bin/commands/uninstall.js +105 -0
- package/bin/commands/update.js +70 -0
- package/bin/commands/verify.js +7 -14
- package/bin/utils/paths.js +28 -0
- package/bin/utils/validate.js +2 -22
- package/bin/utils/version.js +23 -0
- package/docs/code-standards.md +789 -0
- package/docs/codebase-summary.md +533 -286
- package/docs/index.md +74 -0
- package/docs/project-overview-pdr.md +797 -0
- package/docs/system-architecture.md +718 -0
- package/package.json +14 -17
- package/src/ai/prompts/design-tokens/basic.md +80 -0
- package/src/ai/prompts/design-tokens/section-with-css.md +41 -0
- package/src/ai/prompts/design-tokens/section.md +48 -0
- package/src/ai/prompts/design-tokens/with-css.md +87 -0
- package/src/ai/prompts/structure-analysis/basic.md +55 -0
- package/src/ai/prompts/structure-analysis/with-context.md +59 -0
- package/src/ai/prompts/structure-analysis/with-dimensions.md +63 -0
- package/src/ai/prompts/structure-analysis/with-hierarchy.md +73 -0
- package/src/ai/prompts/ux-audit/aggregation.md +42 -0
- package/src/ai/prompts/ux-audit/desktop.md +92 -0
- package/src/ai/prompts/ux-audit/mobile.md +93 -0
- package/src/ai/prompts/ux-audit/tablet.md +92 -0
- package/src/core/animation/animation-extractor-ast.js +183 -0
- package/src/core/animation/animation-extractor-output.js +152 -0
- package/src/core/animation/animation-extractor.js +178 -0
- package/src/core/animation/state-capture-detection.js +200 -0
- package/src/core/animation/state-capture.js +193 -0
- package/src/core/capture/browser-context-pool.js +96 -0
- package/src/core/capture/multi-page-screenshot-page.js +110 -0
- package/src/core/capture/multi-page-screenshot.js +208 -0
- package/src/core/capture/screenshot-extraction.js +186 -0
- package/src/core/capture/screenshot-helpers.js +175 -0
- package/src/core/capture/screenshot-orchestrator.js +174 -0
- package/src/core/capture/screenshot-viewport.js +93 -0
- package/src/core/capture/screenshot.js +192 -0
- package/src/core/content/content-counter-dom.js +191 -0
- package/src/core/content/content-counter.js +76 -0
- package/src/core/css/breakpoint-detector.js +66 -0
- package/src/core/css/chromium-defaults.json +23 -0
- package/src/core/css/computed-style-extractor.js +102 -0
- package/src/core/css/css-chunker.js +103 -0
- package/src/core/css/filter-css-dead-code.js +120 -0
- package/src/core/css/filter-css-html-analyzer.js +110 -0
- package/src/core/css/filter-css-selector-matcher.js +172 -0
- package/src/core/css/filter-css.js +206 -0
- package/src/core/css/merge-css-atrule-processor.js +158 -0
- package/src/core/css/merge-css-file-io.js +68 -0
- package/src/core/css/merge-css.js +148 -0
- package/src/core/detection/framework-detector-routing.js +68 -0
- package/src/core/detection/framework-detector-signals.js +65 -0
- package/src/core/detection/framework-detector.js +198 -0
- package/src/core/dimension/dimension-extractor-card-detector.js +82 -0
- package/src/core/dimension/dimension-extractor.js +317 -0
- package/src/core/dimension/dimension-output-ai-summary.js +111 -0
- package/src/core/dimension/dimension-output.js +173 -0
- package/src/core/dimension/dom-tree-analyzer-tree-builders.js +95 -0
- package/src/core/dimension/dom-tree-analyzer.js +191 -0
- package/src/core/discovery/app-state-snapshot-capture.js +195 -0
- package/src/core/discovery/app-state-snapshot-utils.js +178 -0
- package/src/core/discovery/app-state-snapshot.js +131 -0
- package/src/core/discovery/discover-pages-routes.js +84 -0
- package/src/core/discovery/discover-pages-utils.js +177 -0
- package/src/core/discovery/discover-pages.js +191 -0
- package/src/core/html/html-extractor-inline-styler.js +70 -0
- package/src/core/html/html-extractor.js +147 -0
- package/src/core/html/semantic-enhancer-mappings.js +200 -0
- package/src/core/html/semantic-enhancer-page.js +148 -0
- package/src/core/html/semantic-enhancer.js +135 -0
- package/src/core/links/rewrite-links-css-rewriter.js +53 -0
- package/src/core/links/rewrite-links.js +173 -0
- package/src/core/media/asset-validator.js +118 -0
- package/src/core/media/extract-assets-downloader.js +187 -0
- package/src/core/media/extract-assets-page-scraper.js +115 -0
- package/src/core/media/extract-assets.js +159 -0
- package/src/core/media/video-capture-convert.js +200 -0
- package/src/core/media/video-capture.js +201 -0
- package/src/core/{lazy-loader.js → page-prep/lazy-loader.js} +37 -39
- package/src/core/section/section-cropper-helpers.js +43 -0
- package/src/core/{section-cropper.js → section/section-cropper.js} +11 -88
- package/src/core/section/section-detector-strategies.js +139 -0
- package/src/core/section/section-detector-utils.js +100 -0
- package/src/core/section/section-detector.js +88 -0
- package/src/core/tests/test-section-cropper.js +2 -2
- package/src/core/tests/test-section-detector.js +2 -2
- package/src/post-process/enhance-assets.js +29 -4
- package/src/post-process/fetch-images-unsplash-client.js +123 -0
- package/src/post-process/fetch-images.js +60 -263
- package/src/post-process/inject-gosnap.js +88 -0
- package/src/post-process/inject-icons-svg-replacer.js +76 -0
- package/src/post-process/inject-icons.js +47 -200
- package/src/route-discoverers/base-discoverer-utils.js +137 -0
- package/src/route-discoverers/base-discoverer.js +29 -118
- package/src/route-discoverers/index.js +1 -1
- package/src/shared/config.js +38 -0
- package/src/shared/error-codes.js +31 -0
- package/src/shared/viewports.js +46 -0
- package/src/utils/browser.js +0 -7
- package/src/utils/helpers.js +4 -0
- package/src/utils/log.js +12 -0
- package/src/utils/playwright-loader.js +76 -0
- package/src/utils/playwright.js +3 -69
- package/src/utils/progress.js +32 -0
- package/src/verification/generate-audit-report-css-fixes.js +52 -0
- package/src/verification/generate-audit-report-sections.js +158 -0
- package/src/verification/generate-audit-report.js +5 -281
- package/src/verification/quality-scorer.js +92 -0
- package/src/verification/verify-footer-checks.js +103 -0
- package/src/verification/verify-footer-helpers.js +178 -0
- package/src/verification/verify-footer.js +23 -381
- package/src/verification/verify-header-checks.js +104 -0
- package/src/verification/verify-header-helpers.js +156 -0
- package/src/verification/verify-header.js +23 -365
- package/src/verification/verify-layout-report.js +101 -0
- package/src/verification/verify-layout.js +13 -259
- package/src/verification/verify-menu-checks.js +104 -0
- package/src/verification/verify-menu-helpers.js +112 -0
- package/src/verification/verify-menu.js +17 -285
- package/src/verification/verify-slider-checks.js +115 -0
- package/src/verification/verify-slider-constants.js +65 -0
- package/src/verification/verify-slider-helpers.js +164 -0
- package/src/verification/verify-slider.js +23 -414
- package/.env.example +0 -14
- package/docs/basic-clone.md +0 -63
- package/docs/cli-reference.md +0 -316
- package/docs/design-clone-architecture.md +0 -492
- package/docs/pixel-perfect.md +0 -117
- package/docs/project-roadmap.md +0 -382
- package/docs/troubleshooting.md +0 -170
- package/requirements.txt +0 -5
- package/src/ai/__pycache__/analyze-structure.cpython-313.pyc +0 -0
- package/src/ai/__pycache__/extract-design-tokens.cpython-313.pyc +0 -0
- package/src/ai/analyze-structure.py +0 -375
- package/src/ai/extract-design-tokens.py +0 -782
- package/src/ai/prompts/__init__.py +0 -2
- package/src/ai/prompts/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/ai/prompts/__pycache__/design_tokens.cpython-313.pyc +0 -0
- package/src/ai/prompts/__pycache__/structure_analysis.cpython-313.pyc +0 -0
- package/src/ai/prompts/__pycache__/ux_audit.cpython-313.pyc +0 -0
- package/src/ai/prompts/design_tokens.py +0 -316
- package/src/ai/prompts/structure_analysis.py +0 -592
- package/src/ai/prompts/ux_audit.py +0 -198
- package/src/ai/ux-audit.js +0 -596
- package/src/core/animation-extractor.js +0 -526
- package/src/core/app-state-snapshot.js +0 -511
- package/src/core/content-counter.js +0 -342
- package/src/core/design-tokens.js +0 -103
- package/src/core/dimension-extractor.js +0 -438
- package/src/core/dimension-output.js +0 -305
- package/src/core/discover-pages.js +0 -542
- package/src/core/dom-tree-analyzer.js +0 -298
- package/src/core/extract-assets.js +0 -468
- package/src/core/filter-css.js +0 -499
- package/src/core/framework-detector.js +0 -538
- package/src/core/html-extractor.js +0 -212
- package/src/core/merge-css.js +0 -407
- package/src/core/multi-page-screenshot.js +0 -380
- package/src/core/rewrite-links.js +0 -226
- package/src/core/screenshot.js +0 -701
- package/src/core/section-detector.js +0 -386
- package/src/core/semantic-enhancer.js +0 -492
- package/src/core/state-capture.js +0 -598
- package/src/core/video-capture.js +0 -546
- package/src/utils/__init__.py +0 -16
- package/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/utils/__pycache__/env.cpython-313.pyc +0 -0
- package/src/utils/env.py +0 -134
- /package/src/core/{css-extractor.js → css/css-extractor.js} +0 -0
- /package/src/core/{cookie-handler.js → page-prep/cookie-handler.js} +0 -0
- /package/src/core/{page-readiness.js → page-prep/page-readiness.js} +0 -0
|
@@ -1,542 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Page Discovery Module
|
|
3
|
-
*
|
|
4
|
-
* Extracts navigation links from a website to discover cloneable pages.
|
|
5
|
-
* Handles SPA hydration, filters external links, and normalizes URLs.
|
|
6
|
-
*
|
|
7
|
-
* Enhanced with SPA/Framework support (v1.3):
|
|
8
|
-
* - Framework detection (Next.js, Nuxt, Vue, React, Angular, Svelte, Astro)
|
|
9
|
-
* - Framework-specific route discovery
|
|
10
|
-
* - App state capture (optional)
|
|
11
|
-
*
|
|
12
|
-
* Usage:
|
|
13
|
-
* import { discoverPages } from './discover-pages.js';
|
|
14
|
-
* const result = await discoverPages('https://example.com', { maxPages: 10 });
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
import { getBrowser, getPage, disconnectBrowser } from '../utils/browser.js';
|
|
18
|
-
import { waitForDomStable, waitForPageReady } from './page-readiness.js';
|
|
19
|
-
import { dismissCookieBanner } from './cookie-handler.js';
|
|
20
|
-
|
|
21
|
-
// SPA/Framework support imports
|
|
22
|
-
import { detectFramework, formatDetectionResult } from './framework-detector.js';
|
|
23
|
-
import { discoverRoutes as discoverFrameworkRoutes } from '../route-discoverers/index.js';
|
|
24
|
-
import { captureAppState, formatStateSnapshot } from './app-state-snapshot.js';
|
|
25
|
-
|
|
26
|
-
// Navigation selectors in priority order
|
|
27
|
-
const NAV_SELECTORS = [
|
|
28
|
-
'header nav a',
|
|
29
|
-
'header a',
|
|
30
|
-
'nav a',
|
|
31
|
-
'[role="navigation"] a',
|
|
32
|
-
'.navbar a',
|
|
33
|
-
'.nav-menu a',
|
|
34
|
-
'.navigation a',
|
|
35
|
-
'footer nav a',
|
|
36
|
-
'footer a'
|
|
37
|
-
];
|
|
38
|
-
|
|
39
|
-
// Patterns to exclude from discovered links
|
|
40
|
-
const EXCLUDE_PATTERNS = [
|
|
41
|
-
/^mailto:/i,
|
|
42
|
-
/^tel:/i,
|
|
43
|
-
/^javascript:/i,
|
|
44
|
-
/^#/,
|
|
45
|
-
/\.(pdf|jpg|jpeg|png|gif|svg|webp|ico|zip|tar|gz|mp3|mp4|avi|mov)$/i,
|
|
46
|
-
/facebook\.com/i,
|
|
47
|
-
/twitter\.com/i,
|
|
48
|
-
/instagram\.com/i,
|
|
49
|
-
/linkedin\.com/i,
|
|
50
|
-
/youtube\.com/i,
|
|
51
|
-
/tiktok\.com/i
|
|
52
|
-
];
|
|
53
|
-
|
|
54
|
-
// Valid framework names for validation
|
|
55
|
-
const VALID_FRAMEWORKS = ['next', 'nuxt', 'vue', 'react', 'angular', 'svelte', 'astro'];
|
|
56
|
-
|
|
57
|
-
// Default options
|
|
58
|
-
const DEFAULT_OPTIONS = {
|
|
59
|
-
maxPages: 10,
|
|
60
|
-
selectors: null, // Use default NAV_SELECTORS if null
|
|
61
|
-
includeSubdomains: false,
|
|
62
|
-
timeout: 30000,
|
|
63
|
-
// SPA/Framework options (v1.3)
|
|
64
|
-
spaMode: true, // Enable SPA detection and route discovery
|
|
65
|
-
framework: null, // Force specific framework (skip detection)
|
|
66
|
-
noSpaDetect: false, // Disable SPA/framework detection entirely
|
|
67
|
-
captureState: false // Capture app state (Redux/Vuex/Pinia/Zustand)
|
|
68
|
-
};
|
|
69
|
-
|
|
70
|
-
/**
|
|
71
|
-
* Log warning message (only in TTY mode)
|
|
72
|
-
* @param {string} message - Warning message
|
|
73
|
-
*/
|
|
74
|
-
function logWarning(message) {
|
|
75
|
-
if (process.stderr.isTTY) {
|
|
76
|
-
console.error(`[discover-pages] WARN: ${message}`);
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
/**
|
|
81
|
-
* Validate and normalize framework option
|
|
82
|
-
* @param {string|null} framework - Framework name to validate
|
|
83
|
-
* @returns {string|null} Validated framework name or null
|
|
84
|
-
*/
|
|
85
|
-
function validateFramework(framework) {
|
|
86
|
-
if (!framework) return null;
|
|
87
|
-
const normalized = String(framework).toLowerCase().trim();
|
|
88
|
-
if (VALID_FRAMEWORKS.includes(normalized)) {
|
|
89
|
-
return normalized;
|
|
90
|
-
}
|
|
91
|
-
logWarning(`Invalid framework "${framework}". Valid options: ${VALID_FRAMEWORKS.join(', ')}`);
|
|
92
|
-
return null;
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
/**
|
|
96
|
-
* Normalize URL for comparison and deduplication
|
|
97
|
-
* @param {string} baseUrl - Base URL for resolving relative paths
|
|
98
|
-
* @param {string} href - URL to normalize
|
|
99
|
-
* @returns {string|null} Normalized URL or null if invalid
|
|
100
|
-
*/
|
|
101
|
-
export function normalizeUrl(baseUrl, href) {
|
|
102
|
-
if (!href || typeof href !== 'string') return null;
|
|
103
|
-
|
|
104
|
-
try {
|
|
105
|
-
const url = new URL(href, baseUrl);
|
|
106
|
-
|
|
107
|
-
// Skip non-http(s) protocols
|
|
108
|
-
if (!url.protocol.startsWith('http')) return null;
|
|
109
|
-
|
|
110
|
-
// Build normalized URL: origin + pathname (no hash, no query)
|
|
111
|
-
let normalized = url.origin + url.pathname;
|
|
112
|
-
|
|
113
|
-
// Remove trailing slash (except for root)
|
|
114
|
-
if (normalized.endsWith('/') && normalized !== url.origin + '/') {
|
|
115
|
-
normalized = normalized.slice(0, -1);
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
return normalized;
|
|
119
|
-
} catch {
|
|
120
|
-
return null;
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
/**
|
|
125
|
-
* Check if URL is same domain as base
|
|
126
|
-
* @param {string} url - URL to check
|
|
127
|
-
* @param {string} baseDomain - Base domain to compare against
|
|
128
|
-
* @param {boolean} includeSubdomains - Whether to include subdomains
|
|
129
|
-
* @returns {boolean}
|
|
130
|
-
*/
|
|
131
|
-
export function isSameDomain(url, baseDomain, includeSubdomains = false) {
|
|
132
|
-
try {
|
|
133
|
-
const urlObj = new URL(url);
|
|
134
|
-
const hostname = urlObj.hostname.toLowerCase();
|
|
135
|
-
const base = baseDomain.toLowerCase();
|
|
136
|
-
|
|
137
|
-
if (hostname === base) return true;
|
|
138
|
-
|
|
139
|
-
if (includeSubdomains) {
|
|
140
|
-
return hostname.endsWith('.' + base);
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
return false;
|
|
144
|
-
} catch {
|
|
145
|
-
return false;
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
/**
|
|
150
|
-
* Extract page name from link text or URL path
|
|
151
|
-
* @param {string} text - Link text
|
|
152
|
-
* @param {string} path - URL path
|
|
153
|
-
* @returns {string} Page name
|
|
154
|
-
*/
|
|
155
|
-
export function extractPageName(text, path) {
|
|
156
|
-
// Use link text if available and meaningful
|
|
157
|
-
if (text && text.length > 0 && text.length < 50) {
|
|
158
|
-
return text;
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
// Extract from path
|
|
162
|
-
if (!path || path === '/') return 'Home';
|
|
163
|
-
|
|
164
|
-
// Get last segment of path
|
|
165
|
-
const segments = path.split('/').filter(Boolean);
|
|
166
|
-
if (segments.length === 0) return 'Home';
|
|
167
|
-
|
|
168
|
-
const lastSegment = segments[segments.length - 1];
|
|
169
|
-
|
|
170
|
-
// Convert kebab-case/snake_case to Title Case
|
|
171
|
-
return lastSegment
|
|
172
|
-
.replace(/[-_]/g, ' ')
|
|
173
|
-
.replace(/\b\w/g, c => c.toUpperCase());
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
/**
|
|
177
|
-
* Check if href should be excluded
|
|
178
|
-
* @param {string} href - URL to check
|
|
179
|
-
* @returns {boolean}
|
|
180
|
-
*/
|
|
181
|
-
function shouldExclude(href) {
|
|
182
|
-
if (!href) return true;
|
|
183
|
-
return EXCLUDE_PATTERNS.some(pattern => pattern.test(href));
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
/**
|
|
187
|
-
* Normalize a path (remove trailing slash except for root)
|
|
188
|
-
* @param {string} path - Path to normalize
|
|
189
|
-
* @returns {string} Normalized path
|
|
190
|
-
*/
|
|
191
|
-
function normalizePath(path) {
|
|
192
|
-
if (!path || typeof path !== 'string') return '/';
|
|
193
|
-
return path.endsWith('/') && path !== '/' ? path.slice(0, -1) : path;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
/**
|
|
197
|
-
* Merge framework-discovered routes with link-scraped pages
|
|
198
|
-
* Prioritizes framework routes (higher quality), fills gaps with link-scraped
|
|
199
|
-
*
|
|
200
|
-
* @param {Array|null} frameworkRoutes - Routes from framework discoverer
|
|
201
|
-
* @param {Array|null} linkScrapedPages - Pages from link scraping
|
|
202
|
-
* @param {string} baseDomain - Base domain for URL normalization
|
|
203
|
-
* @param {string} baseUrl - Base URL for resolving paths
|
|
204
|
-
* @returns {Array} Merged and deduplicated pages
|
|
205
|
-
*
|
|
206
|
-
* @example
|
|
207
|
-
* const merged = mergeRoutes(
|
|
208
|
-
* [{ path: '/about', name: 'About' }],
|
|
209
|
-
* [{ path: '/contact', name: 'Contact' }],
|
|
210
|
-
* 'example.com',
|
|
211
|
-
* 'https://example.com'
|
|
212
|
-
* );
|
|
213
|
-
*/
|
|
214
|
-
function mergeRoutes(frameworkRoutes, linkScrapedPages, baseDomain, baseUrl) {
|
|
215
|
-
// Input validation
|
|
216
|
-
if (!baseDomain || typeof baseDomain !== 'string') {
|
|
217
|
-
logWarning('mergeRoutes: Invalid baseDomain');
|
|
218
|
-
baseDomain = '';
|
|
219
|
-
}
|
|
220
|
-
if (!baseUrl || typeof baseUrl !== 'string') {
|
|
221
|
-
logWarning('mergeRoutes: Invalid baseUrl');
|
|
222
|
-
baseUrl = '';
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
const seenPaths = new Set();
|
|
226
|
-
const merged = [];
|
|
227
|
-
|
|
228
|
-
// Add framework routes first (higher quality, more accurate)
|
|
229
|
-
if (Array.isArray(frameworkRoutes)) {
|
|
230
|
-
for (const route of frameworkRoutes) {
|
|
231
|
-
if (!route || typeof route !== 'object') continue;
|
|
232
|
-
|
|
233
|
-
const normalizedPath = normalizePath(route.path || '/');
|
|
234
|
-
if (seenPaths.has(normalizedPath)) continue;
|
|
235
|
-
seenPaths.add(normalizedPath);
|
|
236
|
-
|
|
237
|
-
const url = normalizeUrl(baseUrl, normalizedPath) || route.url || '';
|
|
238
|
-
|
|
239
|
-
merged.push({
|
|
240
|
-
path: normalizedPath,
|
|
241
|
-
name: route.name || extractPageName('', normalizedPath),
|
|
242
|
-
url,
|
|
243
|
-
source: route.source || 'framework',
|
|
244
|
-
dynamic: Boolean(route.dynamic)
|
|
245
|
-
});
|
|
246
|
-
}
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
// Add link-scraped pages (fill gaps)
|
|
250
|
-
if (Array.isArray(linkScrapedPages)) {
|
|
251
|
-
for (const page of linkScrapedPages) {
|
|
252
|
-
if (!page || typeof page !== 'object') continue;
|
|
253
|
-
|
|
254
|
-
const normalizedPath = normalizePath(page.path || '/');
|
|
255
|
-
if (seenPaths.has(normalizedPath)) continue;
|
|
256
|
-
seenPaths.add(normalizedPath);
|
|
257
|
-
|
|
258
|
-
merged.push({
|
|
259
|
-
path: normalizedPath,
|
|
260
|
-
name: page.name || extractPageName('', normalizedPath),
|
|
261
|
-
url: page.url || normalizeUrl(baseUrl, normalizedPath) || '',
|
|
262
|
-
source: 'link-scrape',
|
|
263
|
-
dynamic: false
|
|
264
|
-
});
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
return merged;
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
/**
|
|
272
|
-
* Discover pages from a website by extracting navigation links
|
|
273
|
-
* Enhanced with SPA/Framework support (v1.3)
|
|
274
|
-
*
|
|
275
|
-
* @param {string} baseUrl - Starting URL to discover from
|
|
276
|
-
* @param {Object} options - Discovery options
|
|
277
|
-
* @param {number} [options.maxPages=10] - Maximum pages to discover
|
|
278
|
-
* @param {boolean} [options.spaMode=true] - Enable SPA detection
|
|
279
|
-
* @param {string} [options.framework] - Force specific framework
|
|
280
|
-
* @param {boolean} [options.noSpaDetect=false] - Disable SPA detection
|
|
281
|
-
* @param {boolean} [options.captureState=false] - Capture app state
|
|
282
|
-
* @returns {Promise<Object>} Discovery result
|
|
283
|
-
*/
|
|
284
|
-
export async function discoverPages(baseUrl, options = {}) {
|
|
285
|
-
const opts = { ...DEFAULT_OPTIONS, ...options };
|
|
286
|
-
const startTime = Date.now();
|
|
287
|
-
|
|
288
|
-
let browser = null;
|
|
289
|
-
let page = null;
|
|
290
|
-
|
|
291
|
-
try {
|
|
292
|
-
// Parse base URL
|
|
293
|
-
const baseUrlObj = new URL(baseUrl);
|
|
294
|
-
const baseDomain = baseUrlObj.hostname;
|
|
295
|
-
|
|
296
|
-
// Launch browser
|
|
297
|
-
browser = await getBrowser({ headless: true });
|
|
298
|
-
page = await getPage(browser);
|
|
299
|
-
|
|
300
|
-
// Navigate to page
|
|
301
|
-
await page.goto(baseUrl, {
|
|
302
|
-
waitUntil: 'networkidle',
|
|
303
|
-
timeout: opts.timeout
|
|
304
|
-
});
|
|
305
|
-
|
|
306
|
-
// Wait for SPA hydration
|
|
307
|
-
await page.waitForSelector('nav a, header a, [role="navigation"] a', {
|
|
308
|
-
visible: true,
|
|
309
|
-
timeout: 5000
|
|
310
|
-
}).catch(() => {});
|
|
311
|
-
|
|
312
|
-
await waitForDomStable(page, 500, 5000);
|
|
313
|
-
|
|
314
|
-
// Dismiss cookie banner if present
|
|
315
|
-
await dismissCookieBanner(page);
|
|
316
|
-
|
|
317
|
-
// Wait a bit more for any dynamic content
|
|
318
|
-
await new Promise(r => setTimeout(r, 1000));
|
|
319
|
-
|
|
320
|
-
// =========================================
|
|
321
|
-
// SPA/Framework Detection (v1.3)
|
|
322
|
-
// =========================================
|
|
323
|
-
let frameworkInfo = null;
|
|
324
|
-
let frameworkRoutes = [];
|
|
325
|
-
let stateSnapshot = null;
|
|
326
|
-
|
|
327
|
-
if (!opts.noSpaDetect) {
|
|
328
|
-
// Framework detection
|
|
329
|
-
if (opts.framework) {
|
|
330
|
-
// User forced specific framework - validate it
|
|
331
|
-
const validatedFramework = validateFramework(opts.framework);
|
|
332
|
-
if (validatedFramework) {
|
|
333
|
-
frameworkInfo = {
|
|
334
|
-
framework: validatedFramework,
|
|
335
|
-
version: null,
|
|
336
|
-
routingType: 'spa',
|
|
337
|
-
confidence: 'forced',
|
|
338
|
-
signals: ['user-specified']
|
|
339
|
-
};
|
|
340
|
-
}
|
|
341
|
-
} else {
|
|
342
|
-
// Auto-detect framework
|
|
343
|
-
try {
|
|
344
|
-
frameworkInfo = await detectFramework(page);
|
|
345
|
-
} catch (e) {
|
|
346
|
-
logWarning(`Framework detection failed: ${e.message}`);
|
|
347
|
-
frameworkInfo = null;
|
|
348
|
-
}
|
|
349
|
-
}
|
|
350
|
-
|
|
351
|
-
// Framework-specific route discovery
|
|
352
|
-
if (frameworkInfo?.framework && opts.spaMode) {
|
|
353
|
-
try {
|
|
354
|
-
const discoveryResult = await discoverFrameworkRoutes(page, baseUrl, frameworkInfo);
|
|
355
|
-
frameworkRoutes = discoveryResult.routes || [];
|
|
356
|
-
} catch (e) {
|
|
357
|
-
logWarning(`Route discovery failed for ${frameworkInfo.framework}: ${e.message}`);
|
|
358
|
-
frameworkRoutes = [];
|
|
359
|
-
}
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
// Capture app state (optional)
|
|
363
|
-
if (opts.captureState && frameworkInfo) {
|
|
364
|
-
try {
|
|
365
|
-
stateSnapshot = await captureAppState(page, frameworkInfo);
|
|
366
|
-
} catch (e) {
|
|
367
|
-
logWarning(`State capture failed: ${e.message}`);
|
|
368
|
-
stateSnapshot = null;
|
|
369
|
-
}
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
|
|
373
|
-
// =========================================
|
|
374
|
-
// Traditional Link Scraping (existing logic)
|
|
375
|
-
// =========================================
|
|
376
|
-
const selectors = opts.selectors || NAV_SELECTORS;
|
|
377
|
-
const selectorString = selectors.join(', ');
|
|
378
|
-
|
|
379
|
-
const rawLinks = await page.$$eval(selectorString, (elements) => {
|
|
380
|
-
return elements.map(el => ({
|
|
381
|
-
href: el.href,
|
|
382
|
-
text: el.textContent?.trim() || '',
|
|
383
|
-
tagName: el.tagName
|
|
384
|
-
}));
|
|
385
|
-
}).catch(() => []);
|
|
386
|
-
|
|
387
|
-
// Process and filter links
|
|
388
|
-
const seenUrls = new Set();
|
|
389
|
-
const linkScrapedPages = [];
|
|
390
|
-
|
|
391
|
-
// Always include homepage first
|
|
392
|
-
const homeUrl = normalizeUrl(baseUrl, '/');
|
|
393
|
-
if (homeUrl) {
|
|
394
|
-
seenUrls.add(homeUrl);
|
|
395
|
-
linkScrapedPages.push({
|
|
396
|
-
path: '/',
|
|
397
|
-
name: 'Home',
|
|
398
|
-
url: homeUrl
|
|
399
|
-
});
|
|
400
|
-
}
|
|
401
|
-
|
|
402
|
-
for (const link of rawLinks) {
|
|
403
|
-
// Skip excluded patterns
|
|
404
|
-
if (shouldExclude(link.href)) continue;
|
|
405
|
-
|
|
406
|
-
// Normalize URL
|
|
407
|
-
const normalized = normalizeUrl(baseUrl, link.href);
|
|
408
|
-
if (!normalized) continue;
|
|
409
|
-
|
|
410
|
-
// Skip if already seen
|
|
411
|
-
if (seenUrls.has(normalized)) continue;
|
|
412
|
-
|
|
413
|
-
// Check same domain
|
|
414
|
-
if (!isSameDomain(normalized, baseDomain, opts.includeSubdomains)) continue;
|
|
415
|
-
|
|
416
|
-
// Extract path
|
|
417
|
-
const urlObj = new URL(normalized);
|
|
418
|
-
const path = urlObj.pathname;
|
|
419
|
-
|
|
420
|
-
// Skip homepage (already added)
|
|
421
|
-
if (path === '/') continue;
|
|
422
|
-
|
|
423
|
-
// Add to results
|
|
424
|
-
seenUrls.add(normalized);
|
|
425
|
-
linkScrapedPages.push({
|
|
426
|
-
path,
|
|
427
|
-
name: extractPageName(link.text, path),
|
|
428
|
-
url: normalized
|
|
429
|
-
});
|
|
430
|
-
|
|
431
|
-
// Check max pages limit
|
|
432
|
-
if (linkScrapedPages.length >= opts.maxPages) break;
|
|
433
|
-
}
|
|
434
|
-
|
|
435
|
-
// =========================================
|
|
436
|
-
// Merge Routes (v1.3)
|
|
437
|
-
// =========================================
|
|
438
|
-
let pages;
|
|
439
|
-
if (frameworkRoutes.length > 0) {
|
|
440
|
-
// Merge framework routes with link-scraped pages
|
|
441
|
-
pages = mergeRoutes(frameworkRoutes, linkScrapedPages, baseDomain, baseUrl);
|
|
442
|
-
} else {
|
|
443
|
-
// No framework routes, use link-scraped pages only
|
|
444
|
-
pages = linkScrapedPages.map(p => ({ ...p, source: 'link-scrape', dynamic: false }));
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
// Apply max pages limit to merged results
|
|
448
|
-
if (pages.length > opts.maxPages) {
|
|
449
|
-
pages = pages.slice(0, opts.maxPages);
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
// Sort by path depth (shallow first)
|
|
453
|
-
pages.sort((a, b) => {
|
|
454
|
-
if (a.path === '/') return -1;
|
|
455
|
-
if (b.path === '/') return 1;
|
|
456
|
-
const depthA = (a.path.match(/\//g) || []).length;
|
|
457
|
-
const depthB = (b.path.match(/\//g) || []).length;
|
|
458
|
-
return depthA - depthB;
|
|
459
|
-
});
|
|
460
|
-
|
|
461
|
-
const duration = Date.now() - startTime;
|
|
462
|
-
|
|
463
|
-
return {
|
|
464
|
-
success: true,
|
|
465
|
-
baseUrl: baseUrlObj.origin,
|
|
466
|
-
baseDomain,
|
|
467
|
-
// SPA/Framework data (v1.3)
|
|
468
|
-
framework: frameworkInfo,
|
|
469
|
-
stateSnapshot: stateSnapshot,
|
|
470
|
-
// Page discovery results
|
|
471
|
-
pages,
|
|
472
|
-
stats: {
|
|
473
|
-
totalLinksFound: rawLinks.length,
|
|
474
|
-
frameworkRoutesFound: frameworkRoutes.length,
|
|
475
|
-
pagesDiscovered: pages.length,
|
|
476
|
-
durationMs: duration
|
|
477
|
-
}
|
|
478
|
-
};
|
|
479
|
-
} catch (error) {
|
|
480
|
-
// Normalize baseUrl in error case for consistency
|
|
481
|
-
let normalizedBaseUrl = baseUrl;
|
|
482
|
-
let errorBaseDomain = '';
|
|
483
|
-
try {
|
|
484
|
-
const urlObj = new URL(baseUrl);
|
|
485
|
-
normalizedBaseUrl = urlObj.origin;
|
|
486
|
-
errorBaseDomain = urlObj.hostname;
|
|
487
|
-
} catch {
|
|
488
|
-
// Keep original baseUrl if parsing fails
|
|
489
|
-
}
|
|
490
|
-
|
|
491
|
-
return {
|
|
492
|
-
success: false,
|
|
493
|
-
baseUrl: normalizedBaseUrl,
|
|
494
|
-
baseDomain: errorBaseDomain,
|
|
495
|
-
framework: null,
|
|
496
|
-
stateSnapshot: null,
|
|
497
|
-
pages: [{
|
|
498
|
-
path: '/',
|
|
499
|
-
name: 'Home',
|
|
500
|
-
url: normalizeUrl(baseUrl, '/') || baseUrl,
|
|
501
|
-
source: 'fallback',
|
|
502
|
-
dynamic: false
|
|
503
|
-
}],
|
|
504
|
-
error: error.message,
|
|
505
|
-
stats: {
|
|
506
|
-
totalLinksFound: 0,
|
|
507
|
-
frameworkRoutesFound: 0,
|
|
508
|
-
pagesDiscovered: 1,
|
|
509
|
-
durationMs: Date.now() - startTime
|
|
510
|
-
}
|
|
511
|
-
};
|
|
512
|
-
} finally {
|
|
513
|
-
if (browser) {
|
|
514
|
-
await disconnectBrowser();
|
|
515
|
-
}
|
|
516
|
-
}
|
|
517
|
-
}
|
|
518
|
-
|
|
519
|
-
// CLI support - use exact file match to avoid triggering when imported
|
|
520
|
-
import { fileURLToPath } from 'url';
|
|
521
|
-
const __filename = fileURLToPath(import.meta.url);
|
|
522
|
-
const isMainModule = process.argv[1] === __filename;
|
|
523
|
-
|
|
524
|
-
if (isMainModule) {
|
|
525
|
-
const url = process.argv[2];
|
|
526
|
-
const maxPages = parseInt(process.argv[3]) || 10;
|
|
527
|
-
|
|
528
|
-
if (!url) {
|
|
529
|
-
console.error('Usage: node discover-pages.js <url> [maxPages]');
|
|
530
|
-
process.exit(1);
|
|
531
|
-
}
|
|
532
|
-
|
|
533
|
-
discoverPages(url, { maxPages })
|
|
534
|
-
.then(result => {
|
|
535
|
-
console.log(JSON.stringify(result, null, 2));
|
|
536
|
-
process.exit(result.success ? 0 : 1);
|
|
537
|
-
})
|
|
538
|
-
.catch(err => {
|
|
539
|
-
console.error(JSON.stringify({ success: false, error: err.message }));
|
|
540
|
-
process.exit(1);
|
|
541
|
-
});
|
|
542
|
-
}
|