design-clone 1.1.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +42 -20
- package/SKILL.md +74 -0
- package/bin/commands/clone-site.js +75 -10
- package/bin/commands/init.js +33 -1
- package/bin/commands/verify.js +5 -3
- package/bin/utils/validate.js +24 -8
- package/docs/cli-reference.md +224 -2
- package/docs/codebase-summary.md +309 -0
- package/docs/design-clone-architecture.md +290 -45
- package/docs/pixel-perfect.md +35 -4
- package/docs/project-roadmap.md +382 -0
- package/docs/troubleshooting.md +5 -4
- package/package.json +12 -6
- package/src/ai/__pycache__/analyze-structure.cpython-313.pyc +0 -0
- package/src/ai/__pycache__/extract-design-tokens.cpython-313.pyc +0 -0
- package/src/ai/analyze-structure.py +73 -3
- package/src/ai/extract-design-tokens.py +356 -13
- package/src/ai/prompts/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/ai/prompts/__pycache__/design_tokens.cpython-313.pyc +0 -0
- package/src/ai/prompts/__pycache__/structure_analysis.cpython-313.pyc +0 -0
- package/src/ai/prompts/__pycache__/ux_audit.cpython-313.pyc +0 -0
- package/src/ai/prompts/design_tokens.py +133 -0
- package/src/ai/prompts/structure_analysis.py +329 -10
- package/src/ai/prompts/ux_audit.py +198 -0
- package/src/ai/ux-audit.js +596 -0
- package/src/core/animation-extractor.js +526 -0
- package/src/core/app-state-snapshot.js +511 -0
- package/src/core/content-counter.js +342 -0
- package/src/core/cookie-handler.js +1 -1
- package/src/core/css-extractor.js +4 -4
- package/src/core/dimension-extractor.js +93 -21
- package/src/core/dimension-output.js +103 -6
- package/src/core/discover-pages.js +242 -14
- package/src/core/dom-tree-analyzer.js +298 -0
- package/src/core/extract-assets.js +1 -1
- package/src/core/framework-detector.js +538 -0
- package/src/core/html-extractor.js +45 -4
- package/src/core/lazy-loader.js +7 -7
- package/src/core/multi-page-screenshot.js +9 -6
- package/src/core/page-readiness.js +8 -8
- package/src/core/screenshot.js +311 -7
- package/src/core/section-cropper.js +209 -0
- package/src/core/section-detector.js +386 -0
- package/src/core/semantic-enhancer.js +492 -0
- package/src/core/state-capture.js +598 -0
- package/src/core/tests/test-section-cropper.js +177 -0
- package/src/core/tests/test-section-detector.js +55 -0
- package/src/core/video-capture.js +546 -0
- package/src/route-discoverers/angular-discoverer.js +157 -0
- package/src/route-discoverers/astro-discoverer.js +123 -0
- package/src/route-discoverers/base-discoverer.js +242 -0
- package/src/route-discoverers/index.js +106 -0
- package/src/route-discoverers/next-discoverer.js +130 -0
- package/src/route-discoverers/nuxt-discoverer.js +138 -0
- package/src/route-discoverers/react-discoverer.js +139 -0
- package/src/route-discoverers/svelte-discoverer.js +109 -0
- package/src/route-discoverers/universal-discoverer.js +227 -0
- package/src/route-discoverers/vue-discoverer.js +118 -0
- package/src/utils/__init__.py +1 -1
- package/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/utils/__pycache__/env.cpython-313.pyc +0 -0
- package/src/utils/browser.js +11 -37
- package/src/utils/playwright.js +213 -0
- package/src/verification/generate-audit-report.js +398 -0
- package/src/verification/verify-footer.js +493 -0
- package/src/verification/verify-header.js +486 -0
- package/src/verification/verify-layout.js +2 -2
- package/src/verification/verify-menu.js +4 -20
- package/src/verification/verify-slider.js +533 -0
- package/src/utils/puppeteer.js +0 -281
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Nuxt Route Discoverer
|
|
3
|
+
*
|
|
4
|
+
* Extracts routes from Nuxt 2 and Nuxt 3 applications using:
|
|
5
|
+
* - window.__NUXT__ (Nuxt 2/3 state)
|
|
6
|
+
* - window.$nuxt.$router (Vue Router instance)
|
|
7
|
+
* - window.__NUXT_PATHS__ (Nuxt 3 prerendered paths)
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { BaseDiscoverer } from './base-discoverer.js';
|
|
11
|
+
|
|
12
|
+
export class NuxtDiscoverer extends BaseDiscoverer {
|
|
13
|
+
/**
|
|
14
|
+
* Discover routes from a Nuxt application
|
|
15
|
+
* @returns {Promise<import('./base-discoverer.js').DiscoveredRoute[]>}
|
|
16
|
+
*/
|
|
17
|
+
async discover() {
|
|
18
|
+
const rawRoutes = await this.page.evaluate(() => {
|
|
19
|
+
const routes = [];
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Recursively extract routes from Vue Router config
|
|
23
|
+
* @param {Array} routeList - Array of route objects
|
|
24
|
+
* @param {string} prefix - Path prefix for nested routes
|
|
25
|
+
*/
|
|
26
|
+
function extractRoutes(routeList, prefix = '') {
|
|
27
|
+
if (!Array.isArray(routeList)) return;
|
|
28
|
+
|
|
29
|
+
routeList.forEach(r => {
|
|
30
|
+
if (!r.path) return;
|
|
31
|
+
|
|
32
|
+
// Build full path
|
|
33
|
+
let path = r.path;
|
|
34
|
+
if (!path.startsWith('/') && prefix) {
|
|
35
|
+
path = prefix + (prefix.endsWith('/') ? '' : '/') + path;
|
|
36
|
+
} else if (!path.startsWith('/')) {
|
|
37
|
+
path = '/' + path;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Skip internal routes
|
|
41
|
+
if (path.startsWith('/_') || path.startsWith('/:')) {
|
|
42
|
+
// But process children
|
|
43
|
+
if (r.children) extractRoutes(r.children, path);
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
routes.push({
|
|
48
|
+
path,
|
|
49
|
+
name: r.name || '',
|
|
50
|
+
component: r.name || r.component?.name || '',
|
|
51
|
+
source: 'framework'
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
// Process nested routes
|
|
55
|
+
if (r.children) {
|
|
56
|
+
extractRoutes(r.children, path);
|
|
57
|
+
}
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Method 1: __NUXT__ state (both Nuxt 2 and 3)
|
|
62
|
+
if (window.__NUXT__) {
|
|
63
|
+
const nuxt = window.__NUXT__;
|
|
64
|
+
|
|
65
|
+
// Current route path
|
|
66
|
+
if (nuxt.state?.route?.path) {
|
|
67
|
+
routes.push({
|
|
68
|
+
path: nuxt.state.route.path,
|
|
69
|
+
name: nuxt.state.route.name || 'Current Page',
|
|
70
|
+
source: 'framework'
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Nuxt 3: route from payload
|
|
75
|
+
if (nuxt.data?.path || nuxt.path) {
|
|
76
|
+
routes.push({
|
|
77
|
+
path: nuxt.data?.path || nuxt.path,
|
|
78
|
+
name: 'Current Page',
|
|
79
|
+
source: 'framework'
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Method 2: $nuxt.$router (Vue Router instance)
|
|
85
|
+
if (window.$nuxt?.$router?.options?.routes) {
|
|
86
|
+
extractRoutes(window.$nuxt.$router.options.routes);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Method 3: Nuxt 3 useRouter
|
|
90
|
+
if (window.__NUXT_PATHS__ && Array.isArray(window.__NUXT_PATHS__)) {
|
|
91
|
+
window.__NUXT_PATHS__.forEach(path => {
|
|
92
|
+
if (!routes.some(r => r.path === path)) {
|
|
93
|
+
routes.push({
|
|
94
|
+
path,
|
|
95
|
+
source: 'framework'
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Method 4: NuxtLink components in DOM
|
|
102
|
+
document.querySelectorAll('a[href]').forEach(link => {
|
|
103
|
+
const href = link.getAttribute('href');
|
|
104
|
+
if (href && href.startsWith('/') && !href.startsWith('/_')) {
|
|
105
|
+
// Check for Nuxt-specific attributes
|
|
106
|
+
const isNuxtLink = link.hasAttribute('data-v-') ||
|
|
107
|
+
link.closest('[data-v-]') ||
|
|
108
|
+
link.classList.contains('nuxt-link-active') ||
|
|
109
|
+
link.classList.contains('nuxt-link-exact-active');
|
|
110
|
+
|
|
111
|
+
if (isNuxtLink || link.closest('nav, header, [role="navigation"]')) {
|
|
112
|
+
const text = link.textContent?.trim();
|
|
113
|
+
if (!routes.some(r => r.path === href)) {
|
|
114
|
+
routes.push({
|
|
115
|
+
path: href,
|
|
116
|
+
name: text || '',
|
|
117
|
+
source: isNuxtLink ? 'framework' : 'link-scrape'
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
return routes;
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
// Process and deduplicate
|
|
128
|
+
const processedRoutes = rawRoutes.map(route => ({
|
|
129
|
+
...route,
|
|
130
|
+
name: route.name || this.extractPageName(route.path, route.component),
|
|
131
|
+
path: this.normalizeRoute(route.path)
|
|
132
|
+
}));
|
|
133
|
+
|
|
134
|
+
return this.deduplicateRoutes(processedRoutes);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
export default NuxtDiscoverer;
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* React Route Discoverer
|
|
3
|
+
*
|
|
4
|
+
* React Router doesn't expose routes globally, so we use:
|
|
5
|
+
* - Link component scraping from DOM
|
|
6
|
+
* - history.pushState interception
|
|
7
|
+
* - Navigation area link extraction
|
|
8
|
+
*
|
|
9
|
+
* This is the most challenging discoverer due to React's lack of global state.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { BaseDiscoverer } from './base-discoverer.js';
|
|
13
|
+
|
|
14
|
+
export class ReactDiscoverer extends BaseDiscoverer {
|
|
15
|
+
/**
|
|
16
|
+
* Discover routes from a React application
|
|
17
|
+
* @returns {Promise<import('./base-discoverer.js').DiscoveredRoute[]>}
|
|
18
|
+
*/
|
|
19
|
+
async discover() {
|
|
20
|
+
// First, inject pushState interception
|
|
21
|
+
await this.injectInterception();
|
|
22
|
+
|
|
23
|
+
// Get routes from various sources
|
|
24
|
+
const rawRoutes = await this.page.evaluate(() => {
|
|
25
|
+
const routes = [];
|
|
26
|
+
|
|
27
|
+
// Method 1: React Router Link components (they render as <a> tags)
|
|
28
|
+
document.querySelectorAll('a[href]').forEach(link => {
|
|
29
|
+
const href = link.getAttribute('href');
|
|
30
|
+
if (!href || !href.startsWith('/')) return;
|
|
31
|
+
|
|
32
|
+
// React Router Links typically don't have target="_blank"
|
|
33
|
+
// and are within the app structure
|
|
34
|
+
const isInternalLink = !link.hasAttribute('target') ||
|
|
35
|
+
link.getAttribute('target') === '_self';
|
|
36
|
+
|
|
37
|
+
if (isInternalLink) {
|
|
38
|
+
const isInNav = link.closest('nav, header, [role="navigation"], [class*="nav"], [class*="menu"]');
|
|
39
|
+
const text = link.textContent?.trim();
|
|
40
|
+
|
|
41
|
+
// Detect React-specific patterns
|
|
42
|
+
const reactRoot = document.getElementById('root') ||
|
|
43
|
+
document.querySelector('[data-reactroot]');
|
|
44
|
+
const isInsideReact = reactRoot && reactRoot.contains(link);
|
|
45
|
+
|
|
46
|
+
if (isInNav || isInsideReact) {
|
|
47
|
+
routes.push({
|
|
48
|
+
path: href,
|
|
49
|
+
name: text || '',
|
|
50
|
+
source: isInsideReact ? 'framework' : 'link-scrape'
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
// Method 2: Check for intercepted routes
|
|
57
|
+
if (window.__DISCOVERED_ROUTES__ && Array.isArray(window.__DISCOVERED_ROUTES__)) {
|
|
58
|
+
window.__DISCOVERED_ROUTES__.forEach(url => {
|
|
59
|
+
try {
|
|
60
|
+
const path = new URL(url, window.location.origin).pathname;
|
|
61
|
+
if (!routes.some(r => r.path === path)) {
|
|
62
|
+
routes.push({
|
|
63
|
+
path,
|
|
64
|
+
source: 'interception'
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
} catch {
|
|
68
|
+
// Invalid URL
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Method 3: Look for NavLink active classes (React Router specific)
|
|
74
|
+
document.querySelectorAll('a.active, a[aria-current="page"]').forEach(link => {
|
|
75
|
+
const href = link.getAttribute('href');
|
|
76
|
+
if (href && href.startsWith('/')) {
|
|
77
|
+
if (!routes.some(r => r.path === href)) {
|
|
78
|
+
routes.push({
|
|
79
|
+
path: href,
|
|
80
|
+
name: link.textContent?.trim() || '',
|
|
81
|
+
source: 'framework'
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
return routes;
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
const processedRoutes = rawRoutes.map(route => ({
|
|
91
|
+
...route,
|
|
92
|
+
name: route.name || this.extractPageName(route.path),
|
|
93
|
+
path: this.normalizeRoute(route.path)
|
|
94
|
+
}));
|
|
95
|
+
|
|
96
|
+
return this.deduplicateRoutes(processedRoutes);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Inject history.pushState interception script
|
|
101
|
+
*/
|
|
102
|
+
async injectInterception() {
|
|
103
|
+
try {
|
|
104
|
+
await this.page.evaluate(() => {
|
|
105
|
+
if (window.__ROUTE_INTERCEPTION_ACTIVE__) return;
|
|
106
|
+
|
|
107
|
+
window.__DISCOVERED_ROUTES__ = [];
|
|
108
|
+
window.__ROUTE_INTERCEPTION_ACTIVE__ = true;
|
|
109
|
+
|
|
110
|
+
// Intercept pushState
|
|
111
|
+
const originalPushState = history.pushState.bind(history);
|
|
112
|
+
history.pushState = function(state, title, url) {
|
|
113
|
+
if (url) {
|
|
114
|
+
window.__DISCOVERED_ROUTES__.push(url.toString());
|
|
115
|
+
}
|
|
116
|
+
return originalPushState(state, title, url);
|
|
117
|
+
};
|
|
118
|
+
|
|
119
|
+
// Intercept replaceState
|
|
120
|
+
const originalReplaceState = history.replaceState.bind(history);
|
|
121
|
+
history.replaceState = function(state, title, url) {
|
|
122
|
+
if (url) {
|
|
123
|
+
window.__DISCOVERED_ROUTES__.push(url.toString());
|
|
124
|
+
}
|
|
125
|
+
return originalReplaceState(state, title, url);
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
// Listen for popstate
|
|
129
|
+
window.addEventListener('popstate', () => {
|
|
130
|
+
window.__DISCOVERED_ROUTES__.push(window.location.pathname);
|
|
131
|
+
});
|
|
132
|
+
});
|
|
133
|
+
} catch {
|
|
134
|
+
// Interception may fail in some browser contexts, continue without it
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
export default ReactDiscoverer;
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Svelte/SvelteKit Route Discoverer
|
|
3
|
+
*
|
|
4
|
+
* Extracts routes from SvelteKit applications using:
|
|
5
|
+
* - SvelteKit internal routing state
|
|
6
|
+
* - data-sveltekit-* attributes
|
|
7
|
+
* - Standard link scraping for static Svelte apps
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { BaseDiscoverer } from './base-discoverer.js';
|
|
11
|
+
|
|
12
|
+
export class SvelteDiscoverer extends BaseDiscoverer {
|
|
13
|
+
/**
|
|
14
|
+
* Discover routes from a Svelte/SvelteKit application
|
|
15
|
+
* @returns {Promise<import('./base-discoverer.js').DiscoveredRoute[]>}
|
|
16
|
+
*/
|
|
17
|
+
async discover() {
|
|
18
|
+
const rawRoutes = await this.page.evaluate(() => {
|
|
19
|
+
const routes = [];
|
|
20
|
+
|
|
21
|
+
// Method 1: SvelteKit internal state
|
|
22
|
+
if (window.__sveltekit_routes__) {
|
|
23
|
+
// This global may exist in dev mode
|
|
24
|
+
Object.keys(window.__sveltekit_routes__).forEach(path => {
|
|
25
|
+
routes.push({
|
|
26
|
+
path,
|
|
27
|
+
source: 'framework'
|
|
28
|
+
});
|
|
29
|
+
});
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// Method 2: __sveltekit object
|
|
33
|
+
if (window.__sveltekit?.navigation) {
|
|
34
|
+
// May contain navigation state
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Method 3: data-sveltekit-preload-data links (SvelteKit's prefetching)
|
|
38
|
+
document.querySelectorAll('a[data-sveltekit-preload-data]').forEach(link => {
|
|
39
|
+
const href = link.getAttribute('href');
|
|
40
|
+
if (href && href.startsWith('/')) {
|
|
41
|
+
routes.push({
|
|
42
|
+
path: href,
|
|
43
|
+
name: link.textContent?.trim() || '',
|
|
44
|
+
source: 'framework'
|
|
45
|
+
});
|
|
46
|
+
}
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
// Method 4: data-sveltekit-reload links
|
|
50
|
+
document.querySelectorAll('a[data-sveltekit-reload]').forEach(link => {
|
|
51
|
+
const href = link.getAttribute('href');
|
|
52
|
+
if (href && href.startsWith('/')) {
|
|
53
|
+
if (!routes.some(r => r.path === href)) {
|
|
54
|
+
routes.push({
|
|
55
|
+
path: href,
|
|
56
|
+
name: link.textContent?.trim() || '',
|
|
57
|
+
source: 'framework'
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
// Method 5: data-sveltekit-noscroll links
|
|
64
|
+
document.querySelectorAll('a[data-sveltekit-noscroll]').forEach(link => {
|
|
65
|
+
const href = link.getAttribute('href');
|
|
66
|
+
if (href && href.startsWith('/')) {
|
|
67
|
+
if (!routes.some(r => r.path === href)) {
|
|
68
|
+
routes.push({
|
|
69
|
+
path: href,
|
|
70
|
+
name: link.textContent?.trim() || '',
|
|
71
|
+
source: 'framework'
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
// Method 6: Standard navigation links (for all Svelte apps)
|
|
78
|
+
document.querySelectorAll('nav a, header a, [role="navigation"] a').forEach(link => {
|
|
79
|
+
const href = link.getAttribute('href');
|
|
80
|
+
if (href && href.startsWith('/')) {
|
|
81
|
+
// Check if it's a SvelteKit link
|
|
82
|
+
const isSvelteKitLink = link.hasAttribute('data-sveltekit-preload-data') ||
|
|
83
|
+
link.hasAttribute('data-sveltekit-reload') ||
|
|
84
|
+
link.hasAttribute('data-sveltekit-noscroll');
|
|
85
|
+
|
|
86
|
+
if (!routes.some(r => r.path === href)) {
|
|
87
|
+
routes.push({
|
|
88
|
+
path: href,
|
|
89
|
+
name: link.textContent?.trim() || '',
|
|
90
|
+
source: isSvelteKitLink ? 'framework' : 'link-scrape'
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
return routes;
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
const processedRoutes = rawRoutes.map(route => ({
|
|
100
|
+
...route,
|
|
101
|
+
name: route.name || this.extractPageName(route.path),
|
|
102
|
+
path: this.normalizeRoute(route.path)
|
|
103
|
+
}));
|
|
104
|
+
|
|
105
|
+
return this.deduplicateRoutes(processedRoutes);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
export default SvelteDiscoverer;
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Universal Route Discoverer
|
|
3
|
+
*
|
|
4
|
+
* Fallback discoverer for unknown frameworks or static sites.
|
|
5
|
+
* Uses comprehensive techniques:
|
|
6
|
+
* - history.pushState/replaceState interception
|
|
7
|
+
* - Exhaustive link scraping from navigation elements
|
|
8
|
+
* - Sitemap.xml parsing if available
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { BaseDiscoverer } from './base-discoverer.js';
|
|
12
|
+
|
|
13
|
+
export class UniversalDiscoverer extends BaseDiscoverer {
|
|
14
|
+
/**
|
|
15
|
+
* Discover routes using universal techniques
|
|
16
|
+
* @returns {Promise<import('./base-discoverer.js').DiscoveredRoute[]>}
|
|
17
|
+
*/
|
|
18
|
+
async discover() {
|
|
19
|
+
// First, inject history interception
|
|
20
|
+
await this.injectHistoryInterception();
|
|
21
|
+
|
|
22
|
+
// Get routes from multiple sources
|
|
23
|
+
const rawRoutes = await this.page.evaluate(() => {
|
|
24
|
+
const routes = [];
|
|
25
|
+
const seenPaths = new Set();
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Add route if not already seen
|
|
29
|
+
*/
|
|
30
|
+
function addRoute(path, name, source) {
|
|
31
|
+
if (!path || seenPaths.has(path)) return;
|
|
32
|
+
if (!path.startsWith('/')) return;
|
|
33
|
+
|
|
34
|
+
// Skip common non-page paths
|
|
35
|
+
const skipPatterns = [
|
|
36
|
+
/\.(js|css|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|eot|map)$/i,
|
|
37
|
+
/^\/api\//,
|
|
38
|
+
/^\/_next\//,
|
|
39
|
+
/^\/_nuxt\//,
|
|
40
|
+
/^\/static\//,
|
|
41
|
+
/^\/assets\//,
|
|
42
|
+
];
|
|
43
|
+
|
|
44
|
+
if (skipPatterns.some(pattern => pattern.test(path))) return;
|
|
45
|
+
|
|
46
|
+
seenPaths.add(path);
|
|
47
|
+
routes.push({
|
|
48
|
+
path,
|
|
49
|
+
name: name || '',
|
|
50
|
+
source
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Method 1: History interception results
|
|
55
|
+
if (window.__UNIVERSAL_DISCOVERED_ROUTES__ && Array.isArray(window.__UNIVERSAL_DISCOVERED_ROUTES__)) {
|
|
56
|
+
window.__UNIVERSAL_DISCOVERED_ROUTES__.forEach(url => {
|
|
57
|
+
try {
|
|
58
|
+
const path = new URL(url, window.location.origin).pathname;
|
|
59
|
+
addRoute(path, '', 'interception');
|
|
60
|
+
} catch {
|
|
61
|
+
// Invalid URL, skip
|
|
62
|
+
}
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Method 2: Navigation elements (high confidence)
|
|
67
|
+
const navSelectors = [
|
|
68
|
+
'nav a[href]',
|
|
69
|
+
'header a[href]',
|
|
70
|
+
'[role="navigation"] a[href]',
|
|
71
|
+
'[class*="nav"] a[href]',
|
|
72
|
+
'[class*="menu"] a[href]',
|
|
73
|
+
'[class*="sidebar"] a[href]',
|
|
74
|
+
'footer a[href]'
|
|
75
|
+
];
|
|
76
|
+
|
|
77
|
+
navSelectors.forEach(selector => {
|
|
78
|
+
document.querySelectorAll(selector).forEach(link => {
|
|
79
|
+
const href = link.getAttribute('href');
|
|
80
|
+
if (href && href.startsWith('/')) {
|
|
81
|
+
addRoute(href, link.textContent?.trim() || '', 'link-scrape');
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
// Method 3: All internal links (lower confidence but comprehensive)
|
|
87
|
+
document.querySelectorAll('a[href^="/"]').forEach(link => {
|
|
88
|
+
const href = link.getAttribute('href');
|
|
89
|
+
if (href) {
|
|
90
|
+
// Skip if has target="_blank" or download attribute
|
|
91
|
+
if (link.hasAttribute('download')) return;
|
|
92
|
+
if (link.getAttribute('target') === '_blank') return;
|
|
93
|
+
|
|
94
|
+
addRoute(href, link.textContent?.trim() || '', 'link-scrape');
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
// Method 4: Links in main content area
|
|
99
|
+
const mainSelectors = ['main', '[role="main"]', '#content', '.content', 'article'];
|
|
100
|
+
mainSelectors.forEach(selector => {
|
|
101
|
+
const main = document.querySelector(selector);
|
|
102
|
+
if (main) {
|
|
103
|
+
main.querySelectorAll('a[href^="/"]').forEach(link => {
|
|
104
|
+
const href = link.getAttribute('href');
|
|
105
|
+
if (href && !link.hasAttribute('download')) {
|
|
106
|
+
addRoute(href, link.textContent?.trim() || '', 'link-scrape');
|
|
107
|
+
}
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
return routes;
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
// Try to fetch sitemap
|
|
116
|
+
const sitemapRoutes = await this.fetchSitemapRoutes();
|
|
117
|
+
|
|
118
|
+
// Combine all routes
|
|
119
|
+
const allRoutes = [...rawRoutes, ...sitemapRoutes];
|
|
120
|
+
|
|
121
|
+
const processedRoutes = allRoutes.map(route => ({
|
|
122
|
+
...route,
|
|
123
|
+
name: route.name || this.extractPageName(route.path),
|
|
124
|
+
path: this.normalizeRoute(route.path)
|
|
125
|
+
}));
|
|
126
|
+
|
|
127
|
+
return this.deduplicateRoutes(processedRoutes);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Inject history.pushState/replaceState interception
|
|
132
|
+
*/
|
|
133
|
+
async injectHistoryInterception() {
|
|
134
|
+
try {
|
|
135
|
+
await this.page.evaluate(() => {
|
|
136
|
+
if (window.__UNIVERSAL_INTERCEPTION_ACTIVE__) return;
|
|
137
|
+
|
|
138
|
+
window.__UNIVERSAL_DISCOVERED_ROUTES__ = [];
|
|
139
|
+
window.__UNIVERSAL_INTERCEPTION_ACTIVE__ = true;
|
|
140
|
+
|
|
141
|
+
// Intercept pushState
|
|
142
|
+
const originalPushState = history.pushState.bind(history);
|
|
143
|
+
history.pushState = function(state, title, url) {
|
|
144
|
+
if (url) {
|
|
145
|
+
window.__UNIVERSAL_DISCOVERED_ROUTES__.push(url.toString());
|
|
146
|
+
}
|
|
147
|
+
return originalPushState(state, title, url);
|
|
148
|
+
};
|
|
149
|
+
|
|
150
|
+
// Intercept replaceState
|
|
151
|
+
const originalReplaceState = history.replaceState.bind(history);
|
|
152
|
+
history.replaceState = function(state, title, url) {
|
|
153
|
+
if (url) {
|
|
154
|
+
window.__UNIVERSAL_DISCOVERED_ROUTES__.push(url.toString());
|
|
155
|
+
}
|
|
156
|
+
return originalReplaceState(state, title, url);
|
|
157
|
+
};
|
|
158
|
+
|
|
159
|
+
// Listen for popstate
|
|
160
|
+
window.addEventListener('popstate', () => {
|
|
161
|
+
window.__UNIVERSAL_DISCOVERED_ROUTES__.push(window.location.pathname);
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
// Listen for hashchange (for hash-based routing)
|
|
165
|
+
window.addEventListener('hashchange', () => {
|
|
166
|
+
window.__UNIVERSAL_DISCOVERED_ROUTES__.push(window.location.href);
|
|
167
|
+
});
|
|
168
|
+
});
|
|
169
|
+
} catch {
|
|
170
|
+
// Interception may fail in some browser contexts, continue without it
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Try to fetch and parse sitemap.xml
|
|
176
|
+
* @returns {Promise<import('./base-discoverer.js').DiscoveredRoute[]>}
|
|
177
|
+
*/
|
|
178
|
+
async fetchSitemapRoutes() {
|
|
179
|
+
const routes = [];
|
|
180
|
+
|
|
181
|
+
try {
|
|
182
|
+
const sitemapUrl = new URL('/sitemap.xml', this.baseUrl).href;
|
|
183
|
+
|
|
184
|
+
const response = await this.page.evaluate(async (url) => {
|
|
185
|
+
try {
|
|
186
|
+
// Add timeout using AbortController
|
|
187
|
+
const controller = new AbortController();
|
|
188
|
+
const timeoutId = setTimeout(() => controller.abort(), 5000);
|
|
189
|
+
|
|
190
|
+
const res = await fetch(url, { signal: controller.signal });
|
|
191
|
+
clearTimeout(timeoutId);
|
|
192
|
+
|
|
193
|
+
if (!res.ok) return null;
|
|
194
|
+
return await res.text();
|
|
195
|
+
} catch {
|
|
196
|
+
return null;
|
|
197
|
+
}
|
|
198
|
+
}, sitemapUrl);
|
|
199
|
+
|
|
200
|
+
if (response) {
|
|
201
|
+
// Parse sitemap XML
|
|
202
|
+
const urlMatches = response.matchAll(/<loc>([^<]+)<\/loc>/gi);
|
|
203
|
+
for (const match of urlMatches) {
|
|
204
|
+
try {
|
|
205
|
+
const url = new URL(match[1]);
|
|
206
|
+
// Only include paths from same origin
|
|
207
|
+
if (url.origin === new URL(this.baseUrl).origin) {
|
|
208
|
+
routes.push({
|
|
209
|
+
path: url.pathname,
|
|
210
|
+
name: '',
|
|
211
|
+
source: 'sitemap'
|
|
212
|
+
});
|
|
213
|
+
}
|
|
214
|
+
} catch {
|
|
215
|
+
// Invalid URL in sitemap
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
} catch {
|
|
220
|
+
// Sitemap fetch failed, continue without it
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
return routes;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
export default UniversalDiscoverer;
|