design-clone 2.1.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. package/README.md +13 -34
  2. package/SKILL.md +69 -45
  3. package/bin/cli.js +22 -4
  4. package/bin/commands/clone-site.js +31 -171
  5. package/bin/commands/help.js +19 -6
  6. package/bin/commands/init.js +9 -86
  7. package/bin/commands/uninstall.js +105 -0
  8. package/bin/commands/update.js +70 -0
  9. package/bin/commands/verify.js +7 -14
  10. package/bin/utils/paths.js +28 -0
  11. package/bin/utils/validate.js +2 -22
  12. package/bin/utils/version.js +23 -0
  13. package/docs/code-standards.md +789 -0
  14. package/docs/codebase-summary.md +533 -286
  15. package/docs/index.md +74 -0
  16. package/docs/project-overview-pdr.md +797 -0
  17. package/docs/system-architecture.md +718 -0
  18. package/package.json +14 -17
  19. package/src/ai/prompts/design-tokens/basic.md +80 -0
  20. package/src/ai/prompts/design-tokens/section-with-css.md +41 -0
  21. package/src/ai/prompts/design-tokens/section.md +48 -0
  22. package/src/ai/prompts/design-tokens/with-css.md +87 -0
  23. package/src/ai/prompts/structure-analysis/basic.md +55 -0
  24. package/src/ai/prompts/structure-analysis/with-context.md +59 -0
  25. package/src/ai/prompts/structure-analysis/with-dimensions.md +63 -0
  26. package/src/ai/prompts/structure-analysis/with-hierarchy.md +73 -0
  27. package/src/ai/prompts/ux-audit/aggregation.md +42 -0
  28. package/src/ai/prompts/ux-audit/desktop.md +92 -0
  29. package/src/ai/prompts/ux-audit/mobile.md +93 -0
  30. package/src/ai/prompts/ux-audit/tablet.md +92 -0
  31. package/src/core/animation/animation-extractor-ast.js +183 -0
  32. package/src/core/animation/animation-extractor-output.js +152 -0
  33. package/src/core/animation/animation-extractor.js +178 -0
  34. package/src/core/animation/state-capture-detection.js +200 -0
  35. package/src/core/animation/state-capture.js +193 -0
  36. package/src/core/capture/browser-context-pool.js +96 -0
  37. package/src/core/capture/multi-page-screenshot-page.js +110 -0
  38. package/src/core/capture/multi-page-screenshot.js +208 -0
  39. package/src/core/capture/screenshot-extraction.js +186 -0
  40. package/src/core/capture/screenshot-helpers.js +175 -0
  41. package/src/core/capture/screenshot-orchestrator.js +174 -0
  42. package/src/core/capture/screenshot-viewport.js +93 -0
  43. package/src/core/capture/screenshot.js +192 -0
  44. package/src/core/content/content-counter-dom.js +191 -0
  45. package/src/core/content/content-counter.js +76 -0
  46. package/src/core/css/breakpoint-detector.js +66 -0
  47. package/src/core/css/chromium-defaults.json +23 -0
  48. package/src/core/css/computed-style-extractor.js +102 -0
  49. package/src/core/css/css-chunker.js +103 -0
  50. package/src/core/css/filter-css-dead-code.js +120 -0
  51. package/src/core/css/filter-css-html-analyzer.js +110 -0
  52. package/src/core/css/filter-css-selector-matcher.js +172 -0
  53. package/src/core/css/filter-css.js +206 -0
  54. package/src/core/css/merge-css-atrule-processor.js +158 -0
  55. package/src/core/css/merge-css-file-io.js +68 -0
  56. package/src/core/css/merge-css.js +148 -0
  57. package/src/core/detection/framework-detector-routing.js +68 -0
  58. package/src/core/detection/framework-detector-signals.js +65 -0
  59. package/src/core/detection/framework-detector.js +198 -0
  60. package/src/core/dimension/dimension-extractor-card-detector.js +82 -0
  61. package/src/core/dimension/dimension-extractor.js +317 -0
  62. package/src/core/dimension/dimension-output-ai-summary.js +111 -0
  63. package/src/core/dimension/dimension-output.js +173 -0
  64. package/src/core/dimension/dom-tree-analyzer-tree-builders.js +95 -0
  65. package/src/core/dimension/dom-tree-analyzer.js +191 -0
  66. package/src/core/discovery/app-state-snapshot-capture.js +195 -0
  67. package/src/core/discovery/app-state-snapshot-utils.js +178 -0
  68. package/src/core/discovery/app-state-snapshot.js +131 -0
  69. package/src/core/discovery/discover-pages-routes.js +84 -0
  70. package/src/core/discovery/discover-pages-utils.js +177 -0
  71. package/src/core/discovery/discover-pages.js +191 -0
  72. package/src/core/html/html-extractor-inline-styler.js +70 -0
  73. package/src/core/html/html-extractor.js +147 -0
  74. package/src/core/html/semantic-enhancer-mappings.js +200 -0
  75. package/src/core/html/semantic-enhancer-page.js +148 -0
  76. package/src/core/html/semantic-enhancer.js +135 -0
  77. package/src/core/links/rewrite-links-css-rewriter.js +53 -0
  78. package/src/core/links/rewrite-links.js +173 -0
  79. package/src/core/media/asset-validator.js +118 -0
  80. package/src/core/media/extract-assets-downloader.js +187 -0
  81. package/src/core/media/extract-assets-page-scraper.js +115 -0
  82. package/src/core/media/extract-assets.js +159 -0
  83. package/src/core/media/video-capture-convert.js +200 -0
  84. package/src/core/media/video-capture.js +201 -0
  85. package/src/core/{lazy-loader.js → page-prep/lazy-loader.js} +37 -39
  86. package/src/core/section/section-cropper-helpers.js +43 -0
  87. package/src/core/{section-cropper.js → section/section-cropper.js} +11 -88
  88. package/src/core/section/section-detector-strategies.js +139 -0
  89. package/src/core/section/section-detector-utils.js +100 -0
  90. package/src/core/section/section-detector.js +88 -0
  91. package/src/core/tests/test-section-cropper.js +2 -2
  92. package/src/core/tests/test-section-detector.js +2 -2
  93. package/src/post-process/enhance-assets.js +29 -4
  94. package/src/post-process/fetch-images-unsplash-client.js +123 -0
  95. package/src/post-process/fetch-images.js +60 -263
  96. package/src/post-process/inject-gosnap.js +88 -0
  97. package/src/post-process/inject-icons-svg-replacer.js +76 -0
  98. package/src/post-process/inject-icons.js +47 -200
  99. package/src/route-discoverers/base-discoverer-utils.js +137 -0
  100. package/src/route-discoverers/base-discoverer.js +29 -118
  101. package/src/route-discoverers/index.js +1 -1
  102. package/src/shared/config.js +38 -0
  103. package/src/shared/error-codes.js +31 -0
  104. package/src/shared/viewports.js +46 -0
  105. package/src/utils/browser.js +0 -7
  106. package/src/utils/helpers.js +4 -0
  107. package/src/utils/log.js +12 -0
  108. package/src/utils/playwright-loader.js +76 -0
  109. package/src/utils/playwright.js +3 -69
  110. package/src/utils/progress.js +32 -0
  111. package/src/verification/generate-audit-report-css-fixes.js +52 -0
  112. package/src/verification/generate-audit-report-sections.js +158 -0
  113. package/src/verification/generate-audit-report.js +5 -281
  114. package/src/verification/quality-scorer.js +92 -0
  115. package/src/verification/verify-footer-checks.js +103 -0
  116. package/src/verification/verify-footer-helpers.js +178 -0
  117. package/src/verification/verify-footer.js +23 -381
  118. package/src/verification/verify-header-checks.js +104 -0
  119. package/src/verification/verify-header-helpers.js +156 -0
  120. package/src/verification/verify-header.js +23 -365
  121. package/src/verification/verify-layout-report.js +101 -0
  122. package/src/verification/verify-layout.js +13 -259
  123. package/src/verification/verify-menu-checks.js +104 -0
  124. package/src/verification/verify-menu-helpers.js +112 -0
  125. package/src/verification/verify-menu.js +17 -285
  126. package/src/verification/verify-slider-checks.js +115 -0
  127. package/src/verification/verify-slider-constants.js +65 -0
  128. package/src/verification/verify-slider-helpers.js +164 -0
  129. package/src/verification/verify-slider.js +23 -414
  130. package/.env.example +0 -14
  131. package/docs/basic-clone.md +0 -63
  132. package/docs/cli-reference.md +0 -316
  133. package/docs/design-clone-architecture.md +0 -492
  134. package/docs/pixel-perfect.md +0 -117
  135. package/docs/project-roadmap.md +0 -382
  136. package/docs/troubleshooting.md +0 -170
  137. package/requirements.txt +0 -5
  138. package/src/ai/__pycache__/analyze-structure.cpython-313.pyc +0 -0
  139. package/src/ai/__pycache__/extract-design-tokens.cpython-313.pyc +0 -0
  140. package/src/ai/analyze-structure.py +0 -375
  141. package/src/ai/extract-design-tokens.py +0 -782
  142. package/src/ai/prompts/__init__.py +0 -2
  143. package/src/ai/prompts/__pycache__/__init__.cpython-313.pyc +0 -0
  144. package/src/ai/prompts/__pycache__/design_tokens.cpython-313.pyc +0 -0
  145. package/src/ai/prompts/__pycache__/structure_analysis.cpython-313.pyc +0 -0
  146. package/src/ai/prompts/__pycache__/ux_audit.cpython-313.pyc +0 -0
  147. package/src/ai/prompts/design_tokens.py +0 -316
  148. package/src/ai/prompts/structure_analysis.py +0 -592
  149. package/src/ai/prompts/ux_audit.py +0 -198
  150. package/src/ai/ux-audit.js +0 -596
  151. package/src/core/animation-extractor.js +0 -526
  152. package/src/core/app-state-snapshot.js +0 -511
  153. package/src/core/content-counter.js +0 -342
  154. package/src/core/design-tokens.js +0 -103
  155. package/src/core/dimension-extractor.js +0 -438
  156. package/src/core/dimension-output.js +0 -305
  157. package/src/core/discover-pages.js +0 -542
  158. package/src/core/dom-tree-analyzer.js +0 -298
  159. package/src/core/extract-assets.js +0 -468
  160. package/src/core/filter-css.js +0 -499
  161. package/src/core/framework-detector.js +0 -538
  162. package/src/core/html-extractor.js +0 -212
  163. package/src/core/merge-css.js +0 -407
  164. package/src/core/multi-page-screenshot.js +0 -380
  165. package/src/core/rewrite-links.js +0 -226
  166. package/src/core/screenshot.js +0 -701
  167. package/src/core/section-detector.js +0 -386
  168. package/src/core/semantic-enhancer.js +0 -492
  169. package/src/core/state-capture.js +0 -598
  170. package/src/core/video-capture.js +0 -546
  171. package/src/utils/__init__.py +0 -16
  172. package/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
  173. package/src/utils/__pycache__/env.cpython-313.pyc +0 -0
  174. package/src/utils/env.py +0 -134
  175. /package/src/core/{css-extractor.js → css/css-extractor.js} +0 -0
  176. /package/src/core/{cookie-handler.js → page-prep/cookie-handler.js} +0 -0
  177. /package/src/core/{page-readiness.js → page-prep/page-readiness.js} +0 -0
@@ -1,542 +0,0 @@
1
- /**
2
- * Page Discovery Module
3
- *
4
- * Extracts navigation links from a website to discover cloneable pages.
5
- * Handles SPA hydration, filters external links, and normalizes URLs.
6
- *
7
- * Enhanced with SPA/Framework support (v1.3):
8
- * - Framework detection (Next.js, Nuxt, Vue, React, Angular, Svelte, Astro)
9
- * - Framework-specific route discovery
10
- * - App state capture (optional)
11
- *
12
- * Usage:
13
- * import { discoverPages } from './discover-pages.js';
14
- * const result = await discoverPages('https://example.com', { maxPages: 10 });
15
- */
16
-
17
- import { getBrowser, getPage, disconnectBrowser } from '../utils/browser.js';
18
- import { waitForDomStable, waitForPageReady } from './page-readiness.js';
19
- import { dismissCookieBanner } from './cookie-handler.js';
20
-
21
- // SPA/Framework support imports
22
- import { detectFramework, formatDetectionResult } from './framework-detector.js';
23
- import { discoverRoutes as discoverFrameworkRoutes } from '../route-discoverers/index.js';
24
- import { captureAppState, formatStateSnapshot } from './app-state-snapshot.js';
25
-
26
- // Navigation selectors in priority order
27
- const NAV_SELECTORS = [
28
- 'header nav a',
29
- 'header a',
30
- 'nav a',
31
- '[role="navigation"] a',
32
- '.navbar a',
33
- '.nav-menu a',
34
- '.navigation a',
35
- 'footer nav a',
36
- 'footer a'
37
- ];
38
-
39
- // Patterns to exclude from discovered links
40
- const EXCLUDE_PATTERNS = [
41
- /^mailto:/i,
42
- /^tel:/i,
43
- /^javascript:/i,
44
- /^#/,
45
- /\.(pdf|jpg|jpeg|png|gif|svg|webp|ico|zip|tar|gz|mp3|mp4|avi|mov)$/i,
46
- /facebook\.com/i,
47
- /twitter\.com/i,
48
- /instagram\.com/i,
49
- /linkedin\.com/i,
50
- /youtube\.com/i,
51
- /tiktok\.com/i
52
- ];
53
-
54
- // Valid framework names for validation
55
- const VALID_FRAMEWORKS = ['next', 'nuxt', 'vue', 'react', 'angular', 'svelte', 'astro'];
56
-
57
- // Default options
58
- const DEFAULT_OPTIONS = {
59
- maxPages: 10,
60
- selectors: null, // Use default NAV_SELECTORS if null
61
- includeSubdomains: false,
62
- timeout: 30000,
63
- // SPA/Framework options (v1.3)
64
- spaMode: true, // Enable SPA detection and route discovery
65
- framework: null, // Force specific framework (skip detection)
66
- noSpaDetect: false, // Disable SPA/framework detection entirely
67
- captureState: false // Capture app state (Redux/Vuex/Pinia/Zustand)
68
- };
69
-
70
- /**
71
- * Log warning message (only in TTY mode)
72
- * @param {string} message - Warning message
73
- */
74
- function logWarning(message) {
75
- if (process.stderr.isTTY) {
76
- console.error(`[discover-pages] WARN: ${message}`);
77
- }
78
- }
79
-
80
- /**
81
- * Validate and normalize framework option
82
- * @param {string|null} framework - Framework name to validate
83
- * @returns {string|null} Validated framework name or null
84
- */
85
- function validateFramework(framework) {
86
- if (!framework) return null;
87
- const normalized = String(framework).toLowerCase().trim();
88
- if (VALID_FRAMEWORKS.includes(normalized)) {
89
- return normalized;
90
- }
91
- logWarning(`Invalid framework "${framework}". Valid options: ${VALID_FRAMEWORKS.join(', ')}`);
92
- return null;
93
- }
94
-
95
- /**
96
- * Normalize URL for comparison and deduplication
97
- * @param {string} baseUrl - Base URL for resolving relative paths
98
- * @param {string} href - URL to normalize
99
- * @returns {string|null} Normalized URL or null if invalid
100
- */
101
- export function normalizeUrl(baseUrl, href) {
102
- if (!href || typeof href !== 'string') return null;
103
-
104
- try {
105
- const url = new URL(href, baseUrl);
106
-
107
- // Skip non-http(s) protocols
108
- if (!url.protocol.startsWith('http')) return null;
109
-
110
- // Build normalized URL: origin + pathname (no hash, no query)
111
- let normalized = url.origin + url.pathname;
112
-
113
- // Remove trailing slash (except for root)
114
- if (normalized.endsWith('/') && normalized !== url.origin + '/') {
115
- normalized = normalized.slice(0, -1);
116
- }
117
-
118
- return normalized;
119
- } catch {
120
- return null;
121
- }
122
- }
123
-
124
- /**
125
- * Check if URL is same domain as base
126
- * @param {string} url - URL to check
127
- * @param {string} baseDomain - Base domain to compare against
128
- * @param {boolean} includeSubdomains - Whether to include subdomains
129
- * @returns {boolean}
130
- */
131
- export function isSameDomain(url, baseDomain, includeSubdomains = false) {
132
- try {
133
- const urlObj = new URL(url);
134
- const hostname = urlObj.hostname.toLowerCase();
135
- const base = baseDomain.toLowerCase();
136
-
137
- if (hostname === base) return true;
138
-
139
- if (includeSubdomains) {
140
- return hostname.endsWith('.' + base);
141
- }
142
-
143
- return false;
144
- } catch {
145
- return false;
146
- }
147
- }
148
-
149
- /**
150
- * Extract page name from link text or URL path
151
- * @param {string} text - Link text
152
- * @param {string} path - URL path
153
- * @returns {string} Page name
154
- */
155
- export function extractPageName(text, path) {
156
- // Use link text if available and meaningful
157
- if (text && text.length > 0 && text.length < 50) {
158
- return text;
159
- }
160
-
161
- // Extract from path
162
- if (!path || path === '/') return 'Home';
163
-
164
- // Get last segment of path
165
- const segments = path.split('/').filter(Boolean);
166
- if (segments.length === 0) return 'Home';
167
-
168
- const lastSegment = segments[segments.length - 1];
169
-
170
- // Convert kebab-case/snake_case to Title Case
171
- return lastSegment
172
- .replace(/[-_]/g, ' ')
173
- .replace(/\b\w/g, c => c.toUpperCase());
174
- }
175
-
176
- /**
177
- * Check if href should be excluded
178
- * @param {string} href - URL to check
179
- * @returns {boolean}
180
- */
181
- function shouldExclude(href) {
182
- if (!href) return true;
183
- return EXCLUDE_PATTERNS.some(pattern => pattern.test(href));
184
- }
185
-
186
- /**
187
- * Normalize a path (remove trailing slash except for root)
188
- * @param {string} path - Path to normalize
189
- * @returns {string} Normalized path
190
- */
191
- function normalizePath(path) {
192
- if (!path || typeof path !== 'string') return '/';
193
- return path.endsWith('/') && path !== '/' ? path.slice(0, -1) : path;
194
- }
195
-
196
- /**
197
- * Merge framework-discovered routes with link-scraped pages
198
- * Prioritizes framework routes (higher quality), fills gaps with link-scraped
199
- *
200
- * @param {Array|null} frameworkRoutes - Routes from framework discoverer
201
- * @param {Array|null} linkScrapedPages - Pages from link scraping
202
- * @param {string} baseDomain - Base domain for URL normalization
203
- * @param {string} baseUrl - Base URL for resolving paths
204
- * @returns {Array} Merged and deduplicated pages
205
- *
206
- * @example
207
- * const merged = mergeRoutes(
208
- * [{ path: '/about', name: 'About' }],
209
- * [{ path: '/contact', name: 'Contact' }],
210
- * 'example.com',
211
- * 'https://example.com'
212
- * );
213
- */
214
- function mergeRoutes(frameworkRoutes, linkScrapedPages, baseDomain, baseUrl) {
215
- // Input validation
216
- if (!baseDomain || typeof baseDomain !== 'string') {
217
- logWarning('mergeRoutes: Invalid baseDomain');
218
- baseDomain = '';
219
- }
220
- if (!baseUrl || typeof baseUrl !== 'string') {
221
- logWarning('mergeRoutes: Invalid baseUrl');
222
- baseUrl = '';
223
- }
224
-
225
- const seenPaths = new Set();
226
- const merged = [];
227
-
228
- // Add framework routes first (higher quality, more accurate)
229
- if (Array.isArray(frameworkRoutes)) {
230
- for (const route of frameworkRoutes) {
231
- if (!route || typeof route !== 'object') continue;
232
-
233
- const normalizedPath = normalizePath(route.path || '/');
234
- if (seenPaths.has(normalizedPath)) continue;
235
- seenPaths.add(normalizedPath);
236
-
237
- const url = normalizeUrl(baseUrl, normalizedPath) || route.url || '';
238
-
239
- merged.push({
240
- path: normalizedPath,
241
- name: route.name || extractPageName('', normalizedPath),
242
- url,
243
- source: route.source || 'framework',
244
- dynamic: Boolean(route.dynamic)
245
- });
246
- }
247
- }
248
-
249
- // Add link-scraped pages (fill gaps)
250
- if (Array.isArray(linkScrapedPages)) {
251
- for (const page of linkScrapedPages) {
252
- if (!page || typeof page !== 'object') continue;
253
-
254
- const normalizedPath = normalizePath(page.path || '/');
255
- if (seenPaths.has(normalizedPath)) continue;
256
- seenPaths.add(normalizedPath);
257
-
258
- merged.push({
259
- path: normalizedPath,
260
- name: page.name || extractPageName('', normalizedPath),
261
- url: page.url || normalizeUrl(baseUrl, normalizedPath) || '',
262
- source: 'link-scrape',
263
- dynamic: false
264
- });
265
- }
266
- }
267
-
268
- return merged;
269
- }
270
-
271
- /**
272
- * Discover pages from a website by extracting navigation links
273
- * Enhanced with SPA/Framework support (v1.3)
274
- *
275
- * @param {string} baseUrl - Starting URL to discover from
276
- * @param {Object} options - Discovery options
277
- * @param {number} [options.maxPages=10] - Maximum pages to discover
278
- * @param {boolean} [options.spaMode=true] - Enable SPA detection
279
- * @param {string} [options.framework] - Force specific framework
280
- * @param {boolean} [options.noSpaDetect=false] - Disable SPA detection
281
- * @param {boolean} [options.captureState=false] - Capture app state
282
- * @returns {Promise<Object>} Discovery result
283
- */
284
- export async function discoverPages(baseUrl, options = {}) {
285
- const opts = { ...DEFAULT_OPTIONS, ...options };
286
- const startTime = Date.now();
287
-
288
- let browser = null;
289
- let page = null;
290
-
291
- try {
292
- // Parse base URL
293
- const baseUrlObj = new URL(baseUrl);
294
- const baseDomain = baseUrlObj.hostname;
295
-
296
- // Launch browser
297
- browser = await getBrowser({ headless: true });
298
- page = await getPage(browser);
299
-
300
- // Navigate to page
301
- await page.goto(baseUrl, {
302
- waitUntil: 'networkidle',
303
- timeout: opts.timeout
304
- });
305
-
306
- // Wait for SPA hydration
307
- await page.waitForSelector('nav a, header a, [role="navigation"] a', {
308
- visible: true,
309
- timeout: 5000
310
- }).catch(() => {});
311
-
312
- await waitForDomStable(page, 500, 5000);
313
-
314
- // Dismiss cookie banner if present
315
- await dismissCookieBanner(page);
316
-
317
- // Wait a bit more for any dynamic content
318
- await new Promise(r => setTimeout(r, 1000));
319
-
320
- // =========================================
321
- // SPA/Framework Detection (v1.3)
322
- // =========================================
323
- let frameworkInfo = null;
324
- let frameworkRoutes = [];
325
- let stateSnapshot = null;
326
-
327
- if (!opts.noSpaDetect) {
328
- // Framework detection
329
- if (opts.framework) {
330
- // User forced specific framework - validate it
331
- const validatedFramework = validateFramework(opts.framework);
332
- if (validatedFramework) {
333
- frameworkInfo = {
334
- framework: validatedFramework,
335
- version: null,
336
- routingType: 'spa',
337
- confidence: 'forced',
338
- signals: ['user-specified']
339
- };
340
- }
341
- } else {
342
- // Auto-detect framework
343
- try {
344
- frameworkInfo = await detectFramework(page);
345
- } catch (e) {
346
- logWarning(`Framework detection failed: ${e.message}`);
347
- frameworkInfo = null;
348
- }
349
- }
350
-
351
- // Framework-specific route discovery
352
- if (frameworkInfo?.framework && opts.spaMode) {
353
- try {
354
- const discoveryResult = await discoverFrameworkRoutes(page, baseUrl, frameworkInfo);
355
- frameworkRoutes = discoveryResult.routes || [];
356
- } catch (e) {
357
- logWarning(`Route discovery failed for ${frameworkInfo.framework}: ${e.message}`);
358
- frameworkRoutes = [];
359
- }
360
- }
361
-
362
- // Capture app state (optional)
363
- if (opts.captureState && frameworkInfo) {
364
- try {
365
- stateSnapshot = await captureAppState(page, frameworkInfo);
366
- } catch (e) {
367
- logWarning(`State capture failed: ${e.message}`);
368
- stateSnapshot = null;
369
- }
370
- }
371
- }
372
-
373
- // =========================================
374
- // Traditional Link Scraping (existing logic)
375
- // =========================================
376
- const selectors = opts.selectors || NAV_SELECTORS;
377
- const selectorString = selectors.join(', ');
378
-
379
- const rawLinks = await page.$$eval(selectorString, (elements) => {
380
- return elements.map(el => ({
381
- href: el.href,
382
- text: el.textContent?.trim() || '',
383
- tagName: el.tagName
384
- }));
385
- }).catch(() => []);
386
-
387
- // Process and filter links
388
- const seenUrls = new Set();
389
- const linkScrapedPages = [];
390
-
391
- // Always include homepage first
392
- const homeUrl = normalizeUrl(baseUrl, '/');
393
- if (homeUrl) {
394
- seenUrls.add(homeUrl);
395
- linkScrapedPages.push({
396
- path: '/',
397
- name: 'Home',
398
- url: homeUrl
399
- });
400
- }
401
-
402
- for (const link of rawLinks) {
403
- // Skip excluded patterns
404
- if (shouldExclude(link.href)) continue;
405
-
406
- // Normalize URL
407
- const normalized = normalizeUrl(baseUrl, link.href);
408
- if (!normalized) continue;
409
-
410
- // Skip if already seen
411
- if (seenUrls.has(normalized)) continue;
412
-
413
- // Check same domain
414
- if (!isSameDomain(normalized, baseDomain, opts.includeSubdomains)) continue;
415
-
416
- // Extract path
417
- const urlObj = new URL(normalized);
418
- const path = urlObj.pathname;
419
-
420
- // Skip homepage (already added)
421
- if (path === '/') continue;
422
-
423
- // Add to results
424
- seenUrls.add(normalized);
425
- linkScrapedPages.push({
426
- path,
427
- name: extractPageName(link.text, path),
428
- url: normalized
429
- });
430
-
431
- // Check max pages limit
432
- if (linkScrapedPages.length >= opts.maxPages) break;
433
- }
434
-
435
- // =========================================
436
- // Merge Routes (v1.3)
437
- // =========================================
438
- let pages;
439
- if (frameworkRoutes.length > 0) {
440
- // Merge framework routes with link-scraped pages
441
- pages = mergeRoutes(frameworkRoutes, linkScrapedPages, baseDomain, baseUrl);
442
- } else {
443
- // No framework routes, use link-scraped pages only
444
- pages = linkScrapedPages.map(p => ({ ...p, source: 'link-scrape', dynamic: false }));
445
- }
446
-
447
- // Apply max pages limit to merged results
448
- if (pages.length > opts.maxPages) {
449
- pages = pages.slice(0, opts.maxPages);
450
- }
451
-
452
- // Sort by path depth (shallow first)
453
- pages.sort((a, b) => {
454
- if (a.path === '/') return -1;
455
- if (b.path === '/') return 1;
456
- const depthA = (a.path.match(/\//g) || []).length;
457
- const depthB = (b.path.match(/\//g) || []).length;
458
- return depthA - depthB;
459
- });
460
-
461
- const duration = Date.now() - startTime;
462
-
463
- return {
464
- success: true,
465
- baseUrl: baseUrlObj.origin,
466
- baseDomain,
467
- // SPA/Framework data (v1.3)
468
- framework: frameworkInfo,
469
- stateSnapshot: stateSnapshot,
470
- // Page discovery results
471
- pages,
472
- stats: {
473
- totalLinksFound: rawLinks.length,
474
- frameworkRoutesFound: frameworkRoutes.length,
475
- pagesDiscovered: pages.length,
476
- durationMs: duration
477
- }
478
- };
479
- } catch (error) {
480
- // Normalize baseUrl in error case for consistency
481
- let normalizedBaseUrl = baseUrl;
482
- let errorBaseDomain = '';
483
- try {
484
- const urlObj = new URL(baseUrl);
485
- normalizedBaseUrl = urlObj.origin;
486
- errorBaseDomain = urlObj.hostname;
487
- } catch {
488
- // Keep original baseUrl if parsing fails
489
- }
490
-
491
- return {
492
- success: false,
493
- baseUrl: normalizedBaseUrl,
494
- baseDomain: errorBaseDomain,
495
- framework: null,
496
- stateSnapshot: null,
497
- pages: [{
498
- path: '/',
499
- name: 'Home',
500
- url: normalizeUrl(baseUrl, '/') || baseUrl,
501
- source: 'fallback',
502
- dynamic: false
503
- }],
504
- error: error.message,
505
- stats: {
506
- totalLinksFound: 0,
507
- frameworkRoutesFound: 0,
508
- pagesDiscovered: 1,
509
- durationMs: Date.now() - startTime
510
- }
511
- };
512
- } finally {
513
- if (browser) {
514
- await disconnectBrowser();
515
- }
516
- }
517
- }
518
-
519
- // CLI support - use exact file match to avoid triggering when imported
520
- import { fileURLToPath } from 'url';
521
- const __filename = fileURLToPath(import.meta.url);
522
- const isMainModule = process.argv[1] === __filename;
523
-
524
- if (isMainModule) {
525
- const url = process.argv[2];
526
- const maxPages = parseInt(process.argv[3]) || 10;
527
-
528
- if (!url) {
529
- console.error('Usage: node discover-pages.js <url> [maxPages]');
530
- process.exit(1);
531
- }
532
-
533
- discoverPages(url, { maxPages })
534
- .then(result => {
535
- console.log(JSON.stringify(result, null, 2));
536
- process.exit(result.success ? 0 : 1);
537
- })
538
- .catch(err => {
539
- console.error(JSON.stringify({ success: false, error: err.message }));
540
- process.exit(1);
541
- });
542
- }