design-clone 1.2.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/README.md +26 -12
  2. package/bin/commands/clone-site.js +75 -10
  3. package/bin/commands/init.js +33 -1
  4. package/bin/commands/verify.js +5 -3
  5. package/bin/utils/validate.js +24 -8
  6. package/docs/cli-reference.md +200 -2
  7. package/docs/codebase-summary.md +309 -0
  8. package/docs/design-clone-architecture.md +259 -42
  9. package/docs/pixel-perfect.md +35 -4
  10. package/docs/project-roadmap.md +382 -0
  11. package/docs/troubleshooting.md +5 -4
  12. package/package.json +10 -8
  13. package/src/ai/__pycache__/analyze-structure.cpython-313.pyc +0 -0
  14. package/src/ai/__pycache__/extract-design-tokens.cpython-313.pyc +0 -0
  15. package/src/ai/analyze-structure.py +73 -3
  16. package/src/ai/extract-design-tokens.py +356 -13
  17. package/src/ai/prompts/__pycache__/design_tokens.cpython-313.pyc +0 -0
  18. package/src/ai/prompts/__pycache__/structure_analysis.cpython-313.pyc +0 -0
  19. package/src/ai/prompts/__pycache__/ux_audit.cpython-313.pyc +0 -0
  20. package/src/ai/prompts/design_tokens.py +133 -0
  21. package/src/ai/prompts/structure_analysis.py +329 -10
  22. package/src/ai/prompts/ux_audit.py +198 -0
  23. package/src/ai/ux-audit.js +596 -0
  24. package/src/core/app-state-snapshot.js +511 -0
  25. package/src/core/content-counter.js +342 -0
  26. package/src/core/cookie-handler.js +1 -1
  27. package/src/core/css-extractor.js +4 -4
  28. package/src/core/dimension-extractor.js +93 -21
  29. package/src/core/dimension-output.js +103 -6
  30. package/src/core/discover-pages.js +242 -14
  31. package/src/core/dom-tree-analyzer.js +298 -0
  32. package/src/core/extract-assets.js +1 -1
  33. package/src/core/framework-detector.js +538 -0
  34. package/src/core/html-extractor.js +45 -4
  35. package/src/core/lazy-loader.js +7 -7
  36. package/src/core/multi-page-screenshot.js +9 -6
  37. package/src/core/page-readiness.js +8 -8
  38. package/src/core/screenshot.js +138 -9
  39. package/src/core/section-cropper.js +209 -0
  40. package/src/core/section-detector.js +386 -0
  41. package/src/core/semantic-enhancer.js +492 -0
  42. package/src/core/state-capture.js +18 -22
  43. package/src/core/tests/test-section-cropper.js +177 -0
  44. package/src/core/tests/test-section-detector.js +55 -0
  45. package/src/core/video-capture.js +152 -146
  46. package/src/route-discoverers/angular-discoverer.js +157 -0
  47. package/src/route-discoverers/astro-discoverer.js +123 -0
  48. package/src/route-discoverers/base-discoverer.js +242 -0
  49. package/src/route-discoverers/index.js +106 -0
  50. package/src/route-discoverers/next-discoverer.js +130 -0
  51. package/src/route-discoverers/nuxt-discoverer.js +138 -0
  52. package/src/route-discoverers/react-discoverer.js +139 -0
  53. package/src/route-discoverers/svelte-discoverer.js +109 -0
  54. package/src/route-discoverers/universal-discoverer.js +227 -0
  55. package/src/route-discoverers/vue-discoverer.js +118 -0
  56. package/src/utils/__init__.py +1 -1
  57. package/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
  58. package/src/utils/browser.js +11 -37
  59. package/src/utils/playwright.js +213 -0
  60. package/src/verification/generate-audit-report.js +398 -0
  61. package/src/verification/verify-footer.js +493 -0
  62. package/src/verification/verify-header.js +486 -0
  63. package/src/verification/verify-layout.js +2 -2
  64. package/src/verification/verify-menu.js +4 -20
  65. package/src/verification/verify-slider.js +533 -0
  66. package/src/utils/puppeteer.js +0 -281
@@ -0,0 +1,538 @@
1
+ /**
2
+ * Framework Detector Module
3
+ *
4
+ * Detects JavaScript frameworks used on a page by checking:
5
+ * - Global objects (window.__NEXT_DATA__, etc.)
6
+ * - DOM attributes ([data-reactroot], [ng-version], etc.)
7
+ * - Script URL patterns (/_next/, /_nuxt/, etc.)
8
+ *
9
+ * Returns framework info with confidence scoring.
10
+ *
11
+ * Usage:
12
+ * import { detectFramework } from './framework-detector.js';
13
+ * const info = await detectFramework(page);
14
+ * // { framework: 'next', version: '14.0.0', confidence: 'high', ... }
15
+ */
16
+
17
+ /**
18
+ * @typedef {Object} FrameworkInfo
19
+ * @property {string|null} framework - 'next'|'nuxt'|'vue'|'react'|'angular'|'svelte'|'astro'|null
20
+ * @property {string|null} version - Framework version if detectable
21
+ * @property {'spa'|'ssr'|'ssg'|'unknown'} routingType - Routing/rendering strategy
22
+ * @property {'high'|'medium'|'low'} confidence - Detection confidence
23
+ * @property {string[]} signals - Matched detection signals
24
+ */
25
+
26
+ // Confidence thresholds
27
+ const CONFIDENCE_HIGH_THRESHOLD = 5;
28
+ const CONFIDENCE_MEDIUM_THRESHOLD = 3;
29
+
30
+ /**
31
+ * Detection signals for each framework
32
+ * Each signal has: type, path/selector/pattern, weight (1-3), signal (label)
33
+ */
34
+ const DETECTION_SIGNALS = {
35
+ next: [
36
+ { type: 'global', path: ['__NEXT_DATA__'], weight: 3, signal: '__NEXT_DATA__' },
37
+ { type: 'global', path: ['__NEXT_LOADED_PAGES__'], weight: 2, signal: '__NEXT_LOADED_PAGES__' },
38
+ { type: 'global', path: ['__BUILD_MANIFEST'], weight: 2, signal: '__BUILD_MANIFEST' },
39
+ { type: 'dom', selector: '#__next', weight: 2, signal: '#__next' },
40
+ { type: 'script', pattern: '/_next/', weight: 1, signal: 'script:/_next/' }
41
+ ],
42
+ nuxt: [
43
+ { type: 'global', path: ['__NUXT__'], weight: 3, signal: '__NUXT__' },
44
+ { type: 'global', path: ['$nuxt'], weight: 2, signal: '$nuxt' },
45
+ { type: 'global', path: ['__NUXT_PATHS__'], weight: 2, signal: '__NUXT_PATHS__' },
46
+ { type: 'dom', selector: '#__nuxt', weight: 2, signal: '#__nuxt' },
47
+ { type: 'dom', selector: '#__layout', weight: 1, signal: '#__layout' },
48
+ { type: 'script', pattern: '/_nuxt/', weight: 1, signal: 'script:/_nuxt/' }
49
+ ],
50
+ vue: [
51
+ { type: 'global', path: ['__VUE__'], weight: 3, signal: '__VUE__' },
52
+ { type: 'global', path: ['Vue'], weight: 2, signal: 'Vue' },
53
+ { type: 'global', path: ['__VUE_DEVTOOLS_GLOBAL_HOOK__'], weight: 1, signal: '__VUE_DEVTOOLS_GLOBAL_HOOK__' },
54
+ { type: 'dom', selector: '[data-v-]', weight: 2, signal: 'data-v-*' },
55
+ { type: 'dom', selector: '[data-server-rendered]', weight: 2, signal: 'data-server-rendered' }
56
+ ],
57
+ react: [
58
+ { type: 'global', path: ['__REACT_DEVTOOLS_GLOBAL_HOOK__'], weight: 1, signal: '__REACT_DEVTOOLS_GLOBAL_HOOK__' },
59
+ { type: 'dom', selector: '[data-reactroot]', weight: 3, signal: 'data-reactroot' },
60
+ { type: 'dom', selector: '[data-reactid]', weight: 2, signal: 'data-reactid' },
61
+ { type: 'dom', selector: '#root[data-reactroot], #root > div', weight: 1, signal: '#root' }
62
+ ],
63
+ angular: [
64
+ { type: 'global', path: ['ng'], weight: 2, signal: 'ng' },
65
+ { type: 'global', path: ['getAllAngularRootElements'], weight: 3, signal: 'getAllAngularRootElements' },
66
+ { type: 'dom', selector: '[ng-version]', weight: 3, signal: 'ng-version' },
67
+ { type: 'dom', selector: 'app-root', weight: 2, signal: 'app-root' },
68
+ { type: 'dom', selector: '[_nghost-]', weight: 2, signal: '_nghost-*' },
69
+ { type: 'dom', selector: '[ng-app]', weight: 2, signal: 'ng-app' }
70
+ ],
71
+ svelte: [
72
+ { type: 'global', path: ['__svelte__'], weight: 2, signal: '__svelte__' },
73
+ { type: 'global', path: ['__sveltekit'], weight: 3, signal: '__sveltekit' },
74
+ { type: 'dom', selector: '[data-sveltekit-preload-data]', weight: 3, signal: 'data-sveltekit-preload-data' },
75
+ { type: 'dom', selector: '[data-sveltekit-reload]', weight: 2, signal: 'data-sveltekit-reload' },
76
+ { type: 'script', pattern: '/@svelte/', weight: 1, signal: 'script:/@svelte/' }
77
+ ],
78
+ astro: [
79
+ { type: 'dom', selector: 'astro-island', weight: 3, signal: 'astro-island' },
80
+ { type: 'dom', selector: '[data-astro-cid-]', weight: 2, signal: 'data-astro-cid-*' },
81
+ { type: 'dom', selector: '[data-astro-source-file]', weight: 2, signal: 'data-astro-source-file' },
82
+ { type: 'meta', name: 'generator', pattern: 'Astro', weight: 3, signal: 'meta:generator:Astro' },
83
+ { type: 'script', pattern: '/@astrojs/', weight: 1, signal: 'script:/@astrojs/' }
84
+ ]
85
+ };
86
+
87
+ /**
88
+ * Calculate confidence level based on total weight
89
+ * @param {number} totalWeight - Sum of matched signal weights
90
+ * @returns {'high'|'medium'|'low'} Confidence level
91
+ */
92
+ function calculateConfidence(totalWeight) {
93
+ if (totalWeight >= CONFIDENCE_HIGH_THRESHOLD) return 'high';
94
+ if (totalWeight >= CONFIDENCE_MEDIUM_THRESHOLD) return 'medium';
95
+ return 'low';
96
+ }
97
+
98
+ /**
99
+ * Safe property access without eval()
100
+ * @param {Object} obj - Object to traverse
101
+ * @param {string[]} path - Property path array
102
+ * @returns {*} Value at path or undefined
103
+ */
104
+ function safeGet(obj, path) {
105
+ let current = obj;
106
+ for (const key of path) {
107
+ if (current === null || current === undefined) return undefined;
108
+ current = current[key];
109
+ }
110
+ return current;
111
+ }
112
+
113
+ /**
114
+ * Check if element has attribute with prefix
115
+ * @param {Element} el - DOM element
116
+ * @param {string} prefix - Attribute prefix
117
+ * @returns {boolean}
118
+ */
119
+ function hasAttributeWithPrefix(el, prefix) {
120
+ return Array.from(el.attributes).some(attr => attr.name.startsWith(prefix));
121
+ }
122
+
123
+ /**
124
+ * Detection logic that runs in browser context via page.evaluate()
125
+ * @param {Object} signals - DETECTION_SIGNALS object
126
+ * @returns {Object} Detection results for all frameworks
127
+ */
128
+ function browserDetectionLogic(signals) {
129
+ // Helper: safe property access without eval
130
+ function safeGet(obj, path) {
131
+ let current = obj;
132
+ for (const key of path) {
133
+ if (current === null || current === undefined) return undefined;
134
+ current = current[key];
135
+ }
136
+ return current;
137
+ }
138
+
139
+ // Helper: check if any element has attribute with prefix
140
+ function hasAttrPrefix(prefix) {
141
+ return Array.from(document.querySelectorAll('*')).some(el =>
142
+ Array.from(el.attributes).some(attr => attr.name.startsWith(prefix))
143
+ );
144
+ }
145
+
146
+ const results = {};
147
+
148
+ for (const [framework, checks] of Object.entries(signals)) {
149
+ let totalWeight = 0;
150
+ const matchedSignals = [];
151
+ let version = null;
152
+
153
+ for (const check of checks) {
154
+ let matched = false;
155
+
156
+ try {
157
+ switch (check.type) {
158
+ case 'global':
159
+ // Safe property traversal instead of eval()
160
+ matched = safeGet(window, check.path) !== undefined;
161
+ break;
162
+
163
+ case 'dom':
164
+ // Handle attribute selectors with partial match
165
+ if (check.selector.includes('[data-v-]')) {
166
+ matched = hasAttrPrefix('data-v-');
167
+ } else if (check.selector.includes('[data-astro-cid-]')) {
168
+ matched = hasAttrPrefix('data-astro-cid-');
169
+ } else if (check.selector.includes('[_nghost-]')) {
170
+ matched = hasAttrPrefix('_nghost-');
171
+ } else {
172
+ matched = !!document.querySelector(check.selector);
173
+ }
174
+ break;
175
+
176
+ case 'script':
177
+ // Check if any script src contains pattern
178
+ const scripts = Array.from(document.querySelectorAll('script[src]'));
179
+ matched = scripts.some(s => s.src.includes(check.pattern));
180
+ break;
181
+
182
+ case 'meta':
183
+ // Check meta tag content
184
+ const meta = document.querySelector(`meta[name="${check.name}"]`);
185
+ matched = meta && meta.content && meta.content.includes(check.pattern);
186
+ break;
187
+ }
188
+ } catch (e) {
189
+ matched = false;
190
+ }
191
+
192
+ if (matched) {
193
+ totalWeight += check.weight;
194
+ matchedSignals.push(check.signal);
195
+ }
196
+ }
197
+
198
+ // Extract version based on framework
199
+ if (totalWeight > 0) {
200
+ try {
201
+ switch (framework) {
202
+ case 'next':
203
+ const nextData = safeGet(window, ['__NEXT_DATA__']);
204
+ if (nextData) {
205
+ version = nextData.nextExport ? 'export' : (nextData.buildId || null);
206
+ // Try runtime config version
207
+ if (nextData.runtimeConfig?.version) {
208
+ version = nextData.runtimeConfig.version;
209
+ }
210
+ }
211
+ break;
212
+ case 'nuxt':
213
+ const nuxtConfig = safeGet(window, ['__NUXT__', 'config', 'app', 'buildId']);
214
+ if (nuxtConfig) version = nuxtConfig;
215
+ break;
216
+ case 'vue':
217
+ version = safeGet(window, ['Vue', 'version']) ||
218
+ safeGet(window, ['__VUE__', 'version']) || null;
219
+ break;
220
+ case 'react':
221
+ version = safeGet(window, ['React', 'version']) || null;
222
+ break;
223
+ case 'angular':
224
+ const ngVersion = document.querySelector('[ng-version]');
225
+ if (ngVersion) version = ngVersion.getAttribute('ng-version');
226
+ break;
227
+ case 'svelte':
228
+ // Svelte doesn't expose version easily
229
+ break;
230
+ case 'astro':
231
+ const astroMeta = document.querySelector('meta[name="generator"]');
232
+ if (astroMeta && astroMeta.content.includes('Astro')) {
233
+ const match = astroMeta.content.match(/Astro v?([\d.]+)/);
234
+ if (match) version = match[1];
235
+ }
236
+ break;
237
+ }
238
+ } catch (e) {
239
+ // Ignore version extraction errors
240
+ }
241
+ }
242
+
243
+ results[framework] = {
244
+ weight: totalWeight,
245
+ signals: matchedSignals,
246
+ version
247
+ };
248
+ }
249
+
250
+ return results;
251
+ }
252
+
253
+ /**
254
+ * Infer routing type based on framework and detected signals
255
+ * @param {import('playwright').Page} page - Playwright page object
256
+ * @param {string} framework - Detected framework name
257
+ * @returns {Promise<'spa'|'ssr'|'ssg'|'unknown'>} Routing type
258
+ */
259
+ async function inferRoutingType(page, framework) {
260
+ if (!framework) return 'unknown';
261
+
262
+ return await page.evaluate((fw) => {
263
+ // Helper for safe property access
264
+ function safeGet(obj, path) {
265
+ let current = obj;
266
+ for (const key of path) {
267
+ if (current === null || current === undefined) return undefined;
268
+ current = current[key];
269
+ }
270
+ return current;
271
+ }
272
+
273
+ try {
274
+ switch (fw) {
275
+ case 'next': {
276
+ const nextData = safeGet(window, ['__NEXT_DATA__']);
277
+ if (nextData) {
278
+ if (nextData.nextExport) return 'ssg';
279
+ if (nextData.isFallback === false) return 'ssr';
280
+ if (document.querySelector('[data-nscript]')) return 'ssr';
281
+ }
282
+ return 'ssr';
283
+ }
284
+
285
+ case 'nuxt': {
286
+ const nuxtData = safeGet(window, ['__NUXT__']);
287
+ if (nuxtData?.serverRendered === true) return 'ssr';
288
+ if (nuxtData?.serverRendered === false) return 'spa';
289
+ return 'ssr';
290
+ }
291
+
292
+ case 'vue':
293
+ if (window.$nuxt) return 'ssr'; // Actually Nuxt
294
+ if (document.querySelector('[data-server-rendered="true"]')) return 'ssr';
295
+ return 'spa';
296
+
297
+ case 'react':
298
+ if (safeGet(window, ['__NEXT_DATA__'])) return 'ssr';
299
+ if (window.___gatsby) return 'ssg';
300
+ return 'spa';
301
+
302
+ case 'angular':
303
+ if (document.querySelector('[ng-server-context]')) return 'ssr';
304
+ return 'spa';
305
+
306
+ case 'svelte':
307
+ if (safeGet(window, ['__sveltekit'])) return 'ssr';
308
+ return 'spa';
309
+
310
+ case 'astro':
311
+ return 'ssg';
312
+
313
+ default:
314
+ return 'unknown';
315
+ }
316
+ } catch (e) {
317
+ return 'unknown';
318
+ }
319
+ }, framework);
320
+ }
321
+
322
+ /**
323
+ * Detect framework used on the current page
324
+ * @param {import('playwright').Page} page - Playwright page object
325
+ * @returns {Promise<FrameworkInfo>} Framework detection result
326
+ */
327
+ export async function detectFramework(page) {
328
+ // Run detection logic in browser context
329
+ const results = await page.evaluate((signals) => {
330
+ // Helper: safe property access without eval
331
+ function safeGet(obj, path) {
332
+ let current = obj;
333
+ for (const key of path) {
334
+ if (current === null || current === undefined) return undefined;
335
+ current = current[key];
336
+ }
337
+ return current;
338
+ }
339
+
340
+ // Helper: check if any element has attribute with prefix
341
+ function hasAttrPrefix(prefix) {
342
+ return Array.from(document.querySelectorAll('*')).some(el =>
343
+ Array.from(el.attributes).some(attr => attr.name.startsWith(prefix))
344
+ );
345
+ }
346
+
347
+ const results = {};
348
+
349
+ for (const [framework, checks] of Object.entries(signals)) {
350
+ let totalWeight = 0;
351
+ const matchedSignals = [];
352
+ let version = null;
353
+
354
+ for (const check of checks) {
355
+ let matched = false;
356
+
357
+ try {
358
+ switch (check.type) {
359
+ case 'global':
360
+ matched = safeGet(window, check.path) !== undefined;
361
+ break;
362
+
363
+ case 'dom':
364
+ if (check.selector.includes('[data-v-]')) {
365
+ matched = hasAttrPrefix('data-v-');
366
+ } else if (check.selector.includes('[data-astro-cid-]')) {
367
+ matched = hasAttrPrefix('data-astro-cid-');
368
+ } else if (check.selector.includes('[_nghost-]')) {
369
+ matched = hasAttrPrefix('_nghost-');
370
+ } else {
371
+ matched = !!document.querySelector(check.selector);
372
+ }
373
+ break;
374
+
375
+ case 'script':
376
+ const scripts = Array.from(document.querySelectorAll('script[src]'));
377
+ matched = scripts.some(s => s.src.includes(check.pattern));
378
+ break;
379
+
380
+ case 'meta':
381
+ const meta = document.querySelector(`meta[name="${check.name}"]`);
382
+ matched = meta && meta.content && meta.content.includes(check.pattern);
383
+ break;
384
+ }
385
+ } catch (e) {
386
+ matched = false;
387
+ }
388
+
389
+ if (matched) {
390
+ totalWeight += check.weight;
391
+ matchedSignals.push(check.signal);
392
+ }
393
+ }
394
+
395
+ // Extract version based on framework
396
+ if (totalWeight > 0) {
397
+ try {
398
+ switch (framework) {
399
+ case 'next':
400
+ const nextData = safeGet(window, ['__NEXT_DATA__']);
401
+ if (nextData) {
402
+ version = nextData.nextExport ? 'export' : (nextData.buildId || null);
403
+ if (nextData.runtimeConfig?.version) {
404
+ version = nextData.runtimeConfig.version;
405
+ }
406
+ }
407
+ break;
408
+ case 'nuxt':
409
+ const nuxtConfig = safeGet(window, ['__NUXT__', 'config', 'app', 'buildId']);
410
+ if (nuxtConfig) version = nuxtConfig;
411
+ break;
412
+ case 'vue':
413
+ version = safeGet(window, ['Vue', 'version']) ||
414
+ safeGet(window, ['__VUE__', 'version']) || null;
415
+ break;
416
+ case 'react':
417
+ version = safeGet(window, ['React', 'version']) || null;
418
+ break;
419
+ case 'angular':
420
+ const ngVersion = document.querySelector('[ng-version]');
421
+ if (ngVersion) version = ngVersion.getAttribute('ng-version');
422
+ break;
423
+ case 'svelte':
424
+ break;
425
+ case 'astro':
426
+ const astroMeta = document.querySelector('meta[name="generator"]');
427
+ if (astroMeta && astroMeta.content.includes('Astro')) {
428
+ const match = astroMeta.content.match(/Astro v?([\d.]+)/);
429
+ if (match) version = match[1];
430
+ }
431
+ break;
432
+ }
433
+ } catch (e) {
434
+ // Ignore version extraction errors
435
+ }
436
+ }
437
+
438
+ results[framework] = {
439
+ weight: totalWeight,
440
+ signals: matchedSignals,
441
+ version
442
+ };
443
+ }
444
+
445
+ return results;
446
+ }, DETECTION_SIGNALS);
447
+
448
+ // Find framework with highest weight
449
+ // Priority order: SSR frameworks first, then base frameworks
450
+ const priorityOrder = ['next', 'nuxt', 'astro', 'svelte', 'angular', 'vue', 'react'];
451
+
452
+ let bestFramework = null;
453
+ let bestWeight = 0;
454
+ let bestSignals = [];
455
+ let bestVersion = null;
456
+
457
+ for (const framework of priorityOrder) {
458
+ const result = results[framework];
459
+ if (result.weight > bestWeight) {
460
+ bestWeight = result.weight;
461
+ bestFramework = framework;
462
+ bestSignals = result.signals;
463
+ bestVersion = result.version;
464
+ }
465
+ }
466
+
467
+ // Calculate confidence
468
+ const confidence = bestWeight > 0 ? calculateConfidence(bestWeight) : 'low';
469
+
470
+ // Infer routing type
471
+ const routingType = await inferRoutingType(page, bestFramework);
472
+
473
+ return {
474
+ framework: bestFramework,
475
+ version: bestVersion,
476
+ routingType,
477
+ confidence,
478
+ signals: bestSignals
479
+ };
480
+ }
481
+
482
+ /**
483
+ * Format detection result for CLI output
484
+ * @param {FrameworkInfo} info - Detection result
485
+ * @returns {string} Human-readable summary
486
+ */
487
+ export function formatDetectionResult(info) {
488
+ if (!info.framework) {
489
+ return 'No framework detected (static HTML or unknown framework)';
490
+ }
491
+
492
+ const parts = [
493
+ `Framework: ${info.framework}`,
494
+ info.version ? `Version: ${info.version}` : null,
495
+ `Routing: ${info.routingType}`,
496
+ `Confidence: ${info.confidence}`,
497
+ `Signals: ${info.signals.join(', ')}`
498
+ ].filter(Boolean);
499
+
500
+ return parts.join(' | ');
501
+ }
502
+
503
+ // CLI support - check if this is the main module being executed directly
504
+ // Use import.meta.url to compare with process.argv[1]
505
+ import { fileURLToPath } from 'url';
506
+ const __filename = fileURLToPath(import.meta.url);
507
+ const isMainModule = process.argv[1] === __filename;
508
+
509
+ if (isMainModule) {
510
+ const { getBrowser, getPage, disconnectBrowser } = await import('../utils/browser.js');
511
+
512
+ const url = process.argv[2];
513
+ if (!url) {
514
+ console.error('Usage: node framework-detector.js <url>');
515
+ process.exit(1);
516
+ }
517
+
518
+ try {
519
+ const browser = await getBrowser({ headless: true });
520
+ const page = await getPage(browser);
521
+
522
+ await page.goto(url, { waitUntil: 'networkidle', timeout: 30000 });
523
+
524
+ // Wait for hydration
525
+ await new Promise(r => setTimeout(r, 2000));
526
+
527
+ const result = await detectFramework(page);
528
+
529
+ console.log(JSON.stringify(result, null, 2));
530
+ console.error('\n' + formatDetectionResult(result));
531
+
532
+ await disconnectBrowser();
533
+ process.exit(0);
534
+ } catch (error) {
535
+ console.error(JSON.stringify({ error: error.message }));
536
+ process.exit(1);
537
+ }
538
+ }
@@ -3,9 +3,11 @@
3
3
  *
4
4
  * Extract and clean HTML from page, removing scripts,
5
5
  * event handlers, and framework-specific attributes.
6
+ * Optionally enhances with WordPress-compatible semantic structure.
6
7
  */
7
8
 
8
9
  import { LAYOUT_PROPERTIES } from './css-extractor.js';
10
+ import { enhanceSemanticHTMLInPage } from './semantic-enhancer.js';
9
11
 
10
12
  // Size limits
11
13
  export const MAX_HTML_SIZE = 10 * 1024 * 1024; // 10MB limit
@@ -34,12 +36,12 @@ export const CRITICAL_POSITION = ['absolute', 'fixed'];
34
36
 
35
37
  /**
36
38
  * Extract and clean HTML from page
37
- * @param {Page} page - Puppeteer page
39
+ * @param {Page} page - Playwright page
38
40
  * @param {Array} frameworkPatterns - Patterns to remove
39
41
  * @returns {Promise<{html: string, warnings: string[], elementCount: number}>}
40
42
  */
41
43
  export async function extractCleanHtml(page, frameworkPatterns = JS_FRAMEWORK_PATTERNS) {
42
- return await page.evaluate((patterns, inlineProps, criticalDisplay, criticalPosition) => {
44
+ return await page.evaluate(({ patterns, inlineProps, criticalDisplay, criticalPosition }) => {
43
45
  const warnings = [];
44
46
 
45
47
  // Check DOM size
@@ -166,6 +168,45 @@ export async function extractCleanHtml(page, frameworkPatterns = JS_FRAMEWORK_PA
166
168
  doc.innerHTML + '\n</html>';
167
169
 
168
170
  return { html, warnings, elementCount, inlinedCount };
169
- }, frameworkPatterns.map(r => ({ source: r.source, flags: r.flags })),
170
- INLINE_LAYOUT_PROPS, CRITICAL_DISPLAY, CRITICAL_POSITION);
171
+ }, {
172
+ patterns: frameworkPatterns.map(r => ({ source: r.source, flags: r.flags })),
173
+ inlineProps: INLINE_LAYOUT_PROPS,
174
+ criticalDisplay: CRITICAL_DISPLAY,
175
+ criticalPosition: CRITICAL_POSITION
176
+ });
177
+ }
178
+
179
+ /**
180
+ * Extract, clean, and optionally enhance HTML with semantic structure
181
+ * @param {Page} page - Playwright page
182
+ * @param {Object} options - Configuration options
183
+ * @param {boolean} [options.enhanceSemantic=true] - Add WordPress semantic IDs/classes/roles
184
+ * @param {Array} [options.frameworkPatterns] - Custom framework patterns to remove
185
+ * @returns {Promise<{html: string, warnings: string[], elementCount: number, semanticStats?: Object}>}
186
+ */
187
+ export async function extractAndEnhanceHtml(page, options = {}) {
188
+ const {
189
+ enhanceSemantic = true,
190
+ frameworkPatterns = JS_FRAMEWORK_PATTERNS
191
+ } = options;
192
+
193
+ // First extract clean HTML
194
+ const result = await extractCleanHtml(page, frameworkPatterns);
195
+
196
+ // Apply semantic enhancement if enabled
197
+ if (enhanceSemantic) {
198
+ try {
199
+ const enhanced = await enhanceSemanticHTMLInPage(page, result.html);
200
+ return {
201
+ ...result,
202
+ html: enhanced.html,
203
+ semanticStats: enhanced.stats
204
+ };
205
+ } catch (err) {
206
+ result.warnings.push(`Semantic enhancement failed: ${err.message}`);
207
+ return result;
208
+ }
209
+ }
210
+
211
+ return result;
171
212
  }
@@ -14,7 +14,7 @@ export const IMAGE_LOAD_TIMEOUT = 20000;
14
14
  * - Sets loading="eager" on all images
15
15
  * - Copies data-src to src if exists
16
16
  * - Triggers IntersectionObserver by scrolling
17
- * @param {Page} page - Puppeteer page
17
+ * @param {Page} page - Playwright page
18
18
  */
19
19
  export async function forceLazyImages(page) {
20
20
  return await page.evaluate(async () => {
@@ -51,7 +51,7 @@ export async function forceLazyImages(page) {
51
51
 
52
52
  /**
53
53
  * Force all hidden animated elements to be visible
54
- * @param {Page} page - Puppeteer page
54
+ * @param {Page} page - Playwright page
55
55
  */
56
56
  export async function forceAnimatedElementsVisible(page) {
57
57
  return await page.evaluate(() => {
@@ -78,12 +78,12 @@ export async function forceAnimatedElementsVisible(page) {
78
78
 
79
79
  /**
80
80
  * Trigger lazy loading by scrolling through entire page
81
- * @param {Page} page - Puppeteer page
81
+ * @param {Page} page - Playwright page
82
82
  * @param {number} maxIterations - Max scroll iterations
83
83
  * @param {number} scrollDelay - Pause time between scrolls
84
84
  */
85
85
  export async function triggerLazyLoad(page, maxIterations = 20, scrollDelay = 1500) {
86
- return await page.evaluate(async (maxIter, pauseMs) => {
86
+ return await page.evaluate(async ({ maxIter, pauseMs }) => {
87
87
  return new Promise(async (resolve) => {
88
88
  const viewportHeight = window.innerHeight;
89
89
  const totalHeight = document.body.scrollHeight;
@@ -128,12 +128,12 @@ export async function triggerLazyLoad(page, maxIterations = 20, scrollDelay = 15
128
128
  stableAt: iterations
129
129
  });
130
130
  });
131
- }, maxIterations, scrollDelay);
131
+ }, { maxIter: maxIterations, pauseMs: scrollDelay });
132
132
  }
133
133
 
134
134
  /**
135
135
  * Wait for all images to finish loading
136
- * @param {Page} page - Puppeteer page
136
+ * @param {Page} page - Playwright page
137
137
  * @param {number} timeout - Max wait time
138
138
  */
139
139
  export async function waitForAllImages(page, timeout = IMAGE_LOAD_TIMEOUT) {
@@ -178,7 +178,7 @@ export async function waitForAllImages(page, timeout = IMAGE_LOAD_TIMEOUT) {
178
178
  }, timeout);
179
179
 
180
180
  try {
181
- await page.waitForNetworkIdle({ timeout: Math.min(timeout, 10000) });
181
+ await page.waitForLoadState('networkidle', { timeout: Math.min(timeout, 10000) });
182
182
  } catch {
183
183
  // Network didn't become idle, continue anyway
184
184
  }