npm - design-clone - Versions diffs - 2.1.0 → 2.3.0 - Mend

design-clone 2.1.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (177) hide show

package/README.md +13 -34
package/SKILL.md +69 -45
package/bin/cli.js +22 -4
package/bin/commands/clone-site.js +31 -171
package/bin/commands/help.js +19 -6
package/bin/commands/init.js +9 -86
package/bin/commands/uninstall.js +105 -0
package/bin/commands/update.js +70 -0
package/bin/commands/verify.js +7 -14
package/bin/utils/paths.js +28 -0
package/bin/utils/validate.js +2 -22
package/bin/utils/version.js +23 -0
package/docs/code-standards.md +789 -0
package/docs/codebase-summary.md +533 -286
package/docs/index.md +74 -0
package/docs/project-overview-pdr.md +797 -0
package/docs/system-architecture.md +718 -0
package/package.json +14 -17
package/src/ai/prompts/design-tokens/basic.md +80 -0
package/src/ai/prompts/design-tokens/section-with-css.md +41 -0
package/src/ai/prompts/design-tokens/section.md +48 -0
package/src/ai/prompts/design-tokens/with-css.md +87 -0
package/src/ai/prompts/structure-analysis/basic.md +55 -0
package/src/ai/prompts/structure-analysis/with-context.md +59 -0
package/src/ai/prompts/structure-analysis/with-dimensions.md +63 -0
package/src/ai/prompts/structure-analysis/with-hierarchy.md +73 -0
package/src/ai/prompts/ux-audit/aggregation.md +42 -0
package/src/ai/prompts/ux-audit/desktop.md +92 -0
package/src/ai/prompts/ux-audit/mobile.md +93 -0
package/src/ai/prompts/ux-audit/tablet.md +92 -0
package/src/core/animation/animation-extractor-ast.js +183 -0
package/src/core/animation/animation-extractor-output.js +152 -0
package/src/core/animation/animation-extractor.js +178 -0
package/src/core/animation/state-capture-detection.js +200 -0
package/src/core/animation/state-capture.js +193 -0
package/src/core/capture/browser-context-pool.js +96 -0
package/src/core/capture/multi-page-screenshot-page.js +110 -0
package/src/core/capture/multi-page-screenshot.js +208 -0
package/src/core/capture/screenshot-extraction.js +186 -0
package/src/core/capture/screenshot-helpers.js +175 -0
package/src/core/capture/screenshot-orchestrator.js +174 -0
package/src/core/capture/screenshot-viewport.js +93 -0
package/src/core/capture/screenshot.js +192 -0
package/src/core/content/content-counter-dom.js +191 -0
package/src/core/content/content-counter.js +76 -0
package/src/core/css/breakpoint-detector.js +66 -0
package/src/core/css/chromium-defaults.json +23 -0
package/src/core/css/computed-style-extractor.js +102 -0
package/src/core/css/css-chunker.js +103 -0
package/src/core/css/filter-css-dead-code.js +120 -0
package/src/core/css/filter-css-html-analyzer.js +110 -0
package/src/core/css/filter-css-selector-matcher.js +172 -0
package/src/core/css/filter-css.js +206 -0
package/src/core/css/merge-css-atrule-processor.js +158 -0
package/src/core/css/merge-css-file-io.js +68 -0
package/src/core/css/merge-css.js +148 -0
package/src/core/detection/framework-detector-routing.js +68 -0
package/src/core/detection/framework-detector-signals.js +65 -0
package/src/core/detection/framework-detector.js +198 -0
package/src/core/dimension/dimension-extractor-card-detector.js +82 -0
package/src/core/dimension/dimension-extractor.js +317 -0
package/src/core/dimension/dimension-output-ai-summary.js +111 -0
package/src/core/dimension/dimension-output.js +173 -0
package/src/core/dimension/dom-tree-analyzer-tree-builders.js +95 -0
package/src/core/dimension/dom-tree-analyzer.js +191 -0
package/src/core/discovery/app-state-snapshot-capture.js +195 -0
package/src/core/discovery/app-state-snapshot-utils.js +178 -0
package/src/core/discovery/app-state-snapshot.js +131 -0
package/src/core/discovery/discover-pages-routes.js +84 -0
package/src/core/discovery/discover-pages-utils.js +177 -0
package/src/core/discovery/discover-pages.js +191 -0
package/src/core/html/html-extractor-inline-styler.js +70 -0
package/src/core/html/html-extractor.js +147 -0
package/src/core/html/semantic-enhancer-mappings.js +200 -0
package/src/core/html/semantic-enhancer-page.js +148 -0
package/src/core/html/semantic-enhancer.js +135 -0
package/src/core/links/rewrite-links-css-rewriter.js +53 -0
package/src/core/links/rewrite-links.js +173 -0
package/src/core/media/asset-validator.js +118 -0
package/src/core/media/extract-assets-downloader.js +187 -0
package/src/core/media/extract-assets-page-scraper.js +115 -0
package/src/core/media/extract-assets.js +159 -0
package/src/core/media/video-capture-convert.js +200 -0
package/src/core/media/video-capture.js +201 -0
package/src/core/{lazy-loader.js → page-prep/lazy-loader.js} +37 -39
package/src/core/section/section-cropper-helpers.js +43 -0
package/src/core/{section-cropper.js → section/section-cropper.js} +11 -88
package/src/core/section/section-detector-strategies.js +139 -0
package/src/core/section/section-detector-utils.js +100 -0
package/src/core/section/section-detector.js +88 -0
package/src/core/tests/test-section-cropper.js +2 -2
package/src/core/tests/test-section-detector.js +2 -2
package/src/post-process/enhance-assets.js +29 -4
package/src/post-process/fetch-images-unsplash-client.js +123 -0
package/src/post-process/fetch-images.js +60 -263
package/src/post-process/inject-gosnap.js +88 -0
package/src/post-process/inject-icons-svg-replacer.js +76 -0
package/src/post-process/inject-icons.js +47 -200
package/src/route-discoverers/base-discoverer-utils.js +137 -0
package/src/route-discoverers/base-discoverer.js +29 -118
package/src/route-discoverers/index.js +1 -1
package/src/shared/config.js +38 -0
package/src/shared/error-codes.js +31 -0
package/src/shared/viewports.js +46 -0
package/src/utils/browser.js +0 -7
package/src/utils/helpers.js +4 -0
package/src/utils/log.js +12 -0
package/src/utils/playwright-loader.js +76 -0
package/src/utils/playwright.js +3 -69
package/src/utils/progress.js +32 -0
package/src/verification/generate-audit-report-css-fixes.js +52 -0
package/src/verification/generate-audit-report-sections.js +158 -0
package/src/verification/generate-audit-report.js +5 -281
package/src/verification/quality-scorer.js +92 -0
package/src/verification/verify-footer-checks.js +103 -0
package/src/verification/verify-footer-helpers.js +178 -0
package/src/verification/verify-footer.js +23 -381
package/src/verification/verify-header-checks.js +104 -0
package/src/verification/verify-header-helpers.js +156 -0
package/src/verification/verify-header.js +23 -365
package/src/verification/verify-layout-report.js +101 -0
package/src/verification/verify-layout.js +13 -259
package/src/verification/verify-menu-checks.js +104 -0
package/src/verification/verify-menu-helpers.js +112 -0
package/src/verification/verify-menu.js +17 -285
package/src/verification/verify-slider-checks.js +115 -0
package/src/verification/verify-slider-constants.js +65 -0
package/src/verification/verify-slider-helpers.js +164 -0
package/src/verification/verify-slider.js +23 -414
package/.env.example +0 -14
package/docs/basic-clone.md +0 -63
package/docs/cli-reference.md +0 -316
package/docs/design-clone-architecture.md +0 -492
package/docs/pixel-perfect.md +0 -117
package/docs/project-roadmap.md +0 -382
package/docs/troubleshooting.md +0 -170
package/requirements.txt +0 -5
package/src/ai/__pycache__/analyze-structure.cpython-313.pyc +0 -0
package/src/ai/__pycache__/extract-design-tokens.cpython-313.pyc +0 -0
package/src/ai/analyze-structure.py +0 -375
package/src/ai/extract-design-tokens.py +0 -782
package/src/ai/prompts/__init__.py +0 -2
package/src/ai/prompts/__pycache__/__init__.cpython-313.pyc +0 -0
package/src/ai/prompts/__pycache__/design_tokens.cpython-313.pyc +0 -0
package/src/ai/prompts/__pycache__/structure_analysis.cpython-313.pyc +0 -0
package/src/ai/prompts/__pycache__/ux_audit.cpython-313.pyc +0 -0
package/src/ai/prompts/design_tokens.py +0 -316
package/src/ai/prompts/structure_analysis.py +0 -592
package/src/ai/prompts/ux_audit.py +0 -198
package/src/ai/ux-audit.js +0 -596
package/src/core/animation-extractor.js +0 -526
package/src/core/app-state-snapshot.js +0 -511
package/src/core/content-counter.js +0 -342
package/src/core/design-tokens.js +0 -103
package/src/core/dimension-extractor.js +0 -438
package/src/core/dimension-output.js +0 -305
package/src/core/discover-pages.js +0 -542
package/src/core/dom-tree-analyzer.js +0 -298
package/src/core/extract-assets.js +0 -468
package/src/core/filter-css.js +0 -499
package/src/core/framework-detector.js +0 -538
package/src/core/html-extractor.js +0 -212
package/src/core/merge-css.js +0 -407
package/src/core/multi-page-screenshot.js +0 -380
package/src/core/rewrite-links.js +0 -226
package/src/core/screenshot.js +0 -701
package/src/core/section-detector.js +0 -386
package/src/core/semantic-enhancer.js +0 -492
package/src/core/state-capture.js +0 -598
package/src/core/video-capture.js +0 -546
package/src/utils/__init__.py +0 -16
package/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
package/src/utils/__pycache__/env.cpython-313.pyc +0 -0
package/src/utils/env.py +0 -134
/package/src/core/{css-extractor.js → css/css-extractor.js} +0 -0
/package/src/core/{cookie-handler.js → page-prep/cookie-handler.js} +0 -0
/package/src/core/{page-readiness.js → page-prep/page-readiness.js} +0 -0

package/src/core/discovery/discover-pages-routes.js ADDED Viewed

@@ -0,0 +1,84 @@
+/**
+ * Route merging logic for page discovery.
+ *
+ * Merges framework-discovered routes (higher quality) with
+ * link-scraped pages (fallback), deduplicating by normalized path.
+ * Used by discover-pages.js (main orchestrator).
+ */
+import { normalizeUrl, extractPageName, normalizePath, logWarning } from './discover-pages-utils.js';
+/**
+ * Merge framework-discovered routes with link-scraped pages.
+ * Prioritizes framework routes (higher quality), fills gaps with link-scraped.
+ *
+ * @param {Array|null} frameworkRoutes - Routes from framework discoverer
+ * @param {Array|null} linkScrapedPages - Pages from link scraping
+ * @param {string} baseDomain - Base domain for URL normalization
+ * @param {string} baseUrl - Base URL for resolving paths
+ * @returns {Array} Merged and deduplicated pages
+ *
+ * @example
+ * const merged = mergeRoutes(
+ *   [{ path: '/about', name: 'About' }],
+ *   [{ path: '/contact', name: 'Contact' }],
+ *   'example.com',
+ *   'https://example.com'
+ * );
+ */
+export function mergeRoutes(frameworkRoutes, linkScrapedPages, baseDomain, baseUrl) {
+  // Input validation
+  if (!baseDomain || typeof baseDomain !== 'string') {
+    logWarning('mergeRoutes: Invalid baseDomain');
+    baseDomain = '';
+  }
+  if (!baseUrl || typeof baseUrl !== 'string') {
+    logWarning('mergeRoutes: Invalid baseUrl');
+    baseUrl = '';
+  }
+  const seenPaths = new Set();
+  const merged = [];
+  // Add framework routes first (higher quality, more accurate)
+  if (Array.isArray(frameworkRoutes)) {
+    for (const route of frameworkRoutes) {
+      if (!route || typeof route !== 'object') continue;
+      const normalizedPath = normalizePath(route.path || '/');
+      if (seenPaths.has(normalizedPath)) continue;
+      seenPaths.add(normalizedPath);
+      const url = normalizeUrl(baseUrl, normalizedPath) || route.url || '';
+      merged.push({
+        path: normalizedPath,
+        name: route.name || extractPageName('', normalizedPath),
+        url,
+        source: route.source || 'framework',
+        dynamic: Boolean(route.dynamic)
+      });
+    }
+  }
+  // Add link-scraped pages (fill gaps)
+  if (Array.isArray(linkScrapedPages)) {
+    for (const page of linkScrapedPages) {
+      if (!page || typeof page !== 'object') continue;
+      const normalizedPath = normalizePath(page.path || '/');
+      if (seenPaths.has(normalizedPath)) continue;
+      seenPaths.add(normalizedPath);
+      merged.push({
+        path: normalizedPath,
+        name: page.name || extractPageName('', normalizedPath),
+        url: page.url || normalizeUrl(baseUrl, normalizedPath) || '',
+        source: 'link-scrape',
+        dynamic: false
+      });
+    }
+  }
+  return merged;
+}

package/src/core/discovery/discover-pages-utils.js ADDED Viewed

@@ -0,0 +1,177 @@
+/**
+ * URL utility helpers for page discovery.
+ *
+ * Provides URL normalization, domain checking, page name extraction,
+ * exclusion filtering, path normalization, and route merging logic.
+ * Used by discover-pages.js (main orchestrator).
+ */
+import { logWarn } from '../../utils/log.js';
+// Navigation selectors in priority order
+export const NAV_SELECTORS = [
+  'header nav a',
+  'header a',
+  'nav a',
+  '[role="navigation"] a',
+  '.navbar a',
+  '.nav-menu a',
+  '.navigation a',
+  'footer nav a',
+  'footer a'
+];
+// Patterns to exclude from discovered links
+export const EXCLUDE_PATTERNS = [
+  /^mailto:/i,
+  /^tel:/i,
+  /^javascript:/i,
+  /^#/,
+  /\.(pdf|jpg|jpeg|png|gif|svg|webp|ico|zip|tar|gz|mp3|mp4|avi|mov)$/i,
+  /facebook\.com/i,
+  /twitter\.com/i,
+  /instagram\.com/i,
+  /linkedin\.com/i,
+  /youtube\.com/i,
+  /tiktok\.com/i
+];
+// Valid framework names for validation
+export const VALID_FRAMEWORKS = ['next', 'nuxt', 'vue', 'react', 'angular', 'svelte', 'astro'];
+// Default options
+export const DEFAULT_OPTIONS = {
+  maxPages: 10,
+  selectors: null,  // Use default NAV_SELECTORS if null
+  includeSubdomains: false,
+  timeout: 30000,
+  // SPA/Framework options (v1.3)
+  spaMode: true,         // Enable SPA detection and route discovery
+  framework: null,       // Force specific framework (skip detection)
+  noSpaDetect: false,    // Disable SPA/framework detection entirely
+  captureState: false    // Capture app state (Redux/Vuex/Pinia/Zustand)
+};
+/**
+ * Log warning message (only in TTY mode)
+ * @param {string} message - Warning message
+ */
+export function logWarning(message) {
+  logWarn(`[discover-pages] ${message}`);
+}
+/**
+ * Validate and normalize framework option
+ * @param {string|null} framework - Framework name to validate
+ * @returns {string|null} Validated framework name or null
+ */
+export function validateFramework(framework) {
+  if (!framework) return null;
+  const normalized = String(framework).toLowerCase().trim();
+  if (VALID_FRAMEWORKS.includes(normalized)) {
+    return normalized;
+  }
+  logWarning(`Invalid framework "${framework}". Valid options: ${VALID_FRAMEWORKS.join(', ')}`);
+  return null;
+}
+/**
+ * Normalize URL for comparison and deduplication
+ * @param {string} baseUrl - Base URL for resolving relative paths
+ * @param {string} href - URL to normalize
+ * @returns {string|null} Normalized URL or null if invalid
+ */
+export function normalizeUrl(baseUrl, href) {
+  if (!href || typeof href !== 'string') return null;
+  try {
+    const url = new URL(href, baseUrl);
+    // Skip non-http(s) protocols
+    if (!url.protocol.startsWith('http')) return null;
+    // Build normalized URL: origin + pathname (no hash, no query)
+    let normalized = url.origin + url.pathname;
+    // Remove trailing slash (except for root)
+    if (normalized.endsWith('/') && normalized !== url.origin + '/') {
+      normalized = normalized.slice(0, -1);
+    }
+    return normalized;
+  } catch {
+    return null;
+  }
+}
+/**
+ * Check if URL is same domain as base
+ * @param {string} url - URL to check
+ * @param {string} baseDomain - Base domain to compare against
+ * @param {boolean} includeSubdomains - Whether to include subdomains
+ * @returns {boolean}
+ */
+export function isSameDomain(url, baseDomain, includeSubdomains = false) {
+  try {
+    const urlObj = new URL(url);
+    const hostname = urlObj.hostname.toLowerCase();
+    const base = baseDomain.toLowerCase();
+    if (hostname === base) return true;
+    if (includeSubdomains) {
+      return hostname.endsWith('.' + base);
+    }
+    return false;
+  } catch {
+    return false;
+  }
+}
+/**
+ * Extract page name from link text or URL path
+ * @param {string} text - Link text
+ * @param {string} path - URL path
+ * @returns {string} Page name
+ */
+export function extractPageName(text, path) {
+  // Use link text if available and meaningful
+  if (text && text.length > 0 && text.length < 50) {
+    return text;
+  }
+  // Extract from path
+  if (!path || path === '/') return 'Home';
+  // Get last segment of path
+  const segments = path.split('/').filter(Boolean);
+  if (segments.length === 0) return 'Home';
+  const lastSegment = segments[segments.length - 1];
+  // Convert kebab-case/snake_case to Title Case
+  return lastSegment
+    .replace(/[-_]/g, ' ')
+    .replace(/\b\w/g, c => c.toUpperCase());
+}
+/**
+ * Check if href should be excluded
+ * @param {string} href - URL to check
+ * @returns {boolean}
+ */
+export function shouldExclude(href) {
+  if (!href) return true;
+  return EXCLUDE_PATTERNS.some(pattern => pattern.test(href));
+}
+/**
+ * Normalize a path (remove trailing slash except for root)
+ * @param {string} path - Path to normalize
+ * @returns {string} Normalized path
+ */
+export function normalizePath(path) {
+  if (!path || typeof path !== 'string') return '/';
+  return path.endsWith('/') && path !== '/' ? path.slice(0, -1) : path;
+}

package/src/core/discovery/discover-pages.js ADDED Viewed

@@ -0,0 +1,191 @@
+/**
+ * Page Discovery Module
+ *
+ * Extracts navigation links from a website to discover cloneable pages.
+ * Handles SPA hydration, filters external links, and normalizes URLs.
+ *
+ * Enhanced with SPA/Framework support (v1.3):
+ * - Framework detection (Next.js, Nuxt, Vue, React, Angular, Svelte, Astro)
+ * - Framework-specific route discovery
+ * - App state capture (optional)
+ *
+ * Usage:
+ *   import { discoverPages } from '../discovery/discover-pages.js';
+ *   const result = await discoverPages('https://example.com', { maxPages: 10 });
+ */
+import { getBrowser, getPage, disconnectBrowser } from '../../utils/browser.js';
+import { waitForDomStable } from '../page-prep/page-readiness.js';
+import { dismissCookieBanner } from '../page-prep/cookie-handler.js';
+import { detectFramework } from '../detection/framework-detector.js';
+import { discoverRoutes as discoverFrameworkRoutes } from '../../route-discoverers/index.js';
+import { captureAppState } from './app-state-snapshot.js';
+import {
+  NAV_SELECTORS, DEFAULT_OPTIONS, logWarning, validateFramework,
+  normalizeUrl, isSameDomain, extractPageName, shouldExclude
+} from './discover-pages-utils.js';
+import { mergeRoutes } from './discover-pages-routes.js';
+export { normalizeUrl, isSameDomain, extractPageName } from './discover-pages-utils.js';
+/**
+ * Estimate capture time for discovered pages
+ * @param {Array} pages - Discovered pages
+ * @param {string[]} viewports - Viewport names
+ * @returns {Object} Estimate with pages, viewports, totalCaptures, estimatedSeconds
+ */
+export function estimateCapture(pages, viewports = ['desktop', 'tablet', 'mobile']) {
+  const perCapture = 6; // seconds per viewport capture (avg)
+  const totalCaptures = pages.length * viewports.length;
+  return {
+    pages: pages.length,
+    viewports: viewports.length,
+    totalCaptures,
+    estimatedSeconds: totalCaptures * perCapture,
+    estimatedMinutes: Math.ceil((totalCaptures * perCapture) / 60)
+  };
+}
+/**
+ * Scrape navigation links from the loaded page.
+ * @param {import('playwright').Page} page
+ * @param {string} baseUrl
+ * @param {string} baseDomain
+ * @param {Object} opts
+ * @returns {Promise<Array>} linkScrapedPages
+ */
+async function scrapeNavLinks(page, baseUrl, baseDomain, opts) {
+  const selectorString = (opts.selectors || NAV_SELECTORS).join(', ');
+  const rawLinks = await page.$$eval(selectorString, els =>
+    els.map(el => ({ href: el.href, text: el.textContent?.trim() || '' }))
+  ).catch(() => []);
+  const seenUrls = new Set();
+  const pages = [];
+  const homeUrl = normalizeUrl(baseUrl, '/');
+  if (homeUrl) { seenUrls.add(homeUrl); pages.push({ path: '/', name: 'Home', url: homeUrl }); }
+  for (const link of rawLinks) {
+    if (shouldExclude(link.href)) continue;
+    const normalized = normalizeUrl(baseUrl, link.href);
+    if (!normalized || seenUrls.has(normalized)) continue;
+    if (!isSameDomain(normalized, baseDomain, opts.includeSubdomains)) continue;
+    const path = new URL(normalized).pathname;
+    if (path === '/') continue;
+    seenUrls.add(normalized);
+    pages.push({ path, name: extractPageName(link.text, path), url: normalized });
+    if (pages.length >= opts.maxPages) break;
+  }
+  return { pages, rawCount: rawLinks.length };
+}
+/**
+ * Discover pages from a website by extracting navigation links.
+ * @param {string} baseUrl - Starting URL to discover from
+ * @param {Object} options - Discovery options
+ * @returns {Promise<Object>} Discovery result
+ */
+export async function discoverPages(baseUrl, options = {}) {
+  const opts = { ...DEFAULT_OPTIONS, ...options };
+  const startTime = Date.now();
+  let browser = null;
+  try {
+    const baseUrlObj = new URL(baseUrl);
+    const baseDomain = baseUrlObj.hostname;
+    browser = await getBrowser({ headless: true });
+    const page = await getPage(browser);
+    await page.goto(baseUrl, { waitUntil: 'networkidle', timeout: opts.timeout });
+    await page.waitForSelector('nav a, header a, [role="navigation"] a', {
+      visible: true, timeout: 5000
+    }).catch(() => {});
+    await waitForDomStable(page, 500, 5000);
+    await dismissCookieBanner(page);
+    await new Promise(r => setTimeout(r, 1000));
+    // SPA/Framework detection
+    let frameworkInfo = null;
+    let frameworkRoutes = [];
+    let stateSnapshot = null;
+    if (!opts.noSpaDetect) {
+      if (opts.framework) {
+        const fw = validateFramework(opts.framework);
+        if (fw) frameworkInfo = { framework: fw, version: null, routingType: 'spa', confidence: 'forced', signals: ['user-specified'] };
+      } else {
+        try { frameworkInfo = await detectFramework(page); }
+        catch (e) { logWarning(`Framework detection failed: ${e.message}`); }
+      }
+      if (frameworkInfo?.framework && opts.spaMode) {
+        try {
+          const r = await discoverFrameworkRoutes(page, baseUrl, frameworkInfo);
+          frameworkRoutes = r.routes || [];
+        } catch (e) { logWarning(`Route discovery failed for ${frameworkInfo.framework}: ${e.message}`); }
+      }
+      if (opts.captureState && frameworkInfo) {
+        try { stateSnapshot = await captureAppState(page, frameworkInfo); }
+        catch (e) { logWarning(`State capture failed: ${e.message}`); }
+      }
+    }
+    // Link scraping
+    const { pages: linkScrapedPages, rawCount } = await scrapeNavLinks(page, baseUrl, baseDomain, opts);
+    // Merge and sort
+    let pages = frameworkRoutes.length > 0
+      ? mergeRoutes(frameworkRoutes, linkScrapedPages, baseDomain, baseUrl)
+      : linkScrapedPages.map(p => ({ ...p, source: 'link-scrape', dynamic: false }));
+    if (pages.length > opts.maxPages) pages = pages.slice(0, opts.maxPages);
+    pages.sort((a, b) => {
+      if (a.path === '/') return -1;
+      if (b.path === '/') return 1;
+      return (a.path.match(/\//g) || []).length - (b.path.match(/\//g) || []).length;
+    });
+    return {
+      success: true,
+      baseUrl: baseUrlObj.origin,
+      baseDomain,
+      framework: frameworkInfo,
+      stateSnapshot,
+      pages,
+      stats: { totalLinksFound: rawCount, frameworkRoutesFound: frameworkRoutes.length, pagesDiscovered: pages.length, durationMs: Date.now() - startTime }
+    };
+  } catch (error) {
+    let normalizedBaseUrl = baseUrl;
+    let errorBaseDomain = '';
+    try { const u = new URL(baseUrl); normalizedBaseUrl = u.origin; errorBaseDomain = u.hostname; } catch { /* keep original */ }
+    return {
+      success: false,
+      baseUrl: normalizedBaseUrl,
+      baseDomain: errorBaseDomain,
+      framework: null,
+      stateSnapshot: null,
+      pages: [{ path: '/', name: 'Home', url: normalizeUrl(baseUrl, '/') || baseUrl, source: 'fallback', dynamic: false }],
+      error: error.message,
+      stats: { totalLinksFound: 0, frameworkRoutesFound: 0, pagesDiscovered: 1, durationMs: Date.now() - startTime }
+    };
+  } finally {
+    if (browser) await disconnectBrowser();
+  }
+}
+// CLI support
+import { fileURLToPath } from 'url';
+const __filename = fileURLToPath(import.meta.url);
+if (process.argv[1] === __filename) {
+  const url = process.argv[2];
+  const maxPages = parseInt(process.argv[3]) || 10;
+  if (!url) { console.error('Usage: node discover-pages.js <url> [maxPages]'); process.exit(1); }
+  discoverPages(url, { maxPages })
+    .then(result => { console.log(JSON.stringify(result, null, 2)); process.exit(result.success ? 0 : 1); })
+    .catch(err => { console.error(JSON.stringify({ success: false, error: err.message })); process.exit(1); });
+}

package/src/core/html/html-extractor-inline-styler.js ADDED Viewed

@@ -0,0 +1,70 @@
+/**
+ * Inline Style Injector for HTML Extraction
+ *
+ * Computes and inlines critical layout styles (flex, grid, absolute, fixed)
+ * onto cloned DOM elements during HTML extraction to preserve visual layout
+ * without relying on external stylesheets.
+ *
+ * Designed to run inside page.evaluate — all functions are serialized as
+ * source strings and reconstructed in the browser context.
+ */
+/**
+ * Compute inline styles for critical elements and apply them to the cloned doc.
+ * Called inside page.evaluate with the live document and cloned doc in scope.
+ *
+ * @param {Document} liveDocument - The live page document (for getComputedStyle)
+ * @param {Document} clonedDoc - The cloned document to mutate
+ * @param {string[]} inlineProps - CSS property names (camelCase) to inline
+ * @param {string[]} criticalDisplay - Display values that trigger inlining (e.g. 'flex')
+ * @param {string[]} criticalPosition - Position values that trigger inlining (e.g. 'fixed')
+ * @returns {{ inlinedCount: number, warnings: string[] }}
+ */
+export function computeAndApplyInlineStyles(
+  liveDocument, clonedDoc, inlineProps, criticalDisplay, criticalPosition
+) {
+  const warnings    = [];
+  const inlineStyles = [];
+  let inlinedCount  = 0;
+  liveDocument.querySelectorAll('*').forEach((liveEl, idx) => {
+    const style    = getComputedStyle(liveEl);
+    const display  = style.display;
+    const position = style.position;
+    if (!criticalDisplay.includes(display) && !criticalPosition.includes(position)) return;
+    const props = [];
+    inlineProps.forEach(prop => {
+      const val = style[prop];
+      if (val && val !== 'auto' && val !== 'none' && val !== 'normal' &&
+          val !== '0px' && val !== 'static' && val !== 'visible' &&
+          val !== 'content-box') {
+        const cssProp = prop.replace(/([A-Z])/g, '-$1').toLowerCase();
+        props.push(`${cssProp}: ${val}`);
+      }
+    });
+    // Always include display for critical elements
+    if (!props.some(p => p.startsWith('display:'))) {
+      props.unshift(`display: ${display}`);
+    }
+    if (props.length > 0) inlineStyles.push({ idx, style: props.join('; ') });
+  });
+  // Apply to cloned doc by index
+  const clonedElements = clonedDoc.querySelectorAll('*');
+  inlineStyles.forEach(({ idx, style }) => {
+    if (!clonedElements[idx]) return;
+    const existing = clonedElements[idx].getAttribute('style') || '';
+    clonedElements[idx].setAttribute('style', existing ? `${existing}; ${style}` : style);
+    inlinedCount++;
+  });
+  if (inlinedCount > 100) {
+    warnings.push(`Inlined ${inlinedCount} critical elements`);
+  }
+  return { inlinedCount, warnings };
+}

package/src/core/html/html-extractor.js ADDED Viewed

@@ -0,0 +1,147 @@
+/**
+ * HTML Extractor
+ *
+ * Extract and clean HTML from page, removing scripts,
+ * event handlers, and framework-specific attributes.
+ * Optionally enhances with WordPress-compatible semantic structure.
+ *
+ * Inline style computation lives in html-extractor-inline-styler.js
+ * and is serialized into the browser context via page.evaluate.
+ */
+import { LAYOUT_PROPERTIES } from '../css/css-extractor.js';
+import { enhanceSemanticHTMLInPage } from './semantic-enhancer.js';
+import { computeAndApplyInlineStyles } from './html-extractor-inline-styler.js';
+// Size limits
+export const MAX_HTML_SIZE    = 10 * 1024 * 1024; // 10MB
+export const MAX_DOM_ELEMENTS = 50000;
+// JS framework attribute patterns to remove
+export const JS_FRAMEWORK_PATTERNS = [
+  /^data-react/i, /^data-vue/i, /^data-ng/i, /^ng-/i,
+  /^data-svelte/i, /^x-/i, /^hx-/i, /^v-/i,
+  /^data-alpine/i, /^wire:/i, /^@/
+];
+// Properties to inline on critical elements (layout only, not visual)
+export const INLINE_LAYOUT_PROPS = [
+  ...LAYOUT_PROPERTIES.display,
+  ...LAYOUT_PROPERTIES.grid,
+  ...LAYOUT_PROPERTIES.position,
+  ...LAYOUT_PROPERTIES.sizing,
+  ...LAYOUT_PROPERTIES.box.slice(0, 2) // boxSizing, overflow only
+];
+export const CRITICAL_DISPLAY  = ['flex', 'inline-flex', 'grid', 'inline-grid'];
+export const CRITICAL_POSITION = ['absolute', 'fixed'];
+/**
+ * Extract and clean HTML from page.
+ * @param {import('playwright').Page} page
+ * @param {Array<RegExp>} frameworkPatterns - Patterns to remove
+ * @returns {Promise<{ html: string, warnings: string[], elementCount: number, inlinedCount: number }>}
+ */
+export async function extractCleanHtml(page, frameworkPatterns = JS_FRAMEWORK_PATTERNS) {
+  // Serialize browser-side helper for inline styling
+  const inlineStylerSrc = computeAndApplyInlineStyles.toString();
+  return await page.evaluate(
+    ({ patterns, inlineProps, criticalDisplay, criticalPosition, inlineStylerSrc }) => {
+      const warnings = [];
+      const elementCount = document.querySelectorAll('*').length;
+      if (elementCount > 50000) warnings.push(`Large DOM: ${elementCount} elements`);
+      const doc = document.documentElement.cloneNode(true);
+      // Remove scripts and noscript
+      doc.querySelectorAll('script, noscript').forEach(el => el.remove());
+      doc.querySelectorAll('svg script, svg a[href^="javascript:"]').forEach(el => el.remove());
+      // Sanitize CSS links
+      doc.querySelectorAll('link[rel="stylesheet"]').forEach(link => {
+        const href = link.getAttribute('href') || '';
+        if (href.startsWith('javascript:') || href.startsWith('data:')) link.remove();
+      });
+      // Sanitize inline styles
+      doc.querySelectorAll('style').forEach(style => {
+        if ((style.textContent || '').match(/@import\s+url\s*\(\s*['"]?(javascript|data):/i)) {
+          style.remove();
+        }
+      });
+      // Remove event handlers and framework attributes
+      const patternRegexes = patterns.map(p => new RegExp(p.source, p.flags));
+      doc.querySelectorAll('*').forEach(el => {
+        [...el.attributes].forEach(attr => {
+          if (attr.name.startsWith('on')) el.removeAttribute(attr.name);
+          if (patternRegexes.some(p => p.test(attr.name))) el.removeAttribute(attr.name);
+        });
+      });
+      // Inline critical layout styles (browser-side helper deserialized here)
+      // eslint-disable-next-line no-new-func
+      const computeAndApplyInlineStyles = new Function('return (' + inlineStylerSrc + ')')();
+      const { inlinedCount, warnings: styleWarnings } = computeAndApplyInlineStyles(
+        document, doc, inlineProps, criticalDisplay, criticalPosition
+      );
+      warnings.push(...styleWarnings);
+      // Remove hidden elements
+      doc.querySelectorAll('[hidden], [style*="display: none"], [style*="display:none"]')
+         .forEach(el => el.remove());
+      // Remove empty style tags and HTML comments
+      doc.querySelectorAll('style:empty').forEach(el => el.remove());
+      const removeComments = (node) => {
+        [...node.childNodes].forEach(child => {
+          if (child.nodeType === 8) child.remove();
+          else if (child.nodeType === 1) removeComments(child);
+        });
+      };
+      removeComments(doc);
+      const html = '<!DOCTYPE html>\n<html lang="' +
+                   (document.documentElement.lang || 'en') + '">\n' +
+                   doc.innerHTML + '\n</html>';
+      return { html, warnings, elementCount, inlinedCount };
+    },
+    {
+      patterns:         frameworkPatterns.map(r => ({ source: r.source, flags: r.flags })),
+      inlineProps:      INLINE_LAYOUT_PROPS,
+      criticalDisplay:  CRITICAL_DISPLAY,
+      criticalPosition: CRITICAL_POSITION,
+      inlineStylerSrc
+    }
+  );
+}
+/**
+ * Extract, clean, and optionally enhance HTML with semantic structure.
+ * @param {import('playwright').Page} page
+ * @param {Object} options
+ * @param {boolean} [options.enhanceSemantic=true]
+ * @param {Array<RegExp>} [options.frameworkPatterns]
+ * @returns {Promise<{ html: string, warnings: string[], elementCount: number, semanticStats?: Object }>}
+ */
+export async function extractAndEnhanceHtml(page, options = {}) {
+  const { enhanceSemantic = true, frameworkPatterns = JS_FRAMEWORK_PATTERNS } = options;
+  const result = await extractCleanHtml(page, frameworkPatterns);
+  if (enhanceSemantic) {
+    try {
+      const enhanced = await enhanceSemanticHTMLInPage(page, result.html);
+      return { ...result, html: enhanced.html, semanticStats: enhanced.stats };
+    } catch (err) {
+      result.warnings.push(`Semantic enhancement failed: ${err.message}`);
+      return result;
+    }
+  }
+  return result;
+}