designlang 9.0.0 → 10.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,131 @@
1
+ // Classify the imagery style of a site from the sampled <img> list. No image
2
+ // downloads — we work from the src URL, displayed dimensions, and styling
3
+ // already captured by the crawler. The goal is a fingerprint an LLM can use:
4
+ // "photography-heavy with abstract gradients" vs "flat illustration only".
5
+
6
+ const LABELS = [
7
+ 'photography', '3d-render', 'isometric', 'flat-illustration',
8
+ 'gradient-mesh', 'icon-only', 'screenshot', 'mixed', 'none',
9
+ ];
10
+
11
+ function isSvg(img) {
12
+ const src = (img.src || '').toLowerCase();
13
+ return src.endsWith('.svg') || src.startsWith('data:image/svg+xml') || img.tag === 'svg';
14
+ }
15
+
16
+ function aspectBucket(img) {
17
+ if (!img.width || !img.height) return 'unknown';
18
+ const r = img.width / img.height;
19
+ if (r > 2.2) return 'ultra-wide';
20
+ if (r > 1.4) return 'landscape';
21
+ if (r > 0.85) return 'square-ish';
22
+ if (r > 0.5) return 'portrait';
23
+ return 'tall';
24
+ }
25
+
26
+ function filenameHint(src) {
27
+ const lower = (src || '').toLowerCase();
28
+ const hints = [];
29
+ if (/\b(screenshot|screen-shot|dashboard|ui-|product-ui)\b/.test(lower)) hints.push('screenshot');
30
+ if (/\b(hero|cover|banner)\b/.test(lower)) hints.push('hero');
31
+ if (/\b(3d|render|gltf|iso|isometric)\b/.test(lower)) hints.push('3d');
32
+ if (/\b(illust|illustr|character|mascot)\b/.test(lower)) hints.push('illustration');
33
+ if (/\b(photo|portrait|team|headshot)\b/.test(lower)) hints.push('photo');
34
+ if (/\b(icon|symbol|logo)\b/.test(lower)) hints.push('icon');
35
+ if (/\b(gradient|mesh|blob)\b/.test(lower)) hints.push('mesh');
36
+ return hints;
37
+ }
38
+
39
+ function scoreLabels(images) {
40
+ const tally = Object.fromEntries(LABELS.map(l => [l, 0]));
41
+ const reasons = [];
42
+ let photoish = 0, svgCount = 0, iconCount = 0, heroCount = 0, screenshotCount = 0;
43
+
44
+ for (const img of images) {
45
+ const svg = isSvg(img);
46
+ if (svg) svgCount++;
47
+ const maxSide = Math.max(img.width || 0, img.height || 0);
48
+ if (maxSide < 40 && maxSide > 0) iconCount++;
49
+ const hints = filenameHint(img.src);
50
+ if (hints.includes('screenshot')) { tally.screenshot += 0.4; screenshotCount++; }
51
+ if (hints.includes('3d')) tally['3d-render'] += 0.6;
52
+ if (hints.includes('iso')) tally['isometric'] += 0.6;
53
+ if (hints.includes('illustration')) tally['flat-illustration'] += 0.5;
54
+ if (hints.includes('photo')) { tally.photography += 0.5; photoish++; }
55
+ if (hints.includes('mesh')) tally['gradient-mesh'] += 0.5;
56
+ if (hints.includes('icon')) iconCount++;
57
+ if (hints.includes('hero')) heroCount++;
58
+
59
+ // Ext-based heuristics.
60
+ const src = (img.src || '').toLowerCase();
61
+ if (/\.(jpe?g|webp|avif)(\?|$)/.test(src) && maxSide > 200) {
62
+ tally.photography += 0.15; photoish++;
63
+ reasons.push('raster photo-ext');
64
+ }
65
+ if (src.endsWith('.png') && maxSide > 150 && !svg) {
66
+ tally.photography += 0.05;
67
+ }
68
+ if (svg && maxSide > 100) {
69
+ tally['flat-illustration'] += 0.15;
70
+ }
71
+ }
72
+
73
+ // Normalization heuristics.
74
+ const total = Math.max(1, images.length);
75
+ if (svgCount / total > 0.6) { tally['flat-illustration'] += 0.4; reasons.push('svg-heavy'); }
76
+ if (iconCount / total > 0.7) tally['icon-only'] += 0.6;
77
+ if (photoish / total > 0.5) tally.photography += 0.3;
78
+ if (screenshotCount > 0) reasons.push(`${screenshotCount} screenshot-like`);
79
+
80
+ return { tally, reasons, counts: { total, svgCount, iconCount, heroCount, screenshotCount, photoish } };
81
+ }
82
+
83
+ function dominantAspect(images) {
84
+ const buckets = {};
85
+ for (const img of images) {
86
+ const b = aspectBucket(img);
87
+ buckets[b] = (buckets[b] || 0) + 1;
88
+ }
89
+ const sorted = Object.entries(buckets).sort((a, b) => b[1] - a[1]);
90
+ return sorted[0] ? sorted[0][0] : 'unknown';
91
+ }
92
+
93
+ function borderRadiusProfile(images) {
94
+ const radii = images.map(i => parseFloat(i.borderRadius) || 0).filter(n => !isNaN(n));
95
+ if (!radii.length) return 'none';
96
+ const avg = radii.reduce((a, b) => a + b, 0) / radii.length;
97
+ if (avg > 9999) return 'full';
98
+ if (avg > 20) return 'rounded';
99
+ if (avg > 4) return 'soft';
100
+ return 'square';
101
+ }
102
+
103
+ export function extractImageryStyle(images = []) {
104
+ if (!images.length) {
105
+ return { label: 'none', confidence: 0, counts: {}, aspectRatios: [], radiusProfile: 'none', signals: [] };
106
+ }
107
+ const { tally, reasons, counts } = scoreLabels(images);
108
+ const ranked = Object.entries(tally).sort((a, b) => b[1] - a[1]);
109
+ const [winner, winScore] = ranked[0];
110
+ const [, second] = ranked[1] || [null, 0];
111
+ let label = winScore === 0 ? 'mixed' : winner;
112
+ if (winScore > 0 && second > 0 && (winScore - second) < 0.2) label = 'mixed';
113
+ const confidence = Math.min(1, winScore / Math.max(1, images.length * 0.3));
114
+
115
+ return {
116
+ label,
117
+ confidence: Number(confidence.toFixed(3)),
118
+ counts: {
119
+ total: counts.total,
120
+ svg: counts.svgCount,
121
+ icon: counts.iconCount,
122
+ hero: counts.heroCount,
123
+ screenshot: counts.screenshotCount,
124
+ photoLike: counts.photoish,
125
+ },
126
+ dominantAspect: dominantAspect(images),
127
+ radiusProfile: borderRadiusProfile(images),
128
+ alternates: ranked.filter(([, s]) => s > 0 && s !== winScore).slice(0, 3).map(([l, s]) => ({ label: l, score: Number(s.toFixed(3)) })),
129
+ signals: reasons.slice(0, 10),
130
+ };
131
+ }
@@ -0,0 +1,142 @@
1
+ // Pull the site's logo from the page. Uses a Playwright Page handle because we
2
+ // need the inline SVG source (or the <img> pixels) — computed-styles alone
3
+ // can't recover them. Writes the asset to disk and returns metadata.
4
+ //
5
+ // Strategy:
6
+ // 1) Candidate selectors, in priority order:
7
+ // a. header/nav [aria-label*="logo"] | [class*="logo"] | [id*="logo"]
8
+ // b. header a[href="/"] svg | header a[href="/"] img
9
+ // c. first <svg> in <header>|<nav> with width 16-240 and height 16-120
10
+ // d. first <img> in <header>|<nav> with alt matching site name
11
+ // 2) For SVG: capture `outerHTML`, save as .svg.
12
+ // 3) For <img>: save the bytes via page.request (handles CORS), fallback to
13
+ // element.screenshot() if fetch fails.
14
+ // 4) Compute clearspace by sampling 8 directions from the bounding box and
15
+ // stopping at the first non-whitespace pixel within 80px (very cheap).
16
+
17
+ import { writeFileSync } from 'fs';
18
+ import { join } from 'path';
19
+
20
+ const CANDIDATES = [
21
+ 'header a[href="/"] svg, header [href="/"] svg, nav a[href="/"] svg',
22
+ 'header a[href="/"] img, nav a[href="/"] img',
23
+ 'header [class*="logo" i] svg, nav [class*="logo" i] svg, [id*="logo" i] svg',
24
+ 'header [class*="logo" i] img, nav [class*="logo" i] img, [id*="logo" i] img',
25
+ 'header svg, nav svg',
26
+ 'header img, nav img',
27
+ ];
28
+
29
+ async function findLogoHandle(page) {
30
+ for (const sel of CANDIDATES) {
31
+ try {
32
+ const handles = await page.$$(sel);
33
+ for (const h of handles) {
34
+ const info = await h.evaluate((el) => {
35
+ const r = el.getBoundingClientRect();
36
+ const tag = el.tagName.toLowerCase();
37
+ const w = r.width, hh = r.height;
38
+ if (tag === 'svg') {
39
+ if (w < 16 || w > 260 || hh < 12 || hh > 120) return null;
40
+ return { tag, w, hh, x: r.x, y: r.y, outer: el.outerHTML };
41
+ }
42
+ if (tag === 'img') {
43
+ if (w < 20 || w > 320 || hh < 12 || hh > 120) return null;
44
+ return { tag, w, hh, x: r.x, y: r.y, src: el.currentSrc || el.src, alt: el.getAttribute('alt') || '' };
45
+ }
46
+ return null;
47
+ });
48
+ if (info) return { handle: h, info };
49
+ }
50
+ } catch { /* selector not found, keep looking */ }
51
+ }
52
+ return null;
53
+ }
54
+
55
+ async function fetchImgBytes(page, url) {
56
+ try {
57
+ const resp = await page.request.get(url, { timeout: 10000 });
58
+ if (!resp.ok()) return null;
59
+ const buf = await resp.body();
60
+ return buf;
61
+ } catch { return null; }
62
+ }
63
+
64
+ function guessExt(url) {
65
+ const m = (url || '').toLowerCase().match(/\.(png|jpe?g|webp|gif|svg|ico|avif)(?:$|\?)/);
66
+ return m ? (m[1] === 'jpeg' ? 'jpg' : m[1]) : 'png';
67
+ }
68
+
69
+ export async function extractLogo(page, outDir, prefix) {
70
+ const found = await findLogoHandle(page);
71
+ if (!found) {
72
+ return { found: false };
73
+ }
74
+ const { handle, info } = found;
75
+
76
+ let savedPath = null;
77
+ let ext = null;
78
+
79
+ if (info.tag === 'svg') {
80
+ ext = 'svg';
81
+ savedPath = join(outDir, `${prefix}-logo.svg`);
82
+ writeFileSync(savedPath, info.outer, 'utf-8');
83
+ } else {
84
+ ext = guessExt(info.src);
85
+ savedPath = join(outDir, `${prefix}-logo.${ext}`);
86
+ const bytes = await fetchImgBytes(page, info.src);
87
+ if (bytes) {
88
+ writeFileSync(savedPath, bytes);
89
+ } else {
90
+ // Fallback: screenshot the element.
91
+ try {
92
+ await handle.screenshot({ path: savedPath });
93
+ ext = 'png';
94
+ savedPath = savedPath.replace(/\.(jpe?g|webp|gif|ico|avif)$/i, '.png');
95
+ } catch {
96
+ return { found: true, saved: false, info };
97
+ }
98
+ }
99
+ }
100
+
101
+ // Cheap clearspace: sample the bounding box + 80px margin for document
102
+ // background continuity. Implemented in-page so we don't pull pixels back.
103
+ const clearspace = await page.evaluate(({ x, y, w, h }) => {
104
+ const body = document.body;
105
+ const bg = getComputedStyle(body).backgroundColor;
106
+ // Count how many pixels of margin we have before hitting another element.
107
+ const margin = { top: 0, right: 0, bottom: 0, left: 0 };
108
+ const probe = (dx, dy, max) => {
109
+ for (let d = 2; d <= max; d += 4) {
110
+ const el = document.elementFromPoint(x + dx(d), y + dy(d));
111
+ if (!el) return d;
112
+ // If we're still inside the logo itself, keep going.
113
+ if (el.closest('svg, img, [class*="logo" i]')) continue;
114
+ const r = el.getBoundingClientRect();
115
+ if (r.width < 8 && r.height < 8) continue;
116
+ // If it's the page/body, we hit open space.
117
+ if (el === body || el.tagName === 'HEADER' || el.tagName === 'NAV') return d;
118
+ return d;
119
+ }
120
+ return max;
121
+ };
122
+ margin.top = probe(() => w / 2, d => -d, 80);
123
+ margin.bottom = probe(() => w / 2, d => h + d, 80);
124
+ margin.left = probe(d => -d, () => h / 2, 80);
125
+ margin.right = probe(d => w + d, () => h / 2, 80);
126
+ return { backgroundColor: bg, margin };
127
+ }, info).catch(() => null);
128
+
129
+ return {
130
+ found: true,
131
+ saved: true,
132
+ path: savedPath,
133
+ file: `${prefix}-logo.${ext}`,
134
+ kind: info.tag,
135
+ width: Number(info.w.toFixed(1)),
136
+ height: Number(info.hh.toFixed(1)),
137
+ aspect: Number((info.w / Math.max(1, info.hh)).toFixed(3)),
138
+ src: info.src || null,
139
+ alt: info.alt || null,
140
+ clearspace,
141
+ };
142
+ }
@@ -0,0 +1,152 @@
1
+ // Classify a site's "material language" — the visual tactility vocabulary it
2
+ // uses. Signals are the already-extracted shadows, borders, backdrop filters,
3
+ // saturation, and geometry. Output: one dominant label + secondary signals.
4
+
5
+ const LABELS = [
6
+ 'glassmorphism', 'neumorphism', 'flat', 'brutalist',
7
+ 'skeuomorphic', 'material-you', 'soft-ui', 'mixed',
8
+ ];
9
+
10
+ function parseHexToRgb(hex) {
11
+ if (!hex || !hex.startsWith('#')) return null;
12
+ const h = hex.replace('#', '');
13
+ const full = h.length === 3 ? h.split('').map(c => c + c).join('') : h;
14
+ if (full.length !== 6) return null;
15
+ return { r: parseInt(full.slice(0, 2), 16), g: parseInt(full.slice(2, 4), 16), b: parseInt(full.slice(4, 6), 16) };
16
+ }
17
+
18
+ function rgbSaturation(rgb) {
19
+ if (!rgb) return 0;
20
+ const max = Math.max(rgb.r, rgb.g, rgb.b) / 255;
21
+ const min = Math.min(rgb.r, rgb.g, rgb.b) / 255;
22
+ if (max === 0) return 0;
23
+ return (max - min) / max;
24
+ }
25
+
26
+ function avgSaturation(colors) {
27
+ if (!colors.length) return 0;
28
+ let total = 0, count = 0;
29
+ for (const c of colors) {
30
+ const rgb = parseHexToRgb(c.hex || c);
31
+ if (!rgb) continue;
32
+ total += rgbSaturation(rgb);
33
+ count++;
34
+ }
35
+ return count > 0 ? total / count : 0;
36
+ }
37
+
38
+ function detectBackdropBlur(modernCss = {}, variables = {}) {
39
+ // modernCss.pseudoElements might contain backdrop-filter samples; also look
40
+ // at css-variables for `--backdrop`.
41
+ const samples = [
42
+ ...((modernCss.pseudoElements && modernCss.pseudoElements.samples) || []),
43
+ ...Object.values(variables || {}).flatMap(v => typeof v === 'string' ? [v] : []),
44
+ ].join(' ');
45
+ return /backdrop-filter|backdrop-blur|blur\(\s*\d+px\s*\)/i.test(samples);
46
+ }
47
+
48
+ function shadowComplexity(shadowValues) {
49
+ // Soft-UI / neumorphism use pair-shadows (inset + outer) with low blur and
50
+ // low-saturation grays. Brutalism uses hard black shadows with 0 blur.
51
+ if (!shadowValues.length) return { profile: 'none', avgBlur: 0, maxBlur: 0, insetCount: 0, hardShadowCount: 0, hasPair: false };
52
+ let insetCount = 0, hardShadowCount = 0, totalBlur = 0, maxBlur = 0, pairCount = 0;
53
+ for (const v of shadowValues) {
54
+ const raw = typeof v === 'string' ? v : (v.value || '');
55
+ if (/inset/i.test(raw)) insetCount++;
56
+ // Blur is the third length in `offset-x offset-y blur [spread] color`. The
57
+ // `px` unit is common but optional — `0 0` is a valid zero-blur shadow.
58
+ const blurs = [...raw.matchAll(/(-?\d+(?:\.\d+)?)(?:px)?\s+(-?\d+(?:\.\d+)?)(?:px)?\s+(\d+(?:\.\d+)?)(?:px)?/g)];
59
+ for (const m of blurs) {
60
+ const blur = parseFloat(m[3]);
61
+ totalBlur += blur;
62
+ if (blur > maxBlur) maxBlur = blur;
63
+ if (blur === 0) hardShadowCount++;
64
+ }
65
+ if ((raw.match(/,/g) || []).length >= 1) pairCount++;
66
+ }
67
+ const avgBlur = totalBlur / Math.max(1, shadowValues.length);
68
+ let profile = 'soft';
69
+ if (hardShadowCount > shadowValues.length * 0.5) profile = 'hard';
70
+ else if (maxBlur > 40) profile = 'diffuse';
71
+ return { profile, avgBlur, maxBlur, insetCount, hardShadowCount, hasPair: pairCount > 0 };
72
+ }
73
+
74
+ function borderProfile(radii = [], borderValues = []) {
75
+ const nums = radii.map(r => {
76
+ if (typeof r === 'number') return r;
77
+ if (typeof r === 'string') return parseFloat(r) || 0;
78
+ if (r && typeof r === 'object') return parseFloat(r.value || r.px || 0) || 0;
79
+ return 0;
80
+ }).filter(n => !isNaN(n));
81
+ const avg = nums.length ? nums.reduce((a, b) => a + b, 0) / nums.length : 0;
82
+ const max = nums.length ? Math.max(...nums) : 0;
83
+ const pill = nums.some(n => n >= 9999 || n >= 500);
84
+ const sharp = nums.length && max < 4;
85
+ return { avg, max, pill, sharp };
86
+ }
87
+
88
+ export function extractMaterialLanguage(design = {}) {
89
+ const colors = design.colors?.all || [];
90
+ const shadows = design.shadows?.values || [];
91
+ const radii = design.borders?.radii || [];
92
+ const variables = design.variables || {};
93
+ const modernCss = design.modernCss || {};
94
+
95
+ const sat = avgSaturation(colors);
96
+ const hasBackdropBlur = detectBackdropBlur(modernCss, variables);
97
+ const sh = shadowComplexity(shadows);
98
+ const br = borderProfile(radii);
99
+ const gradientCount = design.gradients?.count || 0;
100
+
101
+ const scores = Object.fromEntries(LABELS.map(l => [l, 0]));
102
+ const signals = [];
103
+
104
+ if (hasBackdropBlur) {
105
+ scores.glassmorphism += 0.6; signals.push({ label: 'glassmorphism', weight: 0.6, detail: 'backdrop-filter present' });
106
+ }
107
+ if (sh.avgBlur > 30 && sat < 0.3 && sh.insetCount > 0 && sh.hasPair) {
108
+ scores.neumorphism += 0.7; signals.push({ label: 'neumorphism', weight: 0.7, detail: 'paired blur + inset + low saturation' });
109
+ }
110
+ if (sh.profile === 'hard' && br.sharp && sat > 0.4) {
111
+ scores.brutalist += 0.75; signals.push({ label: 'brutalist', weight: 0.75, detail: 'hard shadows + sharp corners + saturated' });
112
+ }
113
+ if (shadows.length === 0 && sh.insetCount === 0 && br.avg < 12 && gradientCount < 2) {
114
+ scores.flat += 0.55; signals.push({ label: 'flat', weight: 0.55, detail: 'no shadows, simple radii' });
115
+ }
116
+ if (sh.avgBlur > 60 && sat < 0.4 && !hasBackdropBlur) {
117
+ scores['soft-ui'] += 0.5; signals.push({ label: 'soft-ui', weight: 0.5, detail: 'soft diffuse shadows' });
118
+ }
119
+ if (br.pill && sh.profile === 'soft' && sat > 0.3) {
120
+ scores['material-you'] += 0.45; signals.push({ label: 'material-you', weight: 0.45, detail: 'pill shapes + soft shadows' });
121
+ }
122
+ if (gradientCount > 6 && sat > 0.5) {
123
+ scores.skeuomorphic += 0.35; signals.push({ label: 'skeuomorphic', weight: 0.35, detail: 'heavy gradient usage' });
124
+ }
125
+
126
+ const ranked = Object.entries(scores).sort((a, b) => b[1] - a[1]);
127
+ const [winner, winScore] = ranked[0];
128
+ const [, second] = ranked[1] || [null, 0];
129
+ let label = winScore === 0 ? 'flat' : winner;
130
+ // If top two are close, it's "mixed".
131
+ if (winScore > 0 && second > 0 && (winScore - second) < 0.15) label = 'mixed';
132
+ const confidence = Math.min(1, winScore);
133
+
134
+ return {
135
+ label,
136
+ confidence: Number(confidence.toFixed(3)),
137
+ signals,
138
+ metrics: {
139
+ saturation: Number(sat.toFixed(3)),
140
+ shadowProfile: sh.profile,
141
+ avgShadowBlur: Number(sh.avgBlur.toFixed(1)),
142
+ maxShadowBlur: Number(sh.maxBlur.toFixed(1)),
143
+ insetShadows: sh.insetCount,
144
+ avgRadius: Number(br.avg.toFixed(1)),
145
+ maxRadius: Number(br.max.toFixed(1)),
146
+ hasPill: br.pill,
147
+ hasBackdropBlur,
148
+ gradientCount,
149
+ },
150
+ alternates: ranked.filter(([, s]) => s > 0 && s !== winScore).slice(0, 3).map(([l, s]) => ({ label: l, score: Number(s.toFixed(3)) })),
151
+ };
152
+ }
@@ -0,0 +1,172 @@
1
+ // Classify a crawled URL into a canonical page type. Heuristic-only by default
2
+ // (zero deps, deterministic). Returns { type, confidence, signals, alternates }.
3
+ // Consumers can fall back to the optional --smart LLM pass when confidence is low.
4
+
5
+ const TYPES = [
6
+ 'landing', 'pricing', 'docs', 'blog', 'blog-post',
7
+ 'product', 'about', 'dashboard', 'auth', 'legal', 'unknown',
8
+ ];
9
+
10
+ const URL_RULES = [
11
+ { re: /\/pricing(\/|$)/i, type: 'pricing', weight: 0.9 },
12
+ { re: /\/plans?(\/|$)/i, type: 'pricing', weight: 0.75 },
13
+ { re: /\/docs?(\/|$)|\/documentation|\/guide/i, type: 'docs', weight: 0.9 },
14
+ { re: /\/api-reference|\/reference(\/|$)/i, type: 'docs', weight: 0.85 },
15
+ { re: /\/blog(\/[\w-]+)+/i, type: 'blog-post', weight: 0.9 },
16
+ { re: /\/blog(\/|$)/i, type: 'blog', weight: 0.85 },
17
+ { re: /\/changelog(\/|$)/i, type: 'blog', weight: 0.6 },
18
+ { re: /\/about(\/|$)|\/company|\/team/i, type: 'about', weight: 0.85 },
19
+ { re: /\/product(\/|$)|\/products\//i, type: 'product', weight: 0.75 },
20
+ { re: /\/features?(\/|$)|\/solutions?(\/|$)/i, type: 'product', weight: 0.7 },
21
+ { re: /\/login|\/signin|\/sign-in|\/signup|\/sign-up|\/register/i, type: 'auth', weight: 0.95 },
22
+ { re: /\/terms|\/privacy|\/legal|\/cookie-policy/i, type: 'legal', weight: 0.95 },
23
+ { re: /\/app(\/|$)|\/dashboard|\/console|\/admin/i, type: 'dashboard', weight: 0.8 },
24
+ ];
25
+
26
+ const PRICING_TEXT = /(\$\s?\d|€\s?\d|£\s?\d|₹\s?\d)|\b(per\s?(month|user|seat)|\/mo\b|\/month|\/year|\/yr|billed (annually|monthly)|free (forever|plan|tier)|start (free|trial))\b/i;
27
+ const DOCS_TEXT = /\b(installation|getting started|api reference|parameters|return value|npm install|yarn add|pnpm add|`npx |import \{|quickstart)\b/i;
28
+ const BLOG_POST_TEXT = /\b(by\s+[A-Z][a-z]+\s+[A-Z][a-z]+|posted on|published (on|in)|min read|\d+\s+min read)\b/i;
29
+ const LEGAL_TEXT = /\b(privacy policy|terms of service|terms of use|cookie policy|data protection|gdpr|ccpa|effective date|last updated)\b/i;
30
+ const AUTH_TEXT = /\b(sign in|log in|create (an )?account|forgot password|email address|password)\b/i;
31
+
32
+ function rootPath(url) {
33
+ try {
34
+ const u = new URL(url);
35
+ return (u.pathname || '/').replace(/\/+$/, '') || '/';
36
+ } catch { return '/'; }
37
+ }
38
+
39
+ function countFormFields(sections) {
40
+ // sections text doesn't include input tags, so we approximate via headings + text.
41
+ return sections.reduce((n, s) => n + (s.buttonCount || 0), 0);
42
+ }
43
+
44
+ function detectDocsLayout(sections) {
45
+ // Docs pages often have a sidebar nav + long-form article.
46
+ const hasSidebar = sections.some(s =>
47
+ (s.tag === 'aside') || /sidebar|toc|nav-?docs|left-?nav/i.test(s.className || '')
48
+ );
49
+ const longArticle = sections.find(s => s.tag === 'main' || s.tag === 'section');
50
+ const articleLen = longArticle ? (longArticle.text || '').length : 0;
51
+ return { hasSidebar, articleLen };
52
+ }
53
+
54
+ function detectPricingLayout(sections) {
55
+ // 2-4 similarly-sized cards with currency signals = pricing table.
56
+ for (const s of sections) {
57
+ if (!PRICING_TEXT.test(s.text || '')) continue;
58
+ if ((s.cardCount || 0) >= 2 && (s.cardCount || 0) <= 6) return true;
59
+ }
60
+ return false;
61
+ }
62
+
63
+ export function extractPageIntent(rawData = {}, opts = {}) {
64
+ const url = opts.url || rawData.url || '';
65
+ const path = rootPath(url);
66
+ const title = (opts.title || rawData.title || '').toLowerCase();
67
+ const sections = (rawData.light && rawData.light.sections) || rawData.sections || [];
68
+ const metas = ((rawData.light && rawData.light.stack && rawData.light.stack.metas) || []).map(m => ({
69
+ name: (m.name || '').toLowerCase(),
70
+ content: (m.content || ''),
71
+ }));
72
+ const description = (metas.find(m => m.name === 'description') || {}).content || '';
73
+ const ogType = (metas.find(m => m.name === 'og:type') || {}).content || '';
74
+
75
+ const scores = Object.fromEntries(TYPES.map(t => [t, 0]));
76
+ const signals = [];
77
+
78
+ // 1) URL rules (strongest signal).
79
+ for (const rule of URL_RULES) {
80
+ if (rule.re.test(path)) {
81
+ scores[rule.type] += rule.weight;
82
+ signals.push({ kind: 'url', type: rule.type, weight: rule.weight });
83
+ }
84
+ }
85
+
86
+ // 2) og:type.
87
+ if (ogType === 'article') {
88
+ scores['blog-post'] += 0.6;
89
+ signals.push({ kind: 'meta', type: 'blog-post', weight: 0.6, detail: 'og:type=article' });
90
+ }
91
+
92
+ // 3) Title keywords.
93
+ if (/\bpricing\b|\bplans?\b/.test(title)) { scores.pricing += 0.4; signals.push({ kind: 'title', type: 'pricing', weight: 0.4 }); }
94
+ if (/\bdocs?\b|documentation|guide/.test(title)) { scores.docs += 0.4; signals.push({ kind: 'title', type: 'docs', weight: 0.4 }); }
95
+ if (/\bblog\b/.test(title)) { scores.blog += 0.3; signals.push({ kind: 'title', type: 'blog', weight: 0.3 }); }
96
+ if (/\bprivacy|\bterms\b/.test(title)) { scores.legal += 0.5; signals.push({ kind: 'title', type: 'legal', weight: 0.5 }); }
97
+ if (/\bsign.?in|\blog.?in|\bsign.?up\b/.test(title)) { scores.auth += 0.5; signals.push({ kind: 'title', type: 'auth', weight: 0.5 }); }
98
+
99
+ // 4) DOM text signals.
100
+ const bigText = sections.map(s => s.text || '').join('\n').slice(0, 8000);
101
+ if (PRICING_TEXT.test(bigText) && detectPricingLayout(sections)) {
102
+ scores.pricing += 0.6;
103
+ signals.push({ kind: 'dom', type: 'pricing', weight: 0.6, detail: 'currency+card-grid' });
104
+ }
105
+ if (DOCS_TEXT.test(bigText)) {
106
+ const { hasSidebar, articleLen } = detectDocsLayout(sections);
107
+ const w = 0.3 + (hasSidebar ? 0.25 : 0) + (articleLen > 1500 ? 0.15 : 0);
108
+ scores.docs += w;
109
+ signals.push({ kind: 'dom', type: 'docs', weight: w, detail: `sidebar=${hasSidebar} article=${articleLen}` });
110
+ }
111
+ if (BLOG_POST_TEXT.test(bigText)) {
112
+ scores['blog-post'] += 0.35;
113
+ signals.push({ kind: 'dom', type: 'blog-post', weight: 0.35, detail: 'byline|min-read' });
114
+ }
115
+ if (LEGAL_TEXT.test(bigText)) {
116
+ scores.legal += 0.4;
117
+ signals.push({ kind: 'dom', type: 'legal', weight: 0.4 });
118
+ }
119
+ if (AUTH_TEXT.test(bigText) && countFormFields(sections) < 8 && bigText.length < 3000) {
120
+ scores.auth += 0.35;
121
+ signals.push({ kind: 'dom', type: 'auth', weight: 0.35, detail: 'auth-form-shape' });
122
+ }
123
+
124
+ // 5) Path="/" fallback → landing (weak prior).
125
+ if (path === '/' || path === '') {
126
+ scores.landing += 0.45;
127
+ signals.push({ kind: 'url', type: 'landing', weight: 0.45, detail: 'root-path' });
128
+ }
129
+
130
+ // 6) Generic "has hero + features + cta" shape → landing.
131
+ const roles = new Set();
132
+ for (const s of sections) {
133
+ const blob = `${s.className || ''} ${s.id || ''}`.toLowerCase();
134
+ if (/hero/.test(blob)) roles.add('hero');
135
+ if ((s.cardCount || 0) >= 3) roles.add('features');
136
+ if (/cta|get-?started/.test(blob)) roles.add('cta');
137
+ if (s.tag === 'footer') roles.add('footer');
138
+ }
139
+ if (roles.has('hero') && (roles.has('features') || roles.has('cta'))) {
140
+ scores.landing += 0.3;
141
+ signals.push({ kind: 'shape', type: 'landing', weight: 0.3, detail: [...roles].join('+') });
142
+ }
143
+
144
+ // Pick winner.
145
+ const ranked = Object.entries(scores).sort((a, b) => b[1] - a[1]);
146
+ const [winType, winScore] = ranked[0];
147
+ const [, secondScore] = ranked[1] || ['unknown', 0];
148
+ const margin = winScore - secondScore;
149
+ let confidence = 0;
150
+ if (winScore > 0) {
151
+ confidence = Math.min(1, winScore * 0.6 + margin * 0.4);
152
+ }
153
+ const type = winScore === 0 ? 'unknown' : winType;
154
+
155
+ const alternates = ranked
156
+ .filter(([, s]) => s > 0 && s !== winScore)
157
+ .slice(0, 3)
158
+ .map(([t, s]) => ({ type: t, score: Number(s.toFixed(3)) }));
159
+
160
+ return {
161
+ type,
162
+ confidence: Number(confidence.toFixed(3)),
163
+ path,
164
+ title: opts.title || rawData.title || '',
165
+ description: description.slice(0, 200),
166
+ signals: signals.slice(0, 20),
167
+ alternates,
168
+ needsSmart: confidence < 0.6,
169
+ };
170
+ }
171
+
172
+ export const PAGE_TYPES = TYPES;