launchframe 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -34
- package/package.json +64 -64
- package/packages/extract/dom-crawler.ts +521 -0
- package/packages/extract/emit.ts +2 -2
- package/packages/extract/extract.ts +66 -16
- package/packages/extract/mirror-emit.ts +522 -0
- package/packages/extract/types.ts +118 -0
|
@@ -0,0 +1,521 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DOM layout crawler.
|
|
3
|
+
*
|
|
4
|
+
* Runs inside the rendered page via Playwright's `page.evaluate`. Walks the
|
|
5
|
+
* DOM, identifies top-level sections, classifies each section's role and
|
|
6
|
+
* composition, and counts the content slots it contains. Returns a
|
|
7
|
+
* `SiteLayout` structural model the emitter rebuilds into a Next.js page.
|
|
8
|
+
*
|
|
9
|
+
* What this records:
|
|
10
|
+
* - Section tree (geometry, role, composition, density)
|
|
11
|
+
* - Slot inventory per section: how many headings / body paragraphs /
|
|
12
|
+
* buttons / images / icons / logos / code blocks etc. it contains
|
|
13
|
+
* - Per-section style tokens: background, foreground, padding
|
|
14
|
+
* - Page-level tokens: fonts, primary surface colors, container width
|
|
15
|
+
*
|
|
16
|
+
* What this does NOT record:
|
|
17
|
+
* - Heading or body text content (slots are counts, not strings).
|
|
18
|
+
* - Raw HTML, CSS, or class names from the source.
|
|
19
|
+
* - Brand assets (logos, illustrations, product screenshots).
|
|
20
|
+
*
|
|
21
|
+
* The structural model is what the mirror emitter uses to reconstruct the
|
|
22
|
+
* page's section grammar with `<TextSlot>` / `<MediaSlot>` placeholders.
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
import type { Page } from "playwright";
|
|
26
|
+
|
|
27
|
+
import type {
|
|
28
|
+
Composition,
|
|
29
|
+
SectionLayout,
|
|
30
|
+
SectionRole,
|
|
31
|
+
SiteLayout,
|
|
32
|
+
SiteTokens,
|
|
33
|
+
SlotCount,
|
|
34
|
+
SlotKind,
|
|
35
|
+
} from "./types.js";
|
|
36
|
+
|
|
37
|
+
export async function crawlLayout(
|
|
38
|
+
page: Page,
|
|
39
|
+
url: string,
|
|
40
|
+
viewport: { width: number; height: number },
|
|
41
|
+
): Promise<SiteLayout> {
|
|
42
|
+
await page.evaluate(() => {
|
|
43
|
+
const g = globalThis as unknown as { __name?: (fn: unknown) => unknown };
|
|
44
|
+
if (typeof g.__name === "undefined") g.__name = (fn: unknown) => fn;
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
const host = new URL(url).host;
|
|
48
|
+
const partial = await page.evaluate(crawlInPage);
|
|
49
|
+
|
|
50
|
+
return {
|
|
51
|
+
url,
|
|
52
|
+
host,
|
|
53
|
+
capturedAt: new Date().toISOString(),
|
|
54
|
+
viewport,
|
|
55
|
+
...partial,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Browser-context crawler. Dependency-free so Playwright can serialize it.
|
|
61
|
+
* Returns the layout-bearing fields of `SiteLayout` (url/host/capturedAt
|
|
62
|
+
* are added on the Node side).
|
|
63
|
+
*/
|
|
64
|
+
function crawlInPage(): Pick<
|
|
65
|
+
SiteLayout,
|
|
66
|
+
"pageHeightPx" | "sections" | "tokens"
|
|
67
|
+
> {
|
|
68
|
+
const VIEWPORT_W = window.innerWidth;
|
|
69
|
+
const PAGE_H = Math.max(
|
|
70
|
+
document.documentElement.scrollHeight,
|
|
71
|
+
document.body.scrollHeight,
|
|
72
|
+
);
|
|
73
|
+
|
|
74
|
+
/* ----- helpers ----- */
|
|
75
|
+
|
|
76
|
+
function toHex(rgb: string): string | null {
|
|
77
|
+
if (!rgb || rgb === "transparent") return null;
|
|
78
|
+
const m = rgb.match(/rgba?\(([^)]+)\)/);
|
|
79
|
+
if (!m) return null;
|
|
80
|
+
const parts = m[1]!.split(",").map((s) => s.trim());
|
|
81
|
+
const r = parseInt(parts[0]!, 10);
|
|
82
|
+
const g = parseInt(parts[1]!, 10);
|
|
83
|
+
const b = parseInt(parts[2]!, 10);
|
|
84
|
+
const a = parts[3] !== undefined ? parseFloat(parts[3]) : 1;
|
|
85
|
+
if (a < 0.05) return null;
|
|
86
|
+
if ([r, g, b].some((n) => Number.isNaN(n))) return null;
|
|
87
|
+
const h = (n: number) => n.toString(16).padStart(2, "0");
|
|
88
|
+
return `#${h(r)}${h(g)}${h(b)}`;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function isVisible(el: HTMLElement): boolean {
|
|
92
|
+
const style = getComputedStyle(el);
|
|
93
|
+
if (style.visibility === "hidden" || style.display === "none") return false;
|
|
94
|
+
if (parseFloat(style.opacity) < 0.05) return false;
|
|
95
|
+
const r = el.getBoundingClientRect();
|
|
96
|
+
return r.width > 0 && r.height > 0;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function directTextLength(el: Element): number {
|
|
100
|
+
let total = 0;
|
|
101
|
+
for (const child of Array.from(el.childNodes)) {
|
|
102
|
+
if (child.nodeType === 3) {
|
|
103
|
+
const text = (child.nodeValue ?? "").trim();
|
|
104
|
+
if (text) total += text.length;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
return total;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function classifySlotForElement(el: HTMLElement): SlotKind | null {
|
|
111
|
+
const tag = el.tagName;
|
|
112
|
+
const style = getComputedStyle(el);
|
|
113
|
+
const fontSize = parseFloat(style.fontSize) || 16;
|
|
114
|
+
const fontWeight = parseInt(style.fontWeight, 10) || 400;
|
|
115
|
+
|
|
116
|
+
if (tag === "IMG" || tag === "PICTURE") {
|
|
117
|
+
// Heuristic: small square-ish images in a logo strip vs hero photos.
|
|
118
|
+
const r = el.getBoundingClientRect();
|
|
119
|
+
const ratio = r.width > 0 && r.height > 0 ? r.width / r.height : 0;
|
|
120
|
+
if (r.height > 0 && r.height < 56 && ratio < 6) return "logo-mono";
|
|
121
|
+
return "image";
|
|
122
|
+
}
|
|
123
|
+
if (tag === "SVG" || tag === "svg") {
|
|
124
|
+
const r = el.getBoundingClientRect();
|
|
125
|
+
if (r.width < 32 && r.height < 32) return "icon";
|
|
126
|
+
return "image";
|
|
127
|
+
}
|
|
128
|
+
if (tag === "VIDEO") return "video";
|
|
129
|
+
if (tag === "PRE" || tag === "CODE") return "code";
|
|
130
|
+
if (tag === "INPUT" || tag === "TEXTAREA" || tag === "SELECT") return "input";
|
|
131
|
+
|
|
132
|
+
if (tag === "BUTTON" || (tag === "A" && el.getAttribute("role") === "button")) {
|
|
133
|
+
// Decide primary vs secondary by background contrast vs the page.
|
|
134
|
+
const bg = toHex(style.backgroundColor);
|
|
135
|
+
if (bg && bg !== toHex(getComputedStyle(document.body).backgroundColor)) {
|
|
136
|
+
return "button-primary";
|
|
137
|
+
}
|
|
138
|
+
return "button-secondary";
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
if (tag === "A" && directTextLength(el) > 0) {
|
|
142
|
+
// Anchor that looks like a styled CTA (padded, bordered, or backgrounded).
|
|
143
|
+
const padX = parseFloat(style.paddingLeft) + parseFloat(style.paddingRight);
|
|
144
|
+
const hasBackdrop =
|
|
145
|
+
toHex(style.backgroundColor) !== null ||
|
|
146
|
+
parseFloat(style.borderTopWidth) > 0;
|
|
147
|
+
if (padX > 16 && hasBackdrop) {
|
|
148
|
+
return toHex(style.backgroundColor) ? "button-primary" : "button-secondary";
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (directTextLength(el) === 0) return null;
|
|
153
|
+
|
|
154
|
+
if (/^H[1-6]$/.test(tag)) {
|
|
155
|
+
const level = parseInt(tag.slice(1), 10);
|
|
156
|
+
if (level === 1) return "heading-1";
|
|
157
|
+
if (level === 2) return "heading-2";
|
|
158
|
+
return "heading-3";
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if (tag === "LI") return "bullet";
|
|
162
|
+
|
|
163
|
+
if (fontSize >= 36 && fontWeight >= 600) return "heading-1";
|
|
164
|
+
if (fontSize >= 24 && fontWeight >= 500) return "heading-2";
|
|
165
|
+
if (fontSize >= 18 && fontWeight >= 500) return "heading-3";
|
|
166
|
+
|
|
167
|
+
if (fontSize <= 12 && /uppercase/i.test(style.textTransform)) return "eyebrow";
|
|
168
|
+
if (fontSize <= 13 && fontWeight >= 600) return "badge";
|
|
169
|
+
|
|
170
|
+
if (tag === "P" || tag === "SPAN" || tag === "DIV") return "body";
|
|
171
|
+
|
|
172
|
+
return null;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
function countSlots(root: HTMLElement): SlotCount[] {
|
|
176
|
+
const buckets = new Map<SlotKind, number>();
|
|
177
|
+
const all = root.querySelectorAll<HTMLElement>("*");
|
|
178
|
+
for (const el of Array.from(all)) {
|
|
179
|
+
if (!isVisible(el)) continue;
|
|
180
|
+
const kind = classifySlotForElement(el);
|
|
181
|
+
if (kind) buckets.set(kind, (buckets.get(kind) ?? 0) + 1);
|
|
182
|
+
}
|
|
183
|
+
// De-dupe nested H1 spans etc: if an H1 contains spans, the outer H1 counts.
|
|
184
|
+
// We accept some over-count for body since it's coarse anyway; cap it.
|
|
185
|
+
if ((buckets.get("body") ?? 0) > 12) buckets.set("body", 12);
|
|
186
|
+
if ((buckets.get("bullet") ?? 0) > 24) buckets.set("bullet", 24);
|
|
187
|
+
|
|
188
|
+
const out: SlotCount[] = [];
|
|
189
|
+
for (const [kind, count] of buckets) out.push({ kind, count });
|
|
190
|
+
return out.sort((a, b) => a.kind.localeCompare(b.kind));
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/* ----- section discovery ----- */
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* A top-level section is a block-level element that:
|
|
197
|
+
* - is at least ~70% of the viewport wide,
|
|
198
|
+
* - has a non-trivial height (>= 80 px),
|
|
199
|
+
* - and is one of <header>, <footer>, <main>, <section>, <article>,
|
|
200
|
+
* or a direct child of <body> / <main> that visually plays that role.
|
|
201
|
+
*/
|
|
202
|
+
function findSections(): HTMLElement[] {
|
|
203
|
+
const candidates = new Set<HTMLElement>();
|
|
204
|
+
const tagSet = ["HEADER", "MAIN", "SECTION", "ARTICLE", "FOOTER", "NAV"];
|
|
205
|
+
for (const tag of tagSet) {
|
|
206
|
+
for (const el of Array.from(document.getElementsByTagName(tag))) {
|
|
207
|
+
candidates.add(el as HTMLElement);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
// Add direct children of <body> and <main> as fallback.
|
|
211
|
+
const bodyKids = Array.from(document.body.children) as HTMLElement[];
|
|
212
|
+
for (const el of bodyKids) candidates.add(el);
|
|
213
|
+
const main = document.querySelector("main");
|
|
214
|
+
if (main) {
|
|
215
|
+
for (const el of Array.from(main.children) as HTMLElement[]) {
|
|
216
|
+
candidates.add(el);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
const accepted: HTMLElement[] = [];
|
|
221
|
+
for (const el of candidates) {
|
|
222
|
+
if (!isVisible(el)) continue;
|
|
223
|
+
const r = el.getBoundingClientRect();
|
|
224
|
+
if (r.width < VIEWPORT_W * 0.7) continue;
|
|
225
|
+
if (r.height < 80) continue;
|
|
226
|
+
// Skip if this element is nested inside another already-accepted candidate.
|
|
227
|
+
// We'll do a final pass after sorting.
|
|
228
|
+
accepted.push(el);
|
|
229
|
+
}
|
|
230
|
+
// Sort by document y position.
|
|
231
|
+
accepted.sort((a, b) => {
|
|
232
|
+
const ay = a.getBoundingClientRect().top + window.scrollY;
|
|
233
|
+
const by = b.getBoundingClientRect().top + window.scrollY;
|
|
234
|
+
return ay - by;
|
|
235
|
+
});
|
|
236
|
+
|
|
237
|
+
// Drop any element fully contained in an earlier accepted one.
|
|
238
|
+
const final: HTMLElement[] = [];
|
|
239
|
+
for (const el of accepted) {
|
|
240
|
+
const inside = final.some((p) => p !== el && p.contains(el));
|
|
241
|
+
if (!inside) final.push(el);
|
|
242
|
+
}
|
|
243
|
+
return final;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/* ----- per-section classification ----- */
|
|
247
|
+
|
|
248
|
+
function classifyRole(
|
|
249
|
+
el: HTMLElement,
|
|
250
|
+
indexFromTop: number,
|
|
251
|
+
indexFromBottom: number,
|
|
252
|
+
slots: SlotCount[],
|
|
253
|
+
): SectionRole {
|
|
254
|
+
const r = el.getBoundingClientRect();
|
|
255
|
+
const tag = el.tagName;
|
|
256
|
+
const count = (k: SlotKind) => slots.find((s) => s.kind === k)?.count ?? 0;
|
|
257
|
+
|
|
258
|
+
if (tag === "NAV") return "nav";
|
|
259
|
+
if (tag === "FOOTER" || (indexFromBottom === 0 && r.height < 600)) return "footer";
|
|
260
|
+
if (tag === "HEADER" && indexFromTop === 0 && r.height < 140) return "nav";
|
|
261
|
+
|
|
262
|
+
const headingCount = count("heading-1") + count("heading-2") + count("heading-3");
|
|
263
|
+
const buttons = count("button-primary") + count("button-secondary");
|
|
264
|
+
const images = count("image");
|
|
265
|
+
const logos = count("logo-mono");
|
|
266
|
+
const bullets = count("bullet");
|
|
267
|
+
|
|
268
|
+
// First in-document, has H1 + CTA → hero.
|
|
269
|
+
if (indexFromTop <= 1 && count("heading-1") >= 1 && buttons >= 1) return "hero";
|
|
270
|
+
|
|
271
|
+
// A wide, short band of small uniform images → logo strip.
|
|
272
|
+
if (logos >= 4 && headingCount <= 1 && r.height < r.width * 0.25) {
|
|
273
|
+
return "proof-logos";
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Pricing tells: 2–4 tall columns each with bullets and a button.
|
|
277
|
+
if (bullets >= 6 && buttons >= 2 && r.height > 360) return "pricing";
|
|
278
|
+
|
|
279
|
+
// Lots of headings (3+) of the same level + small bodies → feature grid.
|
|
280
|
+
if (count("heading-2") + count("heading-3") >= 3 && images <= 2) return "feature-grid";
|
|
281
|
+
|
|
282
|
+
// One heading, generous body, one media slot → deep dive.
|
|
283
|
+
if (headingCount >= 1 && images >= 1 && bullets <= 4 && r.height > 320) {
|
|
284
|
+
return "feature-deep-dive";
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// Heading + 2 buttons, short height → conversion band near the bottom.
|
|
288
|
+
if (headingCount <= 2 && buttons >= 1 && r.height < 480 && indexFromBottom <= 2) {
|
|
289
|
+
return "conversion";
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// Quote-shaped: short body strings, sometimes 3-up.
|
|
293
|
+
if (count("body") >= 3 && images === 0 && bullets === 0 && buttons === 0) {
|
|
294
|
+
return "proof-quotes";
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
return "other";
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
function classifyComposition(el: HTMLElement): Composition {
|
|
301
|
+
// Find the deepest descendant that uses CSS grid or flex with >1 row of cols.
|
|
302
|
+
const candidates = el.querySelectorAll<HTMLElement>("*");
|
|
303
|
+
let bestCols = 1;
|
|
304
|
+
let bestKind: "grid" | "flex" | "none" = "none";
|
|
305
|
+
let logoRowCols = 0;
|
|
306
|
+
for (const c of Array.from(candidates).slice(0, 400)) {
|
|
307
|
+
if (!isVisible(c)) continue;
|
|
308
|
+
const s = getComputedStyle(c);
|
|
309
|
+
if (s.display === "grid") {
|
|
310
|
+
const cols = s.gridTemplateColumns
|
|
311
|
+
.split(" ")
|
|
312
|
+
.filter((x) => x.trim().length > 0).length;
|
|
313
|
+
if (cols > bestCols) {
|
|
314
|
+
bestCols = cols;
|
|
315
|
+
bestKind = "grid";
|
|
316
|
+
}
|
|
317
|
+
} else if (s.display === "flex" && s.flexDirection.startsWith("row")) {
|
|
318
|
+
const kids = Array.from(c.children) as HTMLElement[];
|
|
319
|
+
const visibleKids = kids.filter(isVisible);
|
|
320
|
+
if (visibleKids.length > bestCols && visibleKids.length <= 12) {
|
|
321
|
+
bestCols = visibleKids.length;
|
|
322
|
+
bestKind = "flex";
|
|
323
|
+
}
|
|
324
|
+
if (visibleKids.length >= 4) {
|
|
325
|
+
const allSmall = visibleKids.every((k) => {
|
|
326
|
+
const kr = k.getBoundingClientRect();
|
|
327
|
+
return kr.height < 80 && kr.width < 200;
|
|
328
|
+
});
|
|
329
|
+
if (allSmall) logoRowCols = Math.max(logoRowCols, visibleKids.length);
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
if (logoRowCols >= 4) return "logo-row";
|
|
335
|
+
if (bestKind === "none" || bestCols <= 1) return "single-column";
|
|
336
|
+
if (bestCols === 2) return "split-2";
|
|
337
|
+
if (bestCols === 3) return "grid-3";
|
|
338
|
+
if (bestCols === 4) return "grid-4";
|
|
339
|
+
if (bestCols >= 5) return "list";
|
|
340
|
+
return "unknown";
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
function classifyDensity(el: HTMLElement, slots: SlotCount[]): "thin" | "balanced" | "dense" {
|
|
344
|
+
const total = slots.reduce((sum, s) => sum + s.count, 0);
|
|
345
|
+
const r = el.getBoundingClientRect();
|
|
346
|
+
const density = total / Math.max(1, r.height / 100);
|
|
347
|
+
if (density < 0.8) return "thin";
|
|
348
|
+
if (density > 2.4) return "dense";
|
|
349
|
+
return "balanced";
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
function extractSectionStyles(el: HTMLElement): SectionLayout["styles"] {
|
|
353
|
+
const s = getComputedStyle(el);
|
|
354
|
+
return {
|
|
355
|
+
backgroundHex: toHex(s.backgroundColor),
|
|
356
|
+
foregroundHex: toHex(s.color),
|
|
357
|
+
paddingTopPx: Math.round(parseFloat(s.paddingTop) || 0) || null,
|
|
358
|
+
paddingBottomPx: Math.round(parseFloat(s.paddingBottom) || 0) || null,
|
|
359
|
+
};
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
/* ----- page-level token extraction ----- */
|
|
363
|
+
|
|
364
|
+
function extractPageTokens(): SiteTokens {
|
|
365
|
+
const body = document.body;
|
|
366
|
+
const bodyStyle = getComputedStyle(body);
|
|
367
|
+
const bodyFontFamily = bodyStyle.fontFamily.split(",")[0]!.trim().replace(/^["']|["']$/g, "") || "system-ui";
|
|
368
|
+
|
|
369
|
+
let headingFontFamily = bodyFontFamily;
|
|
370
|
+
const h = document.querySelector("h1, h2, h3");
|
|
371
|
+
if (h) {
|
|
372
|
+
const hs = getComputedStyle(h);
|
|
373
|
+
headingFontFamily = hs.fontFamily.split(",")[0]!.trim().replace(/^["']|["']$/g, "") || bodyFontFamily;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
const bg = toHex(bodyStyle.backgroundColor) ?? "#ffffff";
|
|
377
|
+
const fg = toHex(bodyStyle.color) ?? "#0a0a0a";
|
|
378
|
+
|
|
379
|
+
// Primary = the most-used non-text colored button background.
|
|
380
|
+
const buttonBgCounts = new Map<string, number>();
|
|
381
|
+
for (const b of Array.from(document.querySelectorAll<HTMLElement>("button, a, [role='button']"))) {
|
|
382
|
+
if (!isVisible(b)) continue;
|
|
383
|
+
const sb = toHex(getComputedStyle(b).backgroundColor);
|
|
384
|
+
if (!sb || sb === bg) continue;
|
|
385
|
+
buttonBgCounts.set(sb, (buttonBgCounts.get(sb) ?? 0) + 1);
|
|
386
|
+
}
|
|
387
|
+
let primary = fg;
|
|
388
|
+
let primaryCount = 0;
|
|
389
|
+
for (const [hex, count] of buttonBgCounts) {
|
|
390
|
+
if (count > primaryCount) {
|
|
391
|
+
primary = hex;
|
|
392
|
+
primaryCount = count;
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// Muted = a frequent off-white / off-black surface color (non-page).
|
|
397
|
+
const surfaceCounts = new Map<string, number>();
|
|
398
|
+
for (const el of Array.from(document.querySelectorAll<HTMLElement>("body *"))) {
|
|
399
|
+
if (!isVisible(el)) continue;
|
|
400
|
+
const sb = toHex(getComputedStyle(el).backgroundColor);
|
|
401
|
+
if (!sb || sb === bg) continue;
|
|
402
|
+
surfaceCounts.set(sb, (surfaceCounts.get(sb) ?? 0) + 1);
|
|
403
|
+
}
|
|
404
|
+
let muted = bg;
|
|
405
|
+
let mutedCount = 0;
|
|
406
|
+
for (const [hex, count] of surfaceCounts) {
|
|
407
|
+
if (hex === primary) continue;
|
|
408
|
+
if (count > mutedCount) {
|
|
409
|
+
muted = hex;
|
|
410
|
+
mutedCount = count;
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// Border = most-used border color across all elements with a border.
|
|
415
|
+
const borderCounts = new Map<string, number>();
|
|
416
|
+
for (const el of Array.from(document.querySelectorAll<HTMLElement>("body *"))) {
|
|
417
|
+
if (!isVisible(el)) continue;
|
|
418
|
+
const s = getComputedStyle(el);
|
|
419
|
+
if (parseFloat(s.borderTopWidth) <= 0) continue;
|
|
420
|
+
const bh = toHex(s.borderTopColor);
|
|
421
|
+
if (!bh) continue;
|
|
422
|
+
borderCounts.set(bh, (borderCounts.get(bh) ?? 0) + 1);
|
|
423
|
+
}
|
|
424
|
+
let border = "#e5e7eb";
|
|
425
|
+
let borderCount = 0;
|
|
426
|
+
for (const [hex, count] of borderCounts) {
|
|
427
|
+
if (count > borderCount) {
|
|
428
|
+
border = hex;
|
|
429
|
+
borderCount = count;
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// Radius = the most-used non-zero corner radius.
|
|
434
|
+
const radiusCounts = new Map<number, number>();
|
|
435
|
+
for (const el of Array.from(document.querySelectorAll<HTMLElement>("body *"))) {
|
|
436
|
+
if (!isVisible(el)) continue;
|
|
437
|
+
const r = parseFloat(getComputedStyle(el).borderTopLeftRadius);
|
|
438
|
+
if (!(r > 0 && r < 64)) continue;
|
|
439
|
+
const k = Math.round(r);
|
|
440
|
+
radiusCounts.set(k, (radiusCounts.get(k) ?? 0) + 1);
|
|
441
|
+
}
|
|
442
|
+
let radius = 8;
|
|
443
|
+
let radiusCount = 0;
|
|
444
|
+
for (const [px, count] of radiusCounts) {
|
|
445
|
+
if (count > radiusCount) {
|
|
446
|
+
radius = px;
|
|
447
|
+
radiusCount = count;
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
// Container: widest layout block under 1600px.
|
|
452
|
+
let containerPx: number | null = null;
|
|
453
|
+
let containerArea = 0;
|
|
454
|
+
for (const el of Array.from(
|
|
455
|
+
document.querySelectorAll<HTMLElement>("main, section, header, footer, div"),
|
|
456
|
+
)) {
|
|
457
|
+
if (!isVisible(el)) continue;
|
|
458
|
+
const r = el.getBoundingClientRect();
|
|
459
|
+
if (r.width < 720 || r.width > 1600) continue;
|
|
460
|
+
if (r.height < 240) continue;
|
|
461
|
+
const area = r.width * r.height;
|
|
462
|
+
if (area > containerArea) {
|
|
463
|
+
containerArea = area;
|
|
464
|
+
containerPx = Math.round(r.width);
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
return {
|
|
469
|
+
bodyFontFamily,
|
|
470
|
+
headingFontFamily,
|
|
471
|
+
backgroundHex: bg,
|
|
472
|
+
foregroundHex: fg,
|
|
473
|
+
primaryHex: primary,
|
|
474
|
+
mutedHex: muted,
|
|
475
|
+
borderHex: border,
|
|
476
|
+
radiusPx: radius,
|
|
477
|
+
containerPx,
|
|
478
|
+
};
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
/* ----- main pass ----- */
|
|
482
|
+
|
|
483
|
+
const sectionEls = findSections();
|
|
484
|
+
const sections: SectionLayout[] = [];
|
|
485
|
+
for (let i = 0; i < sectionEls.length; i++) {
|
|
486
|
+
const el = sectionEls[i]!;
|
|
487
|
+
const r = el.getBoundingClientRect();
|
|
488
|
+
const top = r.top + window.scrollY;
|
|
489
|
+
const slots = countSlots(el);
|
|
490
|
+
const composition = classifyComposition(el);
|
|
491
|
+
const density = classifyDensity(el, slots);
|
|
492
|
+
const role = classifyRole(el, i, sectionEls.length - 1 - i, slots);
|
|
493
|
+
const styles = extractSectionStyles(el);
|
|
494
|
+
|
|
495
|
+
const notes: string[] = [];
|
|
496
|
+
if (slots.length === 0) notes.push("No content slots detected; rendering an empty wrapper.");
|
|
497
|
+
if (composition === "unknown") notes.push("Composition was ambiguous; fell back to single-column.");
|
|
498
|
+
|
|
499
|
+
sections.push({
|
|
500
|
+
id: `s${i + 1}`,
|
|
501
|
+
role,
|
|
502
|
+
composition,
|
|
503
|
+
density,
|
|
504
|
+
bbox: [
|
|
505
|
+
Math.max(0, Math.min(1, r.left / VIEWPORT_W)),
|
|
506
|
+
Math.max(0, Math.min(1, top / PAGE_H)),
|
|
507
|
+
Math.max(0, Math.min(1, r.width / VIEWPORT_W)),
|
|
508
|
+
Math.max(0, Math.min(1, r.height / PAGE_H)),
|
|
509
|
+
],
|
|
510
|
+
slots,
|
|
511
|
+
styles,
|
|
512
|
+
notes,
|
|
513
|
+
});
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
return {
|
|
517
|
+
pageHeightPx: PAGE_H,
|
|
518
|
+
sections,
|
|
519
|
+
tokens: extractPageTokens(),
|
|
520
|
+
};
|
|
521
|
+
}
|
package/packages/extract/emit.ts
CHANGED
|
@@ -9,8 +9,8 @@
|
|
|
9
9
|
* globals.css — shadcn-compatible CSS variables (light + dark)
|
|
10
10
|
* theme-preview.tsx — a self-contained React component that renders
|
|
11
11
|
* every token so you can eyeball the system
|
|
12
|
-
* REPORT.md — what was extracted, from where,
|
|
13
|
-
*
|
|
12
|
+
* REPORT.md — what was extracted, from where, and how the
|
|
13
|
+
* output is meant to be used
|
|
14
14
|
*/
|
|
15
15
|
|
|
16
16
|
import { mkdirSync, writeFileSync } from "node:fs";
|
|
@@ -4,16 +4,23 @@
|
|
|
4
4
|
* npm run extract -- https://site-a.com https://site-b.com https://site-c.com
|
|
5
5
|
*
|
|
6
6
|
* For each URL: open in Chromium, screenshot, harvest computed design
|
|
7
|
-
* tokens via `browser-extract.ts
|
|
8
|
-
*
|
|
7
|
+
* tokens via `browser-extract.ts`, and crawl the rendered DOM into a
|
|
8
|
+
* typed `SiteLayout` model via `dom-crawler.ts`. After all sites:
|
|
9
|
+
* - Synthesize a drop-in shadcn-compatible design system from the
|
|
10
|
+
* aggregated tokens.
|
|
11
|
+
* - Emit a per-site **layout mirror**: a Next.js page that reconstructs
|
|
12
|
+
* the source's section structure from typed primitives, with
|
|
13
|
+
* `<TextSlot>` / `<MediaSlot>` placeholders for the user's copy and
|
|
14
|
+
* brand assets.
|
|
9
15
|
*
|
|
10
16
|
* Output goes to `output/<runId>/`.
|
|
11
17
|
*
|
|
12
|
-
*
|
|
13
|
-
* - Honor robots.txt
|
|
14
|
-
* - Per-domain rate limit defaults to 15 req/min.
|
|
15
|
-
* -
|
|
16
|
-
*
|
|
18
|
+
* Operational defaults (configurable via flags):
|
|
19
|
+
* - Honor robots.txt unless `--no-robots` is passed.
|
|
20
|
+
* - Per-domain rate limit defaults to 15 req/min (`--rate <n>`).
|
|
21
|
+
* - The crawler extracts a structured representation (section tree,
|
|
22
|
+
* computed style tokens, content kinds); it does not store raw HTML,
|
|
23
|
+
* copy text, or third-party assets in the output.
|
|
17
24
|
*/
|
|
18
25
|
|
|
19
26
|
import { mkdirSync, writeFileSync } from "node:fs";
|
|
@@ -23,9 +30,11 @@ import { fileURLToPath, pathToFileURL } from "node:url";
|
|
|
23
30
|
import { chromium, type Browser } from "playwright";
|
|
24
31
|
|
|
25
32
|
import { harvestTokens } from "./browser-extract.js";
|
|
33
|
+
import { crawlLayout } from "./dom-crawler.js";
|
|
26
34
|
import { emitAll } from "./emit.js";
|
|
35
|
+
import { emitMirror } from "./mirror-emit.js";
|
|
27
36
|
import { synthesize } from "./synthesize.js";
|
|
28
|
-
import type { ExtractionRun, RawTokens, SiteCapture } from "./types.js";
|
|
37
|
+
import type { ExtractionRun, RawTokens, SiteCapture, SiteLayout } from "./types.js";
|
|
29
38
|
|
|
30
39
|
const __filename = fileURLToPath(import.meta.url);
|
|
31
40
|
const __dirname = dirname(__filename);
|
|
@@ -90,9 +99,18 @@ function printHelp(): void {
|
|
|
90
99
|
"Writes to ./output/<runId>/ in your current working directory unless",
|
|
91
100
|
"you pass --out.",
|
|
92
101
|
"",
|
|
93
|
-
"
|
|
94
|
-
"
|
|
95
|
-
"
|
|
102
|
+
"For each URL the CLI:",
|
|
103
|
+
" 1. Renders the page at a desktop viewport in headless Chromium.",
|
|
104
|
+
" 2. Harvests computed design tokens (colors, type, spacing, radius,",
|
|
105
|
+
" shadow).",
|
|
106
|
+
" 3. Crawls the rendered DOM into a typed SiteLayout (section tree,",
|
|
107
|
+
" composition, slot counts, per-section style tokens).",
|
|
108
|
+
" 4. Emits a layout-mirror Next.js page at",
|
|
109
|
+
" output/<runId>/mirror/<host>/page.tsx with <TextSlot> /",
|
|
110
|
+
" <MediaSlot> placeholders for your own copy and imagery.",
|
|
111
|
+
"",
|
|
112
|
+
"After every URL, a drop-in shadcn-compatible design system is",
|
|
113
|
+
"synthesized from the aggregated tokens and written to output/<runId>/.",
|
|
96
114
|
"",
|
|
97
115
|
"Options:",
|
|
98
116
|
" --out <dir> Output directory (default: output/<runId>)",
|
|
@@ -178,11 +196,13 @@ async function captureOne(
|
|
|
178
196
|
url: string,
|
|
179
197
|
viewport: { width: number; height: number },
|
|
180
198
|
outDir: string,
|
|
181
|
-
): Promise<{ raw: RawTokens; capture: SiteCapture } | null> {
|
|
199
|
+
): Promise<{ raw: RawTokens; layout: SiteLayout | null; capture: SiteCapture } | null> {
|
|
182
200
|
const host = new URL(url).host;
|
|
183
201
|
const stamp = `${host}.png`;
|
|
184
202
|
const screenshotPath = join(outDir, "screenshots", stamp);
|
|
185
203
|
const rawPath = join(outDir, "raw", `${host}.tokens.json`);
|
|
204
|
+
const layoutPath = join(outDir, "raw", `${host}.layout.json`);
|
|
205
|
+
const mirrorDir = join(outDir, "mirror", host);
|
|
186
206
|
|
|
187
207
|
const ctx = await browser.newContext({
|
|
188
208
|
userAgent: USER_AGENT,
|
|
@@ -215,18 +235,32 @@ async function captureOne(
|
|
|
215
235
|
mkdirSync(dirname(rawPath), { recursive: true });
|
|
216
236
|
writeFileSync(rawPath, JSON.stringify(raw, null, 2));
|
|
217
237
|
|
|
238
|
+
let layout: SiteLayout | null = null;
|
|
239
|
+
let mirrorWritten: string[] = [];
|
|
240
|
+
try {
|
|
241
|
+
layout = await crawlLayout(page, url, viewport);
|
|
242
|
+
mkdirSync(dirname(layoutPath), { recursive: true });
|
|
243
|
+
writeFileSync(layoutPath, JSON.stringify(layout, null, 2));
|
|
244
|
+
mirrorWritten = emitMirror(layout, mirrorDir);
|
|
245
|
+
} catch (err) {
|
|
246
|
+
console.warn(` ! layout crawl failed for ${url}: ${(err as Error).message}`);
|
|
247
|
+
}
|
|
248
|
+
|
|
218
249
|
const capture: SiteCapture = {
|
|
219
250
|
url,
|
|
220
251
|
host,
|
|
221
252
|
capturedAt: raw.capturedAt,
|
|
222
253
|
screenshotPath,
|
|
223
254
|
rawTokensPath: rawPath,
|
|
255
|
+
...(layout ? { layoutPath } : {}),
|
|
256
|
+
...(mirrorWritten.length > 0 ? { mirrorDir } : {}),
|
|
224
257
|
status: "ok",
|
|
225
258
|
};
|
|
226
|
-
return { raw, capture };
|
|
259
|
+
return { raw, layout, capture };
|
|
227
260
|
} catch (err) {
|
|
228
261
|
return {
|
|
229
262
|
raw: emptyRaw(url, viewport),
|
|
263
|
+
layout: null,
|
|
230
264
|
capture: {
|
|
231
265
|
url,
|
|
232
266
|
host,
|
|
@@ -303,7 +337,11 @@ async function main(): Promise<void> {
|
|
|
303
337
|
captures.push(result.capture);
|
|
304
338
|
if (result.capture.status === "ok") {
|
|
305
339
|
rawList.push(result.raw);
|
|
306
|
-
|
|
340
|
+
const tag = result.layout ? "mirror" : "tokens-only";
|
|
341
|
+
const sectionCount = result.layout?.sections.length ?? 0;
|
|
342
|
+
console.log(
|
|
343
|
+
` ✓ ${url} → ${tag}${result.layout ? ` (${sectionCount} sections)` : ""}`,
|
|
344
|
+
);
|
|
307
345
|
} else {
|
|
308
346
|
console.log(` ✗ ${url} ${result.capture.reason ?? ""}`);
|
|
309
347
|
}
|
|
@@ -339,9 +377,21 @@ async function main(): Promise<void> {
|
|
|
339
377
|
console.log("[extract] wrote:");
|
|
340
378
|
for (const f of written) console.log(` → ${f}`);
|
|
341
379
|
console.log(` → ${join(outDir, "run.json")}`);
|
|
380
|
+
const mirrorDirs = captures.filter((c) => c.mirrorDir).map((c) => c.mirrorDir!);
|
|
381
|
+
if (mirrorDirs.length > 0) {
|
|
382
|
+
console.log("");
|
|
383
|
+
console.log("[extract] layout mirrors:");
|
|
384
|
+
for (const d of mirrorDirs) console.log(` → ${d}/page.tsx`);
|
|
385
|
+
}
|
|
342
386
|
console.log("");
|
|
343
|
-
console.log(`[extract] done. Open ${join(outDir, "REPORT.md")} for the summary.`);
|
|
344
|
-
|
|
387
|
+
console.log(`[extract] done. Open ${join(outDir, "REPORT.md")} for the design-system summary.`);
|
|
388
|
+
if (mirrorDirs.length > 0) {
|
|
389
|
+
console.log(
|
|
390
|
+
`[extract] each mirror folder ships a Next.js page.tsx + MIRROR_NOTES.md.`,
|
|
391
|
+
);
|
|
392
|
+
console.log(`[extract] fill the <TextSlot> / <MediaSlot> placeholders with your own content.`);
|
|
393
|
+
}
|
|
394
|
+
console.log(`[extract] AI handoff: ${join(outDir, "FOR_AI.md")}`);
|
|
345
395
|
}
|
|
346
396
|
|
|
347
397
|
function makeRunId(startedAt: string, name: string | undefined): string {
|