launchframe 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,521 @@
1
+ /**
2
+ * DOM layout crawler.
3
+ *
4
+ * Runs inside the rendered page via Playwright's `page.evaluate`. Walks the
5
+ * DOM, identifies top-level sections, classifies each section's role and
6
+ * composition, and counts the content slots it contains. Returns a
7
+ * `SiteLayout` structural model the emitter rebuilds into a Next.js page.
8
+ *
9
+ * What this records:
10
+ * - Section tree (geometry, role, composition, density)
11
+ * - Slot inventory per section: how many headings / body paragraphs /
12
+ * buttons / images / icons / logos / code blocks etc. it contains
13
+ * - Per-section style tokens: background, foreground, padding
14
+ * - Page-level tokens: fonts, primary surface colors, container width
15
+ *
16
+ * What this does NOT record:
17
+ * - Heading or body text content (slots are counts, not strings).
18
+ * - Raw HTML, CSS, or class names from the source.
19
+ * - Brand assets (logos, illustrations, product screenshots).
20
+ *
21
+ * The structural model is what the mirror emitter uses to reconstruct the
22
+ * page's section grammar with `<TextSlot>` / `<MediaSlot>` placeholders.
23
+ */
24
+
25
+ import type { Page } from "playwright";
26
+
27
+ import type {
28
+ Composition,
29
+ SectionLayout,
30
+ SectionRole,
31
+ SiteLayout,
32
+ SiteTokens,
33
+ SlotCount,
34
+ SlotKind,
35
+ } from "./types.js";
36
+
37
+ export async function crawlLayout(
38
+ page: Page,
39
+ url: string,
40
+ viewport: { width: number; height: number },
41
+ ): Promise<SiteLayout> {
42
+ await page.evaluate(() => {
43
+ const g = globalThis as unknown as { __name?: (fn: unknown) => unknown };
44
+ if (typeof g.__name === "undefined") g.__name = (fn: unknown) => fn;
45
+ });
46
+
47
+ const host = new URL(url).host;
48
+ const partial = await page.evaluate(crawlInPage);
49
+
50
+ return {
51
+ url,
52
+ host,
53
+ capturedAt: new Date().toISOString(),
54
+ viewport,
55
+ ...partial,
56
+ };
57
+ }
58
+
59
+ /**
60
+ * Browser-context crawler. Dependency-free so Playwright can serialize it.
61
+ * Returns the layout-bearing fields of `SiteLayout` (url/host/capturedAt
62
+ * are added on the Node side).
63
+ */
64
+ function crawlInPage(): Pick<
65
+ SiteLayout,
66
+ "pageHeightPx" | "sections" | "tokens"
67
+ > {
68
+ const VIEWPORT_W = window.innerWidth;
69
+ const PAGE_H = Math.max(
70
+ document.documentElement.scrollHeight,
71
+ document.body.scrollHeight,
72
+ );
73
+
74
+ /* ----- helpers ----- */
75
+
76
+ function toHex(rgb: string): string | null {
77
+ if (!rgb || rgb === "transparent") return null;
78
+ const m = rgb.match(/rgba?\(([^)]+)\)/);
79
+ if (!m) return null;
80
+ const parts = m[1]!.split(",").map((s) => s.trim());
81
+ const r = parseInt(parts[0]!, 10);
82
+ const g = parseInt(parts[1]!, 10);
83
+ const b = parseInt(parts[2]!, 10);
84
+ const a = parts[3] !== undefined ? parseFloat(parts[3]) : 1;
85
+ if (a < 0.05) return null;
86
+ if ([r, g, b].some((n) => Number.isNaN(n))) return null;
87
+ const h = (n: number) => n.toString(16).padStart(2, "0");
88
+ return `#${h(r)}${h(g)}${h(b)}`;
89
+ }
90
+
91
+ function isVisible(el: HTMLElement): boolean {
92
+ const style = getComputedStyle(el);
93
+ if (style.visibility === "hidden" || style.display === "none") return false;
94
+ if (parseFloat(style.opacity) < 0.05) return false;
95
+ const r = el.getBoundingClientRect();
96
+ return r.width > 0 && r.height > 0;
97
+ }
98
+
99
+ function directTextLength(el: Element): number {
100
+ let total = 0;
101
+ for (const child of Array.from(el.childNodes)) {
102
+ if (child.nodeType === 3) {
103
+ const text = (child.nodeValue ?? "").trim();
104
+ if (text) total += text.length;
105
+ }
106
+ }
107
+ return total;
108
+ }
109
+
110
+ function classifySlotForElement(el: HTMLElement): SlotKind | null {
111
+ const tag = el.tagName;
112
+ const style = getComputedStyle(el);
113
+ const fontSize = parseFloat(style.fontSize) || 16;
114
+ const fontWeight = parseInt(style.fontWeight, 10) || 400;
115
+
116
+ if (tag === "IMG" || tag === "PICTURE") {
117
+ // Heuristic: small square-ish images in a logo strip vs hero photos.
118
+ const r = el.getBoundingClientRect();
119
+ const ratio = r.width > 0 && r.height > 0 ? r.width / r.height : 0;
120
+ if (r.height > 0 && r.height < 56 && ratio < 6) return "logo-mono";
121
+ return "image";
122
+ }
123
+ if (tag === "SVG" || tag === "svg") {
124
+ const r = el.getBoundingClientRect();
125
+ if (r.width < 32 && r.height < 32) return "icon";
126
+ return "image";
127
+ }
128
+ if (tag === "VIDEO") return "video";
129
+ if (tag === "PRE" || tag === "CODE") return "code";
130
+ if (tag === "INPUT" || tag === "TEXTAREA" || tag === "SELECT") return "input";
131
+
132
+ if (tag === "BUTTON" || (tag === "A" && el.getAttribute("role") === "button")) {
133
+ // Decide primary vs secondary by background contrast vs the page.
134
+ const bg = toHex(style.backgroundColor);
135
+ if (bg && bg !== toHex(getComputedStyle(document.body).backgroundColor)) {
136
+ return "button-primary";
137
+ }
138
+ return "button-secondary";
139
+ }
140
+
141
+ if (tag === "A" && directTextLength(el) > 0) {
142
+ // Anchor that looks like a styled CTA (padded, bordered, or backgrounded).
143
+ const padX = parseFloat(style.paddingLeft) + parseFloat(style.paddingRight);
144
+ const hasBackdrop =
145
+ toHex(style.backgroundColor) !== null ||
146
+ parseFloat(style.borderTopWidth) > 0;
147
+ if (padX > 16 && hasBackdrop) {
148
+ return toHex(style.backgroundColor) ? "button-primary" : "button-secondary";
149
+ }
150
+ }
151
+
152
+ if (directTextLength(el) === 0) return null;
153
+
154
+ if (/^H[1-6]$/.test(tag)) {
155
+ const level = parseInt(tag.slice(1), 10);
156
+ if (level === 1) return "heading-1";
157
+ if (level === 2) return "heading-2";
158
+ return "heading-3";
159
+ }
160
+
161
+ if (tag === "LI") return "bullet";
162
+
163
+ if (fontSize >= 36 && fontWeight >= 600) return "heading-1";
164
+ if (fontSize >= 24 && fontWeight >= 500) return "heading-2";
165
+ if (fontSize >= 18 && fontWeight >= 500) return "heading-3";
166
+
167
+ if (fontSize <= 12 && /uppercase/i.test(style.textTransform)) return "eyebrow";
168
+ if (fontSize <= 13 && fontWeight >= 600) return "badge";
169
+
170
+ if (tag === "P" || tag === "SPAN" || tag === "DIV") return "body";
171
+
172
+ return null;
173
+ }
174
+
175
+ function countSlots(root: HTMLElement): SlotCount[] {
176
+ const buckets = new Map<SlotKind, number>();
177
+ const all = root.querySelectorAll<HTMLElement>("*");
178
+ for (const el of Array.from(all)) {
179
+ if (!isVisible(el)) continue;
180
+ const kind = classifySlotForElement(el);
181
+ if (kind) buckets.set(kind, (buckets.get(kind) ?? 0) + 1);
182
+ }
183
+ // De-dupe nested H1 spans etc: if an H1 contains spans, the outer H1 counts.
184
+ // We accept some over-count for body since it's coarse anyway; cap it.
185
+ if ((buckets.get("body") ?? 0) > 12) buckets.set("body", 12);
186
+ if ((buckets.get("bullet") ?? 0) > 24) buckets.set("bullet", 24);
187
+
188
+ const out: SlotCount[] = [];
189
+ for (const [kind, count] of buckets) out.push({ kind, count });
190
+ return out.sort((a, b) => a.kind.localeCompare(b.kind));
191
+ }
192
+
193
+ /* ----- section discovery ----- */
194
+
195
+ /**
196
+ * A top-level section is a block-level element that:
197
+ * - is at least ~70% of the viewport wide,
198
+ * - has a non-trivial height (>= 80 px),
199
+ * - and is one of <header>, <footer>, <main>, <section>, <article>,
200
+ * or a direct child of <body> / <main> that visually plays that role.
201
+ */
202
+ function findSections(): HTMLElement[] {
203
+ const candidates = new Set<HTMLElement>();
204
+ const tagSet = ["HEADER", "MAIN", "SECTION", "ARTICLE", "FOOTER", "NAV"];
205
+ for (const tag of tagSet) {
206
+ for (const el of Array.from(document.getElementsByTagName(tag))) {
207
+ candidates.add(el as HTMLElement);
208
+ }
209
+ }
210
+ // Add direct children of <body> and <main> as fallback.
211
+ const bodyKids = Array.from(document.body.children) as HTMLElement[];
212
+ for (const el of bodyKids) candidates.add(el);
213
+ const main = document.querySelector("main");
214
+ if (main) {
215
+ for (const el of Array.from(main.children) as HTMLElement[]) {
216
+ candidates.add(el);
217
+ }
218
+ }
219
+
220
+ const accepted: HTMLElement[] = [];
221
+ for (const el of candidates) {
222
+ if (!isVisible(el)) continue;
223
+ const r = el.getBoundingClientRect();
224
+ if (r.width < VIEWPORT_W * 0.7) continue;
225
+ if (r.height < 80) continue;
226
+ // Skip if this element is nested inside another already-accepted candidate.
227
+ // We'll do a final pass after sorting.
228
+ accepted.push(el);
229
+ }
230
+ // Sort by document y position.
231
+ accepted.sort((a, b) => {
232
+ const ay = a.getBoundingClientRect().top + window.scrollY;
233
+ const by = b.getBoundingClientRect().top + window.scrollY;
234
+ return ay - by;
235
+ });
236
+
237
+ // Drop any element fully contained in an earlier accepted one.
238
+ const final: HTMLElement[] = [];
239
+ for (const el of accepted) {
240
+ const inside = final.some((p) => p !== el && p.contains(el));
241
+ if (!inside) final.push(el);
242
+ }
243
+ return final;
244
+ }
245
+
246
+ /* ----- per-section classification ----- */
247
+
248
+ function classifyRole(
249
+ el: HTMLElement,
250
+ indexFromTop: number,
251
+ indexFromBottom: number,
252
+ slots: SlotCount[],
253
+ ): SectionRole {
254
+ const r = el.getBoundingClientRect();
255
+ const tag = el.tagName;
256
+ const count = (k: SlotKind) => slots.find((s) => s.kind === k)?.count ?? 0;
257
+
258
+ if (tag === "NAV") return "nav";
259
+ if (tag === "FOOTER" || (indexFromBottom === 0 && r.height < 600)) return "footer";
260
+ if (tag === "HEADER" && indexFromTop === 0 && r.height < 140) return "nav";
261
+
262
+ const headingCount = count("heading-1") + count("heading-2") + count("heading-3");
263
+ const buttons = count("button-primary") + count("button-secondary");
264
+ const images = count("image");
265
+ const logos = count("logo-mono");
266
+ const bullets = count("bullet");
267
+
268
+ // First in-document, has H1 + CTA → hero.
269
+ if (indexFromTop <= 1 && count("heading-1") >= 1 && buttons >= 1) return "hero";
270
+
271
+ // A wide, short band of small uniform images → logo strip.
272
+ if (logos >= 4 && headingCount <= 1 && r.height < r.width * 0.25) {
273
+ return "proof-logos";
274
+ }
275
+
276
+ // Pricing tells: 2–4 tall columns each with bullets and a button.
277
+ if (bullets >= 6 && buttons >= 2 && r.height > 360) return "pricing";
278
+
279
+ // Lots of headings (3+) of the same level + small bodies → feature grid.
280
+ if (count("heading-2") + count("heading-3") >= 3 && images <= 2) return "feature-grid";
281
+
282
+ // One heading, generous body, one media slot → deep dive.
283
+ if (headingCount >= 1 && images >= 1 && bullets <= 4 && r.height > 320) {
284
+ return "feature-deep-dive";
285
+ }
286
+
287
+ // Heading + 2 buttons, short height → conversion band near the bottom.
288
+ if (headingCount <= 2 && buttons >= 1 && r.height < 480 && indexFromBottom <= 2) {
289
+ return "conversion";
290
+ }
291
+
292
+ // Quote-shaped: short body strings, sometimes 3-up.
293
+ if (count("body") >= 3 && images === 0 && bullets === 0 && buttons === 0) {
294
+ return "proof-quotes";
295
+ }
296
+
297
+ return "other";
298
+ }
299
+
300
+ function classifyComposition(el: HTMLElement): Composition {
301
+ // Find the deepest descendant that uses CSS grid or flex with >1 row of cols.
302
+ const candidates = el.querySelectorAll<HTMLElement>("*");
303
+ let bestCols = 1;
304
+ let bestKind: "grid" | "flex" | "none" = "none";
305
+ let logoRowCols = 0;
306
+ for (const c of Array.from(candidates).slice(0, 400)) {
307
+ if (!isVisible(c)) continue;
308
+ const s = getComputedStyle(c);
309
+ if (s.display === "grid") {
310
+ const cols = s.gridTemplateColumns
311
+ .split(" ")
312
+ .filter((x) => x.trim().length > 0).length;
313
+ if (cols > bestCols) {
314
+ bestCols = cols;
315
+ bestKind = "grid";
316
+ }
317
+ } else if (s.display === "flex" && s.flexDirection.startsWith("row")) {
318
+ const kids = Array.from(c.children) as HTMLElement[];
319
+ const visibleKids = kids.filter(isVisible);
320
+ if (visibleKids.length > bestCols && visibleKids.length <= 12) {
321
+ bestCols = visibleKids.length;
322
+ bestKind = "flex";
323
+ }
324
+ if (visibleKids.length >= 4) {
325
+ const allSmall = visibleKids.every((k) => {
326
+ const kr = k.getBoundingClientRect();
327
+ return kr.height < 80 && kr.width < 200;
328
+ });
329
+ if (allSmall) logoRowCols = Math.max(logoRowCols, visibleKids.length);
330
+ }
331
+ }
332
+ }
333
+
334
+ if (logoRowCols >= 4) return "logo-row";
335
+ if (bestKind === "none" || bestCols <= 1) return "single-column";
336
+ if (bestCols === 2) return "split-2";
337
+ if (bestCols === 3) return "grid-3";
338
+ if (bestCols === 4) return "grid-4";
339
+ if (bestCols >= 5) return "list";
340
+ return "unknown";
341
+ }
342
+
343
+ function classifyDensity(el: HTMLElement, slots: SlotCount[]): "thin" | "balanced" | "dense" {
344
+ const total = slots.reduce((sum, s) => sum + s.count, 0);
345
+ const r = el.getBoundingClientRect();
346
+ const density = total / Math.max(1, r.height / 100);
347
+ if (density < 0.8) return "thin";
348
+ if (density > 2.4) return "dense";
349
+ return "balanced";
350
+ }
351
+
352
+ function extractSectionStyles(el: HTMLElement): SectionLayout["styles"] {
353
+ const s = getComputedStyle(el);
354
+ return {
355
+ backgroundHex: toHex(s.backgroundColor),
356
+ foregroundHex: toHex(s.color),
357
+ paddingTopPx: Math.round(parseFloat(s.paddingTop) || 0) || null,
358
+ paddingBottomPx: Math.round(parseFloat(s.paddingBottom) || 0) || null,
359
+ };
360
+ }
361
+
362
+ /* ----- page-level token extraction ----- */
363
+
364
+ function extractPageTokens(): SiteTokens {
365
+ const body = document.body;
366
+ const bodyStyle = getComputedStyle(body);
367
+ const bodyFontFamily = bodyStyle.fontFamily.split(",")[0]!.trim().replace(/^["']|["']$/g, "") || "system-ui";
368
+
369
+ let headingFontFamily = bodyFontFamily;
370
+ const h = document.querySelector("h1, h2, h3");
371
+ if (h) {
372
+ const hs = getComputedStyle(h);
373
+ headingFontFamily = hs.fontFamily.split(",")[0]!.trim().replace(/^["']|["']$/g, "") || bodyFontFamily;
374
+ }
375
+
376
+ const bg = toHex(bodyStyle.backgroundColor) ?? "#ffffff";
377
+ const fg = toHex(bodyStyle.color) ?? "#0a0a0a";
378
+
379
+ // Primary = the most-used non-text colored button background.
380
+ const buttonBgCounts = new Map<string, number>();
381
+ for (const b of Array.from(document.querySelectorAll<HTMLElement>("button, a, [role='button']"))) {
382
+ if (!isVisible(b)) continue;
383
+ const sb = toHex(getComputedStyle(b).backgroundColor);
384
+ if (!sb || sb === bg) continue;
385
+ buttonBgCounts.set(sb, (buttonBgCounts.get(sb) ?? 0) + 1);
386
+ }
387
+ let primary = fg;
388
+ let primaryCount = 0;
389
+ for (const [hex, count] of buttonBgCounts) {
390
+ if (count > primaryCount) {
391
+ primary = hex;
392
+ primaryCount = count;
393
+ }
394
+ }
395
+
396
+ // Muted = a frequent off-white / off-black surface color (non-page).
397
+ const surfaceCounts = new Map<string, number>();
398
+ for (const el of Array.from(document.querySelectorAll<HTMLElement>("body *"))) {
399
+ if (!isVisible(el)) continue;
400
+ const sb = toHex(getComputedStyle(el).backgroundColor);
401
+ if (!sb || sb === bg) continue;
402
+ surfaceCounts.set(sb, (surfaceCounts.get(sb) ?? 0) + 1);
403
+ }
404
+ let muted = bg;
405
+ let mutedCount = 0;
406
+ for (const [hex, count] of surfaceCounts) {
407
+ if (hex === primary) continue;
408
+ if (count > mutedCount) {
409
+ muted = hex;
410
+ mutedCount = count;
411
+ }
412
+ }
413
+
414
+ // Border = most-used border color across all elements with a border.
415
+ const borderCounts = new Map<string, number>();
416
+ for (const el of Array.from(document.querySelectorAll<HTMLElement>("body *"))) {
417
+ if (!isVisible(el)) continue;
418
+ const s = getComputedStyle(el);
419
+ if (parseFloat(s.borderTopWidth) <= 0) continue;
420
+ const bh = toHex(s.borderTopColor);
421
+ if (!bh) continue;
422
+ borderCounts.set(bh, (borderCounts.get(bh) ?? 0) + 1);
423
+ }
424
+ let border = "#e5e7eb";
425
+ let borderCount = 0;
426
+ for (const [hex, count] of borderCounts) {
427
+ if (count > borderCount) {
428
+ border = hex;
429
+ borderCount = count;
430
+ }
431
+ }
432
+
433
+ // Radius = the most-used non-zero corner radius.
434
+ const radiusCounts = new Map<number, number>();
435
+ for (const el of Array.from(document.querySelectorAll<HTMLElement>("body *"))) {
436
+ if (!isVisible(el)) continue;
437
+ const r = parseFloat(getComputedStyle(el).borderTopLeftRadius);
438
+ if (!(r > 0 && r < 64)) continue;
439
+ const k = Math.round(r);
440
+ radiusCounts.set(k, (radiusCounts.get(k) ?? 0) + 1);
441
+ }
442
+ let radius = 8;
443
+ let radiusCount = 0;
444
+ for (const [px, count] of radiusCounts) {
445
+ if (count > radiusCount) {
446
+ radius = px;
447
+ radiusCount = count;
448
+ }
449
+ }
450
+
451
+ // Container: widest layout block under 1600px.
452
+ let containerPx: number | null = null;
453
+ let containerArea = 0;
454
+ for (const el of Array.from(
455
+ document.querySelectorAll<HTMLElement>("main, section, header, footer, div"),
456
+ )) {
457
+ if (!isVisible(el)) continue;
458
+ const r = el.getBoundingClientRect();
459
+ if (r.width < 720 || r.width > 1600) continue;
460
+ if (r.height < 240) continue;
461
+ const area = r.width * r.height;
462
+ if (area > containerArea) {
463
+ containerArea = area;
464
+ containerPx = Math.round(r.width);
465
+ }
466
+ }
467
+
468
+ return {
469
+ bodyFontFamily,
470
+ headingFontFamily,
471
+ backgroundHex: bg,
472
+ foregroundHex: fg,
473
+ primaryHex: primary,
474
+ mutedHex: muted,
475
+ borderHex: border,
476
+ radiusPx: radius,
477
+ containerPx,
478
+ };
479
+ }
480
+
481
+ /* ----- main pass ----- */
482
+
483
+ const sectionEls = findSections();
484
+ const sections: SectionLayout[] = [];
485
+ for (let i = 0; i < sectionEls.length; i++) {
486
+ const el = sectionEls[i]!;
487
+ const r = el.getBoundingClientRect();
488
+ const top = r.top + window.scrollY;
489
+ const slots = countSlots(el);
490
+ const composition = classifyComposition(el);
491
+ const density = classifyDensity(el, slots);
492
+ const role = classifyRole(el, i, sectionEls.length - 1 - i, slots);
493
+ const styles = extractSectionStyles(el);
494
+
495
+ const notes: string[] = [];
496
+ if (slots.length === 0) notes.push("No content slots detected; rendering an empty wrapper.");
497
+ if (composition === "unknown") notes.push("Composition was ambiguous; fell back to single-column.");
498
+
499
+ sections.push({
500
+ id: `s${i + 1}`,
501
+ role,
502
+ composition,
503
+ density,
504
+ bbox: [
505
+ Math.max(0, Math.min(1, r.left / VIEWPORT_W)),
506
+ Math.max(0, Math.min(1, top / PAGE_H)),
507
+ Math.max(0, Math.min(1, r.width / VIEWPORT_W)),
508
+ Math.max(0, Math.min(1, r.height / PAGE_H)),
509
+ ],
510
+ slots,
511
+ styles,
512
+ notes,
513
+ });
514
+ }
515
+
516
+ return {
517
+ pageHeightPx: PAGE_H,
518
+ sections,
519
+ tokens: extractPageTokens(),
520
+ };
521
+ }
@@ -9,8 +9,8 @@
9
9
  * globals.css — shadcn-compatible CSS variables (light + dark)
10
10
  * theme-preview.tsx — a self-contained React component that renders
11
11
  * every token so you can eyeball the system
12
- * REPORT.md — what was extracted, from where, with the
13
- * anti-clone disclaimer
12
+ * REPORT.md — what was extracted, from where, and how the
13
+ * output is meant to be used
14
14
  */
15
15
 
16
16
  import { mkdirSync, writeFileSync } from "node:fs";
@@ -4,16 +4,23 @@
4
4
  * npm run extract -- https://site-a.com https://site-b.com https://site-c.com
5
5
  *
6
6
  * For each URL: open in Chromium, screenshot, harvest computed design
7
- * tokens via `browser-extract.ts`. After all sites: synthesize an
8
- * original shadcn-compatible design system and emit drop-in files.
7
+ * tokens via `browser-extract.ts`, and crawl the rendered DOM into a
8
+ * typed `SiteLayout` model via `dom-crawler.ts`. After all sites:
9
+ * - Synthesize a drop-in shadcn-compatible design system from the
10
+ * aggregated tokens.
11
+ * - Emit a per-site **layout mirror**: a Next.js page that reconstructs
12
+ * the source's section structure from typed primitives, with
13
+ * `<TextSlot>` / `<MediaSlot>` placeholders for the user's copy and
14
+ * brand assets.
9
15
  *
10
16
  * Output goes to `output/<runId>/`.
11
17
  *
12
- * Policy (from rules/anti-clone-policy.md):
13
- * - Honor robots.txt by default.
14
- * - Per-domain rate limit defaults to 15 req/min.
15
- * - Store only PNG screenshots and harvested computed-style values.
16
- * - Never persist HTML, JS, CSS, or third-party assets.
18
+ * Operational defaults (configurable via flags):
19
+ * - Honor robots.txt unless `--no-robots` is passed.
20
+ * - Per-domain rate limit defaults to 15 req/min (`--rate <n>`).
21
+ * - The crawler extracts a structured representation (section tree,
22
+ * computed style tokens, content kinds); it does not store raw HTML,
23
+ * copy text, or third-party assets in the output.
17
24
  */
18
25
 
19
26
  import { mkdirSync, writeFileSync } from "node:fs";
@@ -23,9 +30,11 @@ import { fileURLToPath, pathToFileURL } from "node:url";
23
30
  import { chromium, type Browser } from "playwright";
24
31
 
25
32
  import { harvestTokens } from "./browser-extract.js";
33
+ import { crawlLayout } from "./dom-crawler.js";
26
34
  import { emitAll } from "./emit.js";
35
+ import { emitMirror } from "./mirror-emit.js";
27
36
  import { synthesize } from "./synthesize.js";
28
- import type { ExtractionRun, RawTokens, SiteCapture } from "./types.js";
37
+ import type { ExtractionRun, RawTokens, SiteCapture, SiteLayout } from "./types.js";
29
38
 
30
39
  const __filename = fileURLToPath(import.meta.url);
31
40
  const __dirname = dirname(__filename);
@@ -90,9 +99,18 @@ function printHelp(): void {
90
99
  "Writes to ./output/<runId>/ in your current working directory unless",
91
100
  "you pass --out.",
92
101
  "",
93
- "Captures each URL at a desktop viewport, harvests computed design",
94
- "tokens (colors, type, spacing, radius, shadow), and synthesizes a",
95
- "drop-in shadcn-compatible design system in output/<runId>/.",
102
+ "For each URL the CLI:",
103
+ " 1. Renders the page at a desktop viewport in headless Chromium.",
104
+ " 2. Harvests computed design tokens (colors, type, spacing, radius,",
105
+ " shadow).",
106
+ " 3. Crawls the rendered DOM into a typed SiteLayout (section tree,",
107
+ " composition, slot counts, per-section style tokens).",
108
+ " 4. Emits a layout-mirror Next.js page at",
109
+ " output/<runId>/mirror/<host>/page.tsx with <TextSlot> /",
110
+ " <MediaSlot> placeholders for your own copy and imagery.",
111
+ "",
112
+ "After every URL, a drop-in shadcn-compatible design system is",
113
+ "synthesized from the aggregated tokens and written to output/<runId>/.",
96
114
  "",
97
115
  "Options:",
98
116
  " --out <dir> Output directory (default: output/<runId>)",
@@ -178,11 +196,13 @@ async function captureOne(
178
196
  url: string,
179
197
  viewport: { width: number; height: number },
180
198
  outDir: string,
181
- ): Promise<{ raw: RawTokens; capture: SiteCapture } | null> {
199
+ ): Promise<{ raw: RawTokens; layout: SiteLayout | null; capture: SiteCapture } | null> {
182
200
  const host = new URL(url).host;
183
201
  const stamp = `${host}.png`;
184
202
  const screenshotPath = join(outDir, "screenshots", stamp);
185
203
  const rawPath = join(outDir, "raw", `${host}.tokens.json`);
204
+ const layoutPath = join(outDir, "raw", `${host}.layout.json`);
205
+ const mirrorDir = join(outDir, "mirror", host);
186
206
 
187
207
  const ctx = await browser.newContext({
188
208
  userAgent: USER_AGENT,
@@ -215,18 +235,32 @@ async function captureOne(
215
235
  mkdirSync(dirname(rawPath), { recursive: true });
216
236
  writeFileSync(rawPath, JSON.stringify(raw, null, 2));
217
237
 
238
+ let layout: SiteLayout | null = null;
239
+ let mirrorWritten: string[] = [];
240
+ try {
241
+ layout = await crawlLayout(page, url, viewport);
242
+ mkdirSync(dirname(layoutPath), { recursive: true });
243
+ writeFileSync(layoutPath, JSON.stringify(layout, null, 2));
244
+ mirrorWritten = emitMirror(layout, mirrorDir);
245
+ } catch (err) {
246
+ console.warn(` ! layout crawl failed for ${url}: ${(err as Error).message}`);
247
+ }
248
+
218
249
  const capture: SiteCapture = {
219
250
  url,
220
251
  host,
221
252
  capturedAt: raw.capturedAt,
222
253
  screenshotPath,
223
254
  rawTokensPath: rawPath,
255
+ ...(layout ? { layoutPath } : {}),
256
+ ...(mirrorWritten.length > 0 ? { mirrorDir } : {}),
224
257
  status: "ok",
225
258
  };
226
- return { raw, capture };
259
+ return { raw, layout, capture };
227
260
  } catch (err) {
228
261
  return {
229
262
  raw: emptyRaw(url, viewport),
263
+ layout: null,
230
264
  capture: {
231
265
  url,
232
266
  host,
@@ -303,7 +337,11 @@ async function main(): Promise<void> {
303
337
  captures.push(result.capture);
304
338
  if (result.capture.status === "ok") {
305
339
  rawList.push(result.raw);
306
- console.log(` ✓ ${url}`);
340
+ const tag = result.layout ? "mirror" : "tokens-only";
341
+ const sectionCount = result.layout?.sections.length ?? 0;
342
+ console.log(
343
+ ` ✓ ${url} → ${tag}${result.layout ? ` (${sectionCount} sections)` : ""}`,
344
+ );
307
345
  } else {
308
346
  console.log(` ✗ ${url} ${result.capture.reason ?? ""}`);
309
347
  }
@@ -339,9 +377,21 @@ async function main(): Promise<void> {
339
377
  console.log("[extract] wrote:");
340
378
  for (const f of written) console.log(` → ${f}`);
341
379
  console.log(` → ${join(outDir, "run.json")}`);
380
+ const mirrorDirs = captures.filter((c) => c.mirrorDir).map((c) => c.mirrorDir!);
381
+ if (mirrorDirs.length > 0) {
382
+ console.log("");
383
+ console.log("[extract] layout mirrors:");
384
+ for (const d of mirrorDirs) console.log(` → ${d}/page.tsx`);
385
+ }
342
386
  console.log("");
343
- console.log(`[extract] done. Open ${join(outDir, "REPORT.md")} for the summary.`);
344
- console.log(`[extract] Give your AI: ${join(outDir, "FOR_AI.md")}`);
387
+ console.log(`[extract] done. Open ${join(outDir, "REPORT.md")} for the design-system summary.`);
388
+ if (mirrorDirs.length > 0) {
389
+ console.log(
390
+ `[extract] each mirror folder ships a Next.js page.tsx + MIRROR_NOTES.md.`,
391
+ );
392
+ console.log(`[extract] fill the <TextSlot> / <MediaSlot> placeholders with your own content.`);
393
+ }
394
+ console.log(`[extract] AI handoff: ${join(outDir, "FOR_AI.md")}`);
345
395
  }
346
396
 
347
397
  function makeRunId(startedAt: string, name: string | undefined): string {