launchframe 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,441 +1,443 @@
1
- /**
2
- * `extract` — the headline command.
3
- *
4
- * npm run extract -- https://site-a.com https://site-b.com https://site-c.com
5
- *
6
- * For each URL: open in Chromium, screenshot, harvest computed design
7
- * tokens via `browser-extract.ts`, and crawl the rendered DOM into a
8
- * typed `SiteLayout` model via `dom-crawler.ts`. After all sites:
9
- * - Synthesize a drop-in shadcn-compatible design system from the
10
- * aggregated tokens.
11
- * - Emit a per-site **layout mirror**: a Next.js page that reconstructs
12
- * the source's section structure from typed primitives, with
13
- * `<TextSlot>` / `<MediaSlot>` placeholders for the user's copy and
14
- * brand assets.
15
- *
16
- * Output goes to `output/<runId>/`.
17
- *
18
- * Operational defaults (configurable via flags):
19
- * - Honor robots.txt unless `--no-robots` is passed.
20
- * - Per-domain rate limit defaults to 15 req/min (`--rate <n>`).
21
- * - The crawler extracts a structured representation (section tree,
22
- * computed style tokens, content kinds) and writes a verbatim
23
- * `reference/<host>/` bundle (HTML + visible text + media URLs) for AI.
24
- */
25
-
26
- import { mkdirSync, writeFileSync } from "node:fs";
27
- import { dirname, join } from "node:path";
28
- import { fileURLToPath, pathToFileURL } from "node:url";
29
-
30
- import { chromium, type Browser } from "playwright";
31
-
32
- import { harvestTokens } from "./browser-extract.js";
33
- import { crawlLayout } from "./dom-crawler.js";
34
- import { emitAll } from "./emit.js";
35
- import { emitPageReference } from "./reference-dump.js";
36
- import { synthesize } from "./synthesize.js";
37
- import type { ExtractionRun, RawTokens, SiteCapture, SiteLayout } from "./types.js";
38
-
39
- const __filename = fileURLToPath(import.meta.url);
40
- const __dirname = dirname(__filename);
41
- /** Writes under the user's cwd so `npx launchframe` from any folder works. */
42
- const DEFAULT_OUTPUT_ROOT = join(process.cwd(), "output");
43
-
44
- const USER_AGENT =
45
- "launchframe/0.1 (+https://github.com/evangruhlkey/launchframe; design-token research; respects robots.txt)";
46
-
47
- interface CliArgs {
48
- urls: string[];
49
- outDir: string;
50
- viewport: { width: number; height: number };
51
- respectRobots: boolean;
52
- rateLimitPerMinute: number;
53
- runName?: string;
54
- }
55
-
56
- function parseArgs(argv: string[]): CliArgs {
57
- const args: CliArgs = {
58
- urls: [],
59
- outDir: "",
60
- viewport: { width: 1440, height: 900 },
61
- respectRobots: true,
62
- rateLimitPerMinute: 15,
63
- };
64
- for (let i = 0; i < argv.length; i++) {
65
- const a = argv[i]!;
66
- if (a === "--out") args.outDir = argv[++i]!;
67
- else if (a === "--name") args.runName = argv[++i];
68
- else if (a === "--no-robots") args.respectRobots = false;
69
- else if (a === "--rate") args.rateLimitPerMinute = parseInt(argv[++i]!, 10);
70
- else if (a === "--width") args.viewport.width = parseInt(argv[++i]!, 10);
71
- else if (a === "--height") args.viewport.height = parseInt(argv[++i]!, 10);
72
- else if (a === "--help" || a === "-h") {
73
- printHelp();
74
- process.exit(0);
75
- } else if (a.startsWith("http://") || a.startsWith("https://")) {
76
- args.urls.push(a);
77
- } else if (a.startsWith("--")) {
78
- console.error(`Unknown flag: ${a}`);
79
- process.exit(2);
80
- } else {
81
- console.error(`Unrecognized argument: ${a}`);
82
- process.exit(2);
83
- }
84
- }
85
- if (args.urls.length === 0) {
86
- printHelp();
87
- process.exit(2);
88
- }
89
- return args;
90
- }
91
-
92
- function printHelp(): void {
93
- console.log(
94
- [
95
- "Usage:",
96
- " npx launchframe <url> [<url> ...] [options] (from any folder)",
97
- " npm run extract -- <url> [<url> ...] [options] (from this repo)",
98
- "",
99
- "Writes to ./output/<runId>/ in your current working directory unless",
100
- "you pass --out.",
101
- "",
102
- "For each URL the CLI:",
103
- " 1. Renders the page at a desktop viewport in headless Chromium.",
104
- " 2. Captures a full-page screenshot and harvests computed design tokens",
105
- " (colors, type, spacing, radius, shadow) raw/<host>.tokens.json.",
106
- " 3. Writes a verbatim reference bundle reference/<host>/ (page.html,",
107
- " visible-text.json/.txt, media.json, meta.json, FOR_AI_REFERENCE.md).",
108
- " 4. Crawls the DOM into SiteLayout → raw/<host>.layout.json and emits",
109
- " mirror/<host>/page.tsx (Framer Motion + Phosphor + image/video slots).",
110
- "",
111
- "After every URL, a drop-in shadcn-compatible design system is",
112
- "synthesized from the aggregated tokens and written to output/<runId>/.",
113
- "",
114
- "Options:",
115
- " --out <dir> Output directory (default: output/<runId>)",
116
- " --name <slug> Human-friendly slug used in the runId",
117
- " --no-robots Skip robots.txt check (not recommended)",
118
- " --rate <per-min> Per-domain rate limit, default 15",
119
- " --width <px> Viewport width, default 1440",
120
- " --height <px> Viewport height, default 900",
121
- " --help Show this help",
122
- ].join("\n"),
123
- );
124
- }
125
-
126
- /* -------------------------------------------------------------------------- */
127
- /* robots.txt */
128
- /* -------------------------------------------------------------------------- */
129
-
130
- async function isAllowedByRobots(url: string): Promise<boolean> {
131
- try {
132
- const u = new URL(url);
133
- const res = await fetch(`${u.origin}/robots.txt`, {
134
- headers: { "User-Agent": USER_AGENT },
135
- signal: AbortSignal.timeout(8_000),
136
- });
137
- if (!res.ok) return true;
138
- const body = await res.text();
139
- return checkRobots(body, u.pathname);
140
- } catch {
141
- return true;
142
- }
143
- }
144
-
145
- function checkRobots(body: string, pathname: string): boolean {
146
- const lines = body.split(/\r?\n/);
147
- let inStarBlock = false;
148
- const disallow: string[] = [];
149
- const allow: string[] = [];
150
- for (const raw of lines) {
151
- const line = raw.split("#")[0]!.trim();
152
- if (!line) continue;
153
- const idx = line.indexOf(":");
154
- if (idx < 0) continue;
155
- const key = line.slice(0, idx).toLowerCase().trim();
156
- const value = line.slice(idx + 1).trim();
157
- if (key === "user-agent") inStarBlock = value === "*";
158
- else if (inStarBlock && key === "disallow" && value) disallow.push(value);
159
- else if (inStarBlock && key === "allow" && value) allow.push(value);
160
- }
161
- const len = (patterns: string[]) =>
162
- patterns.reduce((m, p) => (pathname.startsWith(p) ? Math.max(m, p.length) : m), -1);
163
- const a = len(allow);
164
- const d = len(disallow);
165
- if (d < 0) return true;
166
- return a >= d;
167
- }
168
-
169
- /* -------------------------------------------------------------------------- */
170
- /* Rate limiter */
171
- /* -------------------------------------------------------------------------- */
172
-
173
- class RateLimiter {
174
- private readonly intervalMs: number;
175
- private readonly lastByHost = new Map<string, number>();
176
- constructor(perMinute: number) {
177
- this.intervalMs = Math.ceil(60_000 / Math.max(1, perMinute));
178
- }
179
- async wait(host: string): Promise<void> {
180
- const last = this.lastByHost.get(host) ?? 0;
181
- const elapsed = Date.now() - last;
182
- if (elapsed < this.intervalMs) {
183
- await new Promise((r) => setTimeout(r, this.intervalMs - elapsed));
184
- }
185
- this.lastByHost.set(host, Date.now());
186
- }
187
- }
188
-
189
- /* -------------------------------------------------------------------------- */
190
- /* Pipeline */
191
- /* -------------------------------------------------------------------------- */
192
-
193
- async function captureOne(
194
- browser: Browser,
195
- url: string,
196
- viewport: { width: number; height: number },
197
- outDir: string,
198
- ): Promise<{ raw: RawTokens; layout: SiteLayout | null; capture: SiteCapture } | null> {
199
- const host = new URL(url).host;
200
- const stamp = `${host}.png`;
201
- const screenshotPath = join(outDir, "screenshots", stamp);
202
- const rawPath = join(outDir, "raw", `${host}.tokens.json`);
203
- const layoutPath = join(outDir, "raw", `${host}.layout.json`);
204
- const mirrorDir = join(outDir, "mirror", host);
205
- const referenceDir = join(outDir, "reference", host);
206
-
207
- const ctx = await browser.newContext({
208
- userAgent: USER_AGENT,
209
- viewport,
210
- deviceScaleFactor: 2,
211
- reducedMotion: "reduce",
212
- });
213
- const page = await ctx.newPage();
214
- try {
215
- const response = await page.goto(url, { waitUntil: "networkidle", timeout: 30_000 });
216
- if (!response || response.status() >= 400) {
217
- throw new Error(`HTTP ${response?.status() ?? "unknown"}`);
218
- }
219
-
220
- await page.evaluate(() => {
221
- const style = document.createElement("style");
222
- style.textContent = `*, *::before, *::after {
223
- animation: none !important;
224
- transition: none !important;
225
- scroll-behavior: auto !important;
226
- }`;
227
- document.head.appendChild(style);
228
- });
229
- await page.waitForTimeout(400);
230
-
231
- mkdirSync(dirname(screenshotPath), { recursive: true });
232
- await page.screenshot({ path: screenshotPath, fullPage: true, type: "png" });
233
-
234
- const raw = await harvestTokens(page, url, viewport);
235
- mkdirSync(dirname(rawPath), { recursive: true });
236
- writeFileSync(rawPath, JSON.stringify(raw, null, 2));
237
-
238
- let referenceWritten: string[] = [];
239
- try {
240
- referenceWritten = await emitPageReference(page, url, referenceDir);
241
- } catch (err) {
242
- console.warn(` ! reference dump failed for ${url}: ${(err as Error).message}`);
243
- }
244
-
245
- let layout: SiteLayout | null = null;
246
- let mirrorWritten: string[] = [];
247
- try {
248
- layout = await crawlLayout(page, url, viewport);
249
- mkdirSync(dirname(layoutPath), { recursive: true });
250
- writeFileSync(layoutPath, JSON.stringify(layout, null, 2));
251
- mirrorWritten = emitMirror(layout, mirrorDir);
252
- } catch (err) {
253
- console.warn(` ! layout crawl failed for ${url}: ${(err as Error).message}`);
254
- }
255
-
256
- const capture: SiteCapture = {
257
- url,
258
- host,
259
- capturedAt: raw.capturedAt,
260
- screenshotPath,
261
- rawTokensPath: rawPath,
262
- ...(referenceWritten.length > 0 ? { referenceDir } : {}),
263
- ...(layout ? { layoutPath } : {}),
264
- ...(mirrorWritten.length > 0 ? { mirrorDir } : {}),
265
- status: "ok",
266
- };
267
- return { raw, layout, capture };
268
- } catch (err) {
269
- return {
270
- raw: emptyRaw(url, viewport),
271
- layout: null,
272
- capture: {
273
- url,
274
- host,
275
- capturedAt: new Date().toISOString(),
276
- screenshotPath: "",
277
- rawTokensPath: "",
278
- status: "failed",
279
- reason: (err as Error).message,
280
- },
281
- };
282
- } finally {
283
- await ctx.close();
284
- }
285
- }
286
-
287
- function emptyRaw(url: string, viewport: { width: number; height: number }): RawTokens {
288
- return {
289
- url,
290
- capturedAt: new Date().toISOString(),
291
- viewport,
292
- colors: [],
293
- typography: [],
294
- spacing: [],
295
- radii: [],
296
- shadows: [],
297
- dominantContainerPx: null,
298
- };
299
- }
300
-
301
- async function main(): Promise<void> {
302
- const args = parseArgs(process.argv.slice(2));
303
- const startedAt = new Date().toISOString();
304
- const runId = makeRunId(startedAt, args.runName);
305
- const outDir = args.outDir || join(DEFAULT_OUTPUT_ROOT, runId);
306
-
307
- console.log(`[extract] runId=${runId}`);
308
- console.log(`[extract] urls=${args.urls.length} viewport=${args.viewport.width}x${args.viewport.height}`);
309
- console.log(`[extract] output=${outDir}`);
310
- console.log("");
311
-
312
- mkdirSync(outDir, { recursive: true });
313
-
314
- const limiter = new RateLimiter(args.rateLimitPerMinute);
315
- const captures: SiteCapture[] = [];
316
- const rawList: RawTokens[] = [];
317
-
318
- let browser: Browser | null = null;
319
- try {
320
- browser = await chromium.launch();
321
-
322
- for (const url of args.urls) {
323
- const host = new URL(url).host;
324
-
325
- if (args.respectRobots) {
326
- const allowed = await isAllowedByRobots(url);
327
- if (!allowed) {
328
- console.log(` ⊘ ${url} skipped robots.txt disallows`);
329
- captures.push({
330
- url,
331
- host,
332
- capturedAt: new Date().toISOString(),
333
- screenshotPath: "",
334
- rawTokensPath: "",
335
- status: "skipped",
336
- reason: "robots.txt",
337
- });
338
- continue;
339
- }
340
- }
341
-
342
- await limiter.wait(host);
343
- const result = await captureOne(browser, url, args.viewport, outDir);
344
- if (!result) continue;
345
- captures.push(result.capture);
346
- if (result.capture.status === "ok") {
347
- rawList.push(result.raw);
348
- const tag = result.layout ? "mirror" : "tokens-only";
349
- const sectionCount = result.layout?.sections.length ?? 0;
350
- console.log(
351
- ` ✓ ${url} → ${tag}${result.layout ? ` (${sectionCount} sections)` : ""}`,
352
- );
353
- } else {
354
- console.log(` ✗ ${url} ${result.capture.reason ?? ""}`);
355
- }
356
- }
357
- } finally {
358
- if (browser) await browser.close();
359
- }
360
-
361
- if (rawList.length === 0) {
362
- console.error("[extract] no successful captures — nothing to synthesize.");
363
- process.exit(1);
364
- }
365
-
366
- console.log("");
367
- console.log(`[extract] synthesizing design system from ${rawList.length} site(s)...`);
368
- const designSystem = synthesize(rawList, {
369
- runId,
370
- sources: rawList.map((r) => ({ url: r.url, capturedAt: r.capturedAt })),
371
- });
372
-
373
- const run: ExtractionRun = {
374
- runId,
375
- startedAt,
376
- finishedAt: new Date().toISOString(),
377
- outputDir: outDir,
378
- captures,
379
- designSystem,
380
- };
381
-
382
- const written = emitAll(designSystem, run);
383
- writeFileSync(join(outDir, "run.json"), JSON.stringify(run, null, 2));
384
- console.log("");
385
- console.log("[extract] wrote:");
386
- for (const f of written) console.log(` → ${f}`);
387
- console.log(` → ${join(outDir, "run.json")}`);
388
- const mirrorDirs = captures.filter((c) => c.mirrorDir).map((c) => c.mirrorDir!);
389
- const referenceDirs = captures.filter((c) => c.referenceDir).map((c) => c.referenceDir!);
390
- if (mirrorDirs.length > 0) {
391
- console.log("");
392
- console.log("[extract] layout mirrors:");
393
- for (const d of mirrorDirs) console.log(` → ${d}/page.tsx`);
394
- }
395
- if (referenceDirs.length > 0) {
396
- console.log("");
397
- console.log("[extract] AI reference (verbatim DOM + copy):");
398
- for (const d of referenceDirs) console.log(` → ${d}/FOR_AI_REFERENCE.md`);
399
- }
400
- console.log("");
401
- console.log(`[extract] done. Open ${join(outDir, "REPORT.md")} for the design-system summary.`);
402
- if (mirrorDirs.length > 0) {
403
- console.log(
404
- `[extract] each mirror folder ships a Next.js page.tsx + MIRROR_NOTES.md.`,
405
- );
406
- console.log(`[extract] fill the <TextSlot> / <MediaSlot> placeholders with your own content.`);
407
- }
408
- if (referenceDirs.length > 0) {
409
- console.log(
410
- `[extract] paste reference/<host>/visible-text.txt or page.html into your AI for exact structure + copy.`,
411
- );
412
- }
413
- console.log(`[extract] AI handoff: ${join(outDir, "FOR_AI.md")}`);
414
- }
415
-
416
- function makeRunId(startedAt: string, name: string | undefined): string {
417
- const stamp = startedAt.replace(/[-:T]/g, "").slice(0, 14);
418
- return name ? `${stamp}-${name}` : stamp;
419
- }
420
-
421
- if (isMainModule(import.meta.url)) {
422
- main().catch((err) => {
423
- console.error(err);
424
- process.exit(1);
425
- });
426
- }
427
-
428
- /**
429
- * Cross-platform entry-point check. On Windows, `process.argv[1]` is a
430
- * backslash path while `import.meta.url` is a proper file URL, so the
431
- * naive `file://${argv[1]}` template literal never matches and the
432
- * script silently exits. `pathToFileURL` produces the encoded URL form
433
- * on every platform.
434
- */
435
- function isMainModule(metaUrl: string): boolean {
436
- const entry = process.argv[1];
437
- if (!entry) return false;
438
- return metaUrl === pathToFileURL(entry).href;
439
- }
440
-
441
- export { main };
1
+ /**
2
+ * `extract` — the headline command.
3
+ *
4
+ * npm run extract -- https://site-a.com https://site-b.com https://site-c.com
5
+ *
6
+ * For each URL: open in Chromium, screenshot, harvest computed design
7
+ * tokens via `browser-extract.ts`, and crawl the rendered DOM into a
8
+ * typed `SiteLayout` model via `dom-crawler.ts`. After all sites:
9
+ * - Synthesize a drop-in shadcn-compatible design system from the
10
+ * aggregated tokens.
11
+ * - Emit a per-site **layout mirror**: a Next.js page that reconstructs
12
+ * the source's section structure from typed primitives, with
13
+ * `<TextSlot>` / `<MediaSlot>` placeholders for the user's copy and
14
+ * brand assets.
15
+ *
16
+ * Output goes to `output/<runId>/`.
17
+ *
18
+ * Operational defaults (configurable via flags):
19
+ * - Honor robots.txt unless `--no-robots` is passed.
20
+ * - Per-domain rate limit defaults to 15 req/min (`--rate <n>`).
21
+ * - The crawler extracts a structured representation (section tree,
22
+ * computed style tokens, content kinds) and writes a verbatim
23
+ * `reference/<host>/` bundle (HTML, DOM tree JSON, outlines, visible text,
24
+ * media index) for AI structure cloning.
25
+ */
26
+
27
+ import { mkdirSync, writeFileSync } from "node:fs";
28
+ import { dirname, join } from "node:path";
29
+ import { fileURLToPath, pathToFileURL } from "node:url";
30
+
31
+ import { chromium, type Browser } from "playwright";
32
+
33
+ import { harvestTokens } from "./browser-extract.js";
34
+ import { crawlLayout } from "./dom-crawler.js";
35
+ import { emitAll } from "./emit.js";
36
+ import { emitPageReference } from "./reference-dump.js";
37
+ import { synthesize } from "./synthesize.js";
38
+ import type { ExtractionRun, RawTokens, SiteCapture, SiteLayout } from "./types.js";
39
+
40
+ const __filename = fileURLToPath(import.meta.url);
41
+ const __dirname = dirname(__filename);
42
+ /** Writes under the user's cwd so `npx launchframe` from any folder works. */
43
+ const DEFAULT_OUTPUT_ROOT = join(process.cwd(), "output");
44
+
45
+ const USER_AGENT =
46
+ "launchframe/0.1 (+https://github.com/evangruhlkey/launchframe; design-token research; respects robots.txt)";
47
+
48
+ interface CliArgs {
49
+ urls: string[];
50
+ outDir: string;
51
+ viewport: { width: number; height: number };
52
+ respectRobots: boolean;
53
+ rateLimitPerMinute: number;
54
+ runName?: string;
55
+ }
56
+
57
+ function parseArgs(argv: string[]): CliArgs {
58
+ const args: CliArgs = {
59
+ urls: [],
60
+ outDir: "",
61
+ viewport: { width: 1440, height: 900 },
62
+ respectRobots: true,
63
+ rateLimitPerMinute: 15,
64
+ };
65
+ for (let i = 0; i < argv.length; i++) {
66
+ const a = argv[i]!;
67
+ if (a === "--out") args.outDir = argv[++i]!;
68
+ else if (a === "--name") args.runName = argv[++i];
69
+ else if (a === "--no-robots") args.respectRobots = false;
70
+ else if (a === "--rate") args.rateLimitPerMinute = parseInt(argv[++i]!, 10);
71
+ else if (a === "--width") args.viewport.width = parseInt(argv[++i]!, 10);
72
+ else if (a === "--height") args.viewport.height = parseInt(argv[++i]!, 10);
73
+ else if (a === "--help" || a === "-h") {
74
+ printHelp();
75
+ process.exit(0);
76
+ } else if (a.startsWith("http://") || a.startsWith("https://")) {
77
+ args.urls.push(a);
78
+ } else if (a.startsWith("--")) {
79
+ console.error(`Unknown flag: ${a}`);
80
+ process.exit(2);
81
+ } else {
82
+ console.error(`Unrecognized argument: ${a}`);
83
+ process.exit(2);
84
+ }
85
+ }
86
+ if (args.urls.length === 0) {
87
+ printHelp();
88
+ process.exit(2);
89
+ }
90
+ return args;
91
+ }
92
+
93
+ function printHelp(): void {
94
+ console.log(
95
+ [
96
+ "Usage:",
97
+ " npx launchframe <url> [<url> ...] [options] (from any folder)",
98
+ " npm run extract -- <url> [<url> ...] [options] (from this repo)",
99
+ "",
100
+ "Writes to ./output/<runId>/ in your current working directory unless",
101
+ "you pass --out.",
102
+ "",
103
+ "For each URL the CLI:",
104
+ " 1. Renders the page at a desktop viewport in headless Chromium.",
105
+ " 2. Captures a full-page screenshot and harvests computed design tokens",
106
+ " (colors, type, spacing, radius, shadow)raw/<host>.tokens.json.",
107
+ " 3. Writes a verbatim reference bundle → reference/<host>/ (page.html,",
108
+ " dom-structure.json, structure-outline.txt, visible-text.json/.txt,",
109
+ " media.json, meta.json, FOR_AI_REFERENCE.md).",
110
+ " 4. Crawls the DOM into SiteLayout → raw/<host>.layout.json and emits",
111
+ " mirror/<host>/page.tsx (Framer Motion + Phosphor + image/video slots).",
112
+ "",
113
+ "After every URL, a drop-in shadcn-compatible design system is",
114
+ "synthesized from the aggregated tokens and written to output/<runId>/.",
115
+ "",
116
+ "Options:",
117
+ " --out <dir> Output directory (default: output/<runId>)",
118
+ " --name <slug> Human-friendly slug used in the runId",
119
+ " --no-robots Skip robots.txt check (not recommended)",
120
+ " --rate <per-min> Per-domain rate limit, default 15",
121
+ " --width <px> Viewport width, default 1440",
122
+ " --height <px> Viewport height, default 900",
123
+ " --help Show this help",
124
+ ].join("\n"),
125
+ );
126
+ }
127
+
128
+ /* -------------------------------------------------------------------------- */
129
+ /* robots.txt */
130
+ /* -------------------------------------------------------------------------- */
131
+
132
+ async function isAllowedByRobots(url: string): Promise<boolean> {
133
+ try {
134
+ const u = new URL(url);
135
+ const res = await fetch(`${u.origin}/robots.txt`, {
136
+ headers: { "User-Agent": USER_AGENT },
137
+ signal: AbortSignal.timeout(8_000),
138
+ });
139
+ if (!res.ok) return true;
140
+ const body = await res.text();
141
+ return checkRobots(body, u.pathname);
142
+ } catch {
143
+ return true;
144
+ }
145
+ }
146
+
147
+ function checkRobots(body: string, pathname: string): boolean {
148
+ const lines = body.split(/\r?\n/);
149
+ let inStarBlock = false;
150
+ const disallow: string[] = [];
151
+ const allow: string[] = [];
152
+ for (const raw of lines) {
153
+ const line = raw.split("#")[0]!.trim();
154
+ if (!line) continue;
155
+ const idx = line.indexOf(":");
156
+ if (idx < 0) continue;
157
+ const key = line.slice(0, idx).toLowerCase().trim();
158
+ const value = line.slice(idx + 1).trim();
159
+ if (key === "user-agent") inStarBlock = value === "*";
160
+ else if (inStarBlock && key === "disallow" && value) disallow.push(value);
161
+ else if (inStarBlock && key === "allow" && value) allow.push(value);
162
+ }
163
+ const len = (patterns: string[]) =>
164
+ patterns.reduce((m, p) => (pathname.startsWith(p) ? Math.max(m, p.length) : m), -1);
165
+ const a = len(allow);
166
+ const d = len(disallow);
167
+ if (d < 0) return true;
168
+ return a >= d;
169
+ }
170
+
171
+ /* -------------------------------------------------------------------------- */
172
+ /* Rate limiter */
173
+ /* -------------------------------------------------------------------------- */
174
+
175
+ class RateLimiter {
176
+ private readonly intervalMs: number;
177
+ private readonly lastByHost = new Map<string, number>();
178
+ constructor(perMinute: number) {
179
+ this.intervalMs = Math.ceil(60_000 / Math.max(1, perMinute));
180
+ }
181
+ async wait(host: string): Promise<void> {
182
+ const last = this.lastByHost.get(host) ?? 0;
183
+ const elapsed = Date.now() - last;
184
+ if (elapsed < this.intervalMs) {
185
+ await new Promise((r) => setTimeout(r, this.intervalMs - elapsed));
186
+ }
187
+ this.lastByHost.set(host, Date.now());
188
+ }
189
+ }
190
+
191
+ /* -------------------------------------------------------------------------- */
192
+ /* Pipeline */
193
+ /* -------------------------------------------------------------------------- */
194
+
195
+ async function captureOne(
196
+ browser: Browser,
197
+ url: string,
198
+ viewport: { width: number; height: number },
199
+ outDir: string,
200
+ ): Promise<{ raw: RawTokens; layout: SiteLayout | null; capture: SiteCapture } | null> {
201
+ const host = new URL(url).host;
202
+ const stamp = `${host}.png`;
203
+ const screenshotPath = join(outDir, "screenshots", stamp);
204
+ const rawPath = join(outDir, "raw", `${host}.tokens.json`);
205
+ const layoutPath = join(outDir, "raw", `${host}.layout.json`);
206
+ const mirrorDir = join(outDir, "mirror", host);
207
+ const referenceDir = join(outDir, "reference", host);
208
+
209
+ const ctx = await browser.newContext({
210
+ userAgent: USER_AGENT,
211
+ viewport,
212
+ deviceScaleFactor: 2,
213
+ reducedMotion: "reduce",
214
+ });
215
+ const page = await ctx.newPage();
216
+ try {
217
+ const response = await page.goto(url, { waitUntil: "networkidle", timeout: 30_000 });
218
+ if (!response || response.status() >= 400) {
219
+ throw new Error(`HTTP ${response?.status() ?? "unknown"}`);
220
+ }
221
+
222
+ await page.evaluate(() => {
223
+ const style = document.createElement("style");
224
+ style.textContent = `*, *::before, *::after {
225
+ animation: none !important;
226
+ transition: none !important;
227
+ scroll-behavior: auto !important;
228
+ }`;
229
+ document.head.appendChild(style);
230
+ });
231
+ await page.waitForTimeout(400);
232
+
233
+ mkdirSync(dirname(screenshotPath), { recursive: true });
234
+ await page.screenshot({ path: screenshotPath, fullPage: true, type: "png" });
235
+
236
+ const raw = await harvestTokens(page, url, viewport);
237
+ mkdirSync(dirname(rawPath), { recursive: true });
238
+ writeFileSync(rawPath, JSON.stringify(raw, null, 2));
239
+
240
+ let referenceWritten: string[] = [];
241
+ try {
242
+ referenceWritten = await emitPageReference(page, url, referenceDir, viewport);
243
+ } catch (err) {
244
+ console.warn(` ! reference dump failed for ${url}: ${(err as Error).message}`);
245
+ }
246
+
247
+ let layout: SiteLayout | null = null;
248
+ let mirrorWritten: string[] = [];
249
+ try {
250
+ layout = await crawlLayout(page, url, viewport);
251
+ mkdirSync(dirname(layoutPath), { recursive: true });
252
+ writeFileSync(layoutPath, JSON.stringify(layout, null, 2));
253
+ mirrorWritten = emitMirror(layout, mirrorDir);
254
+ } catch (err) {
255
+ console.warn(` ! layout crawl failed for ${url}: ${(err as Error).message}`);
256
+ }
257
+
258
+ const capture: SiteCapture = {
259
+ url,
260
+ host,
261
+ capturedAt: raw.capturedAt,
262
+ screenshotPath,
263
+ rawTokensPath: rawPath,
264
+ ...(referenceWritten.length > 0 ? { referenceDir } : {}),
265
+ ...(layout ? { layoutPath } : {}),
266
+ ...(mirrorWritten.length > 0 ? { mirrorDir } : {}),
267
+ status: "ok",
268
+ };
269
+ return { raw, layout, capture };
270
+ } catch (err) {
271
+ return {
272
+ raw: emptyRaw(url, viewport),
273
+ layout: null,
274
+ capture: {
275
+ url,
276
+ host,
277
+ capturedAt: new Date().toISOString(),
278
+ screenshotPath: "",
279
+ rawTokensPath: "",
280
+ status: "failed",
281
+ reason: (err as Error).message,
282
+ },
283
+ };
284
+ } finally {
285
+ await ctx.close();
286
+ }
287
+ }
288
+
289
+ function emptyRaw(url: string, viewport: { width: number; height: number }): RawTokens {
290
+ return {
291
+ url,
292
+ capturedAt: new Date().toISOString(),
293
+ viewport,
294
+ colors: [],
295
+ typography: [],
296
+ spacing: [],
297
+ radii: [],
298
+ shadows: [],
299
+ dominantContainerPx: null,
300
+ };
301
+ }
302
+
303
+ async function main(): Promise<void> {
304
+ const args = parseArgs(process.argv.slice(2));
305
+ const startedAt = new Date().toISOString();
306
+ const runId = makeRunId(startedAt, args.runName);
307
+ const outDir = args.outDir || join(DEFAULT_OUTPUT_ROOT, runId);
308
+
309
+ console.log(`[extract] runId=${runId}`);
310
+ console.log(`[extract] urls=${args.urls.length} viewport=${args.viewport.width}x${args.viewport.height}`);
311
+ console.log(`[extract] output=${outDir}`);
312
+ console.log("");
313
+
314
+ mkdirSync(outDir, { recursive: true });
315
+
316
+ const limiter = new RateLimiter(args.rateLimitPerMinute);
317
+ const captures: SiteCapture[] = [];
318
+ const rawList: RawTokens[] = [];
319
+
320
+ let browser: Browser | null = null;
321
+ try {
322
+ browser = await chromium.launch();
323
+
324
+ for (const url of args.urls) {
325
+ const host = new URL(url).host;
326
+
327
+ if (args.respectRobots) {
328
+ const allowed = await isAllowedByRobots(url);
329
+ if (!allowed) {
330
+ console.log(` ⊘ ${url} skipped — robots.txt disallows`);
331
+ captures.push({
332
+ url,
333
+ host,
334
+ capturedAt: new Date().toISOString(),
335
+ screenshotPath: "",
336
+ rawTokensPath: "",
337
+ status: "skipped",
338
+ reason: "robots.txt",
339
+ });
340
+ continue;
341
+ }
342
+ }
343
+
344
+ await limiter.wait(host);
345
+ const result = await captureOne(browser, url, args.viewport, outDir);
346
+ if (!result) continue;
347
+ captures.push(result.capture);
348
+ if (result.capture.status === "ok") {
349
+ rawList.push(result.raw);
350
+ const tag = result.layout ? "mirror" : "tokens-only";
351
+ const sectionCount = result.layout?.sections.length ?? 0;
352
+ console.log(
353
+ ` ✓ ${url} ${tag}${result.layout ? ` (${sectionCount} sections)` : ""}`,
354
+ );
355
+ } else {
356
+ console.log(` ✗ ${url} ${result.capture.reason ?? ""}`);
357
+ }
358
+ }
359
+ } finally {
360
+ if (browser) await browser.close();
361
+ }
362
+
363
+ if (rawList.length === 0) {
364
+ console.error("[extract] no successful captures — nothing to synthesize.");
365
+ process.exit(1);
366
+ }
367
+
368
+ console.log("");
369
+ console.log(`[extract] synthesizing design system from ${rawList.length} site(s)...`);
370
+ const designSystem = synthesize(rawList, {
371
+ runId,
372
+ sources: rawList.map((r) => ({ url: r.url, capturedAt: r.capturedAt })),
373
+ });
374
+
375
+ const run: ExtractionRun = {
376
+ runId,
377
+ startedAt,
378
+ finishedAt: new Date().toISOString(),
379
+ outputDir: outDir,
380
+ captures,
381
+ designSystem,
382
+ };
383
+
384
+ const written = emitAll(designSystem, run);
385
+ writeFileSync(join(outDir, "run.json"), JSON.stringify(run, null, 2));
386
+ console.log("");
387
+ console.log("[extract] wrote:");
388
+ for (const f of written) console.log(` → ${f}`);
389
+ console.log(` → ${join(outDir, "run.json")}`);
390
+ const mirrorDirs = captures.filter((c) => c.mirrorDir).map((c) => c.mirrorDir!);
391
+ const referenceDirs = captures.filter((c) => c.referenceDir).map((c) => c.referenceDir!);
392
+ if (mirrorDirs.length > 0) {
393
+ console.log("");
394
+ console.log("[extract] layout mirrors:");
395
+ for (const d of mirrorDirs) console.log(` → ${d}/page.tsx`);
396
+ }
397
+ if (referenceDirs.length > 0) {
398
+ console.log("");
399
+ console.log("[extract] AI reference (verbatim DOM + copy):");
400
+ for (const d of referenceDirs) console.log(` → ${d}/FOR_AI_REFERENCE.md`);
401
+ }
402
+ console.log("");
403
+ console.log(`[extract] done. Open ${join(outDir, "REPORT.md")} for the design-system summary.`);
404
+ if (mirrorDirs.length > 0) {
405
+ console.log(
406
+ `[extract] each mirror folder ships a Next.js page.tsx + MIRROR_NOTES.md.`,
407
+ );
408
+ console.log(`[extract] fill the <TextSlot> / <MediaSlot> placeholders with your own content.`);
409
+ }
410
+ if (referenceDirs.length > 0) {
411
+ console.log(
412
+ `[extract] paste reference/<host>/visible-text.txt or page.html into your AI for exact structure + copy.`,
413
+ );
414
+ }
415
+ console.log(`[extract] AI handoff: ${join(outDir, "FOR_AI.md")}`);
416
+ }
417
+
418
+ function makeRunId(startedAt: string, name: string | undefined): string {
419
+ const stamp = startedAt.replace(/[-:T]/g, "").slice(0, 14);
420
+ return name ? `${stamp}-${name}` : stamp;
421
+ }
422
+
423
+ if (isMainModule(import.meta.url)) {
424
+ main().catch((err) => {
425
+ console.error(err);
426
+ process.exit(1);
427
+ });
428
+ }
429
+
430
+ /**
431
+ * Cross-platform entry-point check. On Windows, `process.argv[1]` is a
432
+ * backslash path while `import.meta.url` is a proper file URL, so the
433
+ * naive `file://${argv[1]}` template literal never matches and the
434
+ * script silently exits. `pathToFileURL` produces the encoded URL form
435
+ * on every platform.
436
+ */
437
+ function isMainModule(metaUrl: string): boolean {
438
+ const entry = process.argv[1];
439
+ if (!entry) return false;
440
+ return metaUrl === pathToFileURL(entry).href;
441
+ }
442
+
443
+ export { main };