launchframe 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +229 -229
- package/package.json +1 -1
- package/packages/extract/dom-crawler.ts +521 -521
- package/packages/extract/emit.ts +534 -466
- package/packages/extract/extract.ts +441 -441
- package/packages/extract/mirror-emit.ts +617 -617
- package/packages/extract/reference-dump.ts +232 -230
- package/packages/extract/types.ts +311 -311
|
@@ -1,230 +1,232 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Verbatim reference dump for AI / human review.
|
|
3
|
-
*
|
|
4
|
-
* Writes everything under `output/<runId>/reference/<host>/`:
|
|
5
|
-
* - page.html — full document HTML after JS render (`page.content()`)
|
|
6
|
-
* - visible-text.json — structured visible copy (headings, buttons, key blocks)
|
|
7
|
-
* - media.json — img / video / source URLs and attributes
|
|
8
|
-
* - meta.json — title, description, canonical, lang
|
|
9
|
-
* - FOR_AI_REFERENCE.md — how to use these files with an AI
|
|
10
|
-
*/
|
|
11
|
-
|
|
12
|
-
import { mkdirSync, writeFileSync } from "node:fs";
|
|
13
|
-
import { join } from "node:path";
|
|
14
|
-
|
|
15
|
-
import type { Page } from "playwright";
|
|
16
|
-
|
|
17
|
-
export interface ReferenceSnapshot {
|
|
18
|
-
url: string;
|
|
19
|
-
capturedAt: string;
|
|
20
|
-
title: string | null;
|
|
21
|
-
description: string | null;
|
|
22
|
-
canonical: string | null;
|
|
23
|
-
lang: string | null;
|
|
24
|
-
/** Flattened visible strings in DOM order (useful for grep / LLM context). */
|
|
25
|
-
visibleTextBlocks: Array<{
|
|
26
|
-
tag: string;
|
|
27
|
-
role: string | null;
|
|
28
|
-
text: string;
|
|
29
|
-
}>;
|
|
30
|
-
links: Array<{ href: string; text: string }>;
|
|
31
|
-
media: Array<
|
|
32
|
-
| { type: "img"; src: string; alt: string; width: number | null; height: number | null }
|
|
33
|
-
| { type: "video"; src: string | null; poster: string | null }
|
|
34
|
-
| { type: "source"; src: string; kind: string | null }
|
|
35
|
-
>;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
export async function emitPageReference(page: Page, url: string, refDir: string): Promise<string[]> {
|
|
39
|
-
mkdirSync(refDir, { recursive: true });
|
|
40
|
-
const written: string[] = [];
|
|
41
|
-
const capturedAt = new Date().toISOString();
|
|
42
|
-
|
|
43
|
-
await page.evaluate(() => {
|
|
44
|
-
const g = globalThis as unknown as { __name?: (fn: unknown) => unknown };
|
|
45
|
-
if (typeof g.__name === "undefined") g.__name = (fn: unknown) => fn;
|
|
46
|
-
});
|
|
47
|
-
|
|
48
|
-
const html = await page.content();
|
|
49
|
-
const htmlPath = join(refDir, "page.html");
|
|
50
|
-
writeFileSync(htmlPath, html, "utf8");
|
|
51
|
-
written.push(htmlPath);
|
|
52
|
-
|
|
53
|
-
const snapshot = (await page.evaluate(collectSnapshot)) as Omit<ReferenceSnapshot, "url" | "capturedAt">;
|
|
54
|
-
const full: ReferenceSnapshot = {
|
|
55
|
-
url,
|
|
56
|
-
capturedAt,
|
|
57
|
-
...snapshot,
|
|
58
|
-
};
|
|
59
|
-
|
|
60
|
-
writeFileSync(join(refDir, "visible-text.json"), JSON.stringify(full, null, 2) + "\n", "utf8");
|
|
61
|
-
written.push(join(refDir, "visible-text.json"));
|
|
62
|
-
|
|
63
|
-
const txtLines = [
|
|
64
|
-
`# ${full.title ?? "Untitled"}`,
|
|
65
|
-
"",
|
|
66
|
-
...full.visibleTextBlocks.map((b) => b.text),
|
|
67
|
-
"",
|
|
68
|
-
"--- links ---",
|
|
69
|
-
...full.links.map((l) => `${l.text}\t${l.href}`),
|
|
70
|
-
];
|
|
71
|
-
writeFileSync(join(refDir, "visible-text.txt"), txtLines.join("\n"), "utf8");
|
|
72
|
-
written.push(join(refDir, "visible-text.txt"));
|
|
73
|
-
|
|
74
|
-
const mediaOnly = { url, capturedAt, media: full.media };
|
|
75
|
-
writeFileSync(join(refDir, "media.json"), JSON.stringify(mediaOnly, null, 2) + "\n", "utf8");
|
|
76
|
-
written.push(join(refDir, "media.json"));
|
|
77
|
-
|
|
78
|
-
const meta = {
|
|
79
|
-
url,
|
|
80
|
-
capturedAt,
|
|
81
|
-
title: full.title,
|
|
82
|
-
description: full.description,
|
|
83
|
-
canonical: full.canonical,
|
|
84
|
-
lang: full.lang,
|
|
85
|
-
};
|
|
86
|
-
writeFileSync(join(refDir, "meta.json"), JSON.stringify(meta, null, 2) + "\n", "utf8");
|
|
87
|
-
written.push(join(refDir, "meta.json"));
|
|
88
|
-
|
|
89
|
-
writeFileSync(join(refDir, "FOR_AI_REFERENCE.md"), emitAiReadme(url, refDir), "utf8");
|
|
90
|
-
written.push(join(refDir, "FOR_AI_REFERENCE.md"));
|
|
91
|
-
|
|
92
|
-
return written;
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
function emitAiReadme(url: string, refDir: string): string {
|
|
96
|
-
const base = refDir.replace(/\\/g, "/");
|
|
97
|
-
return [
|
|
98
|
-
`# Reference capture — ${url}`,
|
|
99
|
-
"",
|
|
100
|
-
"Use these files when rebuilding the page in React / Next.js:",
|
|
101
|
-
"",
|
|
102
|
-
"| File | Purpose |",
|
|
103
|
-
"| ---- | ------- |",
|
|
104
|
-
"| `page.html` | Full serialized DOM after JavaScript ran in Chromium. Layout, copy, and structure match what crawled (not necessarily valid static HTML elsewhere). |",
|
|
105
|
-
"| `visible-text.json` | Exact visible strings: headings, buttons, links, and block text — good for **verbatim copy** when rewriting `page.tsx`. |",
|
|
106
|
-
"| `media.json` | Every image / video / source URL from the DOM. Host your own assets or swap for placeholders; do not hotlink without permission. |",
|
|
107
|
-
"| `meta.json` | Title, description, lang. |",
|
|
108
|
-
"",
|
|
109
|
-
|
|
110
|
-
"",
|
|
111
|
-
`
|
|
112
|
-
"",
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
const
|
|
123
|
-
const
|
|
124
|
-
const
|
|
125
|
-
const
|
|
126
|
-
|
|
127
|
-
const
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
if (
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
if (
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
if (
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
if (
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
const
|
|
169
|
-
if (
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
const
|
|
180
|
-
if (!
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
links.push({ href, text: text.slice(0, 500) });
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
1
|
+
/**
|
|
2
|
+
* Verbatim reference dump for AI / human review.
|
|
3
|
+
*
|
|
4
|
+
* Writes everything under `output/<runId>/reference/<host>/`:
|
|
5
|
+
* - page.html — full document HTML after JS render (`page.content()`)
|
|
6
|
+
* - visible-text.json — structured visible copy (headings, buttons, key blocks)
|
|
7
|
+
* - media.json — img / video / source URLs and attributes
|
|
8
|
+
* - meta.json — title, description, canonical, lang
|
|
9
|
+
* - FOR_AI_REFERENCE.md — how to use these files with an AI
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { mkdirSync, writeFileSync } from "node:fs";
|
|
13
|
+
import { join } from "node:path";
|
|
14
|
+
|
|
15
|
+
import type { Page } from "playwright";
|
|
16
|
+
|
|
17
|
+
export interface ReferenceSnapshot {
|
|
18
|
+
url: string;
|
|
19
|
+
capturedAt: string;
|
|
20
|
+
title: string | null;
|
|
21
|
+
description: string | null;
|
|
22
|
+
canonical: string | null;
|
|
23
|
+
lang: string | null;
|
|
24
|
+
/** Flattened visible strings in DOM order (useful for grep / LLM context). */
|
|
25
|
+
visibleTextBlocks: Array<{
|
|
26
|
+
tag: string;
|
|
27
|
+
role: string | null;
|
|
28
|
+
text: string;
|
|
29
|
+
}>;
|
|
30
|
+
links: Array<{ href: string; text: string }>;
|
|
31
|
+
media: Array<
|
|
32
|
+
| { type: "img"; src: string; alt: string; width: number | null; height: number | null }
|
|
33
|
+
| { type: "video"; src: string | null; poster: string | null }
|
|
34
|
+
| { type: "source"; src: string; kind: string | null }
|
|
35
|
+
>;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export async function emitPageReference(page: Page, url: string, refDir: string): Promise<string[]> {
|
|
39
|
+
mkdirSync(refDir, { recursive: true });
|
|
40
|
+
const written: string[] = [];
|
|
41
|
+
const capturedAt = new Date().toISOString();
|
|
42
|
+
|
|
43
|
+
await page.evaluate(() => {
|
|
44
|
+
const g = globalThis as unknown as { __name?: (fn: unknown) => unknown };
|
|
45
|
+
if (typeof g.__name === "undefined") g.__name = (fn: unknown) => fn;
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
const html = await page.content();
|
|
49
|
+
const htmlPath = join(refDir, "page.html");
|
|
50
|
+
writeFileSync(htmlPath, html, "utf8");
|
|
51
|
+
written.push(htmlPath);
|
|
52
|
+
|
|
53
|
+
const snapshot = (await page.evaluate(collectSnapshot)) as Omit<ReferenceSnapshot, "url" | "capturedAt">;
|
|
54
|
+
const full: ReferenceSnapshot = {
|
|
55
|
+
url,
|
|
56
|
+
capturedAt,
|
|
57
|
+
...snapshot,
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
writeFileSync(join(refDir, "visible-text.json"), JSON.stringify(full, null, 2) + "\n", "utf8");
|
|
61
|
+
written.push(join(refDir, "visible-text.json"));
|
|
62
|
+
|
|
63
|
+
const txtLines = [
|
|
64
|
+
`# ${full.title ?? "Untitled"}`,
|
|
65
|
+
"",
|
|
66
|
+
...full.visibleTextBlocks.map((b) => b.text),
|
|
67
|
+
"",
|
|
68
|
+
"--- links ---",
|
|
69
|
+
...full.links.map((l) => `${l.text}\t${l.href}`),
|
|
70
|
+
];
|
|
71
|
+
writeFileSync(join(refDir, "visible-text.txt"), txtLines.join("\n"), "utf8");
|
|
72
|
+
written.push(join(refDir, "visible-text.txt"));
|
|
73
|
+
|
|
74
|
+
const mediaOnly = { url, capturedAt, media: full.media };
|
|
75
|
+
writeFileSync(join(refDir, "media.json"), JSON.stringify(mediaOnly, null, 2) + "\n", "utf8");
|
|
76
|
+
written.push(join(refDir, "media.json"));
|
|
77
|
+
|
|
78
|
+
const meta = {
|
|
79
|
+
url,
|
|
80
|
+
capturedAt,
|
|
81
|
+
title: full.title,
|
|
82
|
+
description: full.description,
|
|
83
|
+
canonical: full.canonical,
|
|
84
|
+
lang: full.lang,
|
|
85
|
+
};
|
|
86
|
+
writeFileSync(join(refDir, "meta.json"), JSON.stringify(meta, null, 2) + "\n", "utf8");
|
|
87
|
+
written.push(join(refDir, "meta.json"));
|
|
88
|
+
|
|
89
|
+
writeFileSync(join(refDir, "FOR_AI_REFERENCE.md"), emitAiReadme(url, refDir), "utf8");
|
|
90
|
+
written.push(join(refDir, "FOR_AI_REFERENCE.md"));
|
|
91
|
+
|
|
92
|
+
return written;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function emitAiReadme(url: string, refDir: string): string {
|
|
96
|
+
const base = refDir.replace(/\\/g, "/");
|
|
97
|
+
return [
|
|
98
|
+
`# Reference capture — ${url}`,
|
|
99
|
+
"",
|
|
100
|
+
"Use these files when rebuilding the page in React / Next.js:",
|
|
101
|
+
"",
|
|
102
|
+
"| File | Purpose |",
|
|
103
|
+
"| ---- | ------- |",
|
|
104
|
+
"| `page.html` | Full serialized DOM after JavaScript ran in Chromium. Layout, copy, and structure match what crawled (not necessarily valid static HTML elsewhere). |",
|
|
105
|
+
"| `visible-text.json` | Exact visible strings: headings, buttons, links, and block text — good for **verbatim copy** when rewriting `page.tsx`. |",
|
|
106
|
+
"| `media.json` | Every image / video / source URL from the DOM. Host your own assets or swap for placeholders; do not hotlink without permission. |",
|
|
107
|
+
"| `meta.json` | Title, description, lang. |",
|
|
108
|
+
"",
|
|
109
|
+
"**Workflow:** (1) Recon — skim \`page.html\` for landmarks and grids; (2) Wire — map \`visible-text.*\` + \`media.json\` into sections; (3) Build — prefer editing sibling \`../mirror/<host>/page.tsx\` (section order via \`data-mirror-section\`) instead of inventing layout from scratch. See the run folder’s **FOR_AI.md** for full authority order and compliance notes.",
|
|
110
|
+
"",
|
|
111
|
+
`Sibling folder \`../mirror/<host>/\` has a typed \`page.tsx\` with Framer Motion, Phosphor icons, and slots — wire copy from \`visible-text.json\` and media from \`media.json\` into that file.`,
|
|
112
|
+
"",
|
|
113
|
+
`Captured path: \`${base}\``,
|
|
114
|
+
"",
|
|
115
|
+
].join("\n");
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Runs in browser context.
|
|
120
|
+
*/
|
|
121
|
+
function collectSnapshot(): Omit<ReferenceSnapshot, "url" | "capturedAt"> {
|
|
122
|
+
const title = document.title || null;
|
|
123
|
+
const descEl = document.querySelector('meta[name="description"]');
|
|
124
|
+
const description = descEl?.getAttribute("content")?.trim() || null;
|
|
125
|
+
const canonicalEl = document.querySelector('link[rel="canonical"]');
|
|
126
|
+
const canonical = canonicalEl?.getAttribute("href") || null;
|
|
127
|
+
const lang = document.documentElement.getAttribute("lang");
|
|
128
|
+
|
|
129
|
+
const visibleTextBlocks: Array<{ tag: string; role: string | null; text: string }> = [];
|
|
130
|
+
const pushBlock = (tag: string, el: HTMLElement, role: string | null) => {
|
|
131
|
+
const text = el.innerText?.trim().replace(/\s+/g, " ") ?? "";
|
|
132
|
+
if (text.length < 2 || text.length > 4000) return;
|
|
133
|
+
visibleTextBlocks.push({ tag, role, text });
|
|
134
|
+
};
|
|
135
|
+
|
|
136
|
+
for (const tag of ["H1", "H2", "H3", "H4", "H5", "H6"] as const) {
|
|
137
|
+
for (const el of Array.from(document.querySelectorAll(tag))) {
|
|
138
|
+
if (!(el instanceof HTMLElement)) continue;
|
|
139
|
+
const style = getComputedStyle(el);
|
|
140
|
+
if (style.visibility === "hidden" || style.display === "none") continue;
|
|
141
|
+
pushBlock(tag, el, el.getAttribute("role"));
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
for (const el of Array.from(document.querySelectorAll("p, li, blockquote, figcaption, label"))) {
|
|
146
|
+
if (!(el instanceof HTMLElement)) continue;
|
|
147
|
+
const style = getComputedStyle(el);
|
|
148
|
+
if (style.visibility === "hidden" || style.display === "none") continue;
|
|
149
|
+
pushBlock(el.tagName, el, el.getAttribute("role"));
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
for (const el of Array.from(document.querySelectorAll("button, [role='button']"))) {
|
|
153
|
+
if (!(el instanceof HTMLElement)) continue;
|
|
154
|
+
const style = getComputedStyle(el);
|
|
155
|
+
if (style.visibility === "hidden" || style.display === "none") continue;
|
|
156
|
+
pushBlock("BUTTON", el, el.getAttribute("role"));
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
for (const el of Array.from(document.querySelectorAll("span, div"))) {
|
|
160
|
+
if (!(el instanceof HTMLElement)) continue;
|
|
161
|
+
const role = el.getAttribute("role");
|
|
162
|
+
if (
|
|
163
|
+
role !== "heading" &&
|
|
164
|
+
!el.classList.contains("badge") &&
|
|
165
|
+
el.getAttribute("data-slot") === null
|
|
166
|
+
) {
|
|
167
|
+
// Only capture labeled small UI chrome (badges, pills) via short text + uppercase heuristic
|
|
168
|
+
const style = getComputedStyle(el);
|
|
169
|
+
if (style.visibility === "hidden" || style.display === "none") continue;
|
|
170
|
+
const text = el.innerText?.trim().replace(/\s+/g, " ") ?? "";
|
|
171
|
+
if (text.length < 8 || text.length > 240) continue;
|
|
172
|
+
if (!/^[A-Z0-9\s&.,:]+$/.test(text)) continue; // ALL-CAPS-ish eyebrow labels
|
|
173
|
+
pushBlock(el.tagName, el, role);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
const links: Array<{ href: string; text: string }> = [];
|
|
178
|
+
for (const a of Array.from(document.querySelectorAll("a[href]"))) {
|
|
179
|
+
const href = a.getAttribute("href") ?? "";
|
|
180
|
+
if (!href || href.startsWith("javascript:")) continue;
|
|
181
|
+
const text = (a.textContent ?? "").trim().replace(/\s+/g, " ");
|
|
182
|
+
if (!text) continue;
|
|
183
|
+
try {
|
|
184
|
+
const abs = new URL(href, document.baseURI).href;
|
|
185
|
+
links.push({ href: abs, text: text.slice(0, 500) });
|
|
186
|
+
} catch {
|
|
187
|
+
links.push({ href, text: text.slice(0, 500) });
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
const media: ReferenceSnapshot["media"] = [];
|
|
192
|
+
for (const img of Array.from(document.querySelectorAll("img"))) {
|
|
193
|
+
const src = img.currentSrc || img.src;
|
|
194
|
+
if (!src) continue;
|
|
195
|
+
media.push({
|
|
196
|
+
type: "img",
|
|
197
|
+
src,
|
|
198
|
+
alt: img.alt || "",
|
|
199
|
+
width: img.naturalWidth || null,
|
|
200
|
+
height: img.naturalHeight || null,
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
for (const video of Array.from(document.querySelectorAll("video"))) {
|
|
204
|
+
const poster = video.getAttribute("poster");
|
|
205
|
+
let src: string | null = null;
|
|
206
|
+
if (video.currentSrc) src = video.currentSrc;
|
|
207
|
+
else {
|
|
208
|
+
const s = video.querySelector("source[src]");
|
|
209
|
+
src = s?.getAttribute("src") ?? null;
|
|
210
|
+
}
|
|
211
|
+
media.push({ type: "video", src, poster: poster || null });
|
|
212
|
+
}
|
|
213
|
+
for (const source of Array.from(document.querySelectorAll("source[src]"))) {
|
|
214
|
+
const src = source.getAttribute("src");
|
|
215
|
+
if (!src) continue;
|
|
216
|
+
media.push({
|
|
217
|
+
type: "source",
|
|
218
|
+
src,
|
|
219
|
+
kind: source.getAttribute("type"),
|
|
220
|
+
});
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
return {
|
|
224
|
+
title,
|
|
225
|
+
description,
|
|
226
|
+
canonical,
|
|
227
|
+
lang: lang || null,
|
|
228
|
+
visibleTextBlocks,
|
|
229
|
+
links,
|
|
230
|
+
media,
|
|
231
|
+
};
|
|
232
|
+
}
|