launchframe 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -24
- package/package.json +2 -1
- package/packages/extract/emit.ts +6 -6
- package/packages/extract/extract.ts +5 -3
- package/packages/extract/reference-dump.ts +206 -7
package/README.md
CHANGED
|
@@ -54,13 +54,15 @@ output/<runId>/
|
|
|
54
54
|
├── run.json ← full run metadata (sources, timing, status)
|
|
55
55
|
├── screenshots/ ← captured PNGs
|
|
56
56
|
├── raw/ ← per-site raw token + SiteLayout JSON
|
|
57
|
-
├── reference/ ← verbatim DOM + copy for AI
|
|
57
|
+
├── reference/ ← verbatim DOM + **exact structure JSON** + copy for AI
|
|
58
58
|
│ └── <host>/
|
|
59
|
-
│ ├── page.html
|
|
60
|
-
│ ├──
|
|
61
|
-
│ ├──
|
|
62
|
-
│ ├──
|
|
63
|
-
│ ├──
|
|
59
|
+
│ ├── page.html
|
|
60
|
+
│ ├── dom-structure.json ← canonical body tree (tags, attrs, text nodes)
|
|
61
|
+
│ ├── structure-outline.txt ← tag skeleton for quick scanning
|
|
62
|
+
│ ├── visible-text.txt
|
|
63
|
+
│ ├── visible-text.json
|
|
64
|
+
│ ├── media.json
|
|
65
|
+
│ ├── meta.json
|
|
64
66
|
│ └── FOR_AI_REFERENCE.md
|
|
65
67
|
└── mirror/
|
|
66
68
|
└── <host>/
|
|
@@ -74,15 +76,15 @@ output/<runId>/
|
|
|
74
76
|
## Hand the output to your AI
|
|
75
77
|
|
|
76
78
|
1. Run the command above so `output/<runId>/` exists.
|
|
77
|
-
2. Attach **`reference/<host
|
|
79
|
+
2. Attach **`reference/<host>/`**, especially **`dom-structure.json`** (exact tree) and **`visible-text.*`**, plus **`page.html`** and **`media.json`** so the model sees **exact structure and copy** from the crawl.
|
|
78
80
|
3. Pick the mirror folder: `output/<runId>/mirror/<host>/`.
|
|
79
81
|
4. Either:
|
|
80
82
|
- **Cursor:** `@`-attach `reference/<host>/`, `mirror/<host>/`, `FOR_AI.md`, and
|
|
81
83
|
`tokens.json`, then ask the agent to port copy from `visible-text.txt` into
|
|
82
84
|
`page.tsx` and wire media from `media.json`.
|
|
83
85
|
- **Claude Code:** copy both folders into your project, then ask the same.
|
|
84
|
-
5. The AI's authority order is
|
|
85
|
-
MIRROR_NOTES.md → page.tsx → tokens.json → tailwind.config.ts + globals.css**. It must:
|
|
86
|
+
5. The AI's authority order is **`dom-structure.json` (nesting) → `structure-outline.txt` / `page.html` → `visible-text.*` →
|
|
87
|
+
MIRROR_NOTES.md → mirror `page.tsx` → tokens.json → tailwind.config.ts + globals.css**. It must:
|
|
86
88
|
- Keep the section tree, grid composition, density, Motion, and Phosphor usage in `page.tsx`.
|
|
87
89
|
- Map strings from `visible-text.txt` into the right `<TextSlot>` slots (or replace slots with plain JSX).
|
|
88
90
|
- Use `media.json` for image/video `src` / `poster` (respect licensing; prefer your own assets).
|
|
@@ -191,22 +193,30 @@ npm run analyze # Run section classifier on captured screenshots
|
|
|
191
193
|
npm run formalize # Validate the pattern-atlas/*.json files
|
|
192
194
|
npm run evaluate # Grade a generated page (coherence + a11y)
|
|
193
195
|
npm run typecheck # Project-wide TypeScript check
|
|
196
|
+
npm run sync:agents # Regenerate Copilot / Cline / Continue / Amazon Q stubs from AGENTS.md
|
|
194
197
|
```
|
|
195
198
|
|
|
199
|
+
### AI agents in this repo
|
|
200
|
+
|
|
201
|
+
- **`AGENTS.md`** (root) is the **single source of truth** for how agents should work here (extract handoff, structure fidelity, compliance). **`docs/research/INSPECTION_GUIDE.md`** is inlined into derived configs when you run `npm run sync:agents`.
|
|
202
|
+
- **Cursor:** `.cursor/rules/project.mdc` points at `AGENTS.md`.
|
|
203
|
+
- **Claude Code / Gemini CLI:** `CLAUDE.md` and `GEMINI.md` import `AGENTS.md`.
|
|
204
|
+
- Edit `AGENTS.md`, then run `npm run sync:agents` (or `bash scripts/sync-agent-rules.sh`) to refresh `.github/copilot-instructions.md`, `.clinerules`, `.continue/rules/project.md`, and `.amazonq/rules/project.md`.
|
|
205
|
+
|
|
196
206
|
---
|
|
197
207
|
|
|
198
208
|
## What this is not
|
|
199
209
|
|
|
200
|
-
- **Not
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
+
- **Not the source site's frontend bundle.** The **layout mirror** is React
|
|
211
|
+
code emitted from a typed `SiteLayout` (section tree, composition,
|
|
212
|
+
density) — not a dump of the origin's original components or stylesheets.
|
|
213
|
+
- **Not a substitute for legal clearance.** `reference/<host>/` may contain
|
|
214
|
+
serialized DOM and visible text for tooling **you** run on pages you are
|
|
215
|
+
allowed to analyze. You are responsible for trademarks, copy licenses, and
|
|
216
|
+
`robots.txt`/ToS compliance when using those artifacts.
|
|
217
|
+
- **Not a component library replacement.** Launchframe sits *alongside*
|
|
218
|
+
shadcn/ui: theme files, reference dumps, and slot-driven mirror pages —
|
|
219
|
+
you integrate into your own app.
|
|
210
220
|
|
|
211
221
|
---
|
|
212
222
|
|
|
@@ -216,11 +226,10 @@ Launchframe is intended for layout research and design-system seeding
|
|
|
216
226
|
against pages you have permission to analyze (your own products, sites
|
|
217
227
|
where the operator has permission, or pages where structural analysis is
|
|
218
228
|
permitted by `robots.txt`). The crawler respects `robots.txt` by default
|
|
219
|
-
and rate-limits per domain.
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
trademarks, terms of service, and licenses.
|
|
229
|
+
and rate-limits per domain. Output includes synthesized theme files, a
|
|
230
|
+
typed **mirror** page scaffold, and (per capture) a **reference** bundle
|
|
231
|
+
(DOM snapshot, visible text, media URLs) for AI-assisted reconstruction.
|
|
232
|
+
Operators remain responsible for how they use copy, media, and branding.
|
|
224
233
|
|
|
225
234
|
---
|
|
226
235
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "launchframe",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.9",
|
|
4
4
|
"description": "Point Launchframe at SaaS sites you admire and get back a drop-in shadcn/ui design system (tokens, Tailwind theme, CSS variables, AI handoff) you can build your own UI on top of.",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "Evan Gruhlkey",
|
|
@@ -46,6 +46,7 @@
|
|
|
46
46
|
"analyze": "tsx packages/analysis/analyze-screenshot.ts",
|
|
47
47
|
"formalize": "tsx packages/patterns/pattern-registry.ts",
|
|
48
48
|
"evaluate": "tsx packages/evaluation/evaluate-page.ts",
|
|
49
|
+
"sync:agents": "node scripts/sync-agent-rules.mjs",
|
|
49
50
|
"typecheck": "tsc -p tsconfig.json --noEmit",
|
|
50
51
|
"format:check": "prettier --check ."
|
|
51
52
|
},
|
package/packages/extract/emit.ts
CHANGED
|
@@ -304,7 +304,7 @@ function emitReport(system: DesignSystem, run: ExtractionRun): string {
|
|
|
304
304
|
const lines = [`- ${c.url}`, ` - screenshot: \`${relativize(c.screenshotPath, run.outputDir)}\``];
|
|
305
305
|
if (c.referenceDir) {
|
|
306
306
|
lines.push(
|
|
307
|
-
` - reference: \`${relativize(c.referenceDir, run.outputDir)}/\` — \`page.html\`, \`visible-text.*\`, \`media.json\`, \`FOR_AI_REFERENCE.md\``,
|
|
307
|
+
` - reference: \`${relativize(c.referenceDir, run.outputDir)}/\` — \`page.html\`, \`dom-structure.json\`, \`structure-outline.txt\`, \`visible-text.*\`, \`media.json\`, \`FOR_AI_REFERENCE.md\``,
|
|
308
308
|
);
|
|
309
309
|
}
|
|
310
310
|
if (c.mirrorDir) {
|
|
@@ -403,7 +403,7 @@ ${rampRows(system.dark)}
|
|
|
403
403
|
shadows). It does not, by itself, define every layout detail of a source page.
|
|
404
404
|
- **Per-site \`reference/\` and \`mirror/\` folders** (when emitted) are the
|
|
405
405
|
recon inputs for **structure and copy**: map landmarks and section order from
|
|
406
|
-
\`page.html\`, pull strings from \`visible-text.json\` or \`visible-text.txt\`,
|
|
406
|
+
\`dom-structure.json\` (canonical nesting) + \`page.html\`, pull strings from \`visible-text.json\` or \`visible-text.txt\`,
|
|
407
407
|
align media via \`media.json\`, and implement or refine UI starting from
|
|
408
408
|
\`mirror/<host>/page.tsx\` (\`data-mirror-section\` markers match the crawl).
|
|
409
409
|
- **Compliance:** Do not impersonate another company, ship their trademarks or
|
|
@@ -429,7 +429,7 @@ function emitForAi(system: DesignSystem, run: ExtractionRun): string {
|
|
|
429
429
|
.map((c) => {
|
|
430
430
|
const lines = [`### ${c.url}`];
|
|
431
431
|
if (c.referenceDir) {
|
|
432
|
-
lines.push(`- **Reference:** \`${relativize(c.referenceDir, run.outputDir)}/\` — start with \`FOR_AI_REFERENCE.md\`, then \`page.html\`, \`visible-text.txt\` (or \`.json\`), \`media.json\`.`);
|
|
432
|
+
lines.push(`- **Reference:** \`${relativize(c.referenceDir, run.outputDir)}/\` — start with \`FOR_AI_REFERENCE.md\`, then \`dom-structure.json\` (exact tree), \`structure-outline.txt\`, \`page.html\`, \`visible-text.txt\` (or \`.json\`), \`media.json\`.`);
|
|
433
433
|
}
|
|
434
434
|
if (c.mirrorDir) {
|
|
435
435
|
lines.push(`- **Mirror:** \`${relativize(c.mirrorDir, run.outputDir)}/page.tsx\` — section scaffold + \`data-mirror-section\`; read \`MIRROR_NOTES.md\`.`);
|
|
@@ -448,7 +448,7 @@ ${perHost || "_No reference/mirror paths on this run — token-only._"}
|
|
|
448
448
|
|
|
449
449
|
Workflow (similar in spirit to **recon → specs → build** pipelines):
|
|
450
450
|
|
|
451
|
-
1. **Recon:**
|
|
451
|
+
1. **Recon:** Use \`dom-structure.json\` for **exact nesting and sibling order**; use \`structure-outline.txt\` or \`page.html\` for skimming. Cross-check with \`mirror/.../page.tsx\` (\`data-mirror-section\`).
|
|
452
452
|
2. **Wire copy + media:** Map headings, buttons, and blocks from \`visible-text.*\` into the matching mirror sections (or your components). Use \`media.json\` for asset URLs; replace with licensed or original assets when shipping.
|
|
453
453
|
3. **Build:** Prefer editing **mirror \`page.tsx\`** inside the user's app (or port its structure into their file tree) rather than inventing a new section order from scratch. Apply **REPORT.md** / **tokens** for colors, type, spacing, radii — mirror CSS variables under \`.mirror-root\` should converge to the same semantic palette where possible.
|
|
454
454
|
|
|
@@ -474,7 +474,7 @@ In **Cursor**, \`@\` those paths explicitly.
|
|
|
474
474
|
|
|
475
475
|
## Authority order
|
|
476
476
|
|
|
477
|
-
1. **Structural fidelity:** \`reference/<host>/page.html\` + \`visible-text.*\` + \`mirror/<host>/page.tsx\` —
|
|
477
|
+
1. **Structural fidelity:** \`reference/<host>/dom-structure.json\` + \`page.html\` + \`visible-text.*\` + \`mirror/<host>/page.tsx\` — exact DOM tree shape, then copy and typed scaffold.
|
|
478
478
|
2. **Design tokens:** **REPORT.md** and **tokens.json** — typography scale, spacing, radii, colors, container width, notes.
|
|
479
479
|
3. **Integration:** **tailwind.config.ts** + **globals.css** — merge into a Next.js + Tailwind + shadcn-style app.
|
|
480
480
|
${structureSection}
|
|
@@ -485,7 +485,7 @@ You must use the attached \`output/${system.runId}/\` folder.
|
|
|
485
485
|
|
|
486
486
|
- Read REPORT.md and tokens.json before writing UI. Merge tailwind.config.ts and globals.css into my project (preserve my paths unless I say otherwise).
|
|
487
487
|
- Style with semantic tokens: bg-background, text-foreground, text-muted-foreground, border-border, bg-primary, text-primary-foreground, bg-card, text-card-foreground, etc. Prefer these over ad-hoc hex; mirror pages may use --mirror-* variables until merged.
|
|
488
|
-
- If reference/ and mirror/ exist for my source URL: treat them as mandatory context. Preserve
|
|
488
|
+
- If reference/ and mirror/ exist for my source URL: treat them as mandatory context. Preserve **exact DOM nesting and sibling order** from `dom-structure.json` (and cross-check `page.html`). Align components to `data-mirror-section` and the mirror scaffold. Wire copy from visible-text.* and media from media.json unless I say to rewrite for a different product.
|
|
489
489
|
- If I am building a NEW product unrelated to the crawl: keep layout inspiration from mirror/reference but REPLACE product names, claims, and sensitive copy with my copy. Never impersonate another brand.
|
|
490
490
|
|
|
491
491
|
My product / intent: [describe goal — faithful mirror of URL vs new product in same layout; tone and CTA]
|
|
@@ -20,7 +20,8 @@
|
|
|
20
20
|
* - Per-domain rate limit defaults to 15 req/min (`--rate <n>`).
|
|
21
21
|
* - The crawler extracts a structured representation (section tree,
|
|
22
22
|
* computed style tokens, content kinds) and writes a verbatim
|
|
23
|
-
* `reference/<host>/` bundle (HTML
|
|
23
|
+
* `reference/<host>/` bundle (HTML, DOM tree JSON, outlines, visible text,
|
|
24
|
+
* media index) for AI structure cloning.
|
|
24
25
|
*/
|
|
25
26
|
|
|
26
27
|
import { mkdirSync, writeFileSync } from "node:fs";
|
|
@@ -104,7 +105,8 @@ function printHelp(): void {
|
|
|
104
105
|
" 2. Captures a full-page screenshot and harvests computed design tokens",
|
|
105
106
|
" (colors, type, spacing, radius, shadow) → raw/<host>.tokens.json.",
|
|
106
107
|
" 3. Writes a verbatim reference bundle → reference/<host>/ (page.html,",
|
|
107
|
-
"
|
|
108
|
+
" dom-structure.json, structure-outline.txt, visible-text.json/.txt,",
|
|
109
|
+
" media.json, meta.json, FOR_AI_REFERENCE.md).",
|
|
108
110
|
" 4. Crawls the DOM into SiteLayout → raw/<host>.layout.json and emits",
|
|
109
111
|
" mirror/<host>/page.tsx (Framer Motion + Phosphor + image/video slots).",
|
|
110
112
|
"",
|
|
@@ -237,7 +239,7 @@ async function captureOne(
|
|
|
237
239
|
|
|
238
240
|
let referenceWritten: string[] = [];
|
|
239
241
|
try {
|
|
240
|
-
referenceWritten = await emitPageReference(page, url, referenceDir);
|
|
242
|
+
referenceWritten = await emitPageReference(page, url, referenceDir, viewport);
|
|
241
243
|
} catch (err) {
|
|
242
244
|
console.warn(` ! reference dump failed for ${url}: ${(err as Error).message}`);
|
|
243
245
|
}
|
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
* Verbatim reference dump for AI / human review.
|
|
3
3
|
*
|
|
4
4
|
* Writes everything under `output/<runId>/reference/<host>/`:
|
|
5
|
-
* - page.html
|
|
6
|
-
* -
|
|
7
|
-
* -
|
|
8
|
-
* -
|
|
9
|
-
* -
|
|
5
|
+
* - page.html — full document HTML after JS render (`page.content()`)
|
|
6
|
+
* - dom-structure.json — exact `body` subtree: tag order, attributes, text nodes (JSON)
|
|
7
|
+
* - structure-outline.txt — indented tag skeleton (ids/classes/roles) for quick scanning
|
|
8
|
+
* - visible-text.json — structured visible copy (headings, buttons, key blocks)
|
|
9
|
+
* - media.json — img / video / source URLs and attributes
|
|
10
|
+
* - meta.json — title, description, canonical, lang
|
|
11
|
+
* - FOR_AI_REFERENCE.md — how to use these files with an AI
|
|
10
12
|
*/
|
|
11
13
|
|
|
12
14
|
import { mkdirSync, writeFileSync } from "node:fs";
|
|
@@ -14,6 +16,33 @@ import { join } from "node:path";
|
|
|
14
16
|
|
|
15
17
|
import type { Page } from "playwright";
|
|
16
18
|
|
|
19
|
+
/** Element node in the serialized `body` tree. */
|
|
20
|
+
export interface DomStructureElement {
|
|
21
|
+
tag: string;
|
|
22
|
+
attrs?: Record<string, string>;
|
|
23
|
+
children?: DomStructureChild[];
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export interface DomStructureTextNode {
|
|
27
|
+
type: "text";
|
|
28
|
+
value: string;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export type DomStructureChild = DomStructureElement | DomStructureTextNode;
|
|
32
|
+
|
|
33
|
+
export interface DomStructurePayload {
|
|
34
|
+
title: string | null;
|
|
35
|
+
lang: string | null;
|
|
36
|
+
stats: {
|
|
37
|
+
elementNodes: number;
|
|
38
|
+
omitted: { script: number; style: number; noscript: number; template: number };
|
|
39
|
+
truncated: boolean;
|
|
40
|
+
maxNodes: number;
|
|
41
|
+
maxDepth: number;
|
|
42
|
+
};
|
|
43
|
+
root: DomStructureElement | null;
|
|
44
|
+
}
|
|
45
|
+
|
|
17
46
|
export interface ReferenceSnapshot {
|
|
18
47
|
url: string;
|
|
19
48
|
capturedAt: string;
|
|
@@ -35,7 +64,12 @@ export interface ReferenceSnapshot {
|
|
|
35
64
|
>;
|
|
36
65
|
}
|
|
37
66
|
|
|
38
|
-
export async function emitPageReference(
|
|
67
|
+
export async function emitPageReference(
|
|
68
|
+
page: Page,
|
|
69
|
+
url: string,
|
|
70
|
+
refDir: string,
|
|
71
|
+
viewport?: { width: number; height: number },
|
|
72
|
+
): Promise<string[]> {
|
|
39
73
|
mkdirSync(refDir, { recursive: true });
|
|
40
74
|
const written: string[] = [];
|
|
41
75
|
const capturedAt = new Date().toISOString();
|
|
@@ -50,6 +84,21 @@ export async function emitPageReference(page: Page, url: string, refDir: string)
|
|
|
50
84
|
writeFileSync(htmlPath, html, "utf8");
|
|
51
85
|
written.push(htmlPath);
|
|
52
86
|
|
|
87
|
+
const domEval = (await page.evaluate(collectDomStructureInPage)) as DomStructurePayload;
|
|
88
|
+
const domMerged = {
|
|
89
|
+
url,
|
|
90
|
+
capturedAt,
|
|
91
|
+
viewport: viewport ?? null,
|
|
92
|
+
...domEval,
|
|
93
|
+
};
|
|
94
|
+
const domPath = join(refDir, "dom-structure.json");
|
|
95
|
+
writeFileSync(domPath, JSON.stringify(domMerged, null, 2) + "\n", "utf8");
|
|
96
|
+
written.push(domPath);
|
|
97
|
+
|
|
98
|
+
const outlinePath = join(refDir, "structure-outline.txt");
|
|
99
|
+
writeFileSync(outlinePath, buildStructureOutline(domMerged.root, domMerged.stats) + "\n", "utf8");
|
|
100
|
+
written.push(outlinePath);
|
|
101
|
+
|
|
53
102
|
const snapshot = (await page.evaluate(collectSnapshot)) as Omit<ReferenceSnapshot, "url" | "capturedAt">;
|
|
54
103
|
const full: ReferenceSnapshot = {
|
|
55
104
|
url,
|
|
@@ -102,11 +151,13 @@ function emitAiReadme(url: string, refDir: string): string {
|
|
|
102
151
|
"| File | Purpose |",
|
|
103
152
|
"| ---- | ------- |",
|
|
104
153
|
"| `page.html` | Full serialized DOM after JavaScript ran in Chromium. Layout, copy, and structure match what crawled (not necessarily valid static HTML elsewhere). |",
|
|
154
|
+
"| `dom-structure.json` | **Exact body subtree (JSON):** same child order as the live DOM; every attribute on each element (long `src` / `href` / `style` values truncated); inline text as separate typed entries. Use this as the canonical structure map for React. |",
|
|
155
|
+
"| `structure-outline.txt` | Indented tag skeleton (ids/classes/roles only; no text) for quick navigation. |",
|
|
105
156
|
"| `visible-text.json` | Exact visible strings: headings, buttons, links, and block text — good for **verbatim copy** when rewriting `page.tsx`. |",
|
|
106
157
|
"| `media.json` | Every image / video / source URL from the DOM. Host your own assets or swap for placeholders; do not hotlink without permission. |",
|
|
107
158
|
"| `meta.json` | Title, description, lang. |",
|
|
108
159
|
"",
|
|
109
|
-
"**Workflow:** (1) Recon —
|
|
160
|
+
"**Workflow:** (1) Recon — use **`dom-structure.json`** (or `structure-outline.txt` + `page.html`) for **exact tag order and nesting**; (2) Wire — map `visible-text.*` + `media.json` into that tree; (3) Build — implement `page.tsx` (start from `../mirror/<host>/page.tsx` if present) so component boundaries follow this structure. See the run folder’s **FOR_AI.md** for full authority order and compliance notes.",
|
|
110
161
|
"",
|
|
111
162
|
`Sibling folder \`../mirror/<host>/\` has a typed \`page.tsx\` with Framer Motion, Phosphor icons, and slots — wire copy from \`visible-text.json\` and media from \`media.json\` into that file.`,
|
|
112
163
|
"",
|
|
@@ -115,6 +166,154 @@ function emitAiReadme(url: string, refDir: string): string {
|
|
|
115
166
|
].join("\n");
|
|
116
167
|
}
|
|
117
168
|
|
|
169
|
+
function buildStructureOutline(root: DomStructureElement | null, stats: DomStructurePayload["stats"]): string {
|
|
170
|
+
const MAX_LINES = 15_000;
|
|
171
|
+
const lines: string[] = [
|
|
172
|
+
"# structure-outline.txt — tag skeleton under <body>",
|
|
173
|
+
"# Text nodes omitted here; see dom-structure.json for interleaved copy.",
|
|
174
|
+
`# elementNodes=${stats.elementNodes} truncated=${stats.truncated} omitted=${JSON.stringify(stats.omitted)}`,
|
|
175
|
+
"",
|
|
176
|
+
];
|
|
177
|
+
let n = 0;
|
|
178
|
+
function walkEl(node: DomStructureElement, depth: number) {
|
|
179
|
+
if (n >= MAX_LINES) return;
|
|
180
|
+
const a = node.attrs ?? {};
|
|
181
|
+
let suffix = "";
|
|
182
|
+
if (a.id) suffix += `#${String(a.id).replace(/\s+/g, "")}`;
|
|
183
|
+
if (a.class) {
|
|
184
|
+
const parts = String(a.class)
|
|
185
|
+
.split(/\s+/)
|
|
186
|
+
.filter(Boolean)
|
|
187
|
+
.slice(0, 6)
|
|
188
|
+
.map((c) => `.${c}`);
|
|
189
|
+
suffix += parts.join("");
|
|
190
|
+
}
|
|
191
|
+
if (a.role) suffix += `[role=${a.role}]`;
|
|
192
|
+
lines.push(`${" ".repeat(depth)}${node.tag}${suffix}`);
|
|
193
|
+
n++;
|
|
194
|
+
for (const c of node.children ?? []) {
|
|
195
|
+
if (n >= MAX_LINES) return;
|
|
196
|
+
if ("type" in c && c.type === "text") continue;
|
|
197
|
+
walkEl(c as DomStructureElement, depth + 1);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
if (root) walkEl(root, 0);
|
|
201
|
+
if (n >= MAX_LINES) lines.push("\n… outline truncated …");
|
|
202
|
+
return lines.join("\n");
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Runs in browser context. Serializes `document.body` with exact child order; skips
|
|
207
|
+
* script/style/noscript/template (counts only).
|
|
208
|
+
*/
|
|
209
|
+
function collectDomStructureInPage(): DomStructurePayload {
|
|
210
|
+
const MAX_NODES = 40_000;
|
|
211
|
+
const MAX_DEPTH = 128;
|
|
212
|
+
const MAX_TEXT = 12_000;
|
|
213
|
+
|
|
214
|
+
let nodeCount = 0;
|
|
215
|
+
let truncated = false;
|
|
216
|
+
const omitted = { script: 0, style: 0, noscript: 0, template: 0 };
|
|
217
|
+
|
|
218
|
+
function trimAttr(name: string, v: string): string {
|
|
219
|
+
const big =
|
|
220
|
+
name === "src" ||
|
|
221
|
+
name === "href" ||
|
|
222
|
+
name === "srcset" ||
|
|
223
|
+
name === "style" ||
|
|
224
|
+
name === "content" ||
|
|
225
|
+
name.startsWith("data-");
|
|
226
|
+
const max = big ? 3500 : 2000;
|
|
227
|
+
if (v.length <= max) return v;
|
|
228
|
+
return v.slice(0, max) + "…";
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
function walk(el: Element, depth: number): DomStructureElement | null {
|
|
232
|
+
if (nodeCount >= MAX_NODES) {
|
|
233
|
+
truncated = true;
|
|
234
|
+
return null;
|
|
235
|
+
}
|
|
236
|
+
const tag = el.tagName.toLowerCase();
|
|
237
|
+
if (tag === "script") {
|
|
238
|
+
omitted.script++;
|
|
239
|
+
return null;
|
|
240
|
+
}
|
|
241
|
+
if (tag === "style") {
|
|
242
|
+
omitted.style++;
|
|
243
|
+
return null;
|
|
244
|
+
}
|
|
245
|
+
if (tag === "noscript") {
|
|
246
|
+
omitted.noscript++;
|
|
247
|
+
return null;
|
|
248
|
+
}
|
|
249
|
+
if (tag === "template") {
|
|
250
|
+
omitted.template++;
|
|
251
|
+
return null;
|
|
252
|
+
}
|
|
253
|
+
if (depth > MAX_DEPTH) return null;
|
|
254
|
+
|
|
255
|
+
nodeCount++;
|
|
256
|
+
const attrs: Record<string, string> = {};
|
|
257
|
+
for (let i = 0; i < el.attributes.length; i++) {
|
|
258
|
+
const a = el.attributes.item(i);
|
|
259
|
+
if (!a) continue;
|
|
260
|
+
attrs[a.name] = trimAttr(a.name, a.value);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const children: DomStructureChild[] = [];
|
|
264
|
+
for (const child of el.childNodes) {
|
|
265
|
+
if (nodeCount >= MAX_NODES) {
|
|
266
|
+
truncated = true;
|
|
267
|
+
break;
|
|
268
|
+
}
|
|
269
|
+
if (child.nodeType === 3) {
|
|
270
|
+
let t = (child.textContent ?? "").replace(/\s+/g, " ").trim();
|
|
271
|
+
if (!t) continue;
|
|
272
|
+
if (t.length > MAX_TEXT) t = t.slice(0, MAX_TEXT) + "…";
|
|
273
|
+
children.push({ type: "text", value: t });
|
|
274
|
+
} else if (child.nodeType === 1) {
|
|
275
|
+
const sub = walk(child as Element, depth + 1);
|
|
276
|
+
if (sub) children.push(sub);
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
const out: DomStructureElement = { tag };
|
|
281
|
+
if (Object.keys(attrs).length > 0) out.attrs = attrs;
|
|
282
|
+
if (children.length > 0) out.children = children;
|
|
283
|
+
return out;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
const body = document.body;
|
|
287
|
+
if (!body) {
|
|
288
|
+
return {
|
|
289
|
+
title: document.title || null,
|
|
290
|
+
lang: document.documentElement.getAttribute("lang"),
|
|
291
|
+
stats: {
|
|
292
|
+
elementNodes: 0,
|
|
293
|
+
omitted,
|
|
294
|
+
truncated: false,
|
|
295
|
+
maxNodes: MAX_NODES,
|
|
296
|
+
maxDepth: MAX_DEPTH,
|
|
297
|
+
},
|
|
298
|
+
root: null,
|
|
299
|
+
};
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
const root = walk(body, 0);
|
|
303
|
+
return {
|
|
304
|
+
title: document.title || null,
|
|
305
|
+
lang: document.documentElement.getAttribute("lang"),
|
|
306
|
+
stats: {
|
|
307
|
+
elementNodes: nodeCount,
|
|
308
|
+
omitted,
|
|
309
|
+
truncated,
|
|
310
|
+
maxNodes: MAX_NODES,
|
|
311
|
+
maxDepth: MAX_DEPTH,
|
|
312
|
+
},
|
|
313
|
+
root,
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
|
|
118
317
|
/**
|
|
119
318
|
* Runs in browser context.
|
|
120
319
|
*/
|