launchframe 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -54,13 +54,15 @@ output/<runId>/
54
54
  ├── run.json ← full run metadata (sources, timing, status)
55
55
  ├── screenshots/ ← captured PNGs
56
56
  ├── raw/ ← per-site raw token + SiteLayout JSON
57
- ├── reference/ ← verbatim DOM + copy for AI (see below)
57
+ ├── reference/ ← verbatim DOM + **exact structure JSON** + copy for AI
58
58
  │ └── <host>/
59
- │ ├── page.html ← full HTML after JavaScript
60
- │ ├── visible-text.txt paste-friendly copy extraction
61
- │ ├── visible-text.json structured headings / body / buttons
62
- │ ├── media.json ← img + video URLs
63
- │ ├── meta.json ← title, description, lang
59
+ │ ├── page.html
60
+ │ ├── dom-structure.json canonical body tree (tags, attrs, text nodes)
61
+ │ ├── structure-outline.txt tag skeleton for quick scanning
62
+ │ ├── visible-text.txt
63
+ │ ├── visible-text.json
64
+ │ ├── media.json
65
+ │ ├── meta.json
64
66
  │ └── FOR_AI_REFERENCE.md
65
67
  └── mirror/
66
68
  └── <host>/
@@ -74,15 +76,15 @@ output/<runId>/
74
76
  ## Hand the output to your AI
75
77
 
76
78
  1. Run the command above so `output/<runId>/` exists.
77
- 2. Attach **`reference/<host>/`** (`visible-text.txt`, `page.html`, `media.json`) so the model sees **exact copy and structure** from the crawl.
79
+ 2. Attach **`reference/<host>/`**, especially **`dom-structure.json`** (exact tree) and **`visible-text.*`**, plus **`page.html`** and **`media.json`** so the model sees **exact structure and copy** from the crawl.
78
80
  3. Pick the mirror folder: `output/<runId>/mirror/<host>/`.
79
81
  4. Either:
80
82
  - **Cursor:** `@`-attach `reference/<host>/`, `mirror/<host>/`, `FOR_AI.md`, and
81
83
  `tokens.json`, then ask the agent to port copy from `visible-text.txt` into
82
84
  `page.tsx` and wire media from `media.json`.
83
85
  - **Claude Code:** copy both folders into your project, then ask the same.
84
- 5. The AI's authority order is **reference/visible-text.txt & page.html →
85
- MIRROR_NOTES.md → page.tsx → tokens.json → tailwind.config.ts + globals.css**. It must:
86
+ 5. The AI's authority order is **`dom-structure.json` (nesting) → `structure-outline.txt` / `page.html` `visible-text.*` →
87
+ MIRROR_NOTES.md → mirror `page.tsx` → tokens.json → tailwind.config.ts + globals.css**. It must:
86
88
  - Keep the section tree, grid composition, density, Motion, and Phosphor usage in `page.tsx`.
87
89
  - Map strings from `visible-text.txt` into the right `<TextSlot>` slots (or replace slots with plain JSX).
88
90
  - Use `media.json` for image/video `src` / `poster` (respect licensing; prefer your own assets).
@@ -191,22 +193,30 @@ npm run analyze # Run section classifier on captured screenshots
191
193
  npm run formalize # Validate the pattern-atlas/*.json files
192
194
  npm run evaluate # Grade a generated page (coherence + a11y)
193
195
  npm run typecheck # Project-wide TypeScript check
196
+ npm run sync:agents # Regenerate Copilot / Cline / Continue / Amazon Q stubs from AGENTS.md
194
197
  ```
195
198
 
199
+ ### AI agents in this repo
200
+
201
+ - **`AGENTS.md`** (root) is the **single source of truth** for how agents should work here (extract handoff, structure fidelity, compliance). **`docs/research/INSPECTION_GUIDE.md`** is inlined into derived configs when you run `npm run sync:agents`.
202
+ - **Cursor:** `.cursor/rules/project.mdc` points at `AGENTS.md`.
203
+ - **Claude Code / Gemini CLI:** `CLAUDE.md` and `GEMINI.md` import `AGENTS.md`.
204
+ - Edit `AGENTS.md`, then run `npm run sync:agents` (or `bash scripts/sync-agent-rules.sh`) to refresh `.github/copilot-instructions.md`, `.clinerules`, `.continue/rules/project.md`, and `.amazonq/rules/project.md`.
205
+
196
206
  ---
197
207
 
198
208
  ## What this is not
199
209
 
200
- - **Not a verbatim site downloader.** The crawler builds a typed
201
- `SiteLayout` model from the rendered DOM section tree, geometry,
202
- computed style tokens, content kinds and emits code generated from
203
- that model. It does not save the source's HTML/CSS to disk.
204
- - **Not a content lift.** Heading text, body copy, logos, illustrations,
205
- and product imagery become `<TextSlot>` / `<MediaSlot>` placeholders in
206
- the mirror page. You fill them with your own copy and assets before
207
- shipping.
208
- - **Not a component library replacement.** It sits *on top* of
209
- shadcn/ui and produces theme files plus slot-driven page templates.
210
+ - **Not the source site's frontend bundle.** The **layout mirror** is React
211
+ code emitted from a typed `SiteLayout` (section tree, composition,
212
+ density) not a dump of the origin's original components or stylesheets.
213
+ - **Not a substitute for legal clearance.** `reference/<host>/` may contain
214
+ serialized DOM and visible text for tooling **you** run on pages you are
215
+ allowed to analyze. You are responsible for trademarks, copy licenses, and
216
+ `robots.txt`/ToS compliance when using those artifacts.
217
+ - **Not a component library replacement.** Launchframe sits *alongside*
218
+ shadcn/ui: theme files, reference dumps, and slot-driven mirror pages
219
+ you integrate into your own app.
210
220
 
211
221
  ---
212
222
 
@@ -216,11 +226,10 @@ Launchframe is intended for layout research and design-system seeding
216
226
  against pages you have permission to analyze (your own products, sites
217
227
  where the operator has permission, or pages where structural analysis is
218
228
  permitted by `robots.txt`). The crawler respects `robots.txt` by default
219
- and rate-limits per domain. The output is generated code derived from a
220
- normalized typed model and slot placeholders not a verbatim copy of
221
- the source's markup, copy, or assets. Operators are responsible for the
222
- content they paste into those slots and for honoring third-party
223
- trademarks, terms of service, and licenses.
229
+ and rate-limits per domain. Output includes synthesized theme files, a
230
+ typed **mirror** page scaffold, and (per capture) a **reference** bundle
231
+ (DOM snapshot, visible text, media URLs) for AI-assisted reconstruction.
232
+ Operators remain responsible for how they use copy, media, and branding.
224
233
 
225
234
  ---
226
235
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "launchframe",
3
- "version": "0.1.8",
3
+ "version": "0.1.9",
4
4
  "description": "Point Launchframe at SaaS sites you admire and get back a drop-in shadcn/ui design system (tokens, Tailwind theme, CSS variables, AI handoff) you can build your own UI on top of.",
5
5
  "license": "MIT",
6
6
  "author": "Evan Gruhlkey",
@@ -46,6 +46,7 @@
46
46
  "analyze": "tsx packages/analysis/analyze-screenshot.ts",
47
47
  "formalize": "tsx packages/patterns/pattern-registry.ts",
48
48
  "evaluate": "tsx packages/evaluation/evaluate-page.ts",
49
+ "sync:agents": "node scripts/sync-agent-rules.mjs",
49
50
  "typecheck": "tsc -p tsconfig.json --noEmit",
50
51
  "format:check": "prettier --check ."
51
52
  },
@@ -304,7 +304,7 @@ function emitReport(system: DesignSystem, run: ExtractionRun): string {
304
304
  const lines = [`- ${c.url}`, ` - screenshot: \`${relativize(c.screenshotPath, run.outputDir)}\``];
305
305
  if (c.referenceDir) {
306
306
  lines.push(
307
- ` - reference: \`${relativize(c.referenceDir, run.outputDir)}/\` — \`page.html\`, \`visible-text.*\`, \`media.json\`, \`FOR_AI_REFERENCE.md\``,
307
+ ` - reference: \`${relativize(c.referenceDir, run.outputDir)}/\` — \`page.html\`, \`dom-structure.json\`, \`structure-outline.txt\`, \`visible-text.*\`, \`media.json\`, \`FOR_AI_REFERENCE.md\``,
308
308
  );
309
309
  }
310
310
  if (c.mirrorDir) {
@@ -403,7 +403,7 @@ ${rampRows(system.dark)}
403
403
  shadows). It does not, by itself, define every layout detail of a source page.
404
404
  - **Per-site \`reference/\` and \`mirror/\` folders** (when emitted) are the
405
405
  recon inputs for **structure and copy**: map landmarks and section order from
406
- \`page.html\`, pull strings from \`visible-text.json\` or \`visible-text.txt\`,
406
+ \`dom-structure.json\` (canonical nesting) + \`page.html\`, pull strings from \`visible-text.json\` or \`visible-text.txt\`,
407
407
  align media via \`media.json\`, and implement or refine UI starting from
408
408
  \`mirror/<host>/page.tsx\` (\`data-mirror-section\` markers match the crawl).
409
409
  - **Compliance:** Do not impersonate another company, ship their trademarks or
@@ -429,7 +429,7 @@ function emitForAi(system: DesignSystem, run: ExtractionRun): string {
429
429
  .map((c) => {
430
430
  const lines = [`### ${c.url}`];
431
431
  if (c.referenceDir) {
432
- lines.push(`- **Reference:** \`${relativize(c.referenceDir, run.outputDir)}/\` — start with \`FOR_AI_REFERENCE.md\`, then \`page.html\`, \`visible-text.txt\` (or \`.json\`), \`media.json\`.`);
432
+ lines.push(`- **Reference:** \`${relativize(c.referenceDir, run.outputDir)}/\` — start with \`FOR_AI_REFERENCE.md\`, then \`dom-structure.json\` (exact tree), \`structure-outline.txt\`, \`page.html\`, \`visible-text.txt\` (or \`.json\`), \`media.json\`.`);
433
433
  }
434
434
  if (c.mirrorDir) {
435
435
  lines.push(`- **Mirror:** \`${relativize(c.mirrorDir, run.outputDir)}/page.tsx\` — section scaffold + \`data-mirror-section\`; read \`MIRROR_NOTES.md\`.`);
@@ -448,7 +448,7 @@ ${perHost || "_No reference/mirror paths on this run — token-only._"}
448
448
 
449
449
  Workflow (similar in spirit to **recon → specs → build** pipelines):
450
450
 
451
- 1. **Recon:** Skim \`page.html\` for DOM landmarks (\`<header>\`, main columns, card grids, repeated list patterns). Cross-check section order with \`mirror/.../page.tsx\` (\`data-mirror-section\`).
451
+ 1. **Recon:** Use \`dom-structure.json\` for **exact nesting and sibling order**; use \`structure-outline.txt\` or \`page.html\` for skimming. Cross-check with \`mirror/.../page.tsx\` (\`data-mirror-section\`).
452
452
  2. **Wire copy + media:** Map headings, buttons, and blocks from \`visible-text.*\` into the matching mirror sections (or your components). Use \`media.json\` for asset URLs; replace with licensed or original assets when shipping.
453
453
  3. **Build:** Prefer editing **mirror \`page.tsx\`** inside the user's app (or port its structure into their file tree) rather than inventing a new section order from scratch. Apply **REPORT.md** / **tokens** for colors, type, spacing, radii — mirror CSS variables under \`.mirror-root\` should converge to the same semantic palette where possible.
454
454
 
@@ -474,7 +474,7 @@ In **Cursor**, \`@\` those paths explicitly.
474
474
 
475
475
  ## Authority order
476
476
 
477
- 1. **Structural fidelity:** \`reference/<host>/page.html\` + \`visible-text.*\` + \`mirror/<host>/page.tsx\` — section order, composition patterns, and strings (when the user wants fidelity).
477
+ 1. **Structural fidelity:** \`reference/<host>/dom-structure.json\` + \`page.html\` + \`visible-text.*\` + \`mirror/<host>/page.tsx\` — exact DOM tree shape, then copy and typed scaffold.
478
478
  2. **Design tokens:** **REPORT.md** and **tokens.json** — typography scale, spacing, radii, colors, container width, notes.
479
479
  3. **Integration:** **tailwind.config.ts** + **globals.css** — merge into a Next.js + Tailwind + shadcn-style app.
480
480
  ${structureSection}
@@ -485,7 +485,7 @@ You must use the attached \`output/${system.runId}/\` folder.
485
485
 
486
486
  - Read REPORT.md and tokens.json before writing UI. Merge tailwind.config.ts and globals.css into my project (preserve my paths unless I say otherwise).
487
487
  - Style with semantic tokens: bg-background, text-foreground, text-muted-foreground, border-border, bg-primary, text-primary-foreground, bg-card, text-card-foreground, etc. Prefer these over ad-hoc hex; mirror pages may use --mirror-* variables until merged.
488
- - If reference/ and mirror/ exist for my source URL: treat them as mandatory context. Preserve crawled section order and layout patterns: align components to data-mirror-section and page.html landmarks. Wire copy from visible-text.* and media from media.json unless I say to rewrite for a different product.
488
+ - If reference/ and mirror/ exist for my source URL: treat them as mandatory context. Preserve **exact DOM nesting and sibling order** from `dom-structure.json` (and cross-check `page.html`). Align components to `data-mirror-section` and the mirror scaffold. Wire copy from visible-text.* and media from media.json unless I say to rewrite for a different product.
489
489
  - If I am building a NEW product unrelated to the crawl: keep layout inspiration from mirror/reference but REPLACE product names, claims, and sensitive copy with my copy. Never impersonate another brand.
490
490
 
491
491
  My product / intent: [describe goal — faithful mirror of URL vs new product in same layout; tone and CTA]
@@ -20,7 +20,8 @@
20
20
  * - Per-domain rate limit defaults to 15 req/min (`--rate <n>`).
21
21
  * - The crawler extracts a structured representation (section tree,
22
22
  * computed style tokens, content kinds) and writes a verbatim
23
- * `reference/<host>/` bundle (HTML + visible text + media URLs) for AI.
23
+ * `reference/<host>/` bundle (HTML, DOM tree JSON, outlines, visible text,
24
+ * media index) for AI structure cloning.
24
25
  */
25
26
 
26
27
  import { mkdirSync, writeFileSync } from "node:fs";
@@ -104,7 +105,8 @@ function printHelp(): void {
104
105
  " 2. Captures a full-page screenshot and harvests computed design tokens",
105
106
  " (colors, type, spacing, radius, shadow) → raw/<host>.tokens.json.",
106
107
  " 3. Writes a verbatim reference bundle → reference/<host>/ (page.html,",
107
- " visible-text.json/.txt, media.json, meta.json, FOR_AI_REFERENCE.md).",
108
+ " dom-structure.json, structure-outline.txt, visible-text.json/.txt,",
109
+ " media.json, meta.json, FOR_AI_REFERENCE.md).",
108
110
  " 4. Crawls the DOM into SiteLayout → raw/<host>.layout.json and emits",
109
111
  " mirror/<host>/page.tsx (Framer Motion + Phosphor + image/video slots).",
110
112
  "",
@@ -237,7 +239,7 @@ async function captureOne(
237
239
 
238
240
  let referenceWritten: string[] = [];
239
241
  try {
240
- referenceWritten = await emitPageReference(page, url, referenceDir);
242
+ referenceWritten = await emitPageReference(page, url, referenceDir, viewport);
241
243
  } catch (err) {
242
244
  console.warn(` ! reference dump failed for ${url}: ${(err as Error).message}`);
243
245
  }
@@ -2,11 +2,13 @@
2
2
  * Verbatim reference dump for AI / human review.
3
3
  *
4
4
  * Writes everything under `output/<runId>/reference/<host>/`:
5
- * - page.html — full document HTML after JS render (`page.content()`)
6
- * - visible-text.json structured visible copy (headings, buttons, key blocks)
7
- * - media.json img / video / source URLs and attributes
8
- * - meta.json title, description, canonical, lang
9
- * - FOR_AI_REFERENCE.md how to use these files with an AI
5
+ * - page.html — full document HTML after JS render (`page.content()`)
6
+ * - dom-structure.json exact `body` subtree: tag order, attributes, text nodes (JSON)
7
+ * - structure-outline.txt indented tag skeleton (ids/classes/roles) for quick scanning
8
+ * - visible-text.json structured visible copy (headings, buttons, key blocks)
9
+ * - media.json img / video / source URLs and attributes
10
+ * - meta.json — title, description, canonical, lang
11
+ * - FOR_AI_REFERENCE.md — how to use these files with an AI
10
12
  */
11
13
 
12
14
  import { mkdirSync, writeFileSync } from "node:fs";
@@ -14,6 +16,33 @@ import { join } from "node:path";
14
16
 
15
17
  import type { Page } from "playwright";
16
18
 
19
+ /** Element node in the serialized `body` tree. */
20
+ export interface DomStructureElement {
21
+ tag: string;
22
+ attrs?: Record<string, string>;
23
+ children?: DomStructureChild[];
24
+ }
25
+
26
+ export interface DomStructureTextNode {
27
+ type: "text";
28
+ value: string;
29
+ }
30
+
31
+ export type DomStructureChild = DomStructureElement | DomStructureTextNode;
32
+
33
+ export interface DomStructurePayload {
34
+ title: string | null;
35
+ lang: string | null;
36
+ stats: {
37
+ elementNodes: number;
38
+ omitted: { script: number; style: number; noscript: number; template: number };
39
+ truncated: boolean;
40
+ maxNodes: number;
41
+ maxDepth: number;
42
+ };
43
+ root: DomStructureElement | null;
44
+ }
45
+
17
46
  export interface ReferenceSnapshot {
18
47
  url: string;
19
48
  capturedAt: string;
@@ -35,7 +64,12 @@ export interface ReferenceSnapshot {
35
64
  >;
36
65
  }
37
66
 
38
- export async function emitPageReference(page: Page, url: string, refDir: string): Promise<string[]> {
67
+ export async function emitPageReference(
68
+ page: Page,
69
+ url: string,
70
+ refDir: string,
71
+ viewport?: { width: number; height: number },
72
+ ): Promise<string[]> {
39
73
  mkdirSync(refDir, { recursive: true });
40
74
  const written: string[] = [];
41
75
  const capturedAt = new Date().toISOString();
@@ -50,6 +84,21 @@ export async function emitPageReference(page: Page, url: string, refDir: string)
50
84
  writeFileSync(htmlPath, html, "utf8");
51
85
  written.push(htmlPath);
52
86
 
87
+ const domEval = (await page.evaluate(collectDomStructureInPage)) as DomStructurePayload;
88
+ const domMerged = {
89
+ url,
90
+ capturedAt,
91
+ viewport: viewport ?? null,
92
+ ...domEval,
93
+ };
94
+ const domPath = join(refDir, "dom-structure.json");
95
+ writeFileSync(domPath, JSON.stringify(domMerged, null, 2) + "\n", "utf8");
96
+ written.push(domPath);
97
+
98
+ const outlinePath = join(refDir, "structure-outline.txt");
99
+ writeFileSync(outlinePath, buildStructureOutline(domMerged.root, domMerged.stats) + "\n", "utf8");
100
+ written.push(outlinePath);
101
+
53
102
  const snapshot = (await page.evaluate(collectSnapshot)) as Omit<ReferenceSnapshot, "url" | "capturedAt">;
54
103
  const full: ReferenceSnapshot = {
55
104
  url,
@@ -102,11 +151,13 @@ function emitAiReadme(url: string, refDir: string): string {
102
151
  "| File | Purpose |",
103
152
  "| ---- | ------- |",
104
153
  "| `page.html` | Full serialized DOM after JavaScript ran in Chromium. Layout, copy, and structure match what crawled (not necessarily valid static HTML elsewhere). |",
154
+ "| `dom-structure.json` | **Exact body subtree (JSON):** same child order as the live DOM; every attribute on each element (long `src` / `href` / `style` values truncated); inline text as separate typed entries. Use this as the canonical structure map for React. |",
155
+ "| `structure-outline.txt` | Indented tag skeleton (ids/classes/roles only; no text) for quick navigation. |",
105
156
  "| `visible-text.json` | Exact visible strings: headings, buttons, links, and block text — good for **verbatim copy** when rewriting `page.tsx`. |",
106
157
  "| `media.json` | Every image / video / source URL from the DOM. Host your own assets or swap for placeholders; do not hotlink without permission. |",
107
158
  "| `meta.json` | Title, description, lang. |",
108
159
  "",
109
- "**Workflow:** (1) Recon — skim \`page.html\` for landmarks and grids; (2) Wire — map \`visible-text.*\` + \`media.json\` into sections; (3) Build — prefer editing sibling \`../mirror/<host>/page.tsx\` (section order via \`data-mirror-section\`) instead of inventing layout from scratch. See the run folder’s **FOR_AI.md** for full authority order and compliance notes.",
160
+ "**Workflow:** (1) Recon — use **`dom-structure.json`** (or `structure-outline.txt` + `page.html`) for **exact tag order and nesting**; (2) Wire — map `visible-text.*` + `media.json` into that tree; (3) Build — implement `page.tsx` (start from `../mirror/<host>/page.tsx` if present) so component boundaries follow this structure. See the run folder’s **FOR_AI.md** for full authority order and compliance notes.",
110
161
  "",
111
162
  `Sibling folder \`../mirror/<host>/\` has a typed \`page.tsx\` with Framer Motion, Phosphor icons, and slots — wire copy from \`visible-text.json\` and media from \`media.json\` into that file.`,
112
163
  "",
@@ -115,6 +166,154 @@ function emitAiReadme(url: string, refDir: string): string {
115
166
  ].join("\n");
116
167
  }
117
168
 
169
+ function buildStructureOutline(root: DomStructureElement | null, stats: DomStructurePayload["stats"]): string {
170
+ const MAX_LINES = 15_000;
171
+ const lines: string[] = [
172
+ "# structure-outline.txt — tag skeleton under <body>",
173
+ "# Text nodes omitted here; see dom-structure.json for interleaved copy.",
174
+ `# elementNodes=${stats.elementNodes} truncated=${stats.truncated} omitted=${JSON.stringify(stats.omitted)}`,
175
+ "",
176
+ ];
177
+ let n = 0;
178
+ function walkEl(node: DomStructureElement, depth: number) {
179
+ if (n >= MAX_LINES) return;
180
+ const a = node.attrs ?? {};
181
+ let suffix = "";
182
+ if (a.id) suffix += `#${String(a.id).replace(/\s+/g, "")}`;
183
+ if (a.class) {
184
+ const parts = String(a.class)
185
+ .split(/\s+/)
186
+ .filter(Boolean)
187
+ .slice(0, 6)
188
+ .map((c) => `.${c}`);
189
+ suffix += parts.join("");
190
+ }
191
+ if (a.role) suffix += `[role=${a.role}]`;
192
+ lines.push(`${" ".repeat(depth)}${node.tag}${suffix}`);
193
+ n++;
194
+ for (const c of node.children ?? []) {
195
+ if (n >= MAX_LINES) return;
196
+ if ("type" in c && c.type === "text") continue;
197
+ walkEl(c as DomStructureElement, depth + 1);
198
+ }
199
+ }
200
+ if (root) walkEl(root, 0);
201
+ if (n >= MAX_LINES) lines.push("\n… outline truncated …");
202
+ return lines.join("\n");
203
+ }
204
+
205
+ /**
206
+ * Runs in browser context. Serializes `document.body` with exact child order; skips
207
+ * script/style/noscript/template (counts only).
208
+ */
209
+ function collectDomStructureInPage(): DomStructurePayload {
210
+ const MAX_NODES = 40_000;
211
+ const MAX_DEPTH = 128;
212
+ const MAX_TEXT = 12_000;
213
+
214
+ let nodeCount = 0;
215
+ let truncated = false;
216
+ const omitted = { script: 0, style: 0, noscript: 0, template: 0 };
217
+
218
+ function trimAttr(name: string, v: string): string {
219
+ const big =
220
+ name === "src" ||
221
+ name === "href" ||
222
+ name === "srcset" ||
223
+ name === "style" ||
224
+ name === "content" ||
225
+ name.startsWith("data-");
226
+ const max = big ? 3500 : 2000;
227
+ if (v.length <= max) return v;
228
+ return v.slice(0, max) + "…";
229
+ }
230
+
231
+ function walk(el: Element, depth: number): DomStructureElement | null {
232
+ if (nodeCount >= MAX_NODES) {
233
+ truncated = true;
234
+ return null;
235
+ }
236
+ const tag = el.tagName.toLowerCase();
237
+ if (tag === "script") {
238
+ omitted.script++;
239
+ return null;
240
+ }
241
+ if (tag === "style") {
242
+ omitted.style++;
243
+ return null;
244
+ }
245
+ if (tag === "noscript") {
246
+ omitted.noscript++;
247
+ return null;
248
+ }
249
+ if (tag === "template") {
250
+ omitted.template++;
251
+ return null;
252
+ }
253
+ if (depth > MAX_DEPTH) return null;
254
+
255
+ nodeCount++;
256
+ const attrs: Record<string, string> = {};
257
+ for (let i = 0; i < el.attributes.length; i++) {
258
+ const a = el.attributes.item(i);
259
+ if (!a) continue;
260
+ attrs[a.name] = trimAttr(a.name, a.value);
261
+ }
262
+
263
+ const children: DomStructureChild[] = [];
264
+ for (const child of el.childNodes) {
265
+ if (nodeCount >= MAX_NODES) {
266
+ truncated = true;
267
+ break;
268
+ }
269
+ if (child.nodeType === 3) {
270
+ let t = (child.textContent ?? "").replace(/\s+/g, " ").trim();
271
+ if (!t) continue;
272
+ if (t.length > MAX_TEXT) t = t.slice(0, MAX_TEXT) + "…";
273
+ children.push({ type: "text", value: t });
274
+ } else if (child.nodeType === 1) {
275
+ const sub = walk(child as Element, depth + 1);
276
+ if (sub) children.push(sub);
277
+ }
278
+ }
279
+
280
+ const out: DomStructureElement = { tag };
281
+ if (Object.keys(attrs).length > 0) out.attrs = attrs;
282
+ if (children.length > 0) out.children = children;
283
+ return out;
284
+ }
285
+
286
+ const body = document.body;
287
+ if (!body) {
288
+ return {
289
+ title: document.title || null,
290
+ lang: document.documentElement.getAttribute("lang"),
291
+ stats: {
292
+ elementNodes: 0,
293
+ omitted,
294
+ truncated: false,
295
+ maxNodes: MAX_NODES,
296
+ maxDepth: MAX_DEPTH,
297
+ },
298
+ root: null,
299
+ };
300
+ }
301
+
302
+ const root = walk(body, 0);
303
+ return {
304
+ title: document.title || null,
305
+ lang: document.documentElement.getAttribute("lang"),
306
+ stats: {
307
+ elementNodes: nodeCount,
308
+ omitted,
309
+ truncated,
310
+ maxNodes: MAX_NODES,
311
+ maxDepth: MAX_DEPTH,
312
+ },
313
+ root,
314
+ };
315
+ }
316
+
118
317
  /**
119
318
  * Runs in browser context.
120
319
  */