libretto 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -7
- package/dist/cli/commands/ai.js +3 -5
- package/dist/cli/commands/browser.js +23 -2
- package/dist/cli/commands/init.js +157 -114
- package/dist/cli/commands/snapshot.js +147 -26
- package/dist/cli/core/ai-config.js +38 -46
- package/dist/cli/core/api-snapshot-analyzer.js +74 -0
- package/dist/cli/core/browser.js +21 -4
- package/dist/cli/core/context.js +1 -1
- package/dist/cli/core/snapshot-analyzer.js +295 -104
- package/dist/cli/core/snapshot-api-config.js +137 -0
- package/dist/cli/index.js +1 -0
- package/dist/shared/condense-dom/condense-dom.cjs +462 -0
- package/dist/shared/condense-dom/condense-dom.d.cts +34 -0
- package/dist/shared/condense-dom/condense-dom.d.ts +34 -0
- package/dist/shared/condense-dom/condense-dom.js +438 -0
- package/dist/shared/llm/ai-sdk-adapter.cjs +5 -1
- package/dist/shared/llm/ai-sdk-adapter.js +5 -1
- package/dist/shared/llm/client.cjs +106 -27
- package/dist/shared/llm/client.d.cts +8 -1
- package/dist/shared/llm/client.d.ts +8 -1
- package/dist/shared/llm/client.js +89 -23
- package/dist/shared/llm/types.d.cts +4 -3
- package/dist/shared/llm/types.d.ts +4 -3
- package/dist/shared/state/session-state.cjs +8 -1
- package/dist/shared/state/session-state.d.cts +24 -18
- package/dist/shared/state/session-state.d.ts +24 -18
- package/dist/shared/state/session-state.js +7 -1
- package/package.json +39 -33
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
var condense_dom_exports = {};
|
|
20
|
+
__export(condense_dom_exports, {
|
|
21
|
+
condenseDom: () => condenseDom
|
|
22
|
+
});
|
|
23
|
+
module.exports = __toCommonJS(condense_dom_exports);
|
|
24
|
+
const TEST_ATTRS = /* @__PURE__ */ new Set(["data-testid", "data-test", "data-qa", "data-cy"]);
|
|
25
|
+
const TRUSTED_ATTRS = /* @__PURE__ */ new Set([
|
|
26
|
+
"id",
|
|
27
|
+
"name",
|
|
28
|
+
"for",
|
|
29
|
+
"tabindex",
|
|
30
|
+
"contenteditable",
|
|
31
|
+
"role",
|
|
32
|
+
"title",
|
|
33
|
+
"alt",
|
|
34
|
+
"type",
|
|
35
|
+
"value",
|
|
36
|
+
"placeholder",
|
|
37
|
+
"autocomplete",
|
|
38
|
+
"href",
|
|
39
|
+
"action",
|
|
40
|
+
"method",
|
|
41
|
+
"src"
|
|
42
|
+
]);
|
|
43
|
+
const STATE_ATTRS = /* @__PURE__ */ new Set([
|
|
44
|
+
"disabled",
|
|
45
|
+
"hidden",
|
|
46
|
+
"inert",
|
|
47
|
+
"readonly",
|
|
48
|
+
"required",
|
|
49
|
+
"checked",
|
|
50
|
+
"selected",
|
|
51
|
+
"open",
|
|
52
|
+
"multiple"
|
|
53
|
+
]);
|
|
54
|
+
const BOOLEAN_ATTRS = /* @__PURE__ */ new Set([
|
|
55
|
+
...STATE_ATTRS,
|
|
56
|
+
"async",
|
|
57
|
+
"defer",
|
|
58
|
+
"nomodule"
|
|
59
|
+
]);
|
|
60
|
+
const EMPTY_VALUE_DROP_ATTRS = /* @__PURE__ */ new Set([
|
|
61
|
+
"alt",
|
|
62
|
+
"autocomplete",
|
|
63
|
+
"href",
|
|
64
|
+
"action",
|
|
65
|
+
"method",
|
|
66
|
+
"name",
|
|
67
|
+
"placeholder",
|
|
68
|
+
"src",
|
|
69
|
+
"tabindex",
|
|
70
|
+
"title",
|
|
71
|
+
"type"
|
|
72
|
+
]);
|
|
73
|
+
const URL_ATTRS = /* @__PURE__ */ new Set(["href", "src", "action"]);
|
|
74
|
+
const SCRIPT_ATTRS = /* @__PURE__ */ new Set([
|
|
75
|
+
"src",
|
|
76
|
+
"type",
|
|
77
|
+
"id",
|
|
78
|
+
"defer",
|
|
79
|
+
"async",
|
|
80
|
+
"crossorigin",
|
|
81
|
+
"integrity",
|
|
82
|
+
"nomodule",
|
|
83
|
+
"referrerpolicy"
|
|
84
|
+
]);
|
|
85
|
+
const STYLE_TAG_ATTRS = /* @__PURE__ */ new Set(["media", "type", "nonce", "title"]);
|
|
86
|
+
const INTERACTIVE_TAGS = /* @__PURE__ */ new Set([
|
|
87
|
+
"a",
|
|
88
|
+
"button",
|
|
89
|
+
"input",
|
|
90
|
+
"select",
|
|
91
|
+
"textarea",
|
|
92
|
+
"form",
|
|
93
|
+
"details",
|
|
94
|
+
"dialog",
|
|
95
|
+
"label"
|
|
96
|
+
]);
|
|
97
|
+
const INTERACTIVE_ROLES = /* @__PURE__ */ new Set([
|
|
98
|
+
"button",
|
|
99
|
+
"link",
|
|
100
|
+
"tab",
|
|
101
|
+
"menuitem",
|
|
102
|
+
"checkbox",
|
|
103
|
+
"radio",
|
|
104
|
+
"switch",
|
|
105
|
+
"slider",
|
|
106
|
+
"combobox"
|
|
107
|
+
]);
|
|
108
|
+
const OPEN_TAG_PATTERN = /<([a-zA-Z][\w:-]*)(\s(?:[^"'<>/]|"[^"]*"|'[^']*')*)?\s*(\/?)>/g;
|
|
109
|
+
function condenseDom(html) {
|
|
110
|
+
const originalLength = html.length;
|
|
111
|
+
const reductions = {};
|
|
112
|
+
function track(label, before, after) {
|
|
113
|
+
const diff = before.length - after.length;
|
|
114
|
+
if (diff > 0) {
|
|
115
|
+
reductions[label] = (reductions[label] ?? 0) + diff;
|
|
116
|
+
}
|
|
117
|
+
return after;
|
|
118
|
+
}
|
|
119
|
+
let result = html;
|
|
120
|
+
result = track(
|
|
121
|
+
"noscript",
|
|
122
|
+
result,
|
|
123
|
+
result.replace(/<noscript\b[^>]*>[\s\S]*?<\/noscript>/gi, "")
|
|
124
|
+
);
|
|
125
|
+
result = track(
|
|
126
|
+
"comments",
|
|
127
|
+
result,
|
|
128
|
+
result.replace(/<!--[\s\S]*?(?:-->|$)/g, "")
|
|
129
|
+
);
|
|
130
|
+
result = track(
|
|
131
|
+
"scripts",
|
|
132
|
+
result,
|
|
133
|
+
result.replace(
|
|
134
|
+
/(<script\b[^>]*>)([\s\S]*?)(<\/script(?:\s[^>]*)?>)/gi,
|
|
135
|
+
(_match, open, content, close) => {
|
|
136
|
+
if (!content.trim()) return `${open}${close}`;
|
|
137
|
+
const isDataScript = /type\s*=\s*["']application\/(json|ld\+json)["']/i.test(open);
|
|
138
|
+
if (isDataScript) {
|
|
139
|
+
return `${open}[JSON data, ${content.length} chars]${close}`;
|
|
140
|
+
}
|
|
141
|
+
return `${open}[script, ${content.length} chars]${close}`;
|
|
142
|
+
}
|
|
143
|
+
)
|
|
144
|
+
);
|
|
145
|
+
result = track(
|
|
146
|
+
"styles",
|
|
147
|
+
result,
|
|
148
|
+
result.replace(
|
|
149
|
+
/(<style\b[^>]*>)([\s\S]*?)(<\/style(?:\s[^>]*)?>)/gi,
|
|
150
|
+
(_match, open, content, close) => {
|
|
151
|
+
if (!content.trim()) return `${open}${close}`;
|
|
152
|
+
return `${open}[CSS, ${content.length} chars]${close}`;
|
|
153
|
+
}
|
|
154
|
+
)
|
|
155
|
+
);
|
|
156
|
+
result = track(
|
|
157
|
+
"base64",
|
|
158
|
+
result,
|
|
159
|
+
result.replace(
|
|
160
|
+
/(src|href)\s*=\s*["'](data:[^;]+;base64,)[A-Za-z0-9+/=]{100,}["']/gi,
|
|
161
|
+
(_match, attr, prefix) => {
|
|
162
|
+
const mime = prefix.replace("data:", "").replace(";base64,", "");
|
|
163
|
+
return `${attr}="[base64 ${mime}]"`;
|
|
164
|
+
}
|
|
165
|
+
)
|
|
166
|
+
);
|
|
167
|
+
result = track("attribute-allowlist", result, rewriteTagAttributes(result));
|
|
168
|
+
const svgPattern = /<svg\b([^>]*)>((?:(?!<svg\b)[\s\S])*?)<\/svg>/gi;
|
|
169
|
+
result = track(
|
|
170
|
+
"svg-collapse",
|
|
171
|
+
result,
|
|
172
|
+
(() => {
|
|
173
|
+
let prev;
|
|
174
|
+
let current = result;
|
|
175
|
+
do {
|
|
176
|
+
prev = current;
|
|
177
|
+
current = current.replace(
|
|
178
|
+
svgPattern,
|
|
179
|
+
(_match, attrs, inner) => {
|
|
180
|
+
const keepAttrs = [];
|
|
181
|
+
const attrPatterns = [
|
|
182
|
+
"id",
|
|
183
|
+
"class",
|
|
184
|
+
"role",
|
|
185
|
+
"aria-label",
|
|
186
|
+
"aria-hidden",
|
|
187
|
+
"title",
|
|
188
|
+
"data-testid"
|
|
189
|
+
];
|
|
190
|
+
for (const name of attrPatterns) {
|
|
191
|
+
const attrToken = findAttributeToken(attrs, name);
|
|
192
|
+
if (attrToken) keepAttrs.push(attrToken);
|
|
193
|
+
}
|
|
194
|
+
const hasAriaLabel = /aria-label\s*=/i.test(attrs);
|
|
195
|
+
if (!hasAriaLabel) {
|
|
196
|
+
const titleMatch = inner.match(
|
|
197
|
+
/<title[^>]*>([^<]+)<\/title>/i
|
|
198
|
+
);
|
|
199
|
+
const descMatch = inner.match(
|
|
200
|
+
/<desc[^>]*>([^<]+)<\/desc>/i
|
|
201
|
+
);
|
|
202
|
+
const labelText = titleMatch?.[1]?.trim() || descMatch?.[1]?.trim();
|
|
203
|
+
if (labelText) {
|
|
204
|
+
keepAttrs.push(
|
|
205
|
+
`aria-label="${escapeHtmlAttribute(labelText)}"`
|
|
206
|
+
);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
const attrStr = keepAttrs.length > 0 ? ` ${keepAttrs.join(" ")}` : "";
|
|
210
|
+
return `<svg${attrStr}><!-- [icon] --></svg>`;
|
|
211
|
+
}
|
|
212
|
+
);
|
|
213
|
+
svgPattern.lastIndex = 0;
|
|
214
|
+
} while (current !== prev);
|
|
215
|
+
return current;
|
|
216
|
+
})()
|
|
217
|
+
);
|
|
218
|
+
const layoutProps = /(?:^|;)\s*(?:display|visibility|opacity|pointer-events|position|z-index|overflow)(?:-[a-z]+)?\s*:[^;"]*/gi;
|
|
219
|
+
result = track(
|
|
220
|
+
"inline-styles",
|
|
221
|
+
result,
|
|
222
|
+
result.replace(
|
|
223
|
+
/\sstyle\s*=\s*["']([^"']*)["']/gi,
|
|
224
|
+
(_match, value) => {
|
|
225
|
+
const kept = [];
|
|
226
|
+
let propMatch;
|
|
227
|
+
layoutProps.lastIndex = 0;
|
|
228
|
+
while ((propMatch = layoutProps.exec(value)) !== null) {
|
|
229
|
+
kept.push(propMatch[0].replace(/^[;\s]+/, "").trim());
|
|
230
|
+
}
|
|
231
|
+
if (kept.length === 0) return "";
|
|
232
|
+
return ` style="${kept.join("; ")}"`;
|
|
233
|
+
}
|
|
234
|
+
)
|
|
235
|
+
);
|
|
236
|
+
result = track(
|
|
237
|
+
"obfuscated-classes",
|
|
238
|
+
result,
|
|
239
|
+
result.replace(
|
|
240
|
+
/\sclass\s*=\s*["']([^"']*)["']/gi,
|
|
241
|
+
(_match, value) => {
|
|
242
|
+
const filtered = filterSemanticClasses(value);
|
|
243
|
+
if (!filtered) return "";
|
|
244
|
+
return ` class="${filtered}"`;
|
|
245
|
+
}
|
|
246
|
+
)
|
|
247
|
+
);
|
|
248
|
+
const removableAttrs = /\s(?:xmlns(?::[a-z]+)?|xml:space|xml:lang|fill|stroke|stroke-width|stroke-linecap|stroke-linejoin|stroke-miterlimit|stroke-dasharray|stroke-dashoffset|stroke-opacity|fill-opacity|clip-rule|fill-rule|focusable)\s*=\s*["'][^"']*["']/gi;
|
|
249
|
+
result = track(
|
|
250
|
+
"framework-svg-attrs",
|
|
251
|
+
result,
|
|
252
|
+
result.replace(removableAttrs, "")
|
|
253
|
+
);
|
|
254
|
+
const preBlocks = [];
|
|
255
|
+
result = result.replace(
|
|
256
|
+
/(<pre\b[^>]*>)([\s\S]*?)(<\/pre>)/gi,
|
|
257
|
+
(_match, open, content, close) => {
|
|
258
|
+
const idx = preBlocks.length;
|
|
259
|
+
preBlocks.push(`${open}${content}${close}`);
|
|
260
|
+
return `__PRE_PLACEHOLDER_${idx}__`;
|
|
261
|
+
}
|
|
262
|
+
);
|
|
263
|
+
result = track(
|
|
264
|
+
"whitespace",
|
|
265
|
+
result,
|
|
266
|
+
result.replace(/[ \t]+/g, " ").replace(/\n\s*\n/g, "\n")
|
|
267
|
+
);
|
|
268
|
+
for (let i = 0; i < preBlocks.length; i++) {
|
|
269
|
+
const placeholder = `__PRE_PLACEHOLDER_${i}__`;
|
|
270
|
+
const preBlock = preBlocks[i];
|
|
271
|
+
result = result.replace(placeholder, () => preBlock);
|
|
272
|
+
}
|
|
273
|
+
return {
|
|
274
|
+
html: result,
|
|
275
|
+
originalLength,
|
|
276
|
+
condensedLength: result.length,
|
|
277
|
+
reductions
|
|
278
|
+
};
|
|
279
|
+
}
|
|
280
|
+
function rewriteTagAttributes(html) {
|
|
281
|
+
return html.replace(
|
|
282
|
+
OPEN_TAG_PATTERN,
|
|
283
|
+
(match, rawTagName, rawAttrs, selfClosing) => {
|
|
284
|
+
const tagName = rawTagName.toLowerCase();
|
|
285
|
+
if (!rawAttrs?.trim()) return match;
|
|
286
|
+
const attrs = parseAttributes(rawAttrs);
|
|
287
|
+
if (attrs.length === 0) return match;
|
|
288
|
+
const interactive = isInteractiveElement(tagName, attrs);
|
|
289
|
+
const kept = attrs.map((attr) => keepAttribute(tagName, attr, interactive)).filter((value) => value !== null);
|
|
290
|
+
const attrStr = kept.length > 0 ? ` ${kept.join(" ")}` : "";
|
|
291
|
+
const closing = selfClosing ? " /" : "";
|
|
292
|
+
return `<${rawTagName}${attrStr}${closing}>`;
|
|
293
|
+
}
|
|
294
|
+
);
|
|
295
|
+
}
|
|
296
|
+
function keepAttribute(tagName, attr, interactive) {
|
|
297
|
+
const name = attr.name.toLowerCase();
|
|
298
|
+
const value = attr.value;
|
|
299
|
+
if (name === "class") {
|
|
300
|
+
if (!value?.trim()) return null;
|
|
301
|
+
const filtered = filterSemanticClasses(value);
|
|
302
|
+
if (!filtered) return null;
|
|
303
|
+
return serializeAttribute(attr.name, filtered);
|
|
304
|
+
}
|
|
305
|
+
if (name === "style") {
|
|
306
|
+
if (!value?.trim()) return null;
|
|
307
|
+
return serializeAttribute(attr.name, value);
|
|
308
|
+
}
|
|
309
|
+
if (name.startsWith("aria-")) {
|
|
310
|
+
if (!value?.trim()) return null;
|
|
311
|
+
return attr.rawToken;
|
|
312
|
+
}
|
|
313
|
+
if (TEST_ATTRS.has(name)) {
|
|
314
|
+
if (!value?.trim()) return null;
|
|
315
|
+
return attr.rawToken;
|
|
316
|
+
}
|
|
317
|
+
if (tagName === "script" && SCRIPT_ATTRS.has(name)) {
|
|
318
|
+
return serializePreservedAttribute(attr);
|
|
319
|
+
}
|
|
320
|
+
if (tagName === "style" && STYLE_TAG_ATTRS.has(name)) {
|
|
321
|
+
if (!value?.trim()) return null;
|
|
322
|
+
return attr.rawToken;
|
|
323
|
+
}
|
|
324
|
+
if (STATE_ATTRS.has(name)) {
|
|
325
|
+
return serializePreservedAttribute(attr);
|
|
326
|
+
}
|
|
327
|
+
if (URL_ATTRS.has(name)) {
|
|
328
|
+
if (!value?.trim()) return null;
|
|
329
|
+
const normalized = normalizeUrlValue(value);
|
|
330
|
+
if (normalized === value) return attr.rawToken;
|
|
331
|
+
return serializeAttribute(attr.name, normalized);
|
|
332
|
+
}
|
|
333
|
+
if (TRUSTED_ATTRS.has(name)) {
|
|
334
|
+
if (shouldDropEmptyValue(name, value)) return null;
|
|
335
|
+
return serializePreservedAttribute(attr);
|
|
336
|
+
}
|
|
337
|
+
if (shouldKeepCustomDataAttribute(tagName, name, value, interactive)) {
|
|
338
|
+
return attr.rawToken;
|
|
339
|
+
}
|
|
340
|
+
return null;
|
|
341
|
+
}
|
|
342
|
+
function serializePreservedAttribute(attr) {
|
|
343
|
+
if (BOOLEAN_ATTRS.has(attr.name.toLowerCase())) {
|
|
344
|
+
return attr.rawToken;
|
|
345
|
+
}
|
|
346
|
+
if (attr.value === null) return attr.rawToken;
|
|
347
|
+
return attr.rawToken;
|
|
348
|
+
}
|
|
349
|
+
function shouldDropEmptyValue(name, value) {
|
|
350
|
+
if (value === null) return false;
|
|
351
|
+
if (value.trim()) return false;
|
|
352
|
+
if (name.startsWith("aria-")) return true;
|
|
353
|
+
return EMPTY_VALUE_DROP_ATTRS.has(name);
|
|
354
|
+
}
|
|
355
|
+
function normalizeUrlValue(value) {
|
|
356
|
+
const loweredValue = value.trim().toLowerCase();
|
|
357
|
+
if (loweredValue.startsWith("blob:")) return "blob:[omitted]";
|
|
358
|
+
if (loweredValue.startsWith("javascript:")) return "javascript:[omitted]";
|
|
359
|
+
if (loweredValue.startsWith("vbscript:")) return "vbscript:[omitted]";
|
|
360
|
+
if (loweredValue.startsWith("data:")) return "data:[omitted]";
|
|
361
|
+
if (value.length <= 160) return value;
|
|
362
|
+
try {
|
|
363
|
+
const isAbsolute = /^[a-z][a-z0-9+.-]*:/i.test(value);
|
|
364
|
+
const parsed = isAbsolute ? new URL(value) : new URL(value, "https://condensed.local");
|
|
365
|
+
const prefix = isAbsolute ? `${parsed.protocol}//${parsed.host}${parsed.pathname}` : `${parsed.pathname}${parsed.hash}`;
|
|
366
|
+
const query = parsed.search ? "?[query omitted]" : "";
|
|
367
|
+
return `${prefix}${query}`;
|
|
368
|
+
} catch {
|
|
369
|
+
return `${value.slice(0, 96)}[omitted]`;
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
function filterSemanticClasses(value) {
|
|
373
|
+
const classes = value.split(/\s+/).filter(Boolean);
|
|
374
|
+
const kept = classes.filter((cls) => !isObfuscatedClass(cls));
|
|
375
|
+
return kept.join(" ");
|
|
376
|
+
}
|
|
377
|
+
function isObfuscatedClass(cls) {
|
|
378
|
+
if (cls.length > 80) return true;
|
|
379
|
+
if (/^_?[0-9a-f]{6,}$/i.test(cls)) return true;
|
|
380
|
+
if (/^[a-z]+_[0-9a-f]{4,}$/i.test(cls)) return true;
|
|
381
|
+
if (/^[a-z]{1,2}[0-9]{2,}$/i.test(cls)) return true;
|
|
382
|
+
const digits = (cls.match(/[0-9]/g) || []).length;
|
|
383
|
+
const letters = (cls.match(/[a-zA-Z]/g) || []).length;
|
|
384
|
+
if (cls.length >= 6 && digits >= letters * 0.5 && digits >= 2) return true;
|
|
385
|
+
return false;
|
|
386
|
+
}
|
|
387
|
+
function parseAttributes(rawAttrs) {
|
|
388
|
+
const attrs = [];
|
|
389
|
+
const attrPattern = /([^\s"'<>\/=]+)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+)))?/g;
|
|
390
|
+
let match;
|
|
391
|
+
while ((match = attrPattern.exec(rawAttrs)) !== null) {
|
|
392
|
+
const name = match[1];
|
|
393
|
+
if (!name) continue;
|
|
394
|
+
attrs.push({
|
|
395
|
+
name,
|
|
396
|
+
rawToken: match[0].trim(),
|
|
397
|
+
value: match[2] ?? match[3] ?? match[4] ?? null
|
|
398
|
+
});
|
|
399
|
+
}
|
|
400
|
+
return attrs;
|
|
401
|
+
}
|
|
402
|
+
function isInteractiveElement(tagName, attrs) {
|
|
403
|
+
if (INTERACTIVE_TAGS.has(tagName)) return true;
|
|
404
|
+
for (const attr of attrs) {
|
|
405
|
+
const name = attr.name.toLowerCase();
|
|
406
|
+
if (name === "tabindex" || name === "contenteditable") return true;
|
|
407
|
+
if (name !== "role") continue;
|
|
408
|
+
const role = attr.value?.trim().toLowerCase();
|
|
409
|
+
if (role && INTERACTIVE_ROLES.has(role)) {
|
|
410
|
+
return true;
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
return false;
|
|
414
|
+
}
|
|
415
|
+
function shouldKeepCustomDataAttribute(tagName, attrName, value, interactive) {
|
|
416
|
+
if (!interactive) return false;
|
|
417
|
+
if (!attrName.startsWith("data-")) return false;
|
|
418
|
+
if (TEST_ATTRS.has(attrName)) return false;
|
|
419
|
+
if (!value?.trim()) return false;
|
|
420
|
+
if (value.length > 80) return false;
|
|
421
|
+
if (tagName === "script" || tagName === "style") return false;
|
|
422
|
+
const key = attrName.slice("data-".length);
|
|
423
|
+
if (!looksMeaningfulToken(key)) return false;
|
|
424
|
+
if (!looksMeaningfulDataValue(value)) return false;
|
|
425
|
+
return true;
|
|
426
|
+
}
|
|
427
|
+
function looksMeaningfulToken(value) {
|
|
428
|
+
if (!/^[a-z][a-z0-9-]{1,40}$/i.test(value)) return false;
|
|
429
|
+
if (!/[a-z]{3}/i.test(value)) return false;
|
|
430
|
+
if (/(track|metric|telemetry|analytics|component|display|loaded|token|dps|color|screen|strict|rehydr|fetch)/i.test(value)) {
|
|
431
|
+
return false;
|
|
432
|
+
}
|
|
433
|
+
return true;
|
|
434
|
+
}
|
|
435
|
+
function looksMeaningfulDataValue(value) {
|
|
436
|
+
if (value.length > 80) return false;
|
|
437
|
+
if (/[<>]/.test(value)) return false;
|
|
438
|
+
if (/https?:\/\//i.test(value)) return false;
|
|
439
|
+
return /^[a-z0-9:_./ -]+$/i.test(value);
|
|
440
|
+
}
|
|
441
|
+
function findAttributeToken(attrs, name) {
|
|
442
|
+
const match = attrs.match(
|
|
443
|
+
new RegExp(
|
|
444
|
+
`(?:^|\\s)(${escapeRegExp(name)}(?:\\s*=\\s*(?:"[^"]*"|'[^']*'|[^\\s"'=<>\\x60]+))?)`,
|
|
445
|
+
"i"
|
|
446
|
+
)
|
|
447
|
+
);
|
|
448
|
+
return match?.[1] ?? null;
|
|
449
|
+
}
|
|
450
|
+
function escapeRegExp(value) {
|
|
451
|
+
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
452
|
+
}
|
|
453
|
+
function serializeAttribute(name, value) {
|
|
454
|
+
return `${name}="${escapeHtmlAttribute(value)}"`;
|
|
455
|
+
}
|
|
456
|
+
function escapeHtmlAttribute(value) {
|
|
457
|
+
return value.replace(/&/g, "&").replace(/"/g, """).replace(/</g, "<").replace(/>/g, ">");
|
|
458
|
+
}
|
|
459
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
460
|
+
0 && (module.exports = {
|
|
461
|
+
condenseDom
|
|
462
|
+
});
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DOM condensation — reduces serialized HTML for LLM consumption.
|
|
3
|
+
*
|
|
4
|
+
* All rules run unconditionally (no tiers). The function operates on
|
|
5
|
+
* already-serialized HTML strings (the output of `page.content()`),
|
|
6
|
+
* not a browser-side DOM walk or parsed DOM tree.
|
|
7
|
+
*
|
|
8
|
+
* Rules applied in order:
|
|
9
|
+
* 1. Noscript blocks — remove entirely
|
|
10
|
+
* 2. HTML comments — remove entirely
|
|
11
|
+
* 3. Script contents — hollow out, keep tags + useful attributes
|
|
12
|
+
* 4. Style contents — hollow out, keep tags + useful attributes
|
|
13
|
+
* 5. Embedded binary data — replace base64 data URIs
|
|
14
|
+
* 6. Attribute allowlist — keep trusted attrs, special-case class/style/URLs
|
|
15
|
+
* 7. SVG elements — collapse to single tag, extract title/desc
|
|
16
|
+
* 8. Inline style properties — keep only layout-relevant props
|
|
17
|
+
* 9. Non-semantic class names — filter or delete class values
|
|
18
|
+
* 10. (Cross-reference IDs — preserved, no action needed)
|
|
19
|
+
* 11. Framework-internal and SVG visual attributes — remove
|
|
20
|
+
* 12. Whitespace — collapse (preserve <pre> content)
|
|
21
|
+
*/
|
|
22
|
+
type CondenseDomResult = {
|
|
23
|
+
/** The condensed HTML string. Valid, parseable HTML. */
|
|
24
|
+
html: string;
|
|
25
|
+
/** Character count of the input. */
|
|
26
|
+
originalLength: number;
|
|
27
|
+
/** Character count of the output. */
|
|
28
|
+
condensedLength: number;
|
|
29
|
+
/** Characters removed, keyed by rule name. */
|
|
30
|
+
reductions: Record<string, number>;
|
|
31
|
+
};
|
|
32
|
+
declare function condenseDom(html: string): CondenseDomResult;
|
|
33
|
+
|
|
34
|
+
export { type CondenseDomResult, condenseDom };
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DOM condensation — reduces serialized HTML for LLM consumption.
|
|
3
|
+
*
|
|
4
|
+
* All rules run unconditionally (no tiers). The function operates on
|
|
5
|
+
* already-serialized HTML strings (the output of `page.content()`),
|
|
6
|
+
* not a browser-side DOM walk or parsed DOM tree.
|
|
7
|
+
*
|
|
8
|
+
* Rules applied in order:
|
|
9
|
+
* 1. Noscript blocks — remove entirely
|
|
10
|
+
* 2. HTML comments — remove entirely
|
|
11
|
+
* 3. Script contents — hollow out, keep tags + useful attributes
|
|
12
|
+
* 4. Style contents — hollow out, keep tags + useful attributes
|
|
13
|
+
* 5. Embedded binary data — replace base64 data URIs
|
|
14
|
+
* 6. Attribute allowlist — keep trusted attrs, special-case class/style/URLs
|
|
15
|
+
* 7. SVG elements — collapse to single tag, extract title/desc
|
|
16
|
+
* 8. Inline style properties — keep only layout-relevant props
|
|
17
|
+
* 9. Non-semantic class names — filter or delete class values
|
|
18
|
+
* 10. (Cross-reference IDs — preserved, no action needed)
|
|
19
|
+
* 11. Framework-internal and SVG visual attributes — remove
|
|
20
|
+
* 12. Whitespace — collapse (preserve <pre> content)
|
|
21
|
+
*/
|
|
22
|
+
type CondenseDomResult = {
|
|
23
|
+
/** The condensed HTML string. Valid, parseable HTML. */
|
|
24
|
+
html: string;
|
|
25
|
+
/** Character count of the input. */
|
|
26
|
+
originalLength: number;
|
|
27
|
+
/** Character count of the output. */
|
|
28
|
+
condensedLength: number;
|
|
29
|
+
/** Characters removed, keyed by rule name. */
|
|
30
|
+
reductions: Record<string, number>;
|
|
31
|
+
};
|
|
32
|
+
declare function condenseDom(html: string): CondenseDomResult;
|
|
33
|
+
|
|
34
|
+
export { type CondenseDomResult, condenseDom };
|