libretto 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,462 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+ var condense_dom_exports = {};
20
+ __export(condense_dom_exports, {
21
+ condenseDom: () => condenseDom
22
+ });
23
+ module.exports = __toCommonJS(condense_dom_exports);
24
+ const TEST_ATTRS = /* @__PURE__ */ new Set(["data-testid", "data-test", "data-qa", "data-cy"]);
25
+ const TRUSTED_ATTRS = /* @__PURE__ */ new Set([
26
+ "id",
27
+ "name",
28
+ "for",
29
+ "tabindex",
30
+ "contenteditable",
31
+ "role",
32
+ "title",
33
+ "alt",
34
+ "type",
35
+ "value",
36
+ "placeholder",
37
+ "autocomplete",
38
+ "href",
39
+ "action",
40
+ "method",
41
+ "src"
42
+ ]);
43
+ const STATE_ATTRS = /* @__PURE__ */ new Set([
44
+ "disabled",
45
+ "hidden",
46
+ "inert",
47
+ "readonly",
48
+ "required",
49
+ "checked",
50
+ "selected",
51
+ "open",
52
+ "multiple"
53
+ ]);
54
+ const BOOLEAN_ATTRS = /* @__PURE__ */ new Set([
55
+ ...STATE_ATTRS,
56
+ "async",
57
+ "defer",
58
+ "nomodule"
59
+ ]);
60
+ const EMPTY_VALUE_DROP_ATTRS = /* @__PURE__ */ new Set([
61
+ "alt",
62
+ "autocomplete",
63
+ "href",
64
+ "action",
65
+ "method",
66
+ "name",
67
+ "placeholder",
68
+ "src",
69
+ "tabindex",
70
+ "title",
71
+ "type"
72
+ ]);
73
+ const URL_ATTRS = /* @__PURE__ */ new Set(["href", "src", "action"]);
74
+ const SCRIPT_ATTRS = /* @__PURE__ */ new Set([
75
+ "src",
76
+ "type",
77
+ "id",
78
+ "defer",
79
+ "async",
80
+ "crossorigin",
81
+ "integrity",
82
+ "nomodule",
83
+ "referrerpolicy"
84
+ ]);
85
+ const STYLE_TAG_ATTRS = /* @__PURE__ */ new Set(["media", "type", "nonce", "title"]);
86
+ const INTERACTIVE_TAGS = /* @__PURE__ */ new Set([
87
+ "a",
88
+ "button",
89
+ "input",
90
+ "select",
91
+ "textarea",
92
+ "form",
93
+ "details",
94
+ "dialog",
95
+ "label"
96
+ ]);
97
+ const INTERACTIVE_ROLES = /* @__PURE__ */ new Set([
98
+ "button",
99
+ "link",
100
+ "tab",
101
+ "menuitem",
102
+ "checkbox",
103
+ "radio",
104
+ "switch",
105
+ "slider",
106
+ "combobox"
107
+ ]);
108
+ const OPEN_TAG_PATTERN = /<([a-zA-Z][\w:-]*)(\s(?:[^"'<>/]|"[^"]*"|'[^']*')*)?\s*(\/?)>/g;
109
+ function condenseDom(html) {
110
+ const originalLength = html.length;
111
+ const reductions = {};
112
+ function track(label, before, after) {
113
+ const diff = before.length - after.length;
114
+ if (diff > 0) {
115
+ reductions[label] = (reductions[label] ?? 0) + diff;
116
+ }
117
+ return after;
118
+ }
119
+ let result = html;
120
+ result = track(
121
+ "noscript",
122
+ result,
123
+ result.replace(/<noscript\b[^>]*>[\s\S]*?<\/noscript>/gi, "")
124
+ );
125
+ result = track(
126
+ "comments",
127
+ result,
128
+ result.replace(/<!--[\s\S]*?(?:-->|$)/g, "")
129
+ );
130
+ result = track(
131
+ "scripts",
132
+ result,
133
+ result.replace(
134
+ /(<script\b[^>]*>)([\s\S]*?)(<\/script(?:\s[^>]*)?>)/gi,
135
+ (_match, open, content, close) => {
136
+ if (!content.trim()) return `${open}${close}`;
137
+ const isDataScript = /type\s*=\s*["']application\/(json|ld\+json)["']/i.test(open);
138
+ if (isDataScript) {
139
+ return `${open}[JSON data, ${content.length} chars]${close}`;
140
+ }
141
+ return `${open}[script, ${content.length} chars]${close}`;
142
+ }
143
+ )
144
+ );
145
+ result = track(
146
+ "styles",
147
+ result,
148
+ result.replace(
149
+ /(<style\b[^>]*>)([\s\S]*?)(<\/style(?:\s[^>]*)?>)/gi,
150
+ (_match, open, content, close) => {
151
+ if (!content.trim()) return `${open}${close}`;
152
+ return `${open}[CSS, ${content.length} chars]${close}`;
153
+ }
154
+ )
155
+ );
156
+ result = track(
157
+ "base64",
158
+ result,
159
+ result.replace(
160
+ /(src|href)\s*=\s*["'](data:[^;]+;base64,)[A-Za-z0-9+/=]{100,}["']/gi,
161
+ (_match, attr, prefix) => {
162
+ const mime = prefix.replace("data:", "").replace(";base64,", "");
163
+ return `${attr}="[base64 ${mime}]"`;
164
+ }
165
+ )
166
+ );
167
+ result = track("attribute-allowlist", result, rewriteTagAttributes(result));
168
+ const svgPattern = /<svg\b([^>]*)>((?:(?!<svg\b)[\s\S])*?)<\/svg>/gi;
169
+ result = track(
170
+ "svg-collapse",
171
+ result,
172
+ (() => {
173
+ let prev;
174
+ let current = result;
175
+ do {
176
+ prev = current;
177
+ current = current.replace(
178
+ svgPattern,
179
+ (_match, attrs, inner) => {
180
+ const keepAttrs = [];
181
+ const attrPatterns = [
182
+ "id",
183
+ "class",
184
+ "role",
185
+ "aria-label",
186
+ "aria-hidden",
187
+ "title",
188
+ "data-testid"
189
+ ];
190
+ for (const name of attrPatterns) {
191
+ const attrToken = findAttributeToken(attrs, name);
192
+ if (attrToken) keepAttrs.push(attrToken);
193
+ }
194
+ const hasAriaLabel = /aria-label\s*=/i.test(attrs);
195
+ if (!hasAriaLabel) {
196
+ const titleMatch = inner.match(
197
+ /<title[^>]*>([^<]+)<\/title>/i
198
+ );
199
+ const descMatch = inner.match(
200
+ /<desc[^>]*>([^<]+)<\/desc>/i
201
+ );
202
+ const labelText = titleMatch?.[1]?.trim() || descMatch?.[1]?.trim();
203
+ if (labelText) {
204
+ keepAttrs.push(
205
+ `aria-label="${escapeHtmlAttribute(labelText)}"`
206
+ );
207
+ }
208
+ }
209
+ const attrStr = keepAttrs.length > 0 ? ` ${keepAttrs.join(" ")}` : "";
210
+ return `<svg${attrStr}><!-- [icon] --></svg>`;
211
+ }
212
+ );
213
+ svgPattern.lastIndex = 0;
214
+ } while (current !== prev);
215
+ return current;
216
+ })()
217
+ );
218
+ const layoutProps = /(?:^|;)\s*(?:display|visibility|opacity|pointer-events|position|z-index|overflow)(?:-[a-z]+)?\s*:[^;"]*/gi;
219
+ result = track(
220
+ "inline-styles",
221
+ result,
222
+ result.replace(
223
+ /\sstyle\s*=\s*["']([^"']*)["']/gi,
224
+ (_match, value) => {
225
+ const kept = [];
226
+ let propMatch;
227
+ layoutProps.lastIndex = 0;
228
+ while ((propMatch = layoutProps.exec(value)) !== null) {
229
+ kept.push(propMatch[0].replace(/^[;\s]+/, "").trim());
230
+ }
231
+ if (kept.length === 0) return "";
232
+ return ` style="${kept.join("; ")}"`;
233
+ }
234
+ )
235
+ );
236
+ result = track(
237
+ "obfuscated-classes",
238
+ result,
239
+ result.replace(
240
+ /\sclass\s*=\s*["']([^"']*)["']/gi,
241
+ (_match, value) => {
242
+ const filtered = filterSemanticClasses(value);
243
+ if (!filtered) return "";
244
+ return ` class="${filtered}"`;
245
+ }
246
+ )
247
+ );
248
+ const removableAttrs = /\s(?:xmlns(?::[a-z]+)?|xml:space|xml:lang|fill|stroke|stroke-width|stroke-linecap|stroke-linejoin|stroke-miterlimit|stroke-dasharray|stroke-dashoffset|stroke-opacity|fill-opacity|clip-rule|fill-rule|focusable)\s*=\s*["'][^"']*["']/gi;
249
+ result = track(
250
+ "framework-svg-attrs",
251
+ result,
252
+ result.replace(removableAttrs, "")
253
+ );
254
+ const preBlocks = [];
255
+ result = result.replace(
256
+ /(<pre\b[^>]*>)([\s\S]*?)(<\/pre>)/gi,
257
+ (_match, open, content, close) => {
258
+ const idx = preBlocks.length;
259
+ preBlocks.push(`${open}${content}${close}`);
260
+ return `__PRE_PLACEHOLDER_${idx}__`;
261
+ }
262
+ );
263
+ result = track(
264
+ "whitespace",
265
+ result,
266
+ result.replace(/[ \t]+/g, " ").replace(/\n\s*\n/g, "\n")
267
+ );
268
+ for (let i = 0; i < preBlocks.length; i++) {
269
+ const placeholder = `__PRE_PLACEHOLDER_${i}__`;
270
+ const preBlock = preBlocks[i];
271
+ result = result.replace(placeholder, () => preBlock);
272
+ }
273
+ return {
274
+ html: result,
275
+ originalLength,
276
+ condensedLength: result.length,
277
+ reductions
278
+ };
279
+ }
280
+ function rewriteTagAttributes(html) {
281
+ return html.replace(
282
+ OPEN_TAG_PATTERN,
283
+ (match, rawTagName, rawAttrs, selfClosing) => {
284
+ const tagName = rawTagName.toLowerCase();
285
+ if (!rawAttrs?.trim()) return match;
286
+ const attrs = parseAttributes(rawAttrs);
287
+ if (attrs.length === 0) return match;
288
+ const interactive = isInteractiveElement(tagName, attrs);
289
+ const kept = attrs.map((attr) => keepAttribute(tagName, attr, interactive)).filter((value) => value !== null);
290
+ const attrStr = kept.length > 0 ? ` ${kept.join(" ")}` : "";
291
+ const closing = selfClosing ? " /" : "";
292
+ return `<${rawTagName}${attrStr}${closing}>`;
293
+ }
294
+ );
295
+ }
296
+ function keepAttribute(tagName, attr, interactive) {
297
+ const name = attr.name.toLowerCase();
298
+ const value = attr.value;
299
+ if (name === "class") {
300
+ if (!value?.trim()) return null;
301
+ const filtered = filterSemanticClasses(value);
302
+ if (!filtered) return null;
303
+ return serializeAttribute(attr.name, filtered);
304
+ }
305
+ if (name === "style") {
306
+ if (!value?.trim()) return null;
307
+ return serializeAttribute(attr.name, value);
308
+ }
309
+ if (name.startsWith("aria-")) {
310
+ if (!value?.trim()) return null;
311
+ return attr.rawToken;
312
+ }
313
+ if (TEST_ATTRS.has(name)) {
314
+ if (!value?.trim()) return null;
315
+ return attr.rawToken;
316
+ }
317
+ if (tagName === "script" && SCRIPT_ATTRS.has(name)) {
318
+ return serializePreservedAttribute(attr);
319
+ }
320
+ if (tagName === "style" && STYLE_TAG_ATTRS.has(name)) {
321
+ if (!value?.trim()) return null;
322
+ return attr.rawToken;
323
+ }
324
+ if (STATE_ATTRS.has(name)) {
325
+ return serializePreservedAttribute(attr);
326
+ }
327
+ if (URL_ATTRS.has(name)) {
328
+ if (!value?.trim()) return null;
329
+ const normalized = normalizeUrlValue(value);
330
+ if (normalized === value) return attr.rawToken;
331
+ return serializeAttribute(attr.name, normalized);
332
+ }
333
+ if (TRUSTED_ATTRS.has(name)) {
334
+ if (shouldDropEmptyValue(name, value)) return null;
335
+ return serializePreservedAttribute(attr);
336
+ }
337
+ if (shouldKeepCustomDataAttribute(tagName, name, value, interactive)) {
338
+ return attr.rawToken;
339
+ }
340
+ return null;
341
+ }
342
+ function serializePreservedAttribute(attr) {
343
+ if (BOOLEAN_ATTRS.has(attr.name.toLowerCase())) {
344
+ return attr.rawToken;
345
+ }
346
+ if (attr.value === null) return attr.rawToken;
347
+ return attr.rawToken;
348
+ }
349
+ function shouldDropEmptyValue(name, value) {
350
+ if (value === null) return false;
351
+ if (value.trim()) return false;
352
+ if (name.startsWith("aria-")) return true;
353
+ return EMPTY_VALUE_DROP_ATTRS.has(name);
354
+ }
355
+ function normalizeUrlValue(value) {
356
+ const loweredValue = value.trim().toLowerCase();
357
+ if (loweredValue.startsWith("blob:")) return "blob:[omitted]";
358
+ if (loweredValue.startsWith("javascript:")) return "javascript:[omitted]";
359
+ if (loweredValue.startsWith("vbscript:")) return "vbscript:[omitted]";
360
+ if (loweredValue.startsWith("data:")) return "data:[omitted]";
361
+ if (value.length <= 160) return value;
362
+ try {
363
+ const isAbsolute = /^[a-z][a-z0-9+.-]*:/i.test(value);
364
+ const parsed = isAbsolute ? new URL(value) : new URL(value, "https://condensed.local");
365
+ const prefix = isAbsolute ? `${parsed.protocol}//${parsed.host}${parsed.pathname}` : `${parsed.pathname}${parsed.hash}`;
366
+ const query = parsed.search ? "?[query omitted]" : "";
367
+ return `${prefix}${query}`;
368
+ } catch {
369
+ return `${value.slice(0, 96)}[omitted]`;
370
+ }
371
+ }
372
+ function filterSemanticClasses(value) {
373
+ const classes = value.split(/\s+/).filter(Boolean);
374
+ const kept = classes.filter((cls) => !isObfuscatedClass(cls));
375
+ return kept.join(" ");
376
+ }
377
+ function isObfuscatedClass(cls) {
378
+ if (cls.length > 80) return true;
379
+ if (/^_?[0-9a-f]{6,}$/i.test(cls)) return true;
380
+ if (/^[a-z]+_[0-9a-f]{4,}$/i.test(cls)) return true;
381
+ if (/^[a-z]{1,2}[0-9]{2,}$/i.test(cls)) return true;
382
+ const digits = (cls.match(/[0-9]/g) || []).length;
383
+ const letters = (cls.match(/[a-zA-Z]/g) || []).length;
384
+ if (cls.length >= 6 && digits >= letters * 0.5 && digits >= 2) return true;
385
+ return false;
386
+ }
387
+ function parseAttributes(rawAttrs) {
388
+ const attrs = [];
389
+ const attrPattern = /([^\s"'<>\/=]+)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+)))?/g;
390
+ let match;
391
+ while ((match = attrPattern.exec(rawAttrs)) !== null) {
392
+ const name = match[1];
393
+ if (!name) continue;
394
+ attrs.push({
395
+ name,
396
+ rawToken: match[0].trim(),
397
+ value: match[2] ?? match[3] ?? match[4] ?? null
398
+ });
399
+ }
400
+ return attrs;
401
+ }
402
+ function isInteractiveElement(tagName, attrs) {
403
+ if (INTERACTIVE_TAGS.has(tagName)) return true;
404
+ for (const attr of attrs) {
405
+ const name = attr.name.toLowerCase();
406
+ if (name === "tabindex" || name === "contenteditable") return true;
407
+ if (name !== "role") continue;
408
+ const role = attr.value?.trim().toLowerCase();
409
+ if (role && INTERACTIVE_ROLES.has(role)) {
410
+ return true;
411
+ }
412
+ }
413
+ return false;
414
+ }
415
+ function shouldKeepCustomDataAttribute(tagName, attrName, value, interactive) {
416
+ if (!interactive) return false;
417
+ if (!attrName.startsWith("data-")) return false;
418
+ if (TEST_ATTRS.has(attrName)) return false;
419
+ if (!value?.trim()) return false;
420
+ if (value.length > 80) return false;
421
+ if (tagName === "script" || tagName === "style") return false;
422
+ const key = attrName.slice("data-".length);
423
+ if (!looksMeaningfulToken(key)) return false;
424
+ if (!looksMeaningfulDataValue(value)) return false;
425
+ return true;
426
+ }
427
+ function looksMeaningfulToken(value) {
428
+ if (!/^[a-z][a-z0-9-]{1,40}$/i.test(value)) return false;
429
+ if (!/[a-z]{3}/i.test(value)) return false;
430
+ if (/(track|metric|telemetry|analytics|component|display|loaded|token|dps|color|screen|strict|rehydr|fetch)/i.test(value)) {
431
+ return false;
432
+ }
433
+ return true;
434
+ }
435
+ function looksMeaningfulDataValue(value) {
436
+ if (value.length > 80) return false;
437
+ if (/[<>]/.test(value)) return false;
438
+ if (/https?:\/\//i.test(value)) return false;
439
+ return /^[a-z0-9:_./ -]+$/i.test(value);
440
+ }
441
+ function findAttributeToken(attrs, name) {
442
+ const match = attrs.match(
443
+ new RegExp(
444
+ `(?:^|\\s)(${escapeRegExp(name)}(?:\\s*=\\s*(?:"[^"]*"|'[^']*'|[^\\s"'=<>\\x60]+))?)`,
445
+ "i"
446
+ )
447
+ );
448
+ return match?.[1] ?? null;
449
+ }
450
+ function escapeRegExp(value) {
451
+ return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
452
+ }
453
+ function serializeAttribute(name, value) {
454
+ return `${name}="${escapeHtmlAttribute(value)}"`;
455
+ }
456
+ function escapeHtmlAttribute(value) {
457
+ return value.replace(/&/g, "&amp;").replace(/"/g, "&quot;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
458
+ }
459
+ // Annotate the CommonJS export names for ESM import in node:
460
+ 0 && (module.exports = {
461
+ condenseDom
462
+ });
@@ -0,0 +1,34 @@
1
+ /**
2
+ * DOM condensation — reduces serialized HTML for LLM consumption.
3
+ *
4
+ * All rules run unconditionally (no tiers). The function operates on
5
+ * already-serialized HTML strings (the output of `page.content()`),
6
+ * not a browser-side DOM walk or parsed DOM tree.
7
+ *
8
+ * Rules applied in order:
9
+ * 1. Noscript blocks — remove entirely
10
+ * 2. HTML comments — remove entirely
11
+ * 3. Script contents — hollow out, keep tags + useful attributes
12
+ * 4. Style contents — hollow out, keep tags + useful attributes
13
+ * 5. Embedded binary data — replace base64 data URIs
14
+ * 6. Attribute allowlist — keep trusted attrs, special-case class/style/URLs
15
+ * 7. SVG elements — collapse to single tag, extract title/desc
16
+ * 8. Inline style properties — keep only layout-relevant props
17
+ * 9. Non-semantic class names — filter or delete class values
18
+ * 10. (Cross-reference IDs — preserved, no action needed)
19
+ * 11. Framework-internal and SVG visual attributes — remove
20
+ * 12. Whitespace — collapse (preserve <pre> content)
21
+ */
22
+ type CondenseDomResult = {
23
+ /** The condensed HTML string. Valid, parseable HTML. */
24
+ html: string;
25
+ /** Character count of the input. */
26
+ originalLength: number;
27
+ /** Character count of the output. */
28
+ condensedLength: number;
29
+ /** Characters removed, keyed by rule name. */
30
+ reductions: Record<string, number>;
31
+ };
32
+ declare function condenseDom(html: string): CondenseDomResult;
33
+
34
+ export { type CondenseDomResult, condenseDom };
@@ -0,0 +1,34 @@
1
+ /**
2
+ * DOM condensation — reduces serialized HTML for LLM consumption.
3
+ *
4
+ * All rules run unconditionally (no tiers). The function operates on
5
+ * already-serialized HTML strings (the output of `page.content()`),
6
+ * not a browser-side DOM walk or parsed DOM tree.
7
+ *
8
+ * Rules applied in order:
9
+ * 1. Noscript blocks — remove entirely
10
+ * 2. HTML comments — remove entirely
11
+ * 3. Script contents — hollow out, keep tags + useful attributes
12
+ * 4. Style contents — hollow out, keep tags + useful attributes
13
+ * 5. Embedded binary data — replace base64 data URIs
14
+ * 6. Attribute allowlist — keep trusted attrs, special-case class/style/URLs
15
+ * 7. SVG elements — collapse to single tag, extract title/desc
16
+ * 8. Inline style properties — keep only layout-relevant props
17
+ * 9. Non-semantic class names — filter or delete class values
18
+ * 10. (Cross-reference IDs — preserved, no action needed)
19
+ * 11. Framework-internal and SVG visual attributes — remove
20
+ * 12. Whitespace — collapse (preserve <pre> content)
21
+ */
22
+ type CondenseDomResult = {
23
+ /** The condensed HTML string. Valid, parseable HTML. */
24
+ html: string;
25
+ /** Character count of the input. */
26
+ originalLength: number;
27
+ /** Character count of the output. */
28
+ condensedLength: number;
29
+ /** Characters removed, keyed by rule name. */
30
+ reductions: Record<string, number>;
31
+ };
32
+ declare function condenseDom(html: string): CondenseDomResult;
33
+
34
+ export { type CondenseDomResult, condenseDom };