@letsrunit/playwright 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/README.md +44 -0
  2. package/dist/index.d.ts +106 -0
  3. package/dist/index.js +3006 -0
  4. package/dist/index.js.map +1 -0
  5. package/package.json +67 -0
  6. package/src/browser.ts +20 -0
  7. package/src/field/calendar.ts +300 -0
  8. package/src/field/date-group.ts +253 -0
  9. package/src/field/date-text-input.ts +270 -0
  10. package/src/field/index.ts +57 -0
  11. package/src/field/native-checkbox.ts +21 -0
  12. package/src/field/native-date.ts +62 -0
  13. package/src/field/native-input.ts +17 -0
  14. package/src/field/native-select.ts +75 -0
  15. package/src/field/otp.ts +22 -0
  16. package/src/field/radio-group.ts +27 -0
  17. package/src/field/slider.ts +132 -0
  18. package/src/field/types.ts +16 -0
  19. package/src/format-html.ts +17 -0
  20. package/src/index.ts +12 -0
  21. package/src/locator.ts +102 -0
  22. package/src/page-info.ts +33 -0
  23. package/src/screenshot.ts +84 -0
  24. package/src/scroll.ts +10 -0
  25. package/src/scrub-html.ts +333 -0
  26. package/src/selector/date-selector.ts +272 -0
  27. package/src/selector/field-selector.ts +121 -0
  28. package/src/selector/index.ts +2 -0
  29. package/src/snapshot.ts +55 -0
  30. package/src/suppress-interferences.ts +288 -0
  31. package/src/translations/af.ts +41 -0
  32. package/src/translations/ar.ts +7 -0
  33. package/src/translations/az.ts +40 -0
  34. package/src/translations/bg.ts +7 -0
  35. package/src/translations/bn.ts +40 -0
  36. package/src/translations/bs.ts +7 -0
  37. package/src/translations/ca.ts +41 -0
  38. package/src/translations/cs.ts +7 -0
  39. package/src/translations/da.ts +44 -0
  40. package/src/translations/de.ts +47 -0
  41. package/src/translations/el.ts +40 -0
  42. package/src/translations/en.ts +7 -0
  43. package/src/translations/es.ts +7 -0
  44. package/src/translations/et.ts +7 -0
  45. package/src/translations/eu.ts +7 -0
  46. package/src/translations/fa.ts +7 -0
  47. package/src/translations/fi.ts +39 -0
  48. package/src/translations/fr.ts +42 -0
  49. package/src/translations/ga.ts +40 -0
  50. package/src/translations/he.ts +45 -0
  51. package/src/translations/hi.ts +39 -0
  52. package/src/translations/hr.ts +7 -0
  53. package/src/translations/hu.ts +7 -0
  54. package/src/translations/hy.ts +7 -0
  55. package/src/translations/id.ts +7 -0
  56. package/src/translations/index.ts +68 -0
  57. package/src/translations/is.ts +7 -0
  58. package/src/translations/it.ts +7 -0
  59. package/src/translations/ja.ts +7 -0
  60. package/src/translations/ka.ts +36 -0
  61. package/src/translations/ko.ts +7 -0
  62. package/src/translations/lt.ts +7 -0
  63. package/src/translations/lv.ts +43 -0
  64. package/src/translations/nl.ts +43 -0
  65. package/src/translations/no.ts +46 -0
  66. package/src/translations/pl.ts +39 -0
  67. package/src/translations/pt.ts +41 -0
  68. package/src/translations/ro.ts +40 -0
  69. package/src/translations/ru.ts +7 -0
  70. package/src/translations/sk.ts +7 -0
  71. package/src/translations/sl.ts +7 -0
  72. package/src/translations/sv.ts +44 -0
  73. package/src/translations/sw.ts +7 -0
  74. package/src/translations/ta.ts +7 -0
  75. package/src/translations/th.ts +39 -0
  76. package/src/translations/tl.ts +7 -0
  77. package/src/translations/tr.ts +41 -0
  78. package/src/translations/uk.ts +7 -0
  79. package/src/translations/ur.ts +43 -0
  80. package/src/translations/vi.ts +7 -0
  81. package/src/translations/zh.ts +7 -0
  82. package/src/types.ts +37 -0
  83. package/src/unified-html-diff.ts +22 -0
  84. package/src/utils/date.ts +40 -0
  85. package/src/utils/pick-field-element.ts +48 -0
  86. package/src/utils/type-check.ts +7 -0
  87. package/src/wait.ts +170 -0
@@ -0,0 +1,16 @@
1
+ import type { Range, Scalar } from '@letsrunit/utils';
2
+ import type { Locator } from '@playwright/test';
3
+
4
+ export interface SetOptions {
5
+ force?: boolean;
6
+ noWaitAfter?: boolean;
7
+ timeout?: number;
8
+ }
9
+
10
+ export type Value = Scalar | Scalar[] | Range | boolean | null;
11
+
12
+ export interface Loc {
13
+ el: Locator;
14
+ tag: string;
15
+ type: string | null;
16
+ }
@@ -0,0 +1,17 @@
1
+ import type { Page } from '@playwright/test';
2
+ import rehypeFormat from 'rehype-format';
3
+ import rehypeParse from 'rehype-parse';
4
+ import rehypeStringify from 'rehype-stringify';
5
+ import { unified } from 'unified';
6
+
7
+ export async function formatHtml(page: string | Page) {
8
+ const html = typeof page === 'string' ? page : await page.content();
9
+
10
+ const file = await unified()
11
+ .use(rehypeParse, { fragment: true })
12
+ .use(rehypeFormat, { indent: 2 }) // collapses existing whitespace and formats nodes
13
+ .use(rehypeStringify)
14
+ .process(html);
15
+
16
+ return String(file);
17
+ }
package/src/index.ts ADDED
@@ -0,0 +1,12 @@
1
+ export * from './browser';
2
+ export * from './field/index';
3
+ export * from './format-html';
4
+ export * from './locator';
5
+ export * from './selector';
6
+ export * from './snapshot';
7
+ export * from './screenshot';
8
+ export * from './scroll';
9
+ export * from './suppress-interferences';
10
+ export * from './types';
11
+ export * from './utils/date';
12
+ export * from './wait';
package/src/locator.ts ADDED
@@ -0,0 +1,102 @@
1
+ import { Locator, Page } from '@playwright/test';
2
+
3
+ type LocatorOptions = Parameters<Page['locator']>[1];
4
+
5
+ /**
6
+ * Locates an element using Playwright selectors, with fallbacks.
7
+ */
8
+ export async function locator(page: Page, selector: string): Promise<Locator> {
9
+ const primary = page.locator(selector).first();
10
+ if (await primary.count()) return primary;
11
+
12
+ return await tryRelaxNameToHasText(page, selector)
13
+ || await tryTagInsteadOfRole(page, selector)
14
+ || await tryRoleNameProximity(page, selector)
15
+ || await tryFieldAlternative(page, selector)
16
+ || await tryAsField(page, selector)
17
+ || primary; // Nothing found, return the original locator (so caller can still wait/assert)
18
+ }
19
+
20
+ async function firstMatch(page: Page, sel: string | string[], opts: LocatorOptions = {}): Promise<Locator | null> {
21
+ for (const selector of Array.isArray(sel) ? sel : [sel]) {
22
+ const loc = page.locator(selector, opts).first();
23
+ if (await loc.count()) return loc;
24
+ }
25
+
26
+ return null;
27
+ }
28
+
29
+ // Preserve the selector but relax [name="..."] to [has-text="..."]
30
+ // This keeps all other parts of the selector intact (prefix/suffix, additional filters).
31
+ // Examples:
32
+ // - role=link[name="Foo"] → role=link:has-text="Foo"
33
+ // - css=button[name="Save"i]:visible → css=button:visible:has-text="Save"
34
+ // - [name="Hello"] → :has-text="Hello"
35
+ async function tryRelaxNameToHasText(page: Page, selector: string): Promise<Locator | null> {
36
+ const matchAnyNameFull = selector.match(/^(role=.*)\[name="([^"]+)"i?](.*)$/i);
37
+ if (!matchAnyNameFull) return null;
38
+ const [, pre, nameText, post] = matchAnyNameFull;
39
+ const containsSelector = `${pre}${post}`;
40
+ return firstMatch(page, containsSelector, { hasText: nameText });
41
+ }
42
+
43
+ // Try using the tag name for `link`, `button` and `option` instead fo the aria role.
44
+ // This keeps all other parts of the selector intact (prefix/suffix, additional filters).
45
+ // Examples:
46
+ // - role=button[name="Foo"] → css=button:has-text="Save"
47
+ async function tryTagInsteadOfRole(page: Page, selector: string): Promise<Locator | null> {
48
+ const matchAnyNameFull = selector.match(/^role=(link|button|option)\s*\[name="([^"]+)"i?](.*)$/i);
49
+ if (!matchAnyNameFull) return null;
50
+ const [, role, nameText, post] = matchAnyNameFull;
51
+ const tag = role === 'link' ? 'a' : role;
52
+ const containsSelector = `css=${tag}${post}`;
53
+ return firstMatch(page, containsSelector, { hasText: nameText });
54
+ }
55
+
56
+ // If a role selector with a name filter fails, try proximity-based fallback while
57
+ // preserving the role and any remainder of the selector.
58
+ // Example: role=switch[name="Adres tonen"i] → text=Adres tonen >> .. >> role=switch
59
+ async function tryRoleNameProximity(page: Page, selector: string): Promise<Locator | null> {
60
+ const matchRole = selector.match(/^role=(\w+)\s*\[name="([^"]+)"i?](.*)$/i);
61
+ if (!matchRole) return null;
62
+ const [, role, name, rest] = matchRole;
63
+ const proximitySelector = `text=${name} >> .. >> role=${role}${rest}`;
64
+ return firstMatch(page, proximitySelector);
65
+ }
66
+
67
+ // Try alternatives if field is not found
68
+ // field="foo" → #foo > input
69
+ async function tryFieldAlternative(page: Page, selector: string): Promise<Locator | null> {
70
+ const matchField = selector.match(/^field="([^"]+)"i?$/i);
71
+ if (!matchField) return null;
72
+ const [, field] = matchField;
73
+ return firstMatch(page, `#${field} > input`);
74
+ }
75
+
76
+ // Try matching using the field selector in case of role mismatch
77
+ async function tryAsField(page: Page, selector: string): Promise<Locator | null> {
78
+ const matchRole = selector.match(/^role=(\w+)\s*\[name="([^"]+)"i?](.*)$/i);
79
+ if (!matchRole) return null;
80
+
81
+ const [, role, name, rest] = matchRole;
82
+
83
+ // Only allow ARIA roles that correspond to field-like controls
84
+ const fieldRoles = new Set([
85
+ 'button', // Sometimes used for date pickers or checkboxes
86
+ 'textbox',
87
+ 'searchbox',
88
+ 'combobox',
89
+ 'spinbutton',
90
+ 'slider',
91
+ 'checkbox',
92
+ 'switch',
93
+ 'radio',
94
+ 'radiogroup',
95
+ 'listbox',
96
+ 'option',
97
+ ]);
98
+
99
+ if (!fieldRoles.has(role.toLowerCase())) return null;
100
+
101
+ return firstMatch(page, `field=${name}${rest}`);
102
+ }
@@ -0,0 +1,33 @@
1
+ import metascraper, { type MetascraperOptions } from 'metascraper';
2
+ import metascraperDescription from 'metascraper-description';
3
+ import metascraperImage from 'metascraper-image';
4
+ import metascraperLang from 'metascraper-lang';
5
+ import metascraperLogo from 'metascraper-logo';
6
+ import metascraperLogoFavicon from 'metascraper-logo-favicon';
7
+ import metascraperTitle from 'metascraper-title';
8
+ import metascraperUrl from 'metascraper-url';
9
+ import type { PageInfo, Snapshot } from './types';
10
+
11
+ const scrape = metascraper([
12
+ metascraperTitle(),
13
+ metascraperDescription(),
14
+ metascraperImage(),
15
+ metascraperLogo(),
16
+ metascraperLogoFavicon(),
17
+ metascraperLang(),
18
+ metascraperUrl(),
19
+ ]);
20
+
21
+ export async function extractPageInfo(options: MetascraperOptions & Partial<Snapshot>): Promise<PageInfo> {
22
+ const meta = await scrape(options);
23
+
24
+ return {
25
+ url: meta.url || options.url,
26
+ name: meta.title || undefined,
27
+ description: meta.description || undefined,
28
+ image: meta.image || undefined,
29
+ favicon: meta.logo || undefined,
30
+ lang: meta.lang || undefined,
31
+ screenshot: options.screenshot,
32
+ };
33
+ }
@@ -0,0 +1,84 @@
1
+ import { hashKey } from '@letsrunit/utils';
2
+ import type { Page, PageScreenshotOptions } from '@playwright/test';
3
+
4
+ export async function screenshot(page: Page, options?: PageScreenshotOptions): Promise<File> {
5
+ const buffer = options?.mask?.length ? await screenshotWithMask(page, options) : await page.screenshot(options);
6
+ const filename = await hashKey(`screenshot-{hash}.png`, buffer);
7
+
8
+ return new File([new Uint8Array(buffer)], filename, { type: 'image/png' });
9
+ }
10
+
11
+ async function screenshotWithMask(page: Page, options: PageScreenshotOptions): Promise<Buffer> {
12
+ const { mask: locators, ...otherOptions } = options;
13
+
14
+ if (!locators?.length) throw new Error('No locators specified');
15
+
16
+ // 1. Inject CSS + overlay
17
+ await page.evaluate(() => {
18
+ const doc = document;
19
+
20
+ // Inject CSS once
21
+ if (!doc.getElementById('lri-mask-style')) {
22
+ const style = doc.createElement('style');
23
+ style.id = 'lri-mask-style';
24
+ style.textContent = `
25
+ .lri-mask-overlay {
26
+ position: fixed;
27
+ inset: 0;
28
+ background: rgba(0, 0, 0, 0.65);
29
+ pointer-events: none;
30
+ z-index: 2147483646;
31
+ }
32
+ .lri-mask-highlight {
33
+ position: relative !important;
34
+ z-index: 2147483647 !important;
35
+ box-shadow: 0 0 0 2px rgba(255, 255, 255, 0.95);
36
+ border-radius: 4px;
37
+ }
38
+ `;
39
+ doc.head.appendChild(style);
40
+ }
41
+
42
+ if (!doc.getElementById('lri-mask-overlay')) {
43
+ const overlay = doc.createElement('div');
44
+ overlay.id = 'lri-mask-overlay';
45
+ overlay.className = 'lri-mask-overlay';
46
+ doc.body.appendChild(overlay);
47
+ }
48
+ });
49
+
50
+ // 2. Resolve locators to element handles and add highlight class
51
+ const handleGroups = await Promise.all(locators.map((loc) => loc.elementHandles()));
52
+ const handles = handleGroups.flat();
53
+
54
+ await Promise.all(
55
+ handles.map((h) =>
56
+ h.evaluate((el) => {
57
+ (el as HTMLElement).classList.add('lri-mask-highlight');
58
+ }),
59
+ ),
60
+ );
61
+
62
+ try {
63
+ // 3. Take screenshot with overlay in place
64
+ return await page.screenshot(otherOptions);
65
+ } finally {
66
+ // 4. Cleanup: remove classes and overlay (keep CSS optional)
67
+ await Promise.all(
68
+ handles.map((h) =>
69
+ h.evaluate((el) => {
70
+ (el as HTMLElement).classList.remove('lri-mask-highlight');
71
+ }),
72
+ ),
73
+ );
74
+
75
+ await page.evaluate(() => {
76
+ const overlay = document.getElementById('lri-mask-overlay');
77
+ if (overlay) overlay.remove();
78
+
79
+ // Optional: remove style as well if you want zero residue
80
+ const style = document.getElementById('lri-mask-style');
81
+ if (style) style.remove();
82
+ });
83
+ }
84
+ }
package/src/scroll.ts ADDED
@@ -0,0 +1,10 @@
1
+ import type { Locator } from '@playwright/test';
2
+
3
+ export async function scrollToCenter(locator: Locator) {
4
+ const count = await locator.count();
5
+ if (!count) return;
6
+
7
+ await locator.evaluate((el) => {
8
+ el.scrollIntoView({ block: 'center', inline: 'center', behavior: 'instant' });
9
+ });
10
+ }
@@ -0,0 +1,333 @@
1
+ // Conservatively scrub HTML for LLMs while preserving semantics.
2
+ // - Keeps structural tags (header/nav/main/section/article/aside/footer, headings, lists, tables, forms, etc.)
3
+ // - Removes <head>, <script>, <style>, <template>, <noscript>, comments
4
+ // - Drops nodes that are clearly hidden/inert via attributes (hidden, aria-hidden="true", display:none, visibility:hidden, opacity:0)
5
+ // - Strips event handlers (on*) and inline styles, keeps semantic attrs (id, class, role, aria-*, data-*, href, src, alt, etc.)
6
+ // - Collapses whitespace in text nodes (but NOT inside <pre>/<code>)
7
+ // - Does NOT change tag names, does NOT unwrap containers, does NOT reorder content
8
+
9
+ import { memoize } from '@letsrunit/utils/src/memoize';
10
+ import type { Page } from '@playwright/test';
11
+ import stringify from 'fast-json-stable-stringify';
12
+ import { JSDOM } from 'jsdom';
13
+ import { isPage } from './utils/type-check';
14
+
15
+ export type ScrubHtmlOptions = {
16
+ /** Remove nodes that look hidden/inert by attributes. Default: true */
17
+ dropHidden?: boolean;
18
+ /** Remove the entire <head>. Default: true */
19
+ dropHead?: boolean;
20
+ /** Remove <svg> glyphs. Default: false */
21
+ dropSvg?: boolean;
22
+ /** Only keep <main> element. Default: false */
23
+ pickMain?: boolean;
24
+ /** Keep a conservative attribute allowlist (0=none, 1=normal, 2=aggressive). Default: 1 */
25
+ stripAttributes?: 0 | 1 | 2;
26
+ /** Normalize whitespace in text nodes (outside pre/code). Default: true */
27
+ normalizeWhitespace?: boolean;
28
+ /** Remove HTML comments. Default: true */
29
+ dropComments?: boolean;
30
+ /** Replace <br> within headings (h1–h6) with a space. Default: true */
31
+ replaceBrInHeadings?: boolean;
32
+ /** Limit lists to max items: -1 mean no limit. Default: -1 */
33
+ limitLists?: number;
34
+ };
35
+
36
+ const HTML_MIN_ATTR_THRESHOLD = 250_000; // ~70k tokens
37
+ const HTML_LIMIT_LISTS_THRESHOLD = 400_000; // ~110k tokens
38
+ const HTML_MAIN_ONLY_THRESHOLD = 600_000; // ~170k tokens
39
+
40
+ function getDefaults(contentLength: number): Required<ScrubHtmlOptions> {
41
+ return {
42
+ dropHidden: true,
43
+ dropHead: true,
44
+ dropSvg: false,
45
+ pickMain: contentLength >= HTML_MAIN_ONLY_THRESHOLD,
46
+ stripAttributes: contentLength >= HTML_MIN_ATTR_THRESHOLD ? 2 : 1,
47
+ normalizeWhitespace: true,
48
+ dropComments: true,
49
+ replaceBrInHeadings: true,
50
+ limitLists: contentLength >= HTML_LIMIT_LISTS_THRESHOLD ? 20 : -1,
51
+ };
52
+ }
53
+
54
+ // Attributes we keep to preserve semantics and minimal usefulness
55
+ const ALLOWED_ATTRS = {
56
+ match: new Set([
57
+ // identity/semantics
58
+ 'id',
59
+ 'class',
60
+ 'role',
61
+ // internationalization
62
+ 'lang',
63
+ 'dir',
64
+
65
+ // anchors & media
66
+ 'href',
67
+ 'title',
68
+ 'target',
69
+ 'rel',
70
+ 'src',
71
+ 'alt',
72
+ 'width',
73
+ 'height',
74
+ 'loading',
75
+
76
+ // tables
77
+ 'scope',
78
+ 'headers',
79
+ 'colspan',
80
+ 'rowspan',
81
+
82
+ // forms (pure semantics—doesn’t change structure)
83
+ 'name',
84
+ 'value',
85
+ 'type',
86
+ 'for',
87
+ 'placeholder',
88
+ 'checked',
89
+ 'selected',
90
+ 'multiple',
91
+ 'method',
92
+ 'action',
93
+
94
+ // time, figure, etc.
95
+ 'datetime',
96
+ ]),
97
+ regexp:/^aria-[\w-]+|^data-[\w-]+$/i, // ARIA attributes & data-* attributes
98
+ };
99
+
100
+ const ALLOWED_ATTRS_AGGRESSIVE = {
101
+ match: new Set([
102
+ // structuur / algemene selectors
103
+ 'id',
104
+ 'class',
105
+ 'role',
106
+
107
+ // links / media
108
+ 'href',
109
+ 'src',
110
+ 'alt',
111
+ 'title',
112
+
113
+ // tables
114
+ 'scope',
115
+
116
+ // forms / velden
117
+ 'name',
118
+ 'type',
119
+ 'for',
120
+ 'placeholder',
121
+ 'value',
122
+ 'checked',
123
+ 'selected',
124
+
125
+ // ARIA voor Playwright getByRole/getByLabel
126
+ 'aria-label',
127
+ 'aria-labelledby',
128
+ 'aria-describedby',
129
+
130
+ // veelgebruikte test selectors
131
+ 'data-testid',
132
+ 'data-test-id',
133
+ 'data-cy',
134
+ 'data-qa',
135
+ ]),
136
+ regexp: null,
137
+ };
138
+
139
+ const HIDDEN_SELECTORS = [
140
+ '[hidden]',
141
+ '[inert]',
142
+ '[aria-hidden="true"]',
143
+ '[style*="display:none"]',
144
+ '[style*="visibility:hidden"]',
145
+ '[style*="opacity:0"]'
146
+ ].join(',');
147
+
148
+ // Tags that are always removed (infra/noise)
149
+ const ALWAYS_DROP = [
150
+ 'script', 'style', 'template', 'noscript', 'slot', 'object', 'embed'
151
+ ];
152
+
153
+ export async function scrubHtml(
154
+ page: { html: string, url: string } | Page,
155
+ opts: ScrubHtmlOptions = {},
156
+ ): Promise<string> {
157
+ if (isPage(page)) page = { html: await page.content(), url: page.url() };
158
+ return await memoizedScrubHtml(page, opts);
159
+ }
160
+
161
+ const memoizedScrubHtml = memoize(realScrubHtml, {
162
+ max: 16,
163
+ ttl: 10 * 60_000,
164
+ cacheKey: (args) => stringify({ html: args[0].html, url: args[0].url, ...args[1] }),
165
+ });
166
+
167
+ /**
168
+ * Scrub HTML conservatively for LLMs without destroying semantics.
169
+ */
170
+ export async function realScrubHtml(
171
+ { html, url }: { html: string, url: string },
172
+ opts: ScrubHtmlOptions = {},
173
+ ): Promise<string> {
174
+ const o = { ...getDefaults(html.length), ...opts };
175
+
176
+ const dom = new JSDOM(html, { url });
177
+ const doc = dom.window.document;
178
+
179
+ if (o.pickMain) pickMain(doc);
180
+ dropInfraAndSvg(doc, !!o.dropSvg);
181
+ if (o.dropHidden) dropHiddenTrees(doc);
182
+ if (o.stripAttributes) stripAttributesAndSanitize(doc, o.stripAttributes);
183
+ if (o.dropComments) dropHtmlComments(doc);
184
+ if (o.replaceBrInHeadings) replaceBrsInHeadings(doc);
185
+ if (o.limitLists >= 0) limitListsAndRows(doc, o.limitLists);
186
+ if (o.normalizeWhitespace) normalizeWhitespace(doc.body);
187
+
188
+ return doc.body.innerHTML;
189
+ }
190
+
191
+ /* ---------------- helpers ---------------- */
192
+
193
+ function hasHiddenAncestor(el: Element): boolean {
194
+ let p: Element | null = el.parentElement;
195
+ while (p) {
196
+ if (
197
+ p.hasAttribute('hidden') ||
198
+ p.hasAttribute('inert') ||
199
+ p.getAttribute('aria-hidden') === 'true'
200
+ ) return true;
201
+
202
+ const style = p.getAttribute('style') || '';
203
+ if (/\bdisplay\s*:\s*none\b/i.test(style)) return true;
204
+ if (/\bvisibility\s*:\s*hidden\b/i.test(style)) return true;
205
+ if (/\bopacity\s*:\s*0(?:\D|$)/i.test(style)) return true;
206
+
207
+ p = p.parentElement;
208
+ }
209
+ return false;
210
+ }
211
+
212
+ function normalizeWhitespace(root: Element) {
213
+ const preLike = new Set(['PRE', 'CODE', 'SAMP', 'KBD']);
214
+ const doc = root.ownerDocument!;
215
+ const walker = doc.createTreeWalker(root, 4 /*NodeFilter.SHOW_TEXT*/);
216
+ const changes: Text[] = [];
217
+
218
+ let node: Node | null;
219
+
220
+ while ((node = walker.nextNode())) {
221
+ const text = node as Text;
222
+ const parent = text.parentElement;
223
+ if (!parent) continue;
224
+ if (preLike.has(parent.tagName)) continue; // don't touch pre/code
225
+
226
+ const v = text.nodeValue ?? '';
227
+ const collapsed = v.replace(/\s+/g, ' ');
228
+ if (collapsed !== v) changes.push(text);
229
+ }
230
+
231
+ for (const t of changes) {
232
+ // extra trim around block-ish elements
233
+ const parent = t.parentElement!;
234
+ const isBlockish = /^(P|LI|DIV|SECTION|ARTICLE|ASIDE|HEADER|FOOTER|MAIN|NAV|H[1-6]|BLOCKQUOTE|FIGCAPTION|TD|TH)$/i.test(parent.tagName);
235
+ t.nodeValue = (t.nodeValue || '').replace(/\s+/g, ' ');
236
+ if (isBlockish) t.nodeValue = (t.nodeValue || '').trim();
237
+ }
238
+ }
239
+
240
+ // Split-out helper steps (top-level, no nested functions)
241
+
242
+ function pickMain(doc: Document): boolean {
243
+ const main = doc.querySelector('main');
244
+ if (!main) return false;
245
+ const clone = main.cloneNode(true);
246
+ doc.body.innerHTML = '';
247
+ doc.body.appendChild(clone);
248
+ return true;
249
+ }
250
+
251
+ function dropInfraAndSvg(doc: Document, dropSvg: boolean) {
252
+ const toDrop = [...ALWAYS_DROP, dropSvg ? 'svg' : ''].filter(Boolean).join(',');
253
+ if (!toDrop) return;
254
+ doc.querySelectorAll(toDrop).forEach((el) => el.remove());
255
+ }
256
+
257
+ function dropHiddenTrees(doc: Document) {
258
+ doc.querySelectorAll<HTMLElement>(HIDDEN_SELECTORS).forEach((el) => el.remove());
259
+ const all = [...doc.body.querySelectorAll<HTMLElement>('*')];
260
+ for (const el of all) {
261
+ if (!el.isConnected) continue;
262
+ if (hasHiddenAncestor(el)) el.remove();
263
+ }
264
+ }
265
+
266
+ function stripAttributesAndSanitize(doc: Document, level: 0 | 1 | 2) {
267
+ if (!level) return;
268
+
269
+ const all = [...doc.body.querySelectorAll<HTMLElement>('*')];
270
+
271
+ for (const el of all) {
272
+ const isSvg = el.namespaceURI === 'http://www.w3.org/2000/svg';
273
+ for (const { name } of [...el.attributes]) {
274
+ const lower = name.toLowerCase();
275
+ if (lower.startsWith('on')) {
276
+ el.removeAttribute(name);
277
+ continue;
278
+ }
279
+ if (lower === 'style') {
280
+ el.removeAttribute(name);
281
+ continue;
282
+ }
283
+ if (isSvg) continue; // keep svg attrs
284
+ const allowed = level === 1 ? ALLOWED_ATTRS : ALLOWED_ATTRS_AGGRESSIVE;
285
+ if (!allowed.match.has(lower) && !allowed.regexp?.test(name)) {
286
+ el.removeAttribute(name);
287
+ }
288
+ }
289
+ }
290
+
291
+ // sanitize javascript: hrefs
292
+ doc.querySelectorAll('a[href]').forEach((a) => {
293
+ const href = a.getAttribute('href') || '';
294
+ if (/^\s*javascript:/i.test(href)) a.removeAttribute('href');
295
+ });
296
+ }
297
+
298
+ function dropHtmlComments(doc: Document) {
299
+ const nf = doc.defaultView?.NodeFilter;
300
+ const SHOW_COMMENT = nf?.SHOW_COMMENT ?? 128; // fallback constant
301
+
302
+ // createTreeWalker expects a NodeFilter mask
303
+ const walker = doc.createTreeWalker(doc, SHOW_COMMENT as unknown as number);
304
+ const toRemove: Comment[] = [];
305
+ let n: Comment | null;
306
+
307
+ while ((n = walker.nextNode() as Comment | null)) toRemove.push(n);
308
+ toRemove.forEach((c) => c.parentNode?.removeChild(c));
309
+ }
310
+
311
+ function replaceBrsInHeadings(doc: Document) {
312
+ doc.querySelectorAll('h1, h2, h3, h4, h5, h6').forEach((h) => {
313
+ h.querySelectorAll('br').forEach((br) => {
314
+ const space = doc.createTextNode(' ');
315
+ (br as Element).replaceWith(space);
316
+ });
317
+ });
318
+ }
319
+
320
+ function limitListsAndRows(doc: Document, limit: number) {
321
+ // lists
322
+ doc.querySelectorAll('ul, ol').forEach((list) => {
323
+ const items = Array.from(list.children).filter((c) => c.tagName === 'LI');
324
+ for (let i = limit; i < items.length; i++) items[i].remove();
325
+ });
326
+
327
+ // table rows
328
+ const rowContainers = doc.querySelectorAll('table, thead, tbody, tfoot');
329
+ rowContainers.forEach((container) => {
330
+ const rows = Array.from(container.children).filter((c) => c.tagName === 'TR');
331
+ for (let i = limit; i < rows.length; i++) rows[i].remove();
332
+ });
333
+ }