@letsrunit/playwright 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +44 -0
- package/dist/index.d.ts +106 -0
- package/dist/index.js +3006 -0
- package/dist/index.js.map +1 -0
- package/package.json +67 -0
- package/src/browser.ts +20 -0
- package/src/field/calendar.ts +300 -0
- package/src/field/date-group.ts +253 -0
- package/src/field/date-text-input.ts +270 -0
- package/src/field/index.ts +57 -0
- package/src/field/native-checkbox.ts +21 -0
- package/src/field/native-date.ts +62 -0
- package/src/field/native-input.ts +17 -0
- package/src/field/native-select.ts +75 -0
- package/src/field/otp.ts +22 -0
- package/src/field/radio-group.ts +27 -0
- package/src/field/slider.ts +132 -0
- package/src/field/types.ts +16 -0
- package/src/format-html.ts +17 -0
- package/src/index.ts +12 -0
- package/src/locator.ts +102 -0
- package/src/page-info.ts +33 -0
- package/src/screenshot.ts +84 -0
- package/src/scroll.ts +10 -0
- package/src/scrub-html.ts +333 -0
- package/src/selector/date-selector.ts +272 -0
- package/src/selector/field-selector.ts +121 -0
- package/src/selector/index.ts +2 -0
- package/src/snapshot.ts +55 -0
- package/src/suppress-interferences.ts +288 -0
- package/src/translations/af.ts +41 -0
- package/src/translations/ar.ts +7 -0
- package/src/translations/az.ts +40 -0
- package/src/translations/bg.ts +7 -0
- package/src/translations/bn.ts +40 -0
- package/src/translations/bs.ts +7 -0
- package/src/translations/ca.ts +41 -0
- package/src/translations/cs.ts +7 -0
- package/src/translations/da.ts +44 -0
- package/src/translations/de.ts +47 -0
- package/src/translations/el.ts +40 -0
- package/src/translations/en.ts +7 -0
- package/src/translations/es.ts +7 -0
- package/src/translations/et.ts +7 -0
- package/src/translations/eu.ts +7 -0
- package/src/translations/fa.ts +7 -0
- package/src/translations/fi.ts +39 -0
- package/src/translations/fr.ts +42 -0
- package/src/translations/ga.ts +40 -0
- package/src/translations/he.ts +45 -0
- package/src/translations/hi.ts +39 -0
- package/src/translations/hr.ts +7 -0
- package/src/translations/hu.ts +7 -0
- package/src/translations/hy.ts +7 -0
- package/src/translations/id.ts +7 -0
- package/src/translations/index.ts +68 -0
- package/src/translations/is.ts +7 -0
- package/src/translations/it.ts +7 -0
- package/src/translations/ja.ts +7 -0
- package/src/translations/ka.ts +36 -0
- package/src/translations/ko.ts +7 -0
- package/src/translations/lt.ts +7 -0
- package/src/translations/lv.ts +43 -0
- package/src/translations/nl.ts +43 -0
- package/src/translations/no.ts +46 -0
- package/src/translations/pl.ts +39 -0
- package/src/translations/pt.ts +41 -0
- package/src/translations/ro.ts +40 -0
- package/src/translations/ru.ts +7 -0
- package/src/translations/sk.ts +7 -0
- package/src/translations/sl.ts +7 -0
- package/src/translations/sv.ts +44 -0
- package/src/translations/sw.ts +7 -0
- package/src/translations/ta.ts +7 -0
- package/src/translations/th.ts +39 -0
- package/src/translations/tl.ts +7 -0
- package/src/translations/tr.ts +41 -0
- package/src/translations/uk.ts +7 -0
- package/src/translations/ur.ts +43 -0
- package/src/translations/vi.ts +7 -0
- package/src/translations/zh.ts +7 -0
- package/src/types.ts +37 -0
- package/src/unified-html-diff.ts +22 -0
- package/src/utils/date.ts +40 -0
- package/src/utils/pick-field-element.ts +48 -0
- package/src/utils/type-check.ts +7 -0
- package/src/wait.ts +170 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { Range, Scalar } from '@letsrunit/utils';
|
|
2
|
+
import type { Locator } from '@playwright/test';
|
|
3
|
+
|
|
4
|
+
export interface SetOptions {
|
|
5
|
+
force?: boolean;
|
|
6
|
+
noWaitAfter?: boolean;
|
|
7
|
+
timeout?: number;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export type Value = Scalar | Scalar[] | Range | boolean | null;
|
|
11
|
+
|
|
12
|
+
export interface Loc {
|
|
13
|
+
el: Locator;
|
|
14
|
+
tag: string;
|
|
15
|
+
type: string | null;
|
|
16
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { Page } from '@playwright/test';
|
|
2
|
+
import rehypeFormat from 'rehype-format';
|
|
3
|
+
import rehypeParse from 'rehype-parse';
|
|
4
|
+
import rehypeStringify from 'rehype-stringify';
|
|
5
|
+
import { unified } from 'unified';
|
|
6
|
+
|
|
7
|
+
export async function formatHtml(page: string | Page) {
|
|
8
|
+
const html = typeof page === 'string' ? page : await page.content();
|
|
9
|
+
|
|
10
|
+
const file = await unified()
|
|
11
|
+
.use(rehypeParse, { fragment: true })
|
|
12
|
+
.use(rehypeFormat, { indent: 2 }) // collapses existing whitespace and formats nodes
|
|
13
|
+
.use(rehypeStringify)
|
|
14
|
+
.process(html);
|
|
15
|
+
|
|
16
|
+
return String(file);
|
|
17
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
export * from './browser';
|
|
2
|
+
export * from './field/index';
|
|
3
|
+
export * from './format-html';
|
|
4
|
+
export * from './locator';
|
|
5
|
+
export * from './selector';
|
|
6
|
+
export * from './snapshot';
|
|
7
|
+
export * from './screenshot';
|
|
8
|
+
export * from './scroll';
|
|
9
|
+
export * from './suppress-interferences';
|
|
10
|
+
export * from './types';
|
|
11
|
+
export * from './utils/date';
|
|
12
|
+
export * from './wait';
|
package/src/locator.ts
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import { Locator, Page } from '@playwright/test';
|
|
2
|
+
|
|
3
|
+
type LocatorOptions = Parameters<Page['locator']>[1];
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Locates an element using Playwright selectors, with fallbacks.
|
|
7
|
+
*/
|
|
8
|
+
export async function locator(page: Page, selector: string): Promise<Locator> {
|
|
9
|
+
const primary = page.locator(selector).first();
|
|
10
|
+
if (await primary.count()) return primary;
|
|
11
|
+
|
|
12
|
+
return await tryRelaxNameToHasText(page, selector)
|
|
13
|
+
|| await tryTagInsteadOfRole(page, selector)
|
|
14
|
+
|| await tryRoleNameProximity(page, selector)
|
|
15
|
+
|| await tryFieldAlternative(page, selector)
|
|
16
|
+
|| await tryAsField(page, selector)
|
|
17
|
+
|| primary; // Nothing found, return the original locator (so caller can still wait/assert)
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
async function firstMatch(page: Page, sel: string | string[], opts: LocatorOptions = {}): Promise<Locator | null> {
|
|
21
|
+
for (const selector of Array.isArray(sel) ? sel : [sel]) {
|
|
22
|
+
const loc = page.locator(selector, opts).first();
|
|
23
|
+
if (await loc.count()) return loc;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
return null;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Preserve the selector but relax [name="..."] to [has-text="..."]
|
|
30
|
+
// This keeps all other parts of the selector intact (prefix/suffix, additional filters).
|
|
31
|
+
// Examples:
|
|
32
|
+
// - role=link[name="Foo"] → role=link:has-text="Foo"
|
|
33
|
+
// - css=button[name="Save"i]:visible → css=button:visible:has-text="Save"
|
|
34
|
+
// - [name="Hello"] → :has-text="Hello"
|
|
35
|
+
async function tryRelaxNameToHasText(page: Page, selector: string): Promise<Locator | null> {
|
|
36
|
+
const matchAnyNameFull = selector.match(/^(role=.*)\[name="([^"]+)"i?](.*)$/i);
|
|
37
|
+
if (!matchAnyNameFull) return null;
|
|
38
|
+
const [, pre, nameText, post] = matchAnyNameFull;
|
|
39
|
+
const containsSelector = `${pre}${post}`;
|
|
40
|
+
return firstMatch(page, containsSelector, { hasText: nameText });
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Try using the tag name for `link`, `button` and `option` instead fo the aria role.
|
|
44
|
+
// This keeps all other parts of the selector intact (prefix/suffix, additional filters).
|
|
45
|
+
// Examples:
|
|
46
|
+
// - role=button[name="Foo"] → css=button:has-text="Save"
|
|
47
|
+
async function tryTagInsteadOfRole(page: Page, selector: string): Promise<Locator | null> {
|
|
48
|
+
const matchAnyNameFull = selector.match(/^role=(link|button|option)\s*\[name="([^"]+)"i?](.*)$/i);
|
|
49
|
+
if (!matchAnyNameFull) return null;
|
|
50
|
+
const [, role, nameText, post] = matchAnyNameFull;
|
|
51
|
+
const tag = role === 'link' ? 'a' : role;
|
|
52
|
+
const containsSelector = `css=${tag}${post}`;
|
|
53
|
+
return firstMatch(page, containsSelector, { hasText: nameText });
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// If a role selector with a name filter fails, try proximity-based fallback while
|
|
57
|
+
// preserving the role and any remainder of the selector.
|
|
58
|
+
// Example: role=switch[name="Adres tonen"i] → text=Adres tonen >> .. >> role=switch
|
|
59
|
+
async function tryRoleNameProximity(page: Page, selector: string): Promise<Locator | null> {
|
|
60
|
+
const matchRole = selector.match(/^role=(\w+)\s*\[name="([^"]+)"i?](.*)$/i);
|
|
61
|
+
if (!matchRole) return null;
|
|
62
|
+
const [, role, name, rest] = matchRole;
|
|
63
|
+
const proximitySelector = `text=${name} >> .. >> role=${role}${rest}`;
|
|
64
|
+
return firstMatch(page, proximitySelector);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Try alternatives if field is not found
|
|
68
|
+
// field="foo" → #foo > input
|
|
69
|
+
async function tryFieldAlternative(page: Page, selector: string): Promise<Locator | null> {
|
|
70
|
+
const matchField = selector.match(/^field="([^"]+)"i?$/i);
|
|
71
|
+
if (!matchField) return null;
|
|
72
|
+
const [, field] = matchField;
|
|
73
|
+
return firstMatch(page, `#${field} > input`);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Try matching using the field selector in case of role mismatch
|
|
77
|
+
async function tryAsField(page: Page, selector: string): Promise<Locator | null> {
|
|
78
|
+
const matchRole = selector.match(/^role=(\w+)\s*\[name="([^"]+)"i?](.*)$/i);
|
|
79
|
+
if (!matchRole) return null;
|
|
80
|
+
|
|
81
|
+
const [, role, name, rest] = matchRole;
|
|
82
|
+
|
|
83
|
+
// Only allow ARIA roles that correspond to field-like controls
|
|
84
|
+
const fieldRoles = new Set([
|
|
85
|
+
'button', // Sometimes used for date pickers or checkboxes
|
|
86
|
+
'textbox',
|
|
87
|
+
'searchbox',
|
|
88
|
+
'combobox',
|
|
89
|
+
'spinbutton',
|
|
90
|
+
'slider',
|
|
91
|
+
'checkbox',
|
|
92
|
+
'switch',
|
|
93
|
+
'radio',
|
|
94
|
+
'radiogroup',
|
|
95
|
+
'listbox',
|
|
96
|
+
'option',
|
|
97
|
+
]);
|
|
98
|
+
|
|
99
|
+
if (!fieldRoles.has(role.toLowerCase())) return null;
|
|
100
|
+
|
|
101
|
+
return firstMatch(page, `field=${name}${rest}`);
|
|
102
|
+
}
|
package/src/page-info.ts
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import metascraper, { type MetascraperOptions } from 'metascraper';
|
|
2
|
+
import metascraperDescription from 'metascraper-description';
|
|
3
|
+
import metascraperImage from 'metascraper-image';
|
|
4
|
+
import metascraperLang from 'metascraper-lang';
|
|
5
|
+
import metascraperLogo from 'metascraper-logo';
|
|
6
|
+
import metascraperLogoFavicon from 'metascraper-logo-favicon';
|
|
7
|
+
import metascraperTitle from 'metascraper-title';
|
|
8
|
+
import metascraperUrl from 'metascraper-url';
|
|
9
|
+
import type { PageInfo, Snapshot } from './types';
|
|
10
|
+
|
|
11
|
+
const scrape = metascraper([
|
|
12
|
+
metascraperTitle(),
|
|
13
|
+
metascraperDescription(),
|
|
14
|
+
metascraperImage(),
|
|
15
|
+
metascraperLogo(),
|
|
16
|
+
metascraperLogoFavicon(),
|
|
17
|
+
metascraperLang(),
|
|
18
|
+
metascraperUrl(),
|
|
19
|
+
]);
|
|
20
|
+
|
|
21
|
+
export async function extractPageInfo(options: MetascraperOptions & Partial<Snapshot>): Promise<PageInfo> {
|
|
22
|
+
const meta = await scrape(options);
|
|
23
|
+
|
|
24
|
+
return {
|
|
25
|
+
url: meta.url || options.url,
|
|
26
|
+
name: meta.title || undefined,
|
|
27
|
+
description: meta.description || undefined,
|
|
28
|
+
image: meta.image || undefined,
|
|
29
|
+
favicon: meta.logo || undefined,
|
|
30
|
+
lang: meta.lang || undefined,
|
|
31
|
+
screenshot: options.screenshot,
|
|
32
|
+
};
|
|
33
|
+
}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { hashKey } from '@letsrunit/utils';
|
|
2
|
+
import type { Page, PageScreenshotOptions } from '@playwright/test';
|
|
3
|
+
|
|
4
|
+
export async function screenshot(page: Page, options?: PageScreenshotOptions): Promise<File> {
|
|
5
|
+
const buffer = options?.mask?.length ? await screenshotWithMask(page, options) : await page.screenshot(options);
|
|
6
|
+
const filename = await hashKey(`screenshot-{hash}.png`, buffer);
|
|
7
|
+
|
|
8
|
+
return new File([new Uint8Array(buffer)], filename, { type: 'image/png' });
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
async function screenshotWithMask(page: Page, options: PageScreenshotOptions): Promise<Buffer> {
|
|
12
|
+
const { mask: locators, ...otherOptions } = options;
|
|
13
|
+
|
|
14
|
+
if (!locators?.length) throw new Error('No locators specified');
|
|
15
|
+
|
|
16
|
+
// 1. Inject CSS + overlay
|
|
17
|
+
await page.evaluate(() => {
|
|
18
|
+
const doc = document;
|
|
19
|
+
|
|
20
|
+
// Inject CSS once
|
|
21
|
+
if (!doc.getElementById('lri-mask-style')) {
|
|
22
|
+
const style = doc.createElement('style');
|
|
23
|
+
style.id = 'lri-mask-style';
|
|
24
|
+
style.textContent = `
|
|
25
|
+
.lri-mask-overlay {
|
|
26
|
+
position: fixed;
|
|
27
|
+
inset: 0;
|
|
28
|
+
background: rgba(0, 0, 0, 0.65);
|
|
29
|
+
pointer-events: none;
|
|
30
|
+
z-index: 2147483646;
|
|
31
|
+
}
|
|
32
|
+
.lri-mask-highlight {
|
|
33
|
+
position: relative !important;
|
|
34
|
+
z-index: 2147483647 !important;
|
|
35
|
+
box-shadow: 0 0 0 2px rgba(255, 255, 255, 0.95);
|
|
36
|
+
border-radius: 4px;
|
|
37
|
+
}
|
|
38
|
+
`;
|
|
39
|
+
doc.head.appendChild(style);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if (!doc.getElementById('lri-mask-overlay')) {
|
|
43
|
+
const overlay = doc.createElement('div');
|
|
44
|
+
overlay.id = 'lri-mask-overlay';
|
|
45
|
+
overlay.className = 'lri-mask-overlay';
|
|
46
|
+
doc.body.appendChild(overlay);
|
|
47
|
+
}
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
// 2. Resolve locators to element handles and add highlight class
|
|
51
|
+
const handleGroups = await Promise.all(locators.map((loc) => loc.elementHandles()));
|
|
52
|
+
const handles = handleGroups.flat();
|
|
53
|
+
|
|
54
|
+
await Promise.all(
|
|
55
|
+
handles.map((h) =>
|
|
56
|
+
h.evaluate((el) => {
|
|
57
|
+
(el as HTMLElement).classList.add('lri-mask-highlight');
|
|
58
|
+
}),
|
|
59
|
+
),
|
|
60
|
+
);
|
|
61
|
+
|
|
62
|
+
try {
|
|
63
|
+
// 3. Take screenshot with overlay in place
|
|
64
|
+
return await page.screenshot(otherOptions);
|
|
65
|
+
} finally {
|
|
66
|
+
// 4. Cleanup: remove classes and overlay (keep CSS optional)
|
|
67
|
+
await Promise.all(
|
|
68
|
+
handles.map((h) =>
|
|
69
|
+
h.evaluate((el) => {
|
|
70
|
+
(el as HTMLElement).classList.remove('lri-mask-highlight');
|
|
71
|
+
}),
|
|
72
|
+
),
|
|
73
|
+
);
|
|
74
|
+
|
|
75
|
+
await page.evaluate(() => {
|
|
76
|
+
const overlay = document.getElementById('lri-mask-overlay');
|
|
77
|
+
if (overlay) overlay.remove();
|
|
78
|
+
|
|
79
|
+
// Optional: remove style as well if you want zero residue
|
|
80
|
+
const style = document.getElementById('lri-mask-style');
|
|
81
|
+
if (style) style.remove();
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
}
|
package/src/scroll.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { Locator } from '@playwright/test';
|
|
2
|
+
|
|
3
|
+
export async function scrollToCenter(locator: Locator) {
|
|
4
|
+
const count = await locator.count();
|
|
5
|
+
if (!count) return;
|
|
6
|
+
|
|
7
|
+
await locator.evaluate((el) => {
|
|
8
|
+
el.scrollIntoView({ block: 'center', inline: 'center', behavior: 'instant' });
|
|
9
|
+
});
|
|
10
|
+
}
|
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
// Conservatively scrub HTML for LLMs while preserving semantics.
|
|
2
|
+
// - Keeps structural tags (header/nav/main/section/article/aside/footer, headings, lists, tables, forms, etc.)
|
|
3
|
+
// - Removes <head>, <script>, <style>, <template>, <noscript>, comments
|
|
4
|
+
// - Drops nodes that are clearly hidden/inert via attributes (hidden, aria-hidden="true", display:none, visibility:hidden, opacity:0)
|
|
5
|
+
// - Strips event handlers (on*) and inline styles, keeps semantic attrs (id, class, role, aria-*, data-*, href, src, alt, etc.)
|
|
6
|
+
// - Collapses whitespace in text nodes (but NOT inside <pre>/<code>)
|
|
7
|
+
// - Does NOT change tag names, does NOT unwrap containers, does NOT reorder content
|
|
8
|
+
|
|
9
|
+
import { memoize } from '@letsrunit/utils/src/memoize';
|
|
10
|
+
import type { Page } from '@playwright/test';
|
|
11
|
+
import stringify from 'fast-json-stable-stringify';
|
|
12
|
+
import { JSDOM } from 'jsdom';
|
|
13
|
+
import { isPage } from './utils/type-check';
|
|
14
|
+
|
|
15
|
+
export type ScrubHtmlOptions = {
|
|
16
|
+
/** Remove nodes that look hidden/inert by attributes. Default: true */
|
|
17
|
+
dropHidden?: boolean;
|
|
18
|
+
/** Remove the entire <head>. Default: true */
|
|
19
|
+
dropHead?: boolean;
|
|
20
|
+
/** Remove <svg> glyphs. Default: false */
|
|
21
|
+
dropSvg?: boolean;
|
|
22
|
+
/** Only keep <main> element. Default: false */
|
|
23
|
+
pickMain?: boolean;
|
|
24
|
+
/** Keep a conservative attribute allowlist (0=none, 1=normal, 2=aggressive). Default: 1 */
|
|
25
|
+
stripAttributes?: 0 | 1 | 2;
|
|
26
|
+
/** Normalize whitespace in text nodes (outside pre/code). Default: true */
|
|
27
|
+
normalizeWhitespace?: boolean;
|
|
28
|
+
/** Remove HTML comments. Default: true */
|
|
29
|
+
dropComments?: boolean;
|
|
30
|
+
/** Replace <br> within headings (h1–h6) with a space. Default: true */
|
|
31
|
+
replaceBrInHeadings?: boolean;
|
|
32
|
+
/** Limit lists to max items: -1 mean no limit. Default: -1 */
|
|
33
|
+
limitLists?: number;
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
const HTML_MIN_ATTR_THRESHOLD = 250_000; // ~70k tokens
|
|
37
|
+
const HTML_LIMIT_LISTS_THRESHOLD = 400_000; // ~110k tokens
|
|
38
|
+
const HTML_MAIN_ONLY_THRESHOLD = 600_000; // ~170k tokens
|
|
39
|
+
|
|
40
|
+
function getDefaults(contentLength: number): Required<ScrubHtmlOptions> {
|
|
41
|
+
return {
|
|
42
|
+
dropHidden: true,
|
|
43
|
+
dropHead: true,
|
|
44
|
+
dropSvg: false,
|
|
45
|
+
pickMain: contentLength >= HTML_MAIN_ONLY_THRESHOLD,
|
|
46
|
+
stripAttributes: contentLength >= HTML_MIN_ATTR_THRESHOLD ? 2 : 1,
|
|
47
|
+
normalizeWhitespace: true,
|
|
48
|
+
dropComments: true,
|
|
49
|
+
replaceBrInHeadings: true,
|
|
50
|
+
limitLists: contentLength >= HTML_LIMIT_LISTS_THRESHOLD ? 20 : -1,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Attributes we keep to preserve semantics and minimal usefulness
|
|
55
|
+
const ALLOWED_ATTRS = {
|
|
56
|
+
match: new Set([
|
|
57
|
+
// identity/semantics
|
|
58
|
+
'id',
|
|
59
|
+
'class',
|
|
60
|
+
'role',
|
|
61
|
+
// internationalization
|
|
62
|
+
'lang',
|
|
63
|
+
'dir',
|
|
64
|
+
|
|
65
|
+
// anchors & media
|
|
66
|
+
'href',
|
|
67
|
+
'title',
|
|
68
|
+
'target',
|
|
69
|
+
'rel',
|
|
70
|
+
'src',
|
|
71
|
+
'alt',
|
|
72
|
+
'width',
|
|
73
|
+
'height',
|
|
74
|
+
'loading',
|
|
75
|
+
|
|
76
|
+
// tables
|
|
77
|
+
'scope',
|
|
78
|
+
'headers',
|
|
79
|
+
'colspan',
|
|
80
|
+
'rowspan',
|
|
81
|
+
|
|
82
|
+
// forms (pure semantics—doesn’t change structure)
|
|
83
|
+
'name',
|
|
84
|
+
'value',
|
|
85
|
+
'type',
|
|
86
|
+
'for',
|
|
87
|
+
'placeholder',
|
|
88
|
+
'checked',
|
|
89
|
+
'selected',
|
|
90
|
+
'multiple',
|
|
91
|
+
'method',
|
|
92
|
+
'action',
|
|
93
|
+
|
|
94
|
+
// time, figure, etc.
|
|
95
|
+
'datetime',
|
|
96
|
+
]),
|
|
97
|
+
regexp:/^aria-[\w-]+|^data-[\w-]+$/i, // ARIA attributes & data-* attributes
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
const ALLOWED_ATTRS_AGGRESSIVE = {
|
|
101
|
+
match: new Set([
|
|
102
|
+
// structuur / algemene selectors
|
|
103
|
+
'id',
|
|
104
|
+
'class',
|
|
105
|
+
'role',
|
|
106
|
+
|
|
107
|
+
// links / media
|
|
108
|
+
'href',
|
|
109
|
+
'src',
|
|
110
|
+
'alt',
|
|
111
|
+
'title',
|
|
112
|
+
|
|
113
|
+
// tables
|
|
114
|
+
'scope',
|
|
115
|
+
|
|
116
|
+
// forms / velden
|
|
117
|
+
'name',
|
|
118
|
+
'type',
|
|
119
|
+
'for',
|
|
120
|
+
'placeholder',
|
|
121
|
+
'value',
|
|
122
|
+
'checked',
|
|
123
|
+
'selected',
|
|
124
|
+
|
|
125
|
+
// ARIA voor Playwright getByRole/getByLabel
|
|
126
|
+
'aria-label',
|
|
127
|
+
'aria-labelledby',
|
|
128
|
+
'aria-describedby',
|
|
129
|
+
|
|
130
|
+
// veelgebruikte test selectors
|
|
131
|
+
'data-testid',
|
|
132
|
+
'data-test-id',
|
|
133
|
+
'data-cy',
|
|
134
|
+
'data-qa',
|
|
135
|
+
]),
|
|
136
|
+
regexp: null,
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
const HIDDEN_SELECTORS = [
|
|
140
|
+
'[hidden]',
|
|
141
|
+
'[inert]',
|
|
142
|
+
'[aria-hidden="true"]',
|
|
143
|
+
'[style*="display:none"]',
|
|
144
|
+
'[style*="visibility:hidden"]',
|
|
145
|
+
'[style*="opacity:0"]'
|
|
146
|
+
].join(',');
|
|
147
|
+
|
|
148
|
+
// Tags that are always removed (infra/noise)
|
|
149
|
+
const ALWAYS_DROP = [
|
|
150
|
+
'script', 'style', 'template', 'noscript', 'slot', 'object', 'embed'
|
|
151
|
+
];
|
|
152
|
+
|
|
153
|
+
export async function scrubHtml(
|
|
154
|
+
page: { html: string, url: string } | Page,
|
|
155
|
+
opts: ScrubHtmlOptions = {},
|
|
156
|
+
): Promise<string> {
|
|
157
|
+
if (isPage(page)) page = { html: await page.content(), url: page.url() };
|
|
158
|
+
return await memoizedScrubHtml(page, opts);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
const memoizedScrubHtml = memoize(realScrubHtml, {
|
|
162
|
+
max: 16,
|
|
163
|
+
ttl: 10 * 60_000,
|
|
164
|
+
cacheKey: (args) => stringify({ html: args[0].html, url: args[0].url, ...args[1] }),
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Scrub HTML conservatively for LLMs without destroying semantics.
|
|
169
|
+
*/
|
|
170
|
+
export async function realScrubHtml(
|
|
171
|
+
{ html, url }: { html: string, url: string },
|
|
172
|
+
opts: ScrubHtmlOptions = {},
|
|
173
|
+
): Promise<string> {
|
|
174
|
+
const o = { ...getDefaults(html.length), ...opts };
|
|
175
|
+
|
|
176
|
+
const dom = new JSDOM(html, { url });
|
|
177
|
+
const doc = dom.window.document;
|
|
178
|
+
|
|
179
|
+
if (o.pickMain) pickMain(doc);
|
|
180
|
+
dropInfraAndSvg(doc, !!o.dropSvg);
|
|
181
|
+
if (o.dropHidden) dropHiddenTrees(doc);
|
|
182
|
+
if (o.stripAttributes) stripAttributesAndSanitize(doc, o.stripAttributes);
|
|
183
|
+
if (o.dropComments) dropHtmlComments(doc);
|
|
184
|
+
if (o.replaceBrInHeadings) replaceBrsInHeadings(doc);
|
|
185
|
+
if (o.limitLists >= 0) limitListsAndRows(doc, o.limitLists);
|
|
186
|
+
if (o.normalizeWhitespace) normalizeWhitespace(doc.body);
|
|
187
|
+
|
|
188
|
+
return doc.body.innerHTML;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/* ---------------- helpers ---------------- */
|
|
192
|
+
|
|
193
|
+
function hasHiddenAncestor(el: Element): boolean {
|
|
194
|
+
let p: Element | null = el.parentElement;
|
|
195
|
+
while (p) {
|
|
196
|
+
if (
|
|
197
|
+
p.hasAttribute('hidden') ||
|
|
198
|
+
p.hasAttribute('inert') ||
|
|
199
|
+
p.getAttribute('aria-hidden') === 'true'
|
|
200
|
+
) return true;
|
|
201
|
+
|
|
202
|
+
const style = p.getAttribute('style') || '';
|
|
203
|
+
if (/\bdisplay\s*:\s*none\b/i.test(style)) return true;
|
|
204
|
+
if (/\bvisibility\s*:\s*hidden\b/i.test(style)) return true;
|
|
205
|
+
if (/\bopacity\s*:\s*0(?:\D|$)/i.test(style)) return true;
|
|
206
|
+
|
|
207
|
+
p = p.parentElement;
|
|
208
|
+
}
|
|
209
|
+
return false;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
function normalizeWhitespace(root: Element) {
|
|
213
|
+
const preLike = new Set(['PRE', 'CODE', 'SAMP', 'KBD']);
|
|
214
|
+
const doc = root.ownerDocument!;
|
|
215
|
+
const walker = doc.createTreeWalker(root, 4 /*NodeFilter.SHOW_TEXT*/);
|
|
216
|
+
const changes: Text[] = [];
|
|
217
|
+
|
|
218
|
+
let node: Node | null;
|
|
219
|
+
|
|
220
|
+
while ((node = walker.nextNode())) {
|
|
221
|
+
const text = node as Text;
|
|
222
|
+
const parent = text.parentElement;
|
|
223
|
+
if (!parent) continue;
|
|
224
|
+
if (preLike.has(parent.tagName)) continue; // don't touch pre/code
|
|
225
|
+
|
|
226
|
+
const v = text.nodeValue ?? '';
|
|
227
|
+
const collapsed = v.replace(/\s+/g, ' ');
|
|
228
|
+
if (collapsed !== v) changes.push(text);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
for (const t of changes) {
|
|
232
|
+
// extra trim around block-ish elements
|
|
233
|
+
const parent = t.parentElement!;
|
|
234
|
+
const isBlockish = /^(P|LI|DIV|SECTION|ARTICLE|ASIDE|HEADER|FOOTER|MAIN|NAV|H[1-6]|BLOCKQUOTE|FIGCAPTION|TD|TH)$/i.test(parent.tagName);
|
|
235
|
+
t.nodeValue = (t.nodeValue || '').replace(/\s+/g, ' ');
|
|
236
|
+
if (isBlockish) t.nodeValue = (t.nodeValue || '').trim();
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Split-out helper steps (top-level, no nested functions)
|
|
241
|
+
|
|
242
|
+
function pickMain(doc: Document): boolean {
|
|
243
|
+
const main = doc.querySelector('main');
|
|
244
|
+
if (!main) return false;
|
|
245
|
+
const clone = main.cloneNode(true);
|
|
246
|
+
doc.body.innerHTML = '';
|
|
247
|
+
doc.body.appendChild(clone);
|
|
248
|
+
return true;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
function dropInfraAndSvg(doc: Document, dropSvg: boolean) {
|
|
252
|
+
const toDrop = [...ALWAYS_DROP, dropSvg ? 'svg' : ''].filter(Boolean).join(',');
|
|
253
|
+
if (!toDrop) return;
|
|
254
|
+
doc.querySelectorAll(toDrop).forEach((el) => el.remove());
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
function dropHiddenTrees(doc: Document) {
|
|
258
|
+
doc.querySelectorAll<HTMLElement>(HIDDEN_SELECTORS).forEach((el) => el.remove());
|
|
259
|
+
const all = [...doc.body.querySelectorAll<HTMLElement>('*')];
|
|
260
|
+
for (const el of all) {
|
|
261
|
+
if (!el.isConnected) continue;
|
|
262
|
+
if (hasHiddenAncestor(el)) el.remove();
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
function stripAttributesAndSanitize(doc: Document, level: 0 | 1 | 2) {
|
|
267
|
+
if (!level) return;
|
|
268
|
+
|
|
269
|
+
const all = [...doc.body.querySelectorAll<HTMLElement>('*')];
|
|
270
|
+
|
|
271
|
+
for (const el of all) {
|
|
272
|
+
const isSvg = el.namespaceURI === 'http://www.w3.org/2000/svg';
|
|
273
|
+
for (const { name } of [...el.attributes]) {
|
|
274
|
+
const lower = name.toLowerCase();
|
|
275
|
+
if (lower.startsWith('on')) {
|
|
276
|
+
el.removeAttribute(name);
|
|
277
|
+
continue;
|
|
278
|
+
}
|
|
279
|
+
if (lower === 'style') {
|
|
280
|
+
el.removeAttribute(name);
|
|
281
|
+
continue;
|
|
282
|
+
}
|
|
283
|
+
if (isSvg) continue; // keep svg attrs
|
|
284
|
+
const allowed = level === 1 ? ALLOWED_ATTRS : ALLOWED_ATTRS_AGGRESSIVE;
|
|
285
|
+
if (!allowed.match.has(lower) && !allowed.regexp?.test(name)) {
|
|
286
|
+
el.removeAttribute(name);
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// sanitize javascript: hrefs
|
|
292
|
+
doc.querySelectorAll('a[href]').forEach((a) => {
|
|
293
|
+
const href = a.getAttribute('href') || '';
|
|
294
|
+
if (/^\s*javascript:/i.test(href)) a.removeAttribute('href');
|
|
295
|
+
});
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
function dropHtmlComments(doc: Document) {
|
|
299
|
+
const nf = doc.defaultView?.NodeFilter;
|
|
300
|
+
const SHOW_COMMENT = nf?.SHOW_COMMENT ?? 128; // fallback constant
|
|
301
|
+
|
|
302
|
+
// createTreeWalker expects a NodeFilter mask
|
|
303
|
+
const walker = doc.createTreeWalker(doc, SHOW_COMMENT as unknown as number);
|
|
304
|
+
const toRemove: Comment[] = [];
|
|
305
|
+
let n: Comment | null;
|
|
306
|
+
|
|
307
|
+
while ((n = walker.nextNode() as Comment | null)) toRemove.push(n);
|
|
308
|
+
toRemove.forEach((c) => c.parentNode?.removeChild(c));
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
function replaceBrsInHeadings(doc: Document) {
|
|
312
|
+
doc.querySelectorAll('h1, h2, h3, h4, h5, h6').forEach((h) => {
|
|
313
|
+
h.querySelectorAll('br').forEach((br) => {
|
|
314
|
+
const space = doc.createTextNode(' ');
|
|
315
|
+
(br as Element).replaceWith(space);
|
|
316
|
+
});
|
|
317
|
+
});
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
function limitListsAndRows(doc: Document, limit: number) {
|
|
321
|
+
// lists
|
|
322
|
+
doc.querySelectorAll('ul, ol').forEach((list) => {
|
|
323
|
+
const items = Array.from(list.children).filter((c) => c.tagName === 'LI');
|
|
324
|
+
for (let i = limit; i < items.length; i++) items[i].remove();
|
|
325
|
+
});
|
|
326
|
+
|
|
327
|
+
// table rows
|
|
328
|
+
const rowContainers = doc.querySelectorAll('table, thead, tbody, tfoot');
|
|
329
|
+
rowContainers.forEach((container) => {
|
|
330
|
+
const rows = Array.from(container.children).filter((c) => c.tagName === 'TR');
|
|
331
|
+
for (let i = limit; i < rows.length; i++) rows[i].remove();
|
|
332
|
+
});
|
|
333
|
+
}
|