@letsrunit/playwright 0.3.6 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -104,4 +104,36 @@ declare function waitAfterInteraction(page: Page, target: Locator, opts?: {
104
104
  quietMs?: number;
105
105
  }): Promise<void>;
106
106
 
107
- export { type PageInfo, type Snapshot, browse, createDateEngine, createFieldEngine, formatDate, formatDateForInput, formatHtml, getMonthNames, locator, screenshot, screenshotElement, scrollToCenter, setFieldValue, snapshot, suppressInterferences, waitAfterInteraction, waitForAnimationsToFinish, waitForDomIdle, waitForIdle, waitForMeta, waitForUrlChange, waitUntilEnabled };
107
+ type ScrubHtmlOptions = {
108
+ /** Remove nodes that look hidden/inert by attributes. Default: true */
109
+ dropHidden?: boolean;
110
+ /** Remove the entire <head>. Default: true */
111
+ dropHead?: boolean;
112
+ /** Remove <svg> glyphs. Default: false */
113
+ dropSvg?: boolean;
114
+ /** Only keep <main> element. Default: false */
115
+ pickMain?: boolean;
116
+ /** Keep a conservative attribute allowlist (0=none, 1=normal, 2=aggressive). Default: 1 */
117
+ stripAttributes?: 0 | 1 | 2;
118
+ /** Normalize whitespace in text nodes (outside pre/code). Default: true */
119
+ normalizeWhitespace?: boolean;
120
+ /** Remove HTML comments. Default: true */
121
+ dropComments?: boolean;
122
+ /** Replace <br> within headings (h1–h6) with a space. Default: true */
123
+ replaceBrInHeadings?: boolean;
124
+ /** Limit lists to max items: -1 mean no limit. Default: -1 */
125
+ limitLists?: number;
126
+ };
127
+ declare function scrubHtml(page: {
128
+ html: string;
129
+ url: string;
130
+ } | Page, opts?: ScrubHtmlOptions): Promise<string>;
131
+ /**
132
+ * Scrub HTML conservatively for LLMs without destroying semantics.
133
+ */
134
+ declare function realScrubHtml({ html, url }: {
135
+ html: string;
136
+ url: string;
137
+ }, opts?: ScrubHtmlOptions): Promise<string>;
138
+
139
+ export { type PageInfo, type ScrubHtmlOptions, type Snapshot, browse, createDateEngine, createFieldEngine, formatDate, formatDateForInput, formatHtml, getMonthNames, locator, realScrubHtml, screenshot, screenshotElement, scrollToCenter, scrubHtml, setFieldValue, snapshot, suppressInterferences, waitAfterInteraction, waitForAnimationsToFinish, waitForDomIdle, waitForIdle, waitForMeta, waitForUrlChange, waitUntilEnabled };
package/dist/index.js CHANGED
@@ -1,8 +1,10 @@
1
- import { getWeekNumber, sleep, chain, hashKey, isRange, isDate, isArray, diffArray, cartesian, uniqueItem } from '@letsrunit/utils';
1
+ import { memoize, getWeekNumber, sleep, chain, hashKey, isRange, isDate, isArray, diffArray, cartesian, uniqueItem } from '@letsrunit/utils';
2
2
  import rehypeFormat from 'rehype-format';
3
3
  import rehypeParse from 'rehype-parse';
4
4
  import rehypeStringify from 'rehype-stringify';
5
5
  import { unified } from 'unified';
6
+ import stringify from 'fast-json-stable-stringify';
7
+ import { JSDOM } from 'jsdom';
6
8
 
7
9
  // src/browser.ts
8
10
  async function browse(browser, options = {}) {
@@ -3006,6 +3008,257 @@ async function suppressInterferences(page, opts = {}) {
3006
3008
  }
3007
3009
  }
3008
3010
 
3009
- export { browse, createDateEngine, createFieldEngine, formatDate, formatDateForInput, formatHtml, getMonthNames, locator, screenshot, screenshotElement, scrollToCenter, setFieldValue, snapshot, suppressInterferences, waitAfterInteraction, waitForAnimationsToFinish, waitForDomIdle, waitForIdle, waitForMeta, waitForUrlChange, waitUntilEnabled };
3011
+ // src/utils/type-check.ts
3012
+ function isPage(page) {
3013
+ return typeof page.content === "function" && typeof page.url === "function" && typeof page.screenshot === "function";
3014
+ }
3015
+
3016
+ // src/scrub-html.ts
3017
+ var HTML_MIN_ATTR_THRESHOLD = 25e4;
3018
+ var HTML_LIMIT_LISTS_THRESHOLD = 4e5;
3019
+ var HTML_MAIN_ONLY_THRESHOLD = 6e5;
3020
+ function getDefaults(contentLength) {
3021
+ return {
3022
+ dropHidden: true,
3023
+ dropHead: true,
3024
+ dropSvg: false,
3025
+ pickMain: contentLength >= HTML_MAIN_ONLY_THRESHOLD,
3026
+ stripAttributes: contentLength >= HTML_MIN_ATTR_THRESHOLD ? 2 : 1,
3027
+ normalizeWhitespace: true,
3028
+ dropComments: true,
3029
+ replaceBrInHeadings: true,
3030
+ limitLists: contentLength >= HTML_LIMIT_LISTS_THRESHOLD ? 20 : -1
3031
+ };
3032
+ }
3033
+ var ALLOWED_ATTRS = {
3034
+ match: /* @__PURE__ */ new Set([
3035
+ // identity/semantics
3036
+ "id",
3037
+ "class",
3038
+ "role",
3039
+ // internationalization
3040
+ "lang",
3041
+ "dir",
3042
+ // anchors & media
3043
+ "href",
3044
+ "title",
3045
+ "target",
3046
+ "rel",
3047
+ "src",
3048
+ "alt",
3049
+ "width",
3050
+ "height",
3051
+ "loading",
3052
+ // tables
3053
+ "scope",
3054
+ "headers",
3055
+ "colspan",
3056
+ "rowspan",
3057
+ // forms (pure semantics—doesn’t change structure)
3058
+ "name",
3059
+ "value",
3060
+ "type",
3061
+ "for",
3062
+ "placeholder",
3063
+ "checked",
3064
+ "selected",
3065
+ "multiple",
3066
+ "method",
3067
+ "action",
3068
+ // time, figure, etc.
3069
+ "datetime"
3070
+ ]),
3071
+ regexp: /^aria-[\w-]+|^data-[\w-]+$/i
3072
+ // ARIA attributes & data-* attributes
3073
+ };
3074
+ var ALLOWED_ATTRS_AGGRESSIVE = {
3075
+ match: /* @__PURE__ */ new Set([
3076
+ // structuur / algemene selectors
3077
+ "id",
3078
+ "class",
3079
+ "role",
3080
+ // links / media
3081
+ "href",
3082
+ "src",
3083
+ "alt",
3084
+ "title",
3085
+ // tables
3086
+ "scope",
3087
+ // forms / velden
3088
+ "name",
3089
+ "type",
3090
+ "for",
3091
+ "placeholder",
3092
+ "value",
3093
+ "checked",
3094
+ "selected",
3095
+ // ARIA voor Playwright getByRole/getByLabel
3096
+ "aria-label",
3097
+ "aria-labelledby",
3098
+ "aria-describedby",
3099
+ // veelgebruikte test selectors
3100
+ "data-testid",
3101
+ "data-test-id",
3102
+ "data-cy",
3103
+ "data-qa"
3104
+ ]),
3105
+ regexp: null
3106
+ };
3107
+ var HIDDEN_SELECTORS = [
3108
+ "[hidden]",
3109
+ "[inert]",
3110
+ '[aria-hidden="true"]',
3111
+ '[style*="display:none"]',
3112
+ '[style*="visibility:hidden"]',
3113
+ '[style*="opacity:0"]'
3114
+ ].join(",");
3115
+ var ALWAYS_DROP = [
3116
+ "script",
3117
+ "style",
3118
+ "template",
3119
+ "noscript",
3120
+ "slot",
3121
+ "object",
3122
+ "embed"
3123
+ ];
3124
+ async function scrubHtml(page, opts = {}) {
3125
+ if (isPage(page)) page = { html: await page.content(), url: page.url() };
3126
+ return await memoizedScrubHtml(page, opts);
3127
+ }
3128
+ var memoizedScrubHtml = memoize(realScrubHtml, {
3129
+ max: 16,
3130
+ ttl: 10 * 6e4,
3131
+ cacheKey: (args) => stringify({ html: args[0].html, url: args[0].url, ...args[1] })
3132
+ });
3133
+ async function realScrubHtml({ html, url }, opts = {}) {
3134
+ const o = { ...getDefaults(html.length), ...opts };
3135
+ const dom = new JSDOM(html, { url });
3136
+ const doc = dom.window.document;
3137
+ if (o.pickMain) pickMain(doc);
3138
+ dropInfraAndSvg(doc, !!o.dropSvg);
3139
+ if (o.dropHidden) dropHiddenTrees(doc);
3140
+ if (o.stripAttributes) stripAttributesAndSanitize(doc, o.stripAttributes);
3141
+ if (o.dropComments) dropHtmlComments(doc);
3142
+ if (o.replaceBrInHeadings) replaceBrsInHeadings(doc);
3143
+ if (o.limitLists >= 0) limitListsAndRows(doc, o.limitLists);
3144
+ if (o.normalizeWhitespace) normalizeWhitespace(doc.body);
3145
+ return doc.body.innerHTML;
3146
+ }
3147
+ function hasHiddenAncestor(el) {
3148
+ let p = el.parentElement;
3149
+ while (p) {
3150
+ if (p.hasAttribute("hidden") || p.hasAttribute("inert") || p.getAttribute("aria-hidden") === "true") return true;
3151
+ const style = p.getAttribute("style") || "";
3152
+ if (/\bdisplay\s*:\s*none\b/i.test(style)) return true;
3153
+ if (/\bvisibility\s*:\s*hidden\b/i.test(style)) return true;
3154
+ if (/\bopacity\s*:\s*0(?:\D|$)/i.test(style)) return true;
3155
+ p = p.parentElement;
3156
+ }
3157
+ return false;
3158
+ }
3159
+ function normalizeWhitespace(root) {
3160
+ const preLike = /* @__PURE__ */ new Set(["PRE", "CODE", "SAMP", "KBD"]);
3161
+ const doc = root.ownerDocument;
3162
+ const walker = doc.createTreeWalker(
3163
+ root,
3164
+ 4
3165
+ /*NodeFilter.SHOW_TEXT*/
3166
+ );
3167
+ const changes = [];
3168
+ let node;
3169
+ while (node = walker.nextNode()) {
3170
+ const text = node;
3171
+ const parent = text.parentElement;
3172
+ if (!parent) continue;
3173
+ if (preLike.has(parent.tagName)) continue;
3174
+ const v = text.nodeValue ?? "";
3175
+ const collapsed = v.replace(/\s+/g, " ");
3176
+ if (collapsed !== v) changes.push(text);
3177
+ }
3178
+ for (const t of changes) {
3179
+ const parent = t.parentElement;
3180
+ const isBlockish = /^(P|LI|DIV|SECTION|ARTICLE|ASIDE|HEADER|FOOTER|MAIN|NAV|H[1-6]|BLOCKQUOTE|FIGCAPTION|TD|TH)$/i.test(parent.tagName);
3181
+ t.nodeValue = (t.nodeValue || "").replace(/\s+/g, " ");
3182
+ if (isBlockish) t.nodeValue = (t.nodeValue || "").trim();
3183
+ }
3184
+ }
3185
+ function pickMain(doc) {
3186
+ const main = doc.querySelector("main");
3187
+ if (!main) return false;
3188
+ const clone = main.cloneNode(true);
3189
+ doc.body.innerHTML = "";
3190
+ doc.body.appendChild(clone);
3191
+ return true;
3192
+ }
3193
+ function dropInfraAndSvg(doc, dropSvg) {
3194
+ const toDrop = [...ALWAYS_DROP, dropSvg ? "svg" : ""].filter(Boolean).join(",");
3195
+ if (!toDrop) return;
3196
+ doc.querySelectorAll(toDrop).forEach((el) => el.remove());
3197
+ }
3198
+ function dropHiddenTrees(doc) {
3199
+ doc.querySelectorAll(HIDDEN_SELECTORS).forEach((el) => el.remove());
3200
+ const all = [...doc.body.querySelectorAll("*")];
3201
+ for (const el of all) {
3202
+ if (!el.isConnected) continue;
3203
+ if (hasHiddenAncestor(el)) el.remove();
3204
+ }
3205
+ }
3206
+ function stripAttributesAndSanitize(doc, level) {
3207
+ if (!level) return;
3208
+ const all = [...doc.body.querySelectorAll("*")];
3209
+ for (const el of all) {
3210
+ const isSvg = el.namespaceURI === "http://www.w3.org/2000/svg";
3211
+ for (const { name } of [...el.attributes]) {
3212
+ const lower = name.toLowerCase();
3213
+ if (lower.startsWith("on")) {
3214
+ el.removeAttribute(name);
3215
+ continue;
3216
+ }
3217
+ if (lower === "style") {
3218
+ el.removeAttribute(name);
3219
+ continue;
3220
+ }
3221
+ if (isSvg) continue;
3222
+ const allowed = level === 1 ? ALLOWED_ATTRS : ALLOWED_ATTRS_AGGRESSIVE;
3223
+ if (!allowed.match.has(lower) && !allowed.regexp?.test(name)) {
3224
+ el.removeAttribute(name);
3225
+ }
3226
+ }
3227
+ }
3228
+ doc.querySelectorAll("a[href]").forEach((a) => {
3229
+ const href = a.getAttribute("href") || "";
3230
+ if (/^\s*javascript:/i.test(href)) a.removeAttribute("href");
3231
+ });
3232
+ }
3233
+ function dropHtmlComments(doc) {
3234
+ const nf = doc.defaultView?.NodeFilter;
3235
+ const SHOW_COMMENT = nf?.SHOW_COMMENT ?? 128;
3236
+ const walker = doc.createTreeWalker(doc, SHOW_COMMENT);
3237
+ const toRemove = [];
3238
+ let n;
3239
+ while (n = walker.nextNode()) toRemove.push(n);
3240
+ toRemove.forEach((c) => c.parentNode?.removeChild(c));
3241
+ }
3242
+ function replaceBrsInHeadings(doc) {
3243
+ doc.querySelectorAll("h1, h2, h3, h4, h5, h6").forEach((h) => {
3244
+ h.querySelectorAll("br").forEach((br) => {
3245
+ const space = doc.createTextNode(" ");
3246
+ br.replaceWith(space);
3247
+ });
3248
+ });
3249
+ }
3250
+ function limitListsAndRows(doc, limit) {
3251
+ doc.querySelectorAll("ul, ol").forEach((list) => {
3252
+ const items = Array.from(list.children).filter((c) => c.tagName === "LI");
3253
+ for (let i = limit; i < items.length; i++) items[i].remove();
3254
+ });
3255
+ const rowContainers = doc.querySelectorAll("table, thead, tbody, tfoot");
3256
+ rowContainers.forEach((container) => {
3257
+ const rows = Array.from(container.children).filter((c) => c.tagName === "TR");
3258
+ for (let i = limit; i < rows.length; i++) rows[i].remove();
3259
+ });
3260
+ }
3261
+
3262
+ export { browse, createDateEngine, createFieldEngine, formatDate, formatDateForInput, formatHtml, getMonthNames, locator, realScrubHtml, screenshot, screenshotElement, scrollToCenter, scrubHtml, setFieldValue, snapshot, suppressInterferences, waitAfterInteraction, waitForAnimationsToFinish, waitForDomIdle, waitForIdle, waitForMeta, waitForUrlChange, waitUntilEnabled };
3010
3263
  //# sourceMappingURL=index.js.map
3011
3264
  //# sourceMappingURL=index.js.map