@intuned/browser-dev 0.1.9-dev.0 → 0.1.10-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/dist/ai/extractStructuredData.js +21 -27
  2. package/dist/ai/tests/testCreateMatchesMapping.spec.js +216 -0
  3. package/dist/ai/tests/testExtractStructuredData.spec.js +346 -0
  4. package/dist/ai/tests/testExtractStructuredDataDomMatchingIframes.spec.js +459 -0
  5. package/dist/ai/tests/testExtractStructuredDataUnit.spec.js +375 -0
  6. package/dist/ai/tests/testMatching.spec.js +342 -0
  7. package/dist/ai/tests/testValidateMatchesMapping.spec.js +265 -0
  8. package/dist/common/extendedTest.js +38 -30
  9. package/dist/common/frame_utils/frameTree.js +116 -0
  10. package/dist/common/frame_utils/getContentWithNestedIframes.js +13 -0
  11. package/dist/common/frame_utils/index.js +95 -0
  12. package/dist/common/frame_utils/stitchIframe.js +105 -0
  13. package/dist/{helpers → common}/frame_utils/tests/testFindAllIframes.spec.js +24 -15
  14. package/dist/common/frame_utils/tests/testGetContentWithNestedIframes.spec.js +241 -0
  15. package/dist/common/frame_utils/utils.js +91 -0
  16. package/dist/common/getSimplifiedHtml.js +20 -20
  17. package/dist/common/matching/matching.js +91 -16
  18. package/dist/common/tests/matching.test.js +225 -0
  19. package/dist/common/tests/testGetSimplifiedHtml.spec.js +324 -0
  20. package/dist/helpers/extractMarkdown.js +16 -7
  21. package/dist/helpers/tests/testExtractMarkdown.spec.js +29 -0
  22. package/dist/helpers/waitForDomSettled.js +4 -4
  23. package/dist/types/intuned-runtime.d.ts +6 -32
  24. package/package.json +1 -1
  25. package/dist/helpers/frame_utils/constants.js +0 -8
  26. package/dist/helpers/frame_utils/findAllIframes.js +0 -82
  27. package/dist/helpers/frame_utils/index.js +0 -44
  28. /package/dist/{helpers → common}/frame_utils/checkFrameAllowsAsyncScripts.js +0 -0
  29. /package/dist/{helpers → common}/frame_utils/getContainerFrame.js +0 -0
@@ -0,0 +1,241 @@
1
+ "use strict";
2
+
3
+ var _extendedTest = require("../../extendedTest");
4
+ var _playwrightCore = require("playwright-core");
5
+ var _getContentWithNestedIframes = require("../getContentWithNestedIframes");
6
+ var _nodeHtmlParser = require("node-html-parser");
7
+ var path = _interopRequireWildcard(require("path"));
8
+ function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
9
+ (0, _extendedTest.describe)("Test getContentWithNestedIframes", () => {
10
+ let browser;
11
+ let page;
12
+ (0, _extendedTest.beforeAll)(async () => {
13
+ browser = await _playwrightCore.chromium.launch({
14
+ headless: true
15
+ });
16
+ });
17
+ (0, _extendedTest.afterAll)(async () => {
18
+ await browser.close();
19
+ });
20
+ (0, _extendedTest.beforeEach)(async () => {
21
+ page = await browser.newPage();
22
+ });
23
+ (0, _extendedTest.afterEach)(async () => {
24
+ await page.close();
25
+ });
26
+ async function getTestPageWithIframes(waitForDynamicContent = false) {
27
+ const testHtmlPath = path.resolve(__dirname, "fixtures", "test_with_iframes.html");
28
+ await page.goto(`file://${testHtmlPath}`);
29
+ if (waitForDynamicContent) {
30
+ await page.waitForTimeout(1500);
31
+ }
32
+ return page;
33
+ }
34
+ (0, _extendedTest.test)("should get content with nested iframes stitched - basic", async () => {
35
+ await getTestPageWithIframes();
36
+ const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
37
+ (0, _extendedTest.expect)(content).toContain("Static Iframe Content");
38
+ (0, _extendedTest.expect)(content).toContain("Srcdoc Iframe");
39
+ (0, _extendedTest.expect)(content).toContain("Outer Iframe");
40
+ const iframeContentCount = (content.match(/<iframe-content/g) || []).length;
41
+ (0, _extendedTest.expect)(iframeContentCount).toBeGreaterThanOrEqual(4);
42
+ (0, _extendedTest.expect)(content).toContain('id="static-iframe"');
43
+ (0, _extendedTest.expect)(content).toContain("<h2>Static Iframe Content</h2>");
44
+ (0, _extendedTest.expect)(content).toContain("iframe-element");
45
+ (0, _extendedTest.expect)(content).toContain("srcdoc-span");
46
+ });
47
+ (0, _extendedTest.test)("should handle nested iframes recursively", async () => {
48
+ await getTestPageWithIframes();
49
+ const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
50
+ (0, _extendedTest.expect)(content).toContain("Outer Iframe");
51
+ (0, _extendedTest.expect)(content).toContain("Inner Iframe");
52
+ (0, _extendedTest.expect)(content).toContain("Deep Element");
53
+ const iframeContentCount = (content.match(/<iframe-content/g) || []).length;
54
+ (0, _extendedTest.expect)(iframeContentCount).toBeGreaterThanOrEqual(4);
55
+ (0, _extendedTest.expect)(content).toContain("deep-element");
56
+ });
57
+ (0, _extendedTest.test)("should handle dynamically created iframes", async () => {
58
+ await getTestPageWithIframes(true);
59
+ const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
60
+ (0, _extendedTest.expect)(content).toContain("Dynamic Iframe");
61
+ (0, _extendedTest.expect)(content).toContain("dynamic-element");
62
+ });
63
+ (0, _extendedTest.test)("should handle empty iframes gracefully", async () => {
64
+ await getTestPageWithIframes();
65
+ const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
66
+ (0, _extendedTest.expect)(content).toContain("empty-iframe-id");
67
+ const emptyIframeCount = (content.match(/empty-iframe-id/g) || []).length;
68
+ (0, _extendedTest.expect)(emptyIframeCount).toBe(1);
69
+ });
70
+ (0, _extendedTest.test)("should preserve main page content", async () => {
71
+ await getTestPageWithIframes();
72
+ const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
73
+ (0, _extendedTest.expect)(content).toContain("Main Page Content");
74
+ (0, _extendedTest.expect)(content).toContain("This is the main page content");
75
+ (0, _extendedTest.expect)(content).toContain("content-after-iframes");
76
+ (0, _extendedTest.expect)(content).toContain("Item 1");
77
+ (0, _extendedTest.expect)(content).toContain("Item 2");
78
+ });
79
+ (0, _extendedTest.test)("should handle cross-origin iframes gracefully", async () => {
80
+ await page.setContent(`
81
+ <html>
82
+ <body element_id="main-body">
83
+ <h1>Main Content</h1>
84
+ <iframe id="cross-origin-iframe"
85
+ element_id="cross-origin-id"
86
+ src="https://example.com"
87
+ width="300"
88
+ height="200">
89
+ </iframe>
90
+ <p>Content after iframe</p>
91
+ </body>
92
+ </html>
93
+ `);
94
+ await page.waitForTimeout(500);
95
+ const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
96
+ (0, _extendedTest.expect)(content).toContain("Main Content");
97
+ (0, _extendedTest.expect)(content).toContain("Content after iframe");
98
+ (0, _extendedTest.expect)(content).toContain("cross-origin-id");
99
+ });
100
+ (0, _extendedTest.test)("should handle malformed HTML in iframe content", async () => {
101
+ await page.setContent(`
102
+ <html>
103
+ <body element_id="main-body">
104
+ <h1>Malformed HTML Test</h1>
105
+ <iframe id="malformed-iframe"
106
+ element_id="malformed-id"
107
+ srcdoc="<html><body><h2>Malformed</h2><p>Unclosed tag<div>Missing closing</p></body></html>"
108
+ width="300"
109
+ height="200">
110
+ </iframe>
111
+ </body>
112
+ </html>
113
+ `);
114
+ const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
115
+ (0, _extendedTest.expect)(content).toContain("Malformed HTML Test");
116
+ (0, _extendedTest.expect)(content).toContain("malformed-id");
117
+ (0, _extendedTest.expect)(content).toContain("Malformed");
118
+ });
119
+ (0, _extendedTest.test)("should handle problematic iframe sources without hanging", async () => {
120
+ await page.setContent(`
121
+ <html>
122
+ <body element_id="main-body">
123
+ <h1>Main Content</h1>
124
+ <iframe id="empty-javascript-iframe"
125
+ element_id="empty-javascript-id"
126
+ src="javascript:"
127
+ width="300"
128
+ height="200">
129
+ </iframe>
130
+ <iframe id="invalid-blob-iframe"
131
+ element_id="invalid-blob-id"
132
+ src="blob:null/invalid"
133
+ width="300"
134
+ height="200">
135
+ </iframe>
136
+ <p>Content after iframes</p>
137
+ </body>
138
+ </html>
139
+ `, {
140
+ waitUntil: "domcontentloaded",
141
+ timeout: 5000
142
+ });
143
+ const startTime = Date.now();
144
+ const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page, 2.0);
145
+ const elapsedTime = Date.now() - startTime;
146
+ (0, _extendedTest.expect)(elapsedTime).toBeLessThan(10000);
147
+ (0, _extendedTest.expect)(content).toContain("Main Content");
148
+ (0, _extendedTest.expect)(content).toContain("Content after iframes");
149
+ (0, _extendedTest.expect)(content).toContain("empty-javascript-id");
150
+ (0, _extendedTest.expect)(content).toContain("invalid-blob-id");
151
+ });
152
+ (0, _extendedTest.test)("should handle legacy frame elements", async () => {
153
+ await page.setContent(`
154
+ <html>
155
+ <head><title>Legacy Frameset</title></head>
156
+ <frameset cols="50%,50%">
157
+ <frame id="frame-1"
158
+ element_id="frame-1-id"
159
+ src="data:text/html,<html><body><h2>Frame 1</h2><p>Frame 1 Content</p></body></html>">
160
+ </frame>
161
+ <frame id="frame-2"
162
+ element_id="frame-2-id"
163
+ src="data:text/html,<html><body><h2>Frame 2</h2><p>Frame 2 Content</p></body></html>">
164
+ </frame>
165
+ </frameset>
166
+ </html>
167
+ `, {
168
+ waitUntil: "domcontentloaded"
169
+ });
170
+ const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
171
+ (0, _extendedTest.expect)(content.includes("frame-1-id") || content.includes("frame-2-id") || content.length > 0).toBe(true);
172
+ });
173
+ (0, _extendedTest.test)("should support custom HTML extractor to skip tags", async () => {
174
+ await page.setContent(`
175
+ <html>
176
+ <body element_id="main-body">
177
+ <h1>Main Content</h1>
178
+ <p>This is main page paragraph that should remain.</p>
179
+ <iframe id="test-iframe"
180
+ element_id="test-iframe-id"
181
+ srcdoc="<html><body><h2>Iframe Title</h2><p>This paragraph should be removed.</p><div>This div should remain.</div><p>Another paragraph to remove.</p></body></html>"
182
+ width="300"
183
+ height="200">
184
+ </iframe>
185
+ <p>Another main page paragraph that should remain.</p>
186
+ </body>
187
+ </html>
188
+ `, {
189
+ waitUntil: "domcontentloaded"
190
+ });
191
+ const customExtractor = async root => {
192
+ let htmlContent;
193
+ if ("content" in root && typeof root.content === "function") {
194
+ htmlContent = await root.content();
195
+ } else {
196
+ htmlContent = await root.evaluate(el => el.outerHTML);
197
+ }
198
+ const parsed = (0, _nodeHtmlParser.parse)(htmlContent);
199
+ const pTags = parsed.querySelectorAll("p");
200
+ pTags.forEach(p => p.remove());
201
+ return parsed.toString();
202
+ };
203
+ const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page, 10000, customExtractor);
204
+ (0, _extendedTest.expect)(content).toContain("Main Content");
205
+ (0, _extendedTest.expect)(content).toContain("test-iframe-id");
206
+ (0, _extendedTest.expect)(content).toContain("Iframe Title");
207
+ (0, _extendedTest.expect)(content).toContain("This div should remain");
208
+ const pTagCount = (content.match(/<p>/g) || []).length;
209
+ (0, _extendedTest.expect)(pTagCount).toBe(0);
210
+ (0, _extendedTest.expect)(content).toContain("<h2>Iframe Title</h2>");
211
+ (0, _extendedTest.expect)(content).toContain("<div>This div should remain.</div>");
212
+ const iframeContentMatch = content.match(/<iframe-content[^>]*>(.*?)<\/iframe-content>/s);
213
+ (0, _extendedTest.expect)(iframeContentMatch).toBeTruthy();
214
+ if (iframeContentMatch) {
215
+ const nestedContent = iframeContentMatch[1];
216
+ (0, _extendedTest.expect)(nestedContent).not.toContain("<p>");
217
+ }
218
+ });
219
+ (0, _extendedTest.test)("should work with locator", async () => {
220
+ await page.setContent(`
221
+ <html>
222
+ <body>
223
+ <div id="container">
224
+ <h1>Container Content</h1>
225
+ <iframe srcdoc="<html><body><h2>Iframe in Container</h2></body></html>"></iframe>
226
+ </div>
227
+ <div id="other">
228
+ <p>Other content</p>
229
+ </div>
230
+ </body>
231
+ </html>`, {
232
+ waitUntil: "domcontentloaded"
233
+ });
234
+ await page.waitForSelector("#container iframe");
235
+ const locator = page.locator("#container");
236
+ const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(locator);
237
+ (0, _extendedTest.expect)(content).toContain("Container Content");
238
+ (0, _extendedTest.expect)(content).toContain("Iframe in Container");
239
+ (0, _extendedTest.expect)(content).not.toContain("Other content");
240
+ });
241
+ });
@@ -0,0 +1,91 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
6
+ exports.IFRAME_TAGS = exports.IFRAME_SRC_ATTRS = exports.IFRAME_REPLACEMENT_TAG = exports.IFRAME_PIXEL_SRC_DOMAINS = exports.IFRAME_CAPTCHA_SRC_PATTERNS = exports.ALL_IFRAMES_CSS_SELECTOR = void 0;
7
+ exports.findTopLevelIframeElements = findTopLevelIframeElements;
8
+ const IFRAME_TAGS = exports.IFRAME_TAGS = ["iframe", "frame"];
9
+ const ALL_IFRAMES_CSS_SELECTOR = exports.ALL_IFRAMES_CSS_SELECTOR = IFRAME_TAGS.join(", ");
10
+ const IFRAME_SRC_ATTRS = exports.IFRAME_SRC_ATTRS = ["src", "data-src", "data-lazy-src", "data-original-src", "data-url", "data-iframe-src"];
11
+ const IFRAME_CAPTCHA_SRC_PATTERNS = exports.IFRAME_CAPTCHA_SRC_PATTERNS = ["google.com/recaptcha", "recaptcha.net/recaptcha", "gstatic.com/recaptcha", "hcaptcha.com", "hcaptcha.net", "funcaptcha.com", "arkoselabs.com", "challenges.cloudflare.com", "cloudflare.com/cdn-cgi/challenge", "cloudflare.com/turnstile", "geetest.com", "awswaf", "aws-waf", "leminnow.com", "captcha"];
12
+ const IFRAME_PIXEL_SRC_DOMAINS = exports.IFRAME_PIXEL_SRC_DOMAINS = ["facebook.com/tr", "doubleclick.net", "google-analytics.com", "bat.bing.com", "analytics.twitter.com", "px.ads.linkedin.com", "t.co/i/adsct", "adsrvr.org", "demdex.net", "crwdcntrl.net"];
13
+ const IFRAME_REPLACEMENT_TAG = exports.IFRAME_REPLACEMENT_TAG = {
14
+ iframe: "iframe-content",
15
+ frame: "frame-content"
16
+ };
17
+ async function findTopLevelIframeElements(root, skipUseless = true) {
18
+ let jsCode;
19
+ if (skipUseless) {
20
+ const srcAttrs = JSON.stringify(IFRAME_SRC_ATTRS);
21
+ const captchaPatterns = JSON.stringify(IFRAME_CAPTCHA_SRC_PATTERNS.map(p => p.toLowerCase()));
22
+ const pixelDomains = JSON.stringify(IFRAME_PIXEL_SRC_DOMAINS);
23
+ jsCode = `(root) => {
24
+ const srcAttrs = ${srcAttrs};
25
+ const captchaPatterns = ${captchaPatterns};
26
+ const pixelDomains = ${pixelDomains};
27
+
28
+ return Array.from(root.querySelectorAll('${ALL_IFRAMES_CSS_SELECTOR}')).filter(el => {
29
+ let url = '';
30
+ for (const attr of srcAttrs) {
31
+ const val = el.getAttribute(attr);
32
+ if (val) { url = val; break; }
33
+ }
34
+ const urlLower = url.toLowerCase();
35
+
36
+ if (captchaPatterns.some(p => urlLower.includes(p))) return false;
37
+ if (pixelDomains.some(d => urlLower.includes(d))) return false;
38
+
39
+ const w = el.getAttribute('width');
40
+ const h = el.getAttribute('height');
41
+ const style = el.getAttribute('style') || '';
42
+
43
+ const parseNum = (val) => {
44
+ if (!val) return null;
45
+ const str = String(val).trim();
46
+ if (/[%a-zA-Z]/.test(str)) return null;
47
+ const n = parseFloat(str);
48
+ return isNaN(n) ? null : n;
49
+ };
50
+
51
+ let width = parseNum(w);
52
+ let height = parseNum(h);
53
+
54
+ if (width === null) {
55
+ const wMatch = style.match(/width\\s*:\\s*(\\d+)/);
56
+ if (wMatch) width = parseFloat(wMatch[1]);
57
+ }
58
+ if (height === null) {
59
+ const hMatch = style.match(/height\\s*:\\s*(\\d+)/);
60
+ if (hMatch) height = parseFloat(hMatch[1]);
61
+ }
62
+
63
+ return !(width === 0 || width === 1 || height === 0 || height === 1);
64
+ });
65
+ }`;
66
+ } else {
67
+ jsCode = `(root) => Array.from(root.querySelectorAll('${ALL_IFRAMES_CSS_SELECTOR}'))`;
68
+ }
69
+ let arrayHandle;
70
+ if ("elementHandle" in root) {
71
+ const element = await root.elementHandle();
72
+ if (!element) {
73
+ return [];
74
+ }
75
+ arrayHandle = await element.evaluateHandle((el, code) => {
76
+ const fn = new Function("return " + code)();
77
+ return fn(el);
78
+ }, jsCode);
79
+ } else {
80
+ arrayHandle = await root.evaluateHandle(`(${jsCode})(document)`);
81
+ }
82
+ const properties = await arrayHandle.getProperties();
83
+ const handles = [];
84
+ for (const jsHandle of properties.values()) {
85
+ const el = jsHandle.asElement();
86
+ if (el) {
87
+ handles.push(el);
88
+ }
89
+ }
90
+ return handles;
91
+ }
@@ -4,17 +4,26 @@ Object.defineProperty(exports, "__esModule", {
4
4
  value: true
5
5
  });
6
6
  exports.getSimplifiedHtml = getSimplifiedHtml;
7
- async function getSimplifiedHtml(containerHandle, options) {
8
- const tagName = await containerHandle.evaluateHandle(element => element.tagName.toLowerCase());
9
- const ALLOWED_ATTRIBUTES = ["aria-label", "data-name", "name", "type", "placeholder", "value", "role", "title", "href", "id", "alt", new RegExp(/^data-/)];
10
- const shouldReturnFullHtml = (await tagName.jsonValue()) === "html";
7
+ async function getSimplifiedHtml(root, options) {
8
+ let containerHandle;
9
+ if ("content" in root && typeof root.content === "function") {
10
+ containerHandle = await root.locator("html").elementHandle();
11
+ } else {
12
+ containerHandle = await root.elementHandle();
13
+ }
14
+ if (!containerHandle) {
15
+ return "";
16
+ }
17
+ const tagNameHandle = await containerHandle.evaluateHandle(element => element.tagName.toLowerCase());
18
+ const tagName = await tagNameHandle.jsonValue();
19
+ const shouldReturnFullHtml = tagName === "html";
11
20
  const optionsWithDefault = {
12
- shouldIncludeOnClick: false,
13
- shouldIncludeContentAsProp: false,
14
- keepOnlyVisibleElements: true,
15
- shouldReturnFullHtml,
16
- ...options
21
+ shouldIncludeOnClick: (options === null || options === void 0 ? void 0 : options.shouldIncludeOnClick) ?? false,
22
+ shouldIncludeContentAsProp: (options === null || options === void 0 ? void 0 : options.shouldIncludeContentAsProp) ?? false,
23
+ keepOnlyVisibleElements: (options === null || options === void 0 ? void 0 : options.keepOnlyVisibleElements) ?? true,
24
+ shouldReturnFullHtml
17
25
  };
26
+ const ALLOWED_ATTRIBUTES = ["aria-label", "data-name", "name", "type", "placeholder", "value", "role", "title", "href", "id", "alt", new RegExp(/^data-/)];
18
27
  const simplifiedHtml = await containerHandle.evaluate((element, {
19
28
  optionsWithDefault,
20
29
  ALLOWED_ATTRIBUTES
@@ -42,15 +51,6 @@ async function getSimplifiedHtml(containerHandle, options) {
42
51
  function isElementInteractive(element, style) {
43
52
  return element.tagName === "A" || element.tagName === "INPUT" || element.tagName === "BUTTON" || element.tagName === "SELECT" || element.tagName === "TEXTAREA" || element.hasAttribute("onclick") || element.hasAttribute("onmousedown") || element.hasAttribute("onmouseup") || element.hasAttribute("onkeydown") || element.hasAttribute("onkeyup") || style.cursor === "pointer";
44
53
  }
45
- function getDocumentFromIframeElementSafely(element) {
46
- try {
47
- if (element.contentWindow && element.contentWindow.document) {
48
- return element.contentWindow.document.documentElement;
49
- }
50
- } catch (error) {
51
- return undefined;
52
- }
53
- }
54
54
  function isInputWithValue(element) {
55
55
  return element.tagName === "INPUT" && element.value && element.value.trim();
56
56
  }
@@ -59,7 +59,7 @@ async function getSimplifiedHtml(containerHandle, options) {
59
59
  if (element.nodeType === 3 && (_element$textContent = element.textContent) !== null && _element$textContent !== void 0 && _element$textContent.trim()) {
60
60
  return document.createTextNode(element.textContent + " ");
61
61
  }
62
- const shouldSkipElementChecks = optionsWithDefault.shouldIncludeIframes ? ["BODY", "HTML", "IFRAME"] : ["BODY", "HTML"].includes(element.nodeName);
62
+ const shouldSkipElementChecks = ["BODY", "HTML"].includes(element.nodeName);
63
63
  if (!isElementNode(element)) {
64
64
  return null;
65
65
  }
@@ -68,7 +68,7 @@ async function getSimplifiedHtml(containerHandle, options) {
68
68
  if (keepOnlyVisibleElements && !isVisible && !isInputWithValue(element)) {
69
69
  return null;
70
70
  }
71
- let children = optionsWithDefault.shouldIncludeIframes && element.nodeName === "IFRAME" ? [getDocumentFromIframeElementSafely(element)].filter(Boolean) : Array.from(element.childNodes).map(c => generateSimplifiedDom(c, interactiveElements, document, allowedAttributes, shouldIncludeContentAsProp, keepOnlyVisibleElements)).filter(truthyFilter);
71
+ let children = Array.from(element.childNodes).map(c => generateSimplifiedDom(c, interactiveElements, document, allowedAttributes, shouldIncludeContentAsProp, keepOnlyVisibleElements)).filter(truthyFilter);
72
72
  if (element.tagName === "BODY") children = children.filter(c => c.nodeType !== 3);
73
73
  const interactive = isElementInteractive(element, style) || element.hasAttribute("role");
74
74
  const hasLabel = element.hasAttribute("aria-label") || element.hasAttribute("name");
@@ -3,6 +3,7 @@
3
3
  Object.defineProperty(exports, "__esModule", {
4
4
  value: true
5
5
  });
6
+ exports.createMatchesMapping = createMatchesMapping;
6
7
  exports.filterAndRankMatches = filterAndRankMatches;
7
8
  exports.isMatchExact = isMatchExact;
8
9
  exports.matchStringsWithDomContent = matchStringsWithDomContent;
@@ -12,7 +13,9 @@ exports.rankMatch = rankMatch;
12
13
  exports.removePunctuationAndSpaces = removePunctuationAndSpaces;
13
14
  exports.replaceWithBestMatches = replaceWithBestMatches;
14
15
  exports.selectBestMatch = selectBestMatch;
16
+ exports.validateMatchesMapping = validateMatchesMapping;
15
17
  var _utils = require("../../helpers/utils");
18
+ var _frame_utils = require("../frame_utils");
16
19
  const logger = {
17
20
  info: message => console.info(message),
18
21
  warning: message => console.warn(message),
@@ -153,22 +156,18 @@ function selectBestMatch({
153
156
  }
154
157
  async function matchStringsWithDomContent({
155
158
  pageObject,
156
- stringsList,
157
- container = null
159
+ stringsList
158
160
  }) {
159
161
  try {
160
162
  await (0, _utils.ensureBrowserScripts)(pageObject);
161
- let handle;
162
- if (container) {
163
- handle = container;
164
- } else {
165
- handle = await pageObject.locator("html").elementHandle();
166
- }
167
- const matches = await pageObject.evaluate(async args => {
168
- const [container, searchTexts] = args;
163
+ const htmlContent = await (0, _frame_utils.getContentWithNestedIframes)(pageObject);
164
+ const matches = await pageObject.evaluate(([htmlContent, searchTexts]) => {
169
165
  try {
170
166
  if (typeof window.__INTUNED__ !== "undefined" && typeof window.__INTUNED__.matchStringsWithDomContent === "function") {
171
- return await window.__INTUNED__.matchStringsWithDomContent(container, searchTexts);
167
+ const parser = new DOMParser();
168
+ const doc = parser.parseFromString(htmlContent, "text/html");
169
+ const domNode = doc.body || doc.documentElement;
170
+ return window.__INTUNED__.matchStringsWithDomContent(domNode, searchTexts);
172
171
  } else {
173
172
  return searchTexts.reduce((acc, text) => {
174
173
  acc[text] = [];
@@ -182,7 +181,7 @@ async function matchStringsWithDomContent({
182
181
  return acc;
183
182
  }, {});
184
183
  }
185
- }, [handle, stringsList]);
184
+ }, [htmlContent, stringsList]);
186
185
  return matches;
187
186
  } catch (error) {
188
187
  logger.warning(`Error matching strings with DOM content: ${error}`);
@@ -194,13 +193,11 @@ async function matchStringsWithDomContent({
194
193
  }
195
194
  async function replaceWithBestMatches({
196
195
  stringsToMatch,
197
- pageObject,
198
- container = null
196
+ pageObject
199
197
  }) {
200
198
  const matchesMap = await matchStringsWithDomContent({
201
199
  pageObject,
202
- stringsList: stringsToMatch,
203
- container
200
+ stringsList: stringsToMatch
204
201
  });
205
202
  const replacements = {};
206
203
  const xpathMapping = {};
@@ -222,6 +219,84 @@ async function replaceWithBestMatches({
222
219
  xpathMapping
223
220
  };
224
221
  }
222
+ async function createMatchesMapping(page, extractedData) {
223
+ const uniqueValues = new Set();
224
+ if (Array.isArray(extractedData)) {
225
+ if (extractedData.length > 0 && typeof extractedData[0] === "string") {
226
+ extractedData.forEach(value => uniqueValues.add(String(value)));
227
+ } else {
228
+ extractedData.forEach(obj => {
229
+ if (typeof obj === "object" && obj !== null) {
230
+ Object.values(obj).forEach(value => uniqueValues.add(String(value)));
231
+ }
232
+ });
233
+ }
234
+ } else if (typeof extractedData === "object" && extractedData !== null) {
235
+ Object.values(extractedData).forEach(value => uniqueValues.add(String(value)));
236
+ }
237
+ const matchesMap = await matchStringsWithDomContent({
238
+ pageObject: page,
239
+ stringsList: Array.from(uniqueValues)
240
+ });
241
+ const mapping = {};
242
+ for (const [value, matches] of Object.entries(matchesMap)) {
243
+ const filteredMatches = matches.filter(match => {
244
+ if (match.match_mode !== "fuzzy") {
245
+ return true;
246
+ }
247
+ const rank = rankMatch({
248
+ original: value,
249
+ match: match.matched_value
250
+ });
251
+ return rank === "HIGH";
252
+ });
253
+ mapping[value] = filteredMatches.map(match => ({
254
+ xpath: match.xpath || "",
255
+ matched_value: match.matched_value || ""
256
+ }));
257
+ }
258
+ return mapping;
259
+ }
260
+ async function validateMatchesMapping(page, cachedMapping) {
261
+ try {
262
+ const stringsToCheck = Object.keys(cachedMapping);
263
+ const currentMatches = await matchStringsWithDomContent({
264
+ pageObject: page,
265
+ stringsList: stringsToCheck
266
+ });
267
+ for (const [value, cachedMatchList] of Object.entries(cachedMapping)) {
268
+ if (!(value in currentMatches)) {
269
+ return false;
270
+ }
271
+ const currentMatchList = currentMatches[value].map(match => ({
272
+ xpath: match.xpath || "",
273
+ matched_value: match.matched_value || ""
274
+ }));
275
+ const cachedSorted = [...cachedMatchList].sort((a, b) => {
276
+ const aKey = `${a.xpath}|${a.matched_value}`;
277
+ const bKey = `${b.xpath}|${b.matched_value}`;
278
+ return aKey.localeCompare(bKey);
279
+ });
280
+ const currentSorted = [...currentMatchList].sort((a, b) => {
281
+ const aKey = `${a.xpath}|${a.matched_value}`;
282
+ const bKey = `${b.xpath}|${b.matched_value}`;
283
+ return aKey.localeCompare(bKey);
284
+ });
285
+ if (cachedSorted.length !== currentSorted.length) {
286
+ return false;
287
+ }
288
+ for (let i = 0; i < cachedSorted.length; i++) {
289
+ if (cachedSorted[i].xpath !== currentSorted[i].xpath || cachedSorted[i].matched_value !== currentSorted[i].matched_value) {
290
+ return false;
291
+ }
292
+ }
293
+ }
294
+ return true;
295
+ } catch (error) {
296
+ logger.error(`Error validating matches mapping: ${error}`);
297
+ return false;
298
+ }
299
+ }
225
300
  async function filterAndRankMatches(frame, matches) {
226
301
  const filteredMatches = matches.filter(match => {
227
302
  const xpath = match.xpath;