@intuned/browser-dev 0.1.9-dev.0 → 0.1.10-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/extractStructuredData.js +21 -27
- package/dist/ai/tests/testCreateMatchesMapping.spec.js +216 -0
- package/dist/ai/tests/testExtractStructuredData.spec.js +346 -0
- package/dist/ai/tests/testExtractStructuredDataDomMatchingIframes.spec.js +459 -0
- package/dist/ai/tests/testExtractStructuredDataUnit.spec.js +375 -0
- package/dist/ai/tests/testMatching.spec.js +342 -0
- package/dist/ai/tests/testValidateMatchesMapping.spec.js +265 -0
- package/dist/common/extendedTest.js +38 -30
- package/dist/common/frame_utils/frameTree.js +116 -0
- package/dist/common/frame_utils/getContentWithNestedIframes.js +13 -0
- package/dist/common/frame_utils/index.js +95 -0
- package/dist/common/frame_utils/stitchIframe.js +105 -0
- package/dist/{helpers → common}/frame_utils/tests/testFindAllIframes.spec.js +24 -15
- package/dist/common/frame_utils/tests/testGetContentWithNestedIframes.spec.js +241 -0
- package/dist/common/frame_utils/utils.js +91 -0
- package/dist/common/getSimplifiedHtml.js +20 -20
- package/dist/common/matching/matching.js +91 -16
- package/dist/common/tests/matching.test.js +225 -0
- package/dist/common/tests/testGetSimplifiedHtml.spec.js +324 -0
- package/dist/helpers/extractMarkdown.js +16 -7
- package/dist/helpers/tests/testExtractMarkdown.spec.js +29 -0
- package/dist/helpers/waitForDomSettled.js +4 -4
- package/dist/types/intuned-runtime.d.ts +6 -32
- package/package.json +1 -1
- package/dist/helpers/frame_utils/constants.js +0 -8
- package/dist/helpers/frame_utils/findAllIframes.js +0 -82
- package/dist/helpers/frame_utils/index.js +0 -44
- /package/dist/{helpers → common}/frame_utils/checkFrameAllowsAsyncScripts.js +0 -0
- /package/dist/{helpers → common}/frame_utils/getContainerFrame.js +0 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
var _extendedTest = require("../../extendedTest");
|
|
4
|
+
var _playwrightCore = require("playwright-core");
|
|
5
|
+
var _getContentWithNestedIframes = require("../getContentWithNestedIframes");
|
|
6
|
+
var _nodeHtmlParser = require("node-html-parser");
|
|
7
|
+
var path = _interopRequireWildcard(require("path"));
|
|
8
|
+
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
9
|
+
(0, _extendedTest.describe)("Test getContentWithNestedIframes", () => {
|
|
10
|
+
let browser;
|
|
11
|
+
let page;
|
|
12
|
+
(0, _extendedTest.beforeAll)(async () => {
|
|
13
|
+
browser = await _playwrightCore.chromium.launch({
|
|
14
|
+
headless: true
|
|
15
|
+
});
|
|
16
|
+
});
|
|
17
|
+
(0, _extendedTest.afterAll)(async () => {
|
|
18
|
+
await browser.close();
|
|
19
|
+
});
|
|
20
|
+
(0, _extendedTest.beforeEach)(async () => {
|
|
21
|
+
page = await browser.newPage();
|
|
22
|
+
});
|
|
23
|
+
(0, _extendedTest.afterEach)(async () => {
|
|
24
|
+
await page.close();
|
|
25
|
+
});
|
|
26
|
+
async function getTestPageWithIframes(waitForDynamicContent = false) {
|
|
27
|
+
const testHtmlPath = path.resolve(__dirname, "fixtures", "test_with_iframes.html");
|
|
28
|
+
await page.goto(`file://${testHtmlPath}`);
|
|
29
|
+
if (waitForDynamicContent) {
|
|
30
|
+
await page.waitForTimeout(1500);
|
|
31
|
+
}
|
|
32
|
+
return page;
|
|
33
|
+
}
|
|
34
|
+
(0, _extendedTest.test)("should get content with nested iframes stitched - basic", async () => {
|
|
35
|
+
await getTestPageWithIframes();
|
|
36
|
+
const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
|
|
37
|
+
(0, _extendedTest.expect)(content).toContain("Static Iframe Content");
|
|
38
|
+
(0, _extendedTest.expect)(content).toContain("Srcdoc Iframe");
|
|
39
|
+
(0, _extendedTest.expect)(content).toContain("Outer Iframe");
|
|
40
|
+
const iframeContentCount = (content.match(/<iframe-content/g) || []).length;
|
|
41
|
+
(0, _extendedTest.expect)(iframeContentCount).toBeGreaterThanOrEqual(4);
|
|
42
|
+
(0, _extendedTest.expect)(content).toContain('id="static-iframe"');
|
|
43
|
+
(0, _extendedTest.expect)(content).toContain("<h2>Static Iframe Content</h2>");
|
|
44
|
+
(0, _extendedTest.expect)(content).toContain("iframe-element");
|
|
45
|
+
(0, _extendedTest.expect)(content).toContain("srcdoc-span");
|
|
46
|
+
});
|
|
47
|
+
(0, _extendedTest.test)("should handle nested iframes recursively", async () => {
|
|
48
|
+
await getTestPageWithIframes();
|
|
49
|
+
const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
|
|
50
|
+
(0, _extendedTest.expect)(content).toContain("Outer Iframe");
|
|
51
|
+
(0, _extendedTest.expect)(content).toContain("Inner Iframe");
|
|
52
|
+
(0, _extendedTest.expect)(content).toContain("Deep Element");
|
|
53
|
+
const iframeContentCount = (content.match(/<iframe-content/g) || []).length;
|
|
54
|
+
(0, _extendedTest.expect)(iframeContentCount).toBeGreaterThanOrEqual(4);
|
|
55
|
+
(0, _extendedTest.expect)(content).toContain("deep-element");
|
|
56
|
+
});
|
|
57
|
+
(0, _extendedTest.test)("should handle dynamically created iframes", async () => {
|
|
58
|
+
await getTestPageWithIframes(true);
|
|
59
|
+
const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
|
|
60
|
+
(0, _extendedTest.expect)(content).toContain("Dynamic Iframe");
|
|
61
|
+
(0, _extendedTest.expect)(content).toContain("dynamic-element");
|
|
62
|
+
});
|
|
63
|
+
(0, _extendedTest.test)("should handle empty iframes gracefully", async () => {
|
|
64
|
+
await getTestPageWithIframes();
|
|
65
|
+
const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
|
|
66
|
+
(0, _extendedTest.expect)(content).toContain("empty-iframe-id");
|
|
67
|
+
const emptyIframeCount = (content.match(/empty-iframe-id/g) || []).length;
|
|
68
|
+
(0, _extendedTest.expect)(emptyIframeCount).toBe(1);
|
|
69
|
+
});
|
|
70
|
+
(0, _extendedTest.test)("should preserve main page content", async () => {
|
|
71
|
+
await getTestPageWithIframes();
|
|
72
|
+
const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
|
|
73
|
+
(0, _extendedTest.expect)(content).toContain("Main Page Content");
|
|
74
|
+
(0, _extendedTest.expect)(content).toContain("This is the main page content");
|
|
75
|
+
(0, _extendedTest.expect)(content).toContain("content-after-iframes");
|
|
76
|
+
(0, _extendedTest.expect)(content).toContain("Item 1");
|
|
77
|
+
(0, _extendedTest.expect)(content).toContain("Item 2");
|
|
78
|
+
});
|
|
79
|
+
(0, _extendedTest.test)("should handle cross-origin iframes gracefully", async () => {
|
|
80
|
+
await page.setContent(`
|
|
81
|
+
<html>
|
|
82
|
+
<body element_id="main-body">
|
|
83
|
+
<h1>Main Content</h1>
|
|
84
|
+
<iframe id="cross-origin-iframe"
|
|
85
|
+
element_id="cross-origin-id"
|
|
86
|
+
src="https://example.com"
|
|
87
|
+
width="300"
|
|
88
|
+
height="200">
|
|
89
|
+
</iframe>
|
|
90
|
+
<p>Content after iframe</p>
|
|
91
|
+
</body>
|
|
92
|
+
</html>
|
|
93
|
+
`);
|
|
94
|
+
await page.waitForTimeout(500);
|
|
95
|
+
const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
|
|
96
|
+
(0, _extendedTest.expect)(content).toContain("Main Content");
|
|
97
|
+
(0, _extendedTest.expect)(content).toContain("Content after iframe");
|
|
98
|
+
(0, _extendedTest.expect)(content).toContain("cross-origin-id");
|
|
99
|
+
});
|
|
100
|
+
(0, _extendedTest.test)("should handle malformed HTML in iframe content", async () => {
|
|
101
|
+
await page.setContent(`
|
|
102
|
+
<html>
|
|
103
|
+
<body element_id="main-body">
|
|
104
|
+
<h1>Malformed HTML Test</h1>
|
|
105
|
+
<iframe id="malformed-iframe"
|
|
106
|
+
element_id="malformed-id"
|
|
107
|
+
srcdoc="<html><body><h2>Malformed</h2><p>Unclosed tag<div>Missing closing</p></body></html>"
|
|
108
|
+
width="300"
|
|
109
|
+
height="200">
|
|
110
|
+
</iframe>
|
|
111
|
+
</body>
|
|
112
|
+
</html>
|
|
113
|
+
`);
|
|
114
|
+
const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
|
|
115
|
+
(0, _extendedTest.expect)(content).toContain("Malformed HTML Test");
|
|
116
|
+
(0, _extendedTest.expect)(content).toContain("malformed-id");
|
|
117
|
+
(0, _extendedTest.expect)(content).toContain("Malformed");
|
|
118
|
+
});
|
|
119
|
+
(0, _extendedTest.test)("should handle problematic iframe sources without hanging", async () => {
|
|
120
|
+
await page.setContent(`
|
|
121
|
+
<html>
|
|
122
|
+
<body element_id="main-body">
|
|
123
|
+
<h1>Main Content</h1>
|
|
124
|
+
<iframe id="empty-javascript-iframe"
|
|
125
|
+
element_id="empty-javascript-id"
|
|
126
|
+
src="javascript:"
|
|
127
|
+
width="300"
|
|
128
|
+
height="200">
|
|
129
|
+
</iframe>
|
|
130
|
+
<iframe id="invalid-blob-iframe"
|
|
131
|
+
element_id="invalid-blob-id"
|
|
132
|
+
src="blob:null/invalid"
|
|
133
|
+
width="300"
|
|
134
|
+
height="200">
|
|
135
|
+
</iframe>
|
|
136
|
+
<p>Content after iframes</p>
|
|
137
|
+
</body>
|
|
138
|
+
</html>
|
|
139
|
+
`, {
|
|
140
|
+
waitUntil: "domcontentloaded",
|
|
141
|
+
timeout: 5000
|
|
142
|
+
});
|
|
143
|
+
const startTime = Date.now();
|
|
144
|
+
const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page, 2.0);
|
|
145
|
+
const elapsedTime = Date.now() - startTime;
|
|
146
|
+
(0, _extendedTest.expect)(elapsedTime).toBeLessThan(10000);
|
|
147
|
+
(0, _extendedTest.expect)(content).toContain("Main Content");
|
|
148
|
+
(0, _extendedTest.expect)(content).toContain("Content after iframes");
|
|
149
|
+
(0, _extendedTest.expect)(content).toContain("empty-javascript-id");
|
|
150
|
+
(0, _extendedTest.expect)(content).toContain("invalid-blob-id");
|
|
151
|
+
});
|
|
152
|
+
(0, _extendedTest.test)("should handle legacy frame elements", async () => {
|
|
153
|
+
await page.setContent(`
|
|
154
|
+
<html>
|
|
155
|
+
<head><title>Legacy Frameset</title></head>
|
|
156
|
+
<frameset cols="50%,50%">
|
|
157
|
+
<frame id="frame-1"
|
|
158
|
+
element_id="frame-1-id"
|
|
159
|
+
src="data:text/html,<html><body><h2>Frame 1</h2><p>Frame 1 Content</p></body></html>">
|
|
160
|
+
</frame>
|
|
161
|
+
<frame id="frame-2"
|
|
162
|
+
element_id="frame-2-id"
|
|
163
|
+
src="data:text/html,<html><body><h2>Frame 2</h2><p>Frame 2 Content</p></body></html>">
|
|
164
|
+
</frame>
|
|
165
|
+
</frameset>
|
|
166
|
+
</html>
|
|
167
|
+
`, {
|
|
168
|
+
waitUntil: "domcontentloaded"
|
|
169
|
+
});
|
|
170
|
+
const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page);
|
|
171
|
+
(0, _extendedTest.expect)(content.includes("frame-1-id") || content.includes("frame-2-id") || content.length > 0).toBe(true);
|
|
172
|
+
});
|
|
173
|
+
(0, _extendedTest.test)("should support custom HTML extractor to skip tags", async () => {
|
|
174
|
+
await page.setContent(`
|
|
175
|
+
<html>
|
|
176
|
+
<body element_id="main-body">
|
|
177
|
+
<h1>Main Content</h1>
|
|
178
|
+
<p>This is main page paragraph that should remain.</p>
|
|
179
|
+
<iframe id="test-iframe"
|
|
180
|
+
element_id="test-iframe-id"
|
|
181
|
+
srcdoc="<html><body><h2>Iframe Title</h2><p>This paragraph should be removed.</p><div>This div should remain.</div><p>Another paragraph to remove.</p></body></html>"
|
|
182
|
+
width="300"
|
|
183
|
+
height="200">
|
|
184
|
+
</iframe>
|
|
185
|
+
<p>Another main page paragraph that should remain.</p>
|
|
186
|
+
</body>
|
|
187
|
+
</html>
|
|
188
|
+
`, {
|
|
189
|
+
waitUntil: "domcontentloaded"
|
|
190
|
+
});
|
|
191
|
+
const customExtractor = async root => {
|
|
192
|
+
let htmlContent;
|
|
193
|
+
if ("content" in root && typeof root.content === "function") {
|
|
194
|
+
htmlContent = await root.content();
|
|
195
|
+
} else {
|
|
196
|
+
htmlContent = await root.evaluate(el => el.outerHTML);
|
|
197
|
+
}
|
|
198
|
+
const parsed = (0, _nodeHtmlParser.parse)(htmlContent);
|
|
199
|
+
const pTags = parsed.querySelectorAll("p");
|
|
200
|
+
pTags.forEach(p => p.remove());
|
|
201
|
+
return parsed.toString();
|
|
202
|
+
};
|
|
203
|
+
const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(page, 10000, customExtractor);
|
|
204
|
+
(0, _extendedTest.expect)(content).toContain("Main Content");
|
|
205
|
+
(0, _extendedTest.expect)(content).toContain("test-iframe-id");
|
|
206
|
+
(0, _extendedTest.expect)(content).toContain("Iframe Title");
|
|
207
|
+
(0, _extendedTest.expect)(content).toContain("This div should remain");
|
|
208
|
+
const pTagCount = (content.match(/<p>/g) || []).length;
|
|
209
|
+
(0, _extendedTest.expect)(pTagCount).toBe(0);
|
|
210
|
+
(0, _extendedTest.expect)(content).toContain("<h2>Iframe Title</h2>");
|
|
211
|
+
(0, _extendedTest.expect)(content).toContain("<div>This div should remain.</div>");
|
|
212
|
+
const iframeContentMatch = content.match(/<iframe-content[^>]*>(.*?)<\/iframe-content>/s);
|
|
213
|
+
(0, _extendedTest.expect)(iframeContentMatch).toBeTruthy();
|
|
214
|
+
if (iframeContentMatch) {
|
|
215
|
+
const nestedContent = iframeContentMatch[1];
|
|
216
|
+
(0, _extendedTest.expect)(nestedContent).not.toContain("<p>");
|
|
217
|
+
}
|
|
218
|
+
});
|
|
219
|
+
(0, _extendedTest.test)("should work with locator", async () => {
|
|
220
|
+
await page.setContent(`
|
|
221
|
+
<html>
|
|
222
|
+
<body>
|
|
223
|
+
<div id="container">
|
|
224
|
+
<h1>Container Content</h1>
|
|
225
|
+
<iframe srcdoc="<html><body><h2>Iframe in Container</h2></body></html>"></iframe>
|
|
226
|
+
</div>
|
|
227
|
+
<div id="other">
|
|
228
|
+
<p>Other content</p>
|
|
229
|
+
</div>
|
|
230
|
+
</body>
|
|
231
|
+
</html>`, {
|
|
232
|
+
waitUntil: "domcontentloaded"
|
|
233
|
+
});
|
|
234
|
+
await page.waitForSelector("#container iframe");
|
|
235
|
+
const locator = page.locator("#container");
|
|
236
|
+
const content = await (0, _getContentWithNestedIframes.getContentWithNestedIframes)(locator);
|
|
237
|
+
(0, _extendedTest.expect)(content).toContain("Container Content");
|
|
238
|
+
(0, _extendedTest.expect)(content).toContain("Iframe in Container");
|
|
239
|
+
(0, _extendedTest.expect)(content).not.toContain("Other content");
|
|
240
|
+
});
|
|
241
|
+
});
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.IFRAME_TAGS = exports.IFRAME_SRC_ATTRS = exports.IFRAME_REPLACEMENT_TAG = exports.IFRAME_PIXEL_SRC_DOMAINS = exports.IFRAME_CAPTCHA_SRC_PATTERNS = exports.ALL_IFRAMES_CSS_SELECTOR = void 0;
|
|
7
|
+
exports.findTopLevelIframeElements = findTopLevelIframeElements;
|
|
8
|
+
const IFRAME_TAGS = exports.IFRAME_TAGS = ["iframe", "frame"];
|
|
9
|
+
const ALL_IFRAMES_CSS_SELECTOR = exports.ALL_IFRAMES_CSS_SELECTOR = IFRAME_TAGS.join(", ");
|
|
10
|
+
const IFRAME_SRC_ATTRS = exports.IFRAME_SRC_ATTRS = ["src", "data-src", "data-lazy-src", "data-original-src", "data-url", "data-iframe-src"];
|
|
11
|
+
const IFRAME_CAPTCHA_SRC_PATTERNS = exports.IFRAME_CAPTCHA_SRC_PATTERNS = ["google.com/recaptcha", "recaptcha.net/recaptcha", "gstatic.com/recaptcha", "hcaptcha.com", "hcaptcha.net", "funcaptcha.com", "arkoselabs.com", "challenges.cloudflare.com", "cloudflare.com/cdn-cgi/challenge", "cloudflare.com/turnstile", "geetest.com", "awswaf", "aws-waf", "leminnow.com", "captcha"];
|
|
12
|
+
const IFRAME_PIXEL_SRC_DOMAINS = exports.IFRAME_PIXEL_SRC_DOMAINS = ["facebook.com/tr", "doubleclick.net", "google-analytics.com", "bat.bing.com", "analytics.twitter.com", "px.ads.linkedin.com", "t.co/i/adsct", "adsrvr.org", "demdex.net", "crwdcntrl.net"];
|
|
13
|
+
const IFRAME_REPLACEMENT_TAG = exports.IFRAME_REPLACEMENT_TAG = {
|
|
14
|
+
iframe: "iframe-content",
|
|
15
|
+
frame: "frame-content"
|
|
16
|
+
};
|
|
17
|
+
async function findTopLevelIframeElements(root, skipUseless = true) {
|
|
18
|
+
let jsCode;
|
|
19
|
+
if (skipUseless) {
|
|
20
|
+
const srcAttrs = JSON.stringify(IFRAME_SRC_ATTRS);
|
|
21
|
+
const captchaPatterns = JSON.stringify(IFRAME_CAPTCHA_SRC_PATTERNS.map(p => p.toLowerCase()));
|
|
22
|
+
const pixelDomains = JSON.stringify(IFRAME_PIXEL_SRC_DOMAINS);
|
|
23
|
+
jsCode = `(root) => {
|
|
24
|
+
const srcAttrs = ${srcAttrs};
|
|
25
|
+
const captchaPatterns = ${captchaPatterns};
|
|
26
|
+
const pixelDomains = ${pixelDomains};
|
|
27
|
+
|
|
28
|
+
return Array.from(root.querySelectorAll('${ALL_IFRAMES_CSS_SELECTOR}')).filter(el => {
|
|
29
|
+
let url = '';
|
|
30
|
+
for (const attr of srcAttrs) {
|
|
31
|
+
const val = el.getAttribute(attr);
|
|
32
|
+
if (val) { url = val; break; }
|
|
33
|
+
}
|
|
34
|
+
const urlLower = url.toLowerCase();
|
|
35
|
+
|
|
36
|
+
if (captchaPatterns.some(p => urlLower.includes(p))) return false;
|
|
37
|
+
if (pixelDomains.some(d => urlLower.includes(d))) return false;
|
|
38
|
+
|
|
39
|
+
const w = el.getAttribute('width');
|
|
40
|
+
const h = el.getAttribute('height');
|
|
41
|
+
const style = el.getAttribute('style') || '';
|
|
42
|
+
|
|
43
|
+
const parseNum = (val) => {
|
|
44
|
+
if (!val) return null;
|
|
45
|
+
const str = String(val).trim();
|
|
46
|
+
if (/[%a-zA-Z]/.test(str)) return null;
|
|
47
|
+
const n = parseFloat(str);
|
|
48
|
+
return isNaN(n) ? null : n;
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
let width = parseNum(w);
|
|
52
|
+
let height = parseNum(h);
|
|
53
|
+
|
|
54
|
+
if (width === null) {
|
|
55
|
+
const wMatch = style.match(/width\\s*:\\s*(\\d+)/);
|
|
56
|
+
if (wMatch) width = parseFloat(wMatch[1]);
|
|
57
|
+
}
|
|
58
|
+
if (height === null) {
|
|
59
|
+
const hMatch = style.match(/height\\s*:\\s*(\\d+)/);
|
|
60
|
+
if (hMatch) height = parseFloat(hMatch[1]);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
return !(width === 0 || width === 1 || height === 0 || height === 1);
|
|
64
|
+
});
|
|
65
|
+
}`;
|
|
66
|
+
} else {
|
|
67
|
+
jsCode = `(root) => Array.from(root.querySelectorAll('${ALL_IFRAMES_CSS_SELECTOR}'))`;
|
|
68
|
+
}
|
|
69
|
+
let arrayHandle;
|
|
70
|
+
if ("elementHandle" in root) {
|
|
71
|
+
const element = await root.elementHandle();
|
|
72
|
+
if (!element) {
|
|
73
|
+
return [];
|
|
74
|
+
}
|
|
75
|
+
arrayHandle = await element.evaluateHandle((el, code) => {
|
|
76
|
+
const fn = new Function("return " + code)();
|
|
77
|
+
return fn(el);
|
|
78
|
+
}, jsCode);
|
|
79
|
+
} else {
|
|
80
|
+
arrayHandle = await root.evaluateHandle(`(${jsCode})(document)`);
|
|
81
|
+
}
|
|
82
|
+
const properties = await arrayHandle.getProperties();
|
|
83
|
+
const handles = [];
|
|
84
|
+
for (const jsHandle of properties.values()) {
|
|
85
|
+
const el = jsHandle.asElement();
|
|
86
|
+
if (el) {
|
|
87
|
+
handles.push(el);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
return handles;
|
|
91
|
+
}
|
|
@@ -4,17 +4,26 @@ Object.defineProperty(exports, "__esModule", {
|
|
|
4
4
|
value: true
|
|
5
5
|
});
|
|
6
6
|
exports.getSimplifiedHtml = getSimplifiedHtml;
|
|
7
|
-
async function getSimplifiedHtml(
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
7
|
+
async function getSimplifiedHtml(root, options) {
|
|
8
|
+
let containerHandle;
|
|
9
|
+
if ("content" in root && typeof root.content === "function") {
|
|
10
|
+
containerHandle = await root.locator("html").elementHandle();
|
|
11
|
+
} else {
|
|
12
|
+
containerHandle = await root.elementHandle();
|
|
13
|
+
}
|
|
14
|
+
if (!containerHandle) {
|
|
15
|
+
return "";
|
|
16
|
+
}
|
|
17
|
+
const tagNameHandle = await containerHandle.evaluateHandle(element => element.tagName.toLowerCase());
|
|
18
|
+
const tagName = await tagNameHandle.jsonValue();
|
|
19
|
+
const shouldReturnFullHtml = tagName === "html";
|
|
11
20
|
const optionsWithDefault = {
|
|
12
|
-
shouldIncludeOnClick: false,
|
|
13
|
-
shouldIncludeContentAsProp: false,
|
|
14
|
-
keepOnlyVisibleElements: true,
|
|
15
|
-
shouldReturnFullHtml
|
|
16
|
-
...options
|
|
21
|
+
shouldIncludeOnClick: (options === null || options === void 0 ? void 0 : options.shouldIncludeOnClick) ?? false,
|
|
22
|
+
shouldIncludeContentAsProp: (options === null || options === void 0 ? void 0 : options.shouldIncludeContentAsProp) ?? false,
|
|
23
|
+
keepOnlyVisibleElements: (options === null || options === void 0 ? void 0 : options.keepOnlyVisibleElements) ?? true,
|
|
24
|
+
shouldReturnFullHtml
|
|
17
25
|
};
|
|
26
|
+
const ALLOWED_ATTRIBUTES = ["aria-label", "data-name", "name", "type", "placeholder", "value", "role", "title", "href", "id", "alt", new RegExp(/^data-/)];
|
|
18
27
|
const simplifiedHtml = await containerHandle.evaluate((element, {
|
|
19
28
|
optionsWithDefault,
|
|
20
29
|
ALLOWED_ATTRIBUTES
|
|
@@ -42,15 +51,6 @@ async function getSimplifiedHtml(containerHandle, options) {
|
|
|
42
51
|
function isElementInteractive(element, style) {
|
|
43
52
|
return element.tagName === "A" || element.tagName === "INPUT" || element.tagName === "BUTTON" || element.tagName === "SELECT" || element.tagName === "TEXTAREA" || element.hasAttribute("onclick") || element.hasAttribute("onmousedown") || element.hasAttribute("onmouseup") || element.hasAttribute("onkeydown") || element.hasAttribute("onkeyup") || style.cursor === "pointer";
|
|
44
53
|
}
|
|
45
|
-
function getDocumentFromIframeElementSafely(element) {
|
|
46
|
-
try {
|
|
47
|
-
if (element.contentWindow && element.contentWindow.document) {
|
|
48
|
-
return element.contentWindow.document.documentElement;
|
|
49
|
-
}
|
|
50
|
-
} catch (error) {
|
|
51
|
-
return undefined;
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
54
|
function isInputWithValue(element) {
|
|
55
55
|
return element.tagName === "INPUT" && element.value && element.value.trim();
|
|
56
56
|
}
|
|
@@ -59,7 +59,7 @@ async function getSimplifiedHtml(containerHandle, options) {
|
|
|
59
59
|
if (element.nodeType === 3 && (_element$textContent = element.textContent) !== null && _element$textContent !== void 0 && _element$textContent.trim()) {
|
|
60
60
|
return document.createTextNode(element.textContent + " ");
|
|
61
61
|
}
|
|
62
|
-
const shouldSkipElementChecks =
|
|
62
|
+
const shouldSkipElementChecks = ["BODY", "HTML"].includes(element.nodeName);
|
|
63
63
|
if (!isElementNode(element)) {
|
|
64
64
|
return null;
|
|
65
65
|
}
|
|
@@ -68,7 +68,7 @@ async function getSimplifiedHtml(containerHandle, options) {
|
|
|
68
68
|
if (keepOnlyVisibleElements && !isVisible && !isInputWithValue(element)) {
|
|
69
69
|
return null;
|
|
70
70
|
}
|
|
71
|
-
let children =
|
|
71
|
+
let children = Array.from(element.childNodes).map(c => generateSimplifiedDom(c, interactiveElements, document, allowedAttributes, shouldIncludeContentAsProp, keepOnlyVisibleElements)).filter(truthyFilter);
|
|
72
72
|
if (element.tagName === "BODY") children = children.filter(c => c.nodeType !== 3);
|
|
73
73
|
const interactive = isElementInteractive(element, style) || element.hasAttribute("role");
|
|
74
74
|
const hasLabel = element.hasAttribute("aria-label") || element.hasAttribute("name");
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
Object.defineProperty(exports, "__esModule", {
|
|
4
4
|
value: true
|
|
5
5
|
});
|
|
6
|
+
exports.createMatchesMapping = createMatchesMapping;
|
|
6
7
|
exports.filterAndRankMatches = filterAndRankMatches;
|
|
7
8
|
exports.isMatchExact = isMatchExact;
|
|
8
9
|
exports.matchStringsWithDomContent = matchStringsWithDomContent;
|
|
@@ -12,7 +13,9 @@ exports.rankMatch = rankMatch;
|
|
|
12
13
|
exports.removePunctuationAndSpaces = removePunctuationAndSpaces;
|
|
13
14
|
exports.replaceWithBestMatches = replaceWithBestMatches;
|
|
14
15
|
exports.selectBestMatch = selectBestMatch;
|
|
16
|
+
exports.validateMatchesMapping = validateMatchesMapping;
|
|
15
17
|
var _utils = require("../../helpers/utils");
|
|
18
|
+
var _frame_utils = require("../frame_utils");
|
|
16
19
|
const logger = {
|
|
17
20
|
info: message => console.info(message),
|
|
18
21
|
warning: message => console.warn(message),
|
|
@@ -153,22 +156,18 @@ function selectBestMatch({
|
|
|
153
156
|
}
|
|
154
157
|
async function matchStringsWithDomContent({
|
|
155
158
|
pageObject,
|
|
156
|
-
stringsList
|
|
157
|
-
container = null
|
|
159
|
+
stringsList
|
|
158
160
|
}) {
|
|
159
161
|
try {
|
|
160
162
|
await (0, _utils.ensureBrowserScripts)(pageObject);
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
handle = container;
|
|
164
|
-
} else {
|
|
165
|
-
handle = await pageObject.locator("html").elementHandle();
|
|
166
|
-
}
|
|
167
|
-
const matches = await pageObject.evaluate(async args => {
|
|
168
|
-
const [container, searchTexts] = args;
|
|
163
|
+
const htmlContent = await (0, _frame_utils.getContentWithNestedIframes)(pageObject);
|
|
164
|
+
const matches = await pageObject.evaluate(([htmlContent, searchTexts]) => {
|
|
169
165
|
try {
|
|
170
166
|
if (typeof window.__INTUNED__ !== "undefined" && typeof window.__INTUNED__.matchStringsWithDomContent === "function") {
|
|
171
|
-
|
|
167
|
+
const parser = new DOMParser();
|
|
168
|
+
const doc = parser.parseFromString(htmlContent, "text/html");
|
|
169
|
+
const domNode = doc.body || doc.documentElement;
|
|
170
|
+
return window.__INTUNED__.matchStringsWithDomContent(domNode, searchTexts);
|
|
172
171
|
} else {
|
|
173
172
|
return searchTexts.reduce((acc, text) => {
|
|
174
173
|
acc[text] = [];
|
|
@@ -182,7 +181,7 @@ async function matchStringsWithDomContent({
|
|
|
182
181
|
return acc;
|
|
183
182
|
}, {});
|
|
184
183
|
}
|
|
185
|
-
}, [
|
|
184
|
+
}, [htmlContent, stringsList]);
|
|
186
185
|
return matches;
|
|
187
186
|
} catch (error) {
|
|
188
187
|
logger.warning(`Error matching strings with DOM content: ${error}`);
|
|
@@ -194,13 +193,11 @@ async function matchStringsWithDomContent({
|
|
|
194
193
|
}
|
|
195
194
|
async function replaceWithBestMatches({
|
|
196
195
|
stringsToMatch,
|
|
197
|
-
pageObject
|
|
198
|
-
container = null
|
|
196
|
+
pageObject
|
|
199
197
|
}) {
|
|
200
198
|
const matchesMap = await matchStringsWithDomContent({
|
|
201
199
|
pageObject,
|
|
202
|
-
stringsList: stringsToMatch
|
|
203
|
-
container
|
|
200
|
+
stringsList: stringsToMatch
|
|
204
201
|
});
|
|
205
202
|
const replacements = {};
|
|
206
203
|
const xpathMapping = {};
|
|
@@ -222,6 +219,84 @@ async function replaceWithBestMatches({
|
|
|
222
219
|
xpathMapping
|
|
223
220
|
};
|
|
224
221
|
}
|
|
222
|
+
async function createMatchesMapping(page, extractedData) {
|
|
223
|
+
const uniqueValues = new Set();
|
|
224
|
+
if (Array.isArray(extractedData)) {
|
|
225
|
+
if (extractedData.length > 0 && typeof extractedData[0] === "string") {
|
|
226
|
+
extractedData.forEach(value => uniqueValues.add(String(value)));
|
|
227
|
+
} else {
|
|
228
|
+
extractedData.forEach(obj => {
|
|
229
|
+
if (typeof obj === "object" && obj !== null) {
|
|
230
|
+
Object.values(obj).forEach(value => uniqueValues.add(String(value)));
|
|
231
|
+
}
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
} else if (typeof extractedData === "object" && extractedData !== null) {
|
|
235
|
+
Object.values(extractedData).forEach(value => uniqueValues.add(String(value)));
|
|
236
|
+
}
|
|
237
|
+
const matchesMap = await matchStringsWithDomContent({
|
|
238
|
+
pageObject: page,
|
|
239
|
+
stringsList: Array.from(uniqueValues)
|
|
240
|
+
});
|
|
241
|
+
const mapping = {};
|
|
242
|
+
for (const [value, matches] of Object.entries(matchesMap)) {
|
|
243
|
+
const filteredMatches = matches.filter(match => {
|
|
244
|
+
if (match.match_mode !== "fuzzy") {
|
|
245
|
+
return true;
|
|
246
|
+
}
|
|
247
|
+
const rank = rankMatch({
|
|
248
|
+
original: value,
|
|
249
|
+
match: match.matched_value
|
|
250
|
+
});
|
|
251
|
+
return rank === "HIGH";
|
|
252
|
+
});
|
|
253
|
+
mapping[value] = filteredMatches.map(match => ({
|
|
254
|
+
xpath: match.xpath || "",
|
|
255
|
+
matched_value: match.matched_value || ""
|
|
256
|
+
}));
|
|
257
|
+
}
|
|
258
|
+
return mapping;
|
|
259
|
+
}
|
|
260
|
+
async function validateMatchesMapping(page, cachedMapping) {
|
|
261
|
+
try {
|
|
262
|
+
const stringsToCheck = Object.keys(cachedMapping);
|
|
263
|
+
const currentMatches = await matchStringsWithDomContent({
|
|
264
|
+
pageObject: page,
|
|
265
|
+
stringsList: stringsToCheck
|
|
266
|
+
});
|
|
267
|
+
for (const [value, cachedMatchList] of Object.entries(cachedMapping)) {
|
|
268
|
+
if (!(value in currentMatches)) {
|
|
269
|
+
return false;
|
|
270
|
+
}
|
|
271
|
+
const currentMatchList = currentMatches[value].map(match => ({
|
|
272
|
+
xpath: match.xpath || "",
|
|
273
|
+
matched_value: match.matched_value || ""
|
|
274
|
+
}));
|
|
275
|
+
const cachedSorted = [...cachedMatchList].sort((a, b) => {
|
|
276
|
+
const aKey = `${a.xpath}|${a.matched_value}`;
|
|
277
|
+
const bKey = `${b.xpath}|${b.matched_value}`;
|
|
278
|
+
return aKey.localeCompare(bKey);
|
|
279
|
+
});
|
|
280
|
+
const currentSorted = [...currentMatchList].sort((a, b) => {
|
|
281
|
+
const aKey = `${a.xpath}|${a.matched_value}`;
|
|
282
|
+
const bKey = `${b.xpath}|${b.matched_value}`;
|
|
283
|
+
return aKey.localeCompare(bKey);
|
|
284
|
+
});
|
|
285
|
+
if (cachedSorted.length !== currentSorted.length) {
|
|
286
|
+
return false;
|
|
287
|
+
}
|
|
288
|
+
for (let i = 0; i < cachedSorted.length; i++) {
|
|
289
|
+
if (cachedSorted[i].xpath !== currentSorted[i].xpath || cachedSorted[i].matched_value !== currentSorted[i].matched_value) {
|
|
290
|
+
return false;
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
return true;
|
|
295
|
+
} catch (error) {
|
|
296
|
+
logger.error(`Error validating matches mapping: ${error}`);
|
|
297
|
+
return false;
|
|
298
|
+
}
|
|
299
|
+
}
|
|
225
300
|
async function filterAndRankMatches(frame, matches) {
|
|
226
301
|
const filteredMatches = matches.filter(match => {
|
|
227
302
|
const xpath = match.xpath;
|