@intuned/browser-dev 2.2.3-test-build.0 → 2.2.3-test-build.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/BROWSER_SCRIPTS_SETUP.md +78 -0
- package/dist/common/ensureBrowserScripts.js +2 -6
- package/dist/common/{browser_scripts.js → script.js} +143 -136
- package/dist/helpers/clickUntilExhausted.js +3 -3
- package/dist/helpers/export.d.ts +5 -5
- package/dist/helpers/index.d.ts +5 -5
- package/dist/helpers/index.js +2 -8
- package/dist/helpers/tests/testClickUntilExhausted.spec.js +10 -25
- package/package.json +4 -2
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# Browser Scripts Setup
|
|
2
|
+
|
|
3
|
+
This document explains how the TypeScript SDK handles the shared `browser_scripts.js` file from the monorepo root.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The `common/browser_scripts.js` file at the root of the monorepo needs to be included in both the TypeScript and Python SDK builds. Instead of copying the file at runtime, we generate a TypeScript module that exports the script content as a constant.
|
|
8
|
+
|
|
9
|
+
## How It Works
|
|
10
|
+
|
|
11
|
+
1. **Source File**: `/common/browser_scripts.js` - The shared browser scripts file
|
|
12
|
+
2. **Generated File**: `/typescript-sdk/src/common/script.ts` - Auto-generated TypeScript module
|
|
13
|
+
3. **Generator Script**: `/typescript-sdk/scripts/generate-browser-script.js` - Node.js script that reads the source and generates the TS module
|
|
14
|
+
|
|
15
|
+
## Build Process
|
|
16
|
+
|
|
17
|
+
The generation happens automatically:
|
|
18
|
+
|
|
19
|
+
### During Installation
|
|
20
|
+
|
|
21
|
+
When you run `yarn install`, the `postinstall` hook automatically generates `script.ts`:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
yarn install # automatically runs generate-browser-script
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### During Build
|
|
28
|
+
|
|
29
|
+
The build script also regenerates `script.ts` before compiling:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
yarn build # runs generate-browser-script → tsc → babel
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Manual Generation
|
|
36
|
+
|
|
37
|
+
You can manually regenerate the file:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
yarn generate-browser-script
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage in Code
|
|
44
|
+
|
|
45
|
+
In `ensureBrowserScripts.ts`:
|
|
46
|
+
|
|
47
|
+
```typescript
|
|
48
|
+
import { BROWSER_SCRIPT } from "./script";
|
|
49
|
+
|
|
50
|
+
async function ensureBrowserScripts(page: Page): Promise<void> {
|
|
51
|
+
// Check if already loaded
|
|
52
|
+
const pageHasScript = await page.evaluate(
|
|
53
|
+
'() => typeof window.__INTUNED__ !== "undefined"'
|
|
54
|
+
);
|
|
55
|
+
|
|
56
|
+
if (pageHasScript) {
|
|
57
|
+
return;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Inject the script
|
|
61
|
+
await page.evaluate(BROWSER_SCRIPT);
|
|
62
|
+
}
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Important Notes
|
|
66
|
+
|
|
67
|
+
- **DO NOT** edit `script.ts` manually - it's auto-generated
|
|
68
|
+
- **DO NOT** commit `script.ts` to git - it's in `.gitignore`
|
|
69
|
+
- The file is regenerated on every build and install
|
|
70
|
+
- If you update `common/browser_scripts.js`, run `yarn generate-browser-script` to update
|
|
71
|
+
|
|
72
|
+
## Benefits
|
|
73
|
+
|
|
74
|
+
1. ✅ **No runtime file reading** - Script is bundled as a string constant
|
|
75
|
+
2. ✅ **Type-safe** - TypeScript can check imports
|
|
76
|
+
3. ✅ **Better performance** - No filesystem access at runtime
|
|
77
|
+
4. ✅ **Cleaner builds** - No need to copy .js files during build
|
|
78
|
+
5. ✅ **Single source of truth** - Always uses the latest from `common/`
|
|
@@ -4,15 +4,11 @@ Object.defineProperty(exports, "__esModule", {
|
|
|
4
4
|
value: true
|
|
5
5
|
});
|
|
6
6
|
exports.ensureBrowserScripts = ensureBrowserScripts;
|
|
7
|
-
var
|
|
8
|
-
var _path = require("path");
|
|
7
|
+
var _script = require("./script");
|
|
9
8
|
async function ensureBrowserScripts(page) {
|
|
10
9
|
const pageHasScript = await page.evaluate('() => typeof window.__INTUNED__ !== "undefined"');
|
|
11
10
|
if (pageHasScript) {
|
|
12
11
|
return;
|
|
13
12
|
}
|
|
14
|
-
|
|
15
|
-
console.log("scriptPath", scriptPath);
|
|
16
|
-
const scriptContent = (0, _fs.readFileSync)(scriptPath, "utf-8");
|
|
17
|
-
await page.evaluate(scriptContent);
|
|
13
|
+
await page.evaluate(_script.BROWSER_SCRIPT);
|
|
18
14
|
}
|
|
@@ -1,4 +1,10 @@
|
|
|
1
|
-
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.BROWSER_SCRIPT = void 0;
|
|
7
|
+
const BROWSER_SCRIPT = exports.BROWSER_SCRIPT = `(function () {
|
|
2
8
|
"use strict";
|
|
3
9
|
|
|
4
10
|
var MatchSource;
|
|
@@ -223,7 +229,7 @@
|
|
|
223
229
|
dist: needleIdx,
|
|
224
230
|
};
|
|
225
231
|
} else {
|
|
226
|
-
candidates.set(
|
|
232
|
+
candidates.set(\`\${i},\${needleIdx + 1},\${needleIdx}\`, {
|
|
227
233
|
startIdx: i,
|
|
228
234
|
needleIdx: needleIdx + 1,
|
|
229
235
|
dist: needleIdx,
|
|
@@ -243,9 +249,9 @@
|
|
|
243
249
|
} else {
|
|
244
250
|
// otherwise, update the candidate's needleIdx and keep it
|
|
245
251
|
candidates.set(
|
|
246
|
-
|
|
252
|
+
\`\${candidate.startIdx},\${candidate.needleIdx + 1},\${
|
|
247
253
|
candidate.dist
|
|
248
|
-
}
|
|
254
|
+
}\`,
|
|
249
255
|
{
|
|
250
256
|
startIdx: candidate.startIdx,
|
|
251
257
|
needleIdx: candidate.needleIdx + 1,
|
|
@@ -256,9 +262,9 @@
|
|
|
256
262
|
} else {
|
|
257
263
|
if (candidate.dist === maxDist) continue;
|
|
258
264
|
candidates.set(
|
|
259
|
-
|
|
265
|
+
\`\${candidate.startIdx},\${candidate.needleIdx},\${
|
|
260
266
|
candidate.dist + 1
|
|
261
|
-
}
|
|
267
|
+
}\`,
|
|
262
268
|
{
|
|
263
269
|
startIdx: candidate.startIdx,
|
|
264
270
|
needleIdx: candidate.needleIdx,
|
|
@@ -288,9 +294,9 @@
|
|
|
288
294
|
};
|
|
289
295
|
} else {
|
|
290
296
|
candidates.set(
|
|
291
|
-
|
|
297
|
+
\`\${candidate.startIdx},\${
|
|
292
298
|
candidate.needleIdx + 1 + nSkipped
|
|
293
|
-
}
|
|
299
|
+
},\${candidate.dist + nSkipped}\`,
|
|
294
300
|
{
|
|
295
301
|
startIdx: candidate.startIdx,
|
|
296
302
|
needleIdx: candidate.needleIdx + 1 + nSkipped,
|
|
@@ -303,9 +309,9 @@
|
|
|
303
309
|
}
|
|
304
310
|
if (i + 1 < haystackLen && candidate.needleIdx + 1 < needleLen) {
|
|
305
311
|
candidates.set(
|
|
306
|
-
|
|
312
|
+
\`\${candidate.startIdx},\${candidate.needleIdx + 1},\${
|
|
307
313
|
candidate.dist + 1
|
|
308
|
-
}
|
|
314
|
+
}\`,
|
|
309
315
|
{
|
|
310
316
|
startIdx: candidate.startIdx,
|
|
311
317
|
needleIdx: candidate.needleIdx + 1,
|
|
@@ -346,9 +352,9 @@
|
|
|
346
352
|
return "";
|
|
347
353
|
}
|
|
348
354
|
// Replace newlines and tabs with spaces
|
|
349
|
-
let normalized = text.replace(
|
|
355
|
+
let normalized = text.replace(/\\n/g, " ").replace(/\\t/g, " ");
|
|
350
356
|
// Replace multiple spaces with a single space
|
|
351
|
-
normalized = normalized.split(
|
|
357
|
+
normalized = normalized.split(/\\s+/).join(" ");
|
|
352
358
|
return normalized.trim();
|
|
353
359
|
}
|
|
354
360
|
function isMatchExact(data, value) {
|
|
@@ -447,13 +453,13 @@
|
|
|
447
453
|
element.namespaceURI !== "http://www.w3.org/1999/xhtml" // HTML namespace, this will make xpath locator succeed without [name()='']
|
|
448
454
|
) {
|
|
449
455
|
// Element is in a namespace (SVG, MathML, or custom namespace)
|
|
450
|
-
nodeXPath =
|
|
456
|
+
nodeXPath = \`*[name()='\${nodeName}']\`;
|
|
451
457
|
} else {
|
|
452
458
|
// Standard HTML element
|
|
453
|
-
nodeXPath =
|
|
459
|
+
nodeXPath = \`\${nodeName}[\${siblingsCount}]\`;
|
|
454
460
|
}
|
|
455
461
|
|
|
456
|
-
return parentXPath ?
|
|
462
|
+
return parentXPath ? \`\${parentXPath}/\${nodeXPath}\` : nodeXPath;
|
|
457
463
|
}
|
|
458
464
|
function traverseAndPrune(node, conditionFunc) {
|
|
459
465
|
const children = Array.from(node.children ?? []);
|
|
@@ -722,7 +728,7 @@
|
|
|
722
728
|
return lengthDiff <= 0.2 * stringToMatch.length;
|
|
723
729
|
})
|
|
724
730
|
.map((attr) => attr.value)
|
|
725
|
-
.join("
|
|
731
|
+
.join("\\n");
|
|
726
732
|
const {
|
|
727
733
|
found: isFuzzMatchFound,
|
|
728
734
|
matchedValue,
|
|
@@ -813,10 +819,10 @@
|
|
|
813
819
|
}
|
|
814
820
|
|
|
815
821
|
function convertElementToMarkdown(element) {
|
|
816
|
-
const mdCharsMatcher = /([\\
|
|
822
|
+
const mdCharsMatcher = /([\\\\[\\]()])/g;
|
|
817
823
|
function escapeMd(text) {
|
|
818
824
|
// Escapes markdown-sensitive characters within other markdown constructs.
|
|
819
|
-
return text.replace(mdCharsMatcher, "
|
|
825
|
+
return text.replace(mdCharsMatcher, "\\\\$1");
|
|
820
826
|
}
|
|
821
827
|
function listNumberingStart(attrs) {
|
|
822
828
|
const start = attrs.getNamedItem("start")?.value;
|
|
@@ -827,25 +833,25 @@
|
|
|
827
833
|
}
|
|
828
834
|
}
|
|
829
835
|
// Define the characters that require escaping
|
|
830
|
-
const slashChars = "
|
|
836
|
+
const slashChars = "\\\\\`*_{}[]()#+-.!";
|
|
831
837
|
// Escape any special regex characters in slashChars
|
|
832
838
|
const escapedSlashChars = slashChars.replace(
|
|
833
|
-
/[
|
|
834
|
-
"
|
|
839
|
+
/[-/\\\\^$*+?.()|[\\]{}]/g,
|
|
840
|
+
"\\\\$&"
|
|
835
841
|
);
|
|
836
842
|
// Create the regular expression
|
|
837
843
|
const mdBackslashMatcher = new RegExp(
|
|
838
|
-
|
|
844
|
+
\`\\\\\\\\(?=[\${escapedSlashChars}])\`,
|
|
839
845
|
"g"
|
|
840
846
|
);
|
|
841
|
-
const mdDotMatcher = new RegExp(
|
|
842
|
-
const mdPlusMatcher = new RegExp(
|
|
843
|
-
const mdDashMatcher = new RegExp(
|
|
847
|
+
const mdDotMatcher = new RegExp(\`^(\\\\s*\\\\d+)(\\\\.)(?=\\\\s)\`, "gm");
|
|
848
|
+
const mdPlusMatcher = new RegExp(\`^(\\\\s*)(\\\\+)(?=\\\\s)\`, "gm");
|
|
849
|
+
const mdDashMatcher = new RegExp(\`^(\\\\s*)(-)(?=\\\\s|-)\`, "gm");
|
|
844
850
|
function escapeMdSection(text) {
|
|
845
|
-
text = text.replace(mdBackslashMatcher, "
|
|
846
|
-
text = text.replace(mdDotMatcher, "$1
|
|
847
|
-
text = text.replace(mdPlusMatcher, "$1
|
|
848
|
-
text = text.replace(mdDashMatcher, "$1
|
|
851
|
+
text = text.replace(mdBackslashMatcher, "\\\\\\\\");
|
|
852
|
+
text = text.replace(mdDotMatcher, "$1\\\\$2");
|
|
853
|
+
text = text.replace(mdPlusMatcher, "$1\\\\$2");
|
|
854
|
+
text = text.replace(mdDashMatcher, "$1\\\\$2");
|
|
849
855
|
return text;
|
|
850
856
|
}
|
|
851
857
|
function isFirstTbody(element) {
|
|
@@ -854,7 +860,7 @@
|
|
|
854
860
|
element.nodeName === "TBODY" &&
|
|
855
861
|
(!previousSibling ||
|
|
856
862
|
(previousSibling.nodeName === "THEAD" &&
|
|
857
|
-
|
|
863
|
+
/^\\s*$/i.test(previousSibling.textContent ?? "")))
|
|
858
864
|
);
|
|
859
865
|
}
|
|
860
866
|
function isHeadingRow(tr) {
|
|
@@ -908,7 +914,7 @@
|
|
|
908
914
|
this.abbrData += data;
|
|
909
915
|
}
|
|
910
916
|
if (pureData && !this.pre) {
|
|
911
|
-
data = data.replace(
|
|
917
|
+
data = data.replace(/\\s+/g, " ");
|
|
912
918
|
if (data && data[0] === " ") {
|
|
913
919
|
this.space = 1;
|
|
914
920
|
data = data.substring(1);
|
|
@@ -916,8 +922,8 @@
|
|
|
916
922
|
}
|
|
917
923
|
if (!data && force !== "end") return;
|
|
918
924
|
if (this.startPre) {
|
|
919
|
-
if (!data.startsWith("
|
|
920
|
-
data = "
|
|
925
|
+
if (!data.startsWith("\\n")) {
|
|
926
|
+
data = "\\n" + data;
|
|
921
927
|
}
|
|
922
928
|
}
|
|
923
929
|
let newLineIndent = ">".repeat(this.blockquote ?? 0);
|
|
@@ -932,7 +938,7 @@
|
|
|
932
938
|
newLineIndent += " ";
|
|
933
939
|
}
|
|
934
940
|
}
|
|
935
|
-
data = data.replace(
|
|
941
|
+
data = data.replace(/\\n/g, \`\\n\${newLineIndent}\`);
|
|
936
942
|
}
|
|
937
943
|
if (this.startPre) {
|
|
938
944
|
this.startPre = false;
|
|
@@ -947,11 +953,11 @@
|
|
|
947
953
|
}
|
|
948
954
|
if (force === "end") {
|
|
949
955
|
this.p_p = 0;
|
|
950
|
-
this.out("
|
|
956
|
+
this.out("\\n");
|
|
951
957
|
this.space = 0;
|
|
952
958
|
}
|
|
953
959
|
if (this.p_p) {
|
|
954
|
-
this.out((this.breakToggle + "
|
|
960
|
+
this.out((this.breakToggle + "\\n" + newLineIndent).repeat(this.p_p));
|
|
955
961
|
this.space = 0;
|
|
956
962
|
this.breakToggle = "";
|
|
957
963
|
}
|
|
@@ -963,7 +969,7 @@
|
|
|
963
969
|
}
|
|
964
970
|
if (this.a && force === "end") {
|
|
965
971
|
if (force === "end") {
|
|
966
|
-
this.out("
|
|
972
|
+
this.out("\\n");
|
|
967
973
|
}
|
|
968
974
|
const newA = this.a.filter((link) => {
|
|
969
975
|
if (this.outCount > link.outcount) {
|
|
@@ -976,19 +982,19 @@
|
|
|
976
982
|
if (link.title) {
|
|
977
983
|
this.out(" (" + link.title + ")");
|
|
978
984
|
}
|
|
979
|
-
this.out("
|
|
985
|
+
this.out("\\n");
|
|
980
986
|
return false;
|
|
981
987
|
}
|
|
982
988
|
return true;
|
|
983
989
|
});
|
|
984
990
|
if (this.a.length !== newA.length) {
|
|
985
|
-
this.out("
|
|
991
|
+
this.out("\\n");
|
|
986
992
|
}
|
|
987
993
|
this.a = newA;
|
|
988
994
|
}
|
|
989
995
|
if (this.abbrList && force === "end") {
|
|
990
996
|
for (const [abbr, definition] of Object.entries(this.abbrList)) {
|
|
991
|
-
this.out("
|
|
997
|
+
this.out("\\n *[" + abbr + "]: " + definition + "\\n");
|
|
992
998
|
}
|
|
993
999
|
}
|
|
994
1000
|
this.p_p = 0;
|
|
@@ -998,7 +1004,7 @@
|
|
|
998
1004
|
out(string) {
|
|
999
1005
|
this.outTextList.push(string);
|
|
1000
1006
|
if (string) {
|
|
1001
|
-
this.lastWasNewLine = string.charAt(string.length - 1) === "
|
|
1007
|
+
this.lastWasNewLine = string.charAt(string.length - 1) === "\\n";
|
|
1002
1008
|
}
|
|
1003
1009
|
}
|
|
1004
1010
|
getResult() {
|
|
@@ -1030,7 +1036,7 @@
|
|
|
1030
1036
|
href?.value === node.nodeValue &&
|
|
1031
1037
|
this.absoluteUrlMatcher.test(href.value)
|
|
1032
1038
|
) {
|
|
1033
|
-
this.processOutput(
|
|
1039
|
+
this.processOutput(\`<\${node.nodeValue}>\`);
|
|
1034
1040
|
return;
|
|
1035
1041
|
} else {
|
|
1036
1042
|
this.processOutput("[");
|
|
@@ -1053,7 +1059,7 @@
|
|
|
1053
1059
|
this.padding();
|
|
1054
1060
|
this.processOutput("#".repeat(this.getHeadingLevel(tag)) + " ");
|
|
1055
1061
|
}
|
|
1056
|
-
if (tag == "br") this.processOutput("
|
|
1062
|
+
if (tag == "br") this.processOutput(" \\n");
|
|
1057
1063
|
if (tag == "hr") {
|
|
1058
1064
|
this.padding();
|
|
1059
1065
|
this.processOutput("---");
|
|
@@ -1092,7 +1098,7 @@
|
|
|
1092
1098
|
this.padding();
|
|
1093
1099
|
}
|
|
1094
1100
|
if (["code", "tt"].includes(nodeName)) {
|
|
1095
|
-
this.processOutput("
|
|
1101
|
+
this.processOutput("\`");
|
|
1096
1102
|
}
|
|
1097
1103
|
if (["em", "i", "u"].includes(nodeName)) {
|
|
1098
1104
|
this.processOutput(this.emphasis_mark);
|
|
@@ -1191,14 +1197,14 @@
|
|
|
1191
1197
|
borderCells += cell(border, node.childNodes[i]);
|
|
1192
1198
|
}
|
|
1193
1199
|
}
|
|
1194
|
-
this.processOutput(borderCells ? "
|
|
1200
|
+
this.processOutput(borderCells ? "\\n" + borderCells + "\\n" : "\\n");
|
|
1195
1201
|
}
|
|
1196
1202
|
if (nodeName === "pre") {
|
|
1197
1203
|
this.pre = false;
|
|
1198
1204
|
this.padding();
|
|
1199
1205
|
}
|
|
1200
1206
|
if (["code", "tt"].includes(nodeName)) {
|
|
1201
|
-
this.processOutput("
|
|
1207
|
+
this.processOutput("\`");
|
|
1202
1208
|
}
|
|
1203
1209
|
if (["em", "i", "u"].includes(nodeName)) {
|
|
1204
1210
|
this.processOutput(this.emphasis_mark);
|
|
@@ -1232,7 +1238,7 @@
|
|
|
1232
1238
|
this.maybeAutomaticLink = null;
|
|
1233
1239
|
} else if (a) {
|
|
1234
1240
|
this.processOutput(
|
|
1235
|
-
|
|
1241
|
+
\`](\${escapeMd(a.getNamedItem("href")?.value || "")})\`
|
|
1236
1242
|
);
|
|
1237
1243
|
}
|
|
1238
1244
|
}
|
|
@@ -1376,14 +1382,14 @@
|
|
|
1376
1382
|
indentLevel
|
|
1377
1383
|
);
|
|
1378
1384
|
if (overriddenElementProcessing) {
|
|
1379
|
-
debugLog(
|
|
1385
|
+
debugLog(\`Element Processing Overridden: '\${childElement.nodeType}'\`);
|
|
1380
1386
|
result.push(...overriddenElementProcessing);
|
|
1381
1387
|
} else if (childElement.nodeType === ElementNode_1$1._Node.TEXT_NODE) {
|
|
1382
1388
|
const textContent = escapeMarkdownCharacters(
|
|
1383
1389
|
childElement.textContent?.trim() ?? ""
|
|
1384
1390
|
);
|
|
1385
1391
|
if (textContent && !!childElement.textContent) {
|
|
1386
|
-
debugLog(
|
|
1392
|
+
debugLog(\`Text Node: '\${textContent}'\`);
|
|
1387
1393
|
// preserve whitespaces when text childElement is not empty
|
|
1388
1394
|
result.push({
|
|
1389
1395
|
type: "text",
|
|
@@ -1398,19 +1404,19 @@
|
|
|
1398
1404
|
elem.textContent || ""
|
|
1399
1405
|
).trim();
|
|
1400
1406
|
if (content) {
|
|
1401
|
-
debugLog(
|
|
1407
|
+
debugLog(\`Heading \${level}: '\${elem.textContent}'\`);
|
|
1402
1408
|
result.push({ type: "heading", level, content });
|
|
1403
1409
|
}
|
|
1404
1410
|
} else if (elem.tagName.toLowerCase() === "p") {
|
|
1405
1411
|
debugLog("Paragraph");
|
|
1406
1412
|
result.push(...htmlToMarkdownAST(elem, options));
|
|
1407
1413
|
// Add a new line after the paragraph
|
|
1408
|
-
result.push({ type: "text", content: "
|
|
1414
|
+
result.push({ type: "text", content: "\\n\\n" });
|
|
1409
1415
|
} else if (elem.tagName.toLowerCase() === "a") {
|
|
1410
1416
|
debugLog(
|
|
1411
|
-
|
|
1417
|
+
\`Link: '\${elem.getAttribute("href")}' with text '\${
|
|
1412
1418
|
elem.textContent
|
|
1413
|
-
}'
|
|
1419
|
+
}'\`
|
|
1414
1420
|
);
|
|
1415
1421
|
// Check if the href is a data URL for an image
|
|
1416
1422
|
if (
|
|
@@ -1456,7 +1462,7 @@
|
|
|
1456
1462
|
}
|
|
1457
1463
|
}
|
|
1458
1464
|
} else if (elem.tagName.toLowerCase() === "img") {
|
|
1459
|
-
debugLog(
|
|
1465
|
+
debugLog(\`Image: src='\${elem.src}', alt='\${elem.alt}'\`);
|
|
1460
1466
|
if (elem.src?.startsWith("data:image")) {
|
|
1461
1467
|
result.push({
|
|
1462
1468
|
type: "image",
|
|
@@ -1477,7 +1483,7 @@
|
|
|
1477
1483
|
}
|
|
1478
1484
|
} else if (elem.tagName.toLowerCase() === "video") {
|
|
1479
1485
|
debugLog(
|
|
1480
|
-
|
|
1486
|
+
\`Video: src='\${elem.src}', poster='\${elem.poster}', controls='\${elem.controls}'\`
|
|
1481
1487
|
);
|
|
1482
1488
|
result.push({
|
|
1483
1489
|
type: "video",
|
|
@@ -1490,9 +1496,9 @@
|
|
|
1490
1496
|
elem.tagName.toLowerCase() === "ol"
|
|
1491
1497
|
) {
|
|
1492
1498
|
debugLog(
|
|
1493
|
-
|
|
1499
|
+
\`\${
|
|
1494
1500
|
elem.tagName.toLowerCase() === "ul" ? "Unordered" : "Ordered"
|
|
1495
|
-
} List
|
|
1501
|
+
} List\`
|
|
1496
1502
|
);
|
|
1497
1503
|
result.push({
|
|
1498
1504
|
type: "list",
|
|
@@ -1504,7 +1510,7 @@
|
|
|
1504
1510
|
});
|
|
1505
1511
|
} else if (elem.tagName.toLowerCase() === "br") {
|
|
1506
1512
|
debugLog("Line Break");
|
|
1507
|
-
result.push({ type: "text", content: "
|
|
1513
|
+
result.push({ type: "text", content: "\\n" });
|
|
1508
1514
|
} else if (elem.tagName.toLowerCase() === "table") {
|
|
1509
1515
|
debugLog("Table");
|
|
1510
1516
|
let colIds = [];
|
|
@@ -1512,7 +1518,7 @@
|
|
|
1512
1518
|
// Generate unique column IDs
|
|
1513
1519
|
const headerCells = Array.from(elem.querySelectorAll("th, td"));
|
|
1514
1520
|
headerCells.forEach((_, index) => {
|
|
1515
|
-
colIds.push(
|
|
1521
|
+
colIds.push(\`col-\${index}\`);
|
|
1516
1522
|
});
|
|
1517
1523
|
}
|
|
1518
1524
|
const tableRows = Array.from(elem.querySelectorAll("tr"));
|
|
@@ -1638,7 +1644,7 @@
|
|
|
1638
1644
|
case "strong":
|
|
1639
1645
|
case "b":
|
|
1640
1646
|
if (content) {
|
|
1641
|
-
debugLog(
|
|
1647
|
+
debugLog(\`Bold: '\${content}'\`);
|
|
1642
1648
|
result.push({
|
|
1643
1649
|
type: "bold",
|
|
1644
1650
|
content: htmlToMarkdownAST(elem, options, indentLevel + 1),
|
|
@@ -1648,7 +1654,7 @@
|
|
|
1648
1654
|
case "em":
|
|
1649
1655
|
case "i":
|
|
1650
1656
|
if (content) {
|
|
1651
|
-
debugLog(
|
|
1657
|
+
debugLog(\`Italic: '\${content}'\`);
|
|
1652
1658
|
result.push({
|
|
1653
1659
|
type: "italic",
|
|
1654
1660
|
content: htmlToMarkdownAST(elem, options, indentLevel + 1),
|
|
@@ -1658,7 +1664,7 @@
|
|
|
1658
1664
|
case "s":
|
|
1659
1665
|
case "strike":
|
|
1660
1666
|
if (content) {
|
|
1661
|
-
debugLog(
|
|
1667
|
+
debugLog(\`Strikethrough: '\${content}'\`);
|
|
1662
1668
|
result.push({
|
|
1663
1669
|
type: "strikethrough",
|
|
1664
1670
|
content: htmlToMarkdownAST(elem, options, indentLevel + 1),
|
|
@@ -1672,7 +1678,7 @@
|
|
|
1672
1678
|
elem.parentNode &&
|
|
1673
1679
|
elem.parentNode.nodeName.toLowerCase() === "pre";
|
|
1674
1680
|
debugLog(
|
|
1675
|
-
|
|
1681
|
+
\`\${isCodeBlock ? "Code Block" : "Inline Code"}: '\${content}'\`
|
|
1676
1682
|
);
|
|
1677
1683
|
const languageClass = elem.className
|
|
1678
1684
|
?.split(" ")
|
|
@@ -1689,7 +1695,7 @@
|
|
|
1689
1695
|
}
|
|
1690
1696
|
break;
|
|
1691
1697
|
case "blockquote":
|
|
1692
|
-
debugLog(
|
|
1698
|
+
debugLog(\`Blockquote\`);
|
|
1693
1699
|
result.push({
|
|
1694
1700
|
type: "blockquote",
|
|
1695
1701
|
content: htmlToMarkdownAST(elem, options),
|
|
@@ -1708,7 +1714,7 @@
|
|
|
1708
1714
|
case "section":
|
|
1709
1715
|
case "summary":
|
|
1710
1716
|
case "time":
|
|
1711
|
-
debugLog(
|
|
1717
|
+
debugLog(\`Semantic HTML Element: '\${elem.tagName}'\`);
|
|
1712
1718
|
result.push({
|
|
1713
1719
|
type: "semanticHtml",
|
|
1714
1720
|
htmlType: elem.tagName.toLowerCase(),
|
|
@@ -1719,10 +1725,10 @@
|
|
|
1719
1725
|
const unhandledElementProcessing =
|
|
1720
1726
|
options?.processUnhandledElement?.(elem, options, indentLevel);
|
|
1721
1727
|
if (unhandledElementProcessing) {
|
|
1722
|
-
debugLog(
|
|
1728
|
+
debugLog(\`Processing Unhandled Element: '\${elem.tagName}'\`);
|
|
1723
1729
|
result.push(...unhandledElementProcessing);
|
|
1724
1730
|
} else {
|
|
1725
|
-
debugLog(
|
|
1731
|
+
debugLog(\`Generic HTMLElement: '\${elem.tagName}'\`);
|
|
1726
1732
|
result.push(
|
|
1727
1733
|
...htmlToMarkdownAST(elem, options, indentLevel + 1)
|
|
1728
1734
|
);
|
|
@@ -1745,7 +1751,7 @@
|
|
|
1745
1751
|
.replace(/</g, "<")
|
|
1746
1752
|
.replace(/>/g, ">");
|
|
1747
1753
|
// Then escape characters that have special meaning in Markdown
|
|
1748
|
-
escapedText = escapedText.replace(/([
|
|
1754
|
+
escapedText = escapedText.replace(/([\\\\\`*_{}[\\]#+!|])/g, "\\\\$1");
|
|
1749
1755
|
return escapedText;
|
|
1750
1756
|
}
|
|
1751
1757
|
|
|
@@ -1769,7 +1775,7 @@
|
|
|
1769
1775
|
let markdownString = "";
|
|
1770
1776
|
if (options?.includeMetaData) {
|
|
1771
1777
|
// include meta-data
|
|
1772
|
-
markdownString += "
|
|
1778
|
+
markdownString += "---\\n";
|
|
1773
1779
|
const node = (0, index_1.findInMarkdownAST)(
|
|
1774
1780
|
nodes,
|
|
1775
1781
|
(_) => _.type === "meta"
|
|
@@ -1777,49 +1783,49 @@
|
|
|
1777
1783
|
if (node?.type === "meta") {
|
|
1778
1784
|
if (node.content.standard) {
|
|
1779
1785
|
Object.keys(node.content.standard).forEach((key) => {
|
|
1780
|
-
markdownString +=
|
|
1786
|
+
markdownString += \`\${key}: "\${node.content.standard[key]}"\\n\`;
|
|
1781
1787
|
});
|
|
1782
1788
|
}
|
|
1783
1789
|
if (options.includeMetaData === "extended") {
|
|
1784
1790
|
if (node.content.openGraph) {
|
|
1785
1791
|
if (Object.keys(node.content.openGraph).length > 0) {
|
|
1786
|
-
markdownString += "openGraph
|
|
1792
|
+
markdownString += "openGraph:\\n";
|
|
1787
1793
|
for (const [key, value] of Object.entries(
|
|
1788
1794
|
node.content.openGraph
|
|
1789
1795
|
)) {
|
|
1790
|
-
markdownString +=
|
|
1796
|
+
markdownString += \` \${key}: "\${value}"\\n\`;
|
|
1791
1797
|
}
|
|
1792
1798
|
}
|
|
1793
1799
|
}
|
|
1794
1800
|
if (node.content.twitter) {
|
|
1795
1801
|
if (Object.keys(node.content.twitter).length > 0) {
|
|
1796
|
-
markdownString += "twitter
|
|
1802
|
+
markdownString += "twitter:\\n";
|
|
1797
1803
|
for (const [key, value] of Object.entries(
|
|
1798
1804
|
node.content.twitter
|
|
1799
1805
|
)) {
|
|
1800
|
-
markdownString +=
|
|
1806
|
+
markdownString += \` \${key}: "\${value}"\\n\`;
|
|
1801
1807
|
}
|
|
1802
1808
|
}
|
|
1803
1809
|
}
|
|
1804
1810
|
if (node.content.jsonLd && node.content.jsonLd.length > 0) {
|
|
1805
|
-
markdownString += "schema
|
|
1811
|
+
markdownString += "schema:\\n";
|
|
1806
1812
|
node.content.jsonLd.forEach((item) => {
|
|
1807
1813
|
const {
|
|
1808
1814
|
"@context": jldContext,
|
|
1809
1815
|
"@type": jldType,
|
|
1810
1816
|
...semanticData
|
|
1811
1817
|
} = item;
|
|
1812
|
-
markdownString +=
|
|
1818
|
+
markdownString += \` \${jldType ?? "(unknown type)"}:\\n\`;
|
|
1813
1819
|
Object.keys(semanticData).forEach((key) => {
|
|
1814
|
-
markdownString +=
|
|
1820
|
+
markdownString += \` \${key}: \${JSON.stringify(
|
|
1815
1821
|
semanticData[key]
|
|
1816
|
-
)}
|
|
1822
|
+
)}\\n\`;
|
|
1817
1823
|
});
|
|
1818
1824
|
});
|
|
1819
1825
|
}
|
|
1820
1826
|
}
|
|
1821
1827
|
}
|
|
1822
|
-
markdownString += "
|
|
1828
|
+
markdownString += "---\\n\\n";
|
|
1823
1829
|
}
|
|
1824
1830
|
return markdownString;
|
|
1825
1831
|
}
|
|
@@ -1850,10 +1856,10 @@
|
|
|
1850
1856
|
);
|
|
1851
1857
|
}
|
|
1852
1858
|
const isMarkdownStringNotEmpty = markdownString.length > 0;
|
|
1853
|
-
const isFirstCharOfContentWhitespace =
|
|
1859
|
+
const isFirstCharOfContentWhitespace = /\\s/.test(
|
|
1854
1860
|
content.slice(0, 1)
|
|
1855
1861
|
);
|
|
1856
|
-
const isLastCharOfMarkdownWhitespace =
|
|
1862
|
+
const isLastCharOfMarkdownWhitespace = /\\s/.test(
|
|
1857
1863
|
markdownString.slice(-1)
|
|
1858
1864
|
);
|
|
1859
1865
|
const isContentPunctuation =
|
|
@@ -1867,14 +1873,14 @@
|
|
|
1867
1873
|
markdownString += " ";
|
|
1868
1874
|
}
|
|
1869
1875
|
if (node.type === "text") {
|
|
1870
|
-
markdownString +=
|
|
1876
|
+
markdownString += \`\${indent}\${content}\`;
|
|
1871
1877
|
} else {
|
|
1872
1878
|
if (node.type === "bold") {
|
|
1873
|
-
markdownString +=
|
|
1879
|
+
markdownString += \`**\${content}**\`;
|
|
1874
1880
|
} else if (node.type === "italic") {
|
|
1875
|
-
markdownString +=
|
|
1881
|
+
markdownString += \`*\${content}*\`;
|
|
1876
1882
|
} else if (node.type === "strikethrough") {
|
|
1877
|
-
markdownString +=
|
|
1883
|
+
markdownString += \`~~\${content}~~\`;
|
|
1878
1884
|
} else if (node.type === "link") {
|
|
1879
1885
|
// check if the link contains only text
|
|
1880
1886
|
if (
|
|
@@ -1882,52 +1888,52 @@
|
|
|
1882
1888
|
node.content[0].type === "text"
|
|
1883
1889
|
) {
|
|
1884
1890
|
// use native markdown syntax for text-only links
|
|
1885
|
-
markdownString +=
|
|
1891
|
+
markdownString += \`[\${content}](\${encodeURI(node.href)})\`;
|
|
1886
1892
|
} else {
|
|
1887
1893
|
// Use HTML <a> tag for links with rich content
|
|
1888
|
-
markdownString +=
|
|
1894
|
+
markdownString += \`<a href="\${node.href}">\${content}</a>\`;
|
|
1889
1895
|
}
|
|
1890
1896
|
}
|
|
1891
1897
|
}
|
|
1892
1898
|
break;
|
|
1893
1899
|
case "heading":
|
|
1894
|
-
const isEndsWithNewLine = markdownString.slice(-1) === "
|
|
1900
|
+
const isEndsWithNewLine = markdownString.slice(-1) === "\\n";
|
|
1895
1901
|
if (!isEndsWithNewLine) {
|
|
1896
|
-
markdownString += "
|
|
1902
|
+
markdownString += "\\n";
|
|
1897
1903
|
}
|
|
1898
|
-
markdownString +=
|
|
1904
|
+
markdownString += \`\${"#".repeat(node.level)} \${node.content}\\n\\n\`;
|
|
1899
1905
|
break;
|
|
1900
1906
|
case "image":
|
|
1901
1907
|
if (!node.alt?.trim() || !!node.src?.trim()) {
|
|
1902
|
-
markdownString +=
|
|
1908
|
+
markdownString += \`\`;
|
|
1903
1909
|
}
|
|
1904
1910
|
break;
|
|
1905
1911
|
case "list":
|
|
1906
1912
|
node.items.forEach((item, i) => {
|
|
1907
|
-
const listItemPrefix = node.ordered ?
|
|
1913
|
+
const listItemPrefix = node.ordered ? \`\${i + 1}.\` : "-";
|
|
1908
1914
|
const contents = markdownContentASTToString(
|
|
1909
1915
|
item.content,
|
|
1910
1916
|
options,
|
|
1911
1917
|
indentLevel + 1
|
|
1912
1918
|
).trim();
|
|
1913
|
-
if (markdownString.slice(-1) !== "
|
|
1914
|
-
markdownString += "
|
|
1919
|
+
if (markdownString.slice(-1) !== "\\n") {
|
|
1920
|
+
markdownString += "\\n";
|
|
1915
1921
|
}
|
|
1916
1922
|
if (contents) {
|
|
1917
|
-
markdownString +=
|
|
1923
|
+
markdownString += \`\${indent}\${listItemPrefix} \${contents}\\n\`;
|
|
1918
1924
|
}
|
|
1919
1925
|
});
|
|
1920
|
-
markdownString += "
|
|
1926
|
+
markdownString += "\\n";
|
|
1921
1927
|
break;
|
|
1922
1928
|
case "video":
|
|
1923
|
-
markdownString +=
|
|
1929
|
+
markdownString += \`\\n\\n\`;
|
|
1924
1930
|
if (node.poster) {
|
|
1925
|
-
markdownString +=
|
|
1931
|
+
markdownString += \`\\n\`;
|
|
1926
1932
|
}
|
|
1927
1933
|
if (node.controls) {
|
|
1928
|
-
markdownString +=
|
|
1934
|
+
markdownString += \`Controls: \${node.controls}\\n\`;
|
|
1929
1935
|
}
|
|
1930
|
-
markdownString += "
|
|
1936
|
+
markdownString += "\\n";
|
|
1931
1937
|
break;
|
|
1932
1938
|
case "table":
|
|
1933
1939
|
const maxColumns = Math.max(
|
|
@@ -1947,15 +1953,15 @@
|
|
|
1947
1953
|
indentLevel + 1
|
|
1948
1954
|
).trim();
|
|
1949
1955
|
if (cell.colId) {
|
|
1950
|
-
cellContent +=
|
|
1956
|
+
cellContent += \` <!-- \${cell.colId} -->\`;
|
|
1951
1957
|
}
|
|
1952
1958
|
if (cell.colspan && cell.colspan > 1) {
|
|
1953
|
-
cellContent +=
|
|
1959
|
+
cellContent += \` <!-- colspan: \${cell.colspan} -->\`;
|
|
1954
1960
|
}
|
|
1955
1961
|
if (cell.rowspan && cell.rowspan > 1) {
|
|
1956
|
-
cellContent +=
|
|
1962
|
+
cellContent += \` <!-- rowspan: \${cell.rowspan} -->\`;
|
|
1957
1963
|
}
|
|
1958
|
-
markdownString +=
|
|
1964
|
+
markdownString += \`| \${cellContent} \`;
|
|
1959
1965
|
currentColumn += cell.colspan || 1;
|
|
1960
1966
|
// Add empty cells for colspan
|
|
1961
1967
|
for (let i = 1; i < (cell.colspan || 1); i++) {
|
|
@@ -1967,29 +1973,29 @@
|
|
|
1967
1973
|
markdownString += "| ";
|
|
1968
1974
|
currentColumn++;
|
|
1969
1975
|
}
|
|
1970
|
-
markdownString += "
|
|
1976
|
+
markdownString += "|\\n";
|
|
1971
1977
|
});
|
|
1972
|
-
markdownString += "
|
|
1978
|
+
markdownString += "\\n";
|
|
1973
1979
|
break;
|
|
1974
1980
|
case "code":
|
|
1975
1981
|
if (node.inline) {
|
|
1976
|
-
const isLsatWhitespace =
|
|
1982
|
+
const isLsatWhitespace = /\\s/.test(markdownString.slice(-1));
|
|
1977
1983
|
if (!isLsatWhitespace) {
|
|
1978
1984
|
markdownString += " ";
|
|
1979
1985
|
}
|
|
1980
|
-
markdownString +=
|
|
1986
|
+
markdownString += \`\\\`\${node.content}\\\`\`;
|
|
1981
1987
|
} else {
|
|
1982
1988
|
// For code blocks, we do not escape characters and preserve formatting
|
|
1983
|
-
markdownString += "
|
|
1984
|
-
markdownString +=
|
|
1985
|
-
markdownString += "
|
|
1989
|
+
markdownString += "\\n\`\`\`" + (node.language ?? "") + "\\n";
|
|
1990
|
+
markdownString += \`\${node.content}\\n\`;
|
|
1991
|
+
markdownString += "\`\`\`\\n\\n";
|
|
1986
1992
|
}
|
|
1987
1993
|
break;
|
|
1988
1994
|
case "blockquote":
|
|
1989
|
-
markdownString +=
|
|
1995
|
+
markdownString += \`> \${markdownContentASTToString(
|
|
1990
1996
|
node.content,
|
|
1991
1997
|
options
|
|
1992
|
-
).trim()}
|
|
1998
|
+
).trim()}\\n\\n\`;
|
|
1993
1999
|
break;
|
|
1994
2000
|
case "meta":
|
|
1995
2001
|
// already handled
|
|
@@ -1998,7 +2004,7 @@
|
|
|
1998
2004
|
switch (node.htmlType) {
|
|
1999
2005
|
case "article":
|
|
2000
2006
|
markdownString +=
|
|
2001
|
-
"
|
|
2007
|
+
"\\n\\n" + markdownContentASTToString(node.content, options);
|
|
2002
2008
|
break;
|
|
2003
2009
|
case "summary":
|
|
2004
2010
|
case "time":
|
|
@@ -2012,18 +2018,18 @@
|
|
|
2012
2018
|
case "details":
|
|
2013
2019
|
case "figure":
|
|
2014
2020
|
markdownString +=
|
|
2015
|
-
|
|
2021
|
+
\`\\n\\n<-\${node.htmlType}->\\n\` +
|
|
2016
2022
|
markdownContentASTToString(node.content, options) +
|
|
2017
|
-
|
|
2023
|
+
\`\\n\\n</-\${node.htmlType}->\\n\`;
|
|
2018
2024
|
break;
|
|
2019
2025
|
case "section":
|
|
2020
|
-
markdownString += "
|
|
2026
|
+
markdownString += "---\\n\\n";
|
|
2021
2027
|
markdownString += markdownContentASTToString(
|
|
2022
2028
|
node.content,
|
|
2023
2029
|
options
|
|
2024
2030
|
);
|
|
2025
|
-
markdownString += "
|
|
2026
|
-
markdownString += "
|
|
2031
|
+
markdownString += "\\n\\n";
|
|
2032
|
+
markdownString += "---\\n\\n";
|
|
2027
2033
|
break;
|
|
2028
2034
|
}
|
|
2029
2035
|
break;
|
|
@@ -2099,17 +2105,17 @@
|
|
|
2099
2105
|
) {
|
|
2100
2106
|
bestIndependentCandidate = candidates[i];
|
|
2101
2107
|
debugMessage(
|
|
2102
|
-
|
|
2108
|
+
\`New best independent candidate found: \${elementToString(
|
|
2103
2109
|
bestIndependentCandidate
|
|
2104
|
-
)}
|
|
2110
|
+
)}\`
|
|
2105
2111
|
);
|
|
2106
2112
|
}
|
|
2107
2113
|
}
|
|
2108
2114
|
}
|
|
2109
2115
|
debugMessage(
|
|
2110
|
-
|
|
2116
|
+
\`Final main content candidate: \${elementToString(
|
|
2111
2117
|
bestIndependentCandidate
|
|
2112
|
-
)}
|
|
2118
|
+
)}\`
|
|
2113
2119
|
);
|
|
2114
2120
|
return bestIndependentCandidate;
|
|
2115
2121
|
}
|
|
@@ -2117,16 +2123,16 @@
|
|
|
2117
2123
|
if (!element) {
|
|
2118
2124
|
return "No element";
|
|
2119
2125
|
}
|
|
2120
|
-
return
|
|
2126
|
+
return \`\${element.tagName}#\${element.id || "no-id"}.\${Array.from(
|
|
2121
2127
|
element.classList
|
|
2122
|
-
).join(".")}
|
|
2128
|
+
).join(".")}\`;
|
|
2123
2129
|
}
|
|
2124
2130
|
function collectCandidates(element, candidates, minScore) {
|
|
2125
2131
|
const score = calculateScore(element);
|
|
2126
2132
|
if (score >= minScore) {
|
|
2127
2133
|
candidates.push(element);
|
|
2128
2134
|
debugMessage(
|
|
2129
|
-
|
|
2135
|
+
\`Candidate found: \${elementToString(element)}, score: \${score}\`
|
|
2130
2136
|
);
|
|
2131
2137
|
}
|
|
2132
2138
|
Array.from(element.children).forEach((child) => {
|
|
@@ -2148,7 +2154,7 @@
|
|
|
2148
2154
|
if (element.classList.contains(attr) || element.id.includes(attr)) {
|
|
2149
2155
|
score += 10;
|
|
2150
2156
|
scoreLog.push(
|
|
2151
|
-
|
|
2157
|
+
\`High impact attribute found: \${attr}, score increased by 10\`
|
|
2152
2158
|
);
|
|
2153
2159
|
}
|
|
2154
2160
|
});
|
|
@@ -2157,7 +2163,7 @@
|
|
|
2157
2163
|
if (highImpactTags.includes(element.tagName.toLowerCase())) {
|
|
2158
2164
|
score += 5;
|
|
2159
2165
|
scoreLog.push(
|
|
2160
|
-
|
|
2166
|
+
\`High impact tag found: \${element.tagName}, score increased by 5\`
|
|
2161
2167
|
);
|
|
2162
2168
|
}
|
|
2163
2169
|
// Paragraph count
|
|
@@ -2166,7 +2172,7 @@
|
|
|
2166
2172
|
if (paragraphScore > 0) {
|
|
2167
2173
|
score += paragraphScore;
|
|
2168
2174
|
scoreLog.push(
|
|
2169
|
-
|
|
2175
|
+
\`Paragraph count: \${paragraphCount}, score increased by \${paragraphScore}\`
|
|
2170
2176
|
);
|
|
2171
2177
|
}
|
|
2172
2178
|
// Text content length
|
|
@@ -2175,7 +2181,7 @@
|
|
|
2175
2181
|
const textScore = Math.min(Math.floor(textContentLength / 200), 5);
|
|
2176
2182
|
score += textScore;
|
|
2177
2183
|
scoreLog.push(
|
|
2178
|
-
|
|
2184
|
+
\`Text content length: \${textContentLength}, score increased by \${textScore}\`
|
|
2179
2185
|
);
|
|
2180
2186
|
}
|
|
2181
2187
|
// Link density
|
|
@@ -2183,7 +2189,7 @@
|
|
|
2183
2189
|
if (linkDensity < 0.3) {
|
|
2184
2190
|
score += 5;
|
|
2185
2191
|
scoreLog.push(
|
|
2186
|
-
|
|
2192
|
+
\`Link density: \${linkDensity.toFixed(2)}, score increased by 5\`
|
|
2187
2193
|
);
|
|
2188
2194
|
}
|
|
2189
2195
|
// Data attributes
|
|
@@ -2204,7 +2210,7 @@
|
|
|
2204
2210
|
);
|
|
2205
2211
|
}
|
|
2206
2212
|
if (scoreLog.length > 0) {
|
|
2207
|
-
debugMessage(
|
|
2213
|
+
debugMessage(\`Scoring for \${elementToString(element)}:\`);
|
|
2208
2214
|
}
|
|
2209
2215
|
return score;
|
|
2210
2216
|
}
|
|
@@ -2302,7 +2308,7 @@
|
|
|
2302
2308
|
const parts = url.split("/"); // Split URL keeping the slash before text
|
|
2303
2309
|
const prefix = parts.slice(0, -1).join("/"); // Get the prefix by removing last part
|
|
2304
2310
|
const refPrefix = addRefPrefix(prefix, prefixesToRefs);
|
|
2305
|
-
return
|
|
2311
|
+
return \`\${refPrefix}://\${parts.slice(-1).join("")}\`;
|
|
2306
2312
|
} else {
|
|
2307
2313
|
if (url.split("/").length > 4) {
|
|
2308
2314
|
return addRefPrefix(url, prefixesToRefs);
|
|
@@ -2360,12 +2366,12 @@
|
|
|
2360
2366
|
exports.findAllInAST = findAllInAST;
|
|
2361
2367
|
const getMainContent = (markdownStr) => {
|
|
2362
2368
|
if (markdownStr.includes("<-main->")) {
|
|
2363
|
-
const regex = /(?<=<-main->)[
|
|
2369
|
+
const regex = /(?<=<-main->)[\\s\\S]*?(?=<\\/-main->)/;
|
|
2364
2370
|
const match = markdownStr.match(regex);
|
|
2365
2371
|
return match?.[0] ?? "";
|
|
2366
2372
|
} else {
|
|
2367
2373
|
const removeSectionsRegex =
|
|
2368
|
-
/(<-nav->[
|
|
2374
|
+
/(<-nav->[\\s\\S]*?<\\/-nav->)|(<-footer->[\\s\\S]*?<\\/-footer->)|(<-header->[\\s\\S]*?<\\/-header->)|(<-aside->[\\s\\S]*?<\\/-aside->)/g;
|
|
2369
2375
|
return markdownStr.replace(removeSectionsRegex, "");
|
|
2370
2376
|
}
|
|
2371
2377
|
};
|
|
@@ -2532,7 +2538,7 @@
|
|
|
2532
2538
|
) {
|
|
2533
2539
|
// content container was found and extracted, re-attaching the head for meta-data extraction
|
|
2534
2540
|
element = parser.parseFromString(
|
|
2535
|
-
|
|
2541
|
+
\`<html>\${doc.head.outerHTML}\${element.outerHTML}\`,
|
|
2536
2542
|
"text/html"
|
|
2537
2543
|
).documentElement;
|
|
2538
2544
|
}
|
|
@@ -2594,3 +2600,4 @@
|
|
|
2594
2600
|
getElementXPath: getElementXPath,
|
|
2595
2601
|
};
|
|
2596
2602
|
})();
|
|
2603
|
+
`;
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
Object.defineProperty(exports, "__esModule", {
|
|
4
4
|
value: true
|
|
5
5
|
});
|
|
6
|
-
exports.
|
|
6
|
+
exports.clickUntilExhausted = exports.clickButtonAndWait = void 0;
|
|
7
7
|
var _withNetworkSettledWait = require("./withNetworkSettledWait");
|
|
8
8
|
var _Logger = require("../common/Logger");
|
|
9
9
|
const getContainerState = async container => {
|
|
@@ -35,7 +35,7 @@ const clickButtonAndWait = async input => {
|
|
|
35
35
|
});
|
|
36
36
|
};
|
|
37
37
|
exports.clickButtonAndWait = clickButtonAndWait;
|
|
38
|
-
const
|
|
38
|
+
const clickUntilExhausted = async input => {
|
|
39
39
|
const {
|
|
40
40
|
page,
|
|
41
41
|
buttonLocator,
|
|
@@ -82,4 +82,4 @@ const clickButtonUntilNoChange = async input => {
|
|
|
82
82
|
timeoutInMs: 30000
|
|
83
83
|
});
|
|
84
84
|
};
|
|
85
|
-
exports.
|
|
85
|
+
exports.clickUntilExhausted = clickUntilExhausted;
|
package/dist/helpers/export.d.ts
CHANGED
|
@@ -367,13 +367,13 @@ export declare function clickButtonAndWait(input: {
|
|
|
367
367
|
*
|
|
368
368
|
* @example
|
|
369
369
|
* ```typescript Load All Items
|
|
370
|
-
* import {
|
|
370
|
+
* import { clickUntilExhausted } from "@intuned/browser";
|
|
371
371
|
* export default async function handler(params, page, context){
|
|
372
372
|
* await page.goto("https://example.com/products");
|
|
373
373
|
* const loadMoreButton = page.locator("button:has-text('Load More')");
|
|
374
374
|
*
|
|
375
375
|
* // Click until button disappears or is disabled
|
|
376
|
-
* await
|
|
376
|
+
* await clickUntilExhausted({
|
|
377
377
|
* page,
|
|
378
378
|
* buttonLocator: loadMoreButton,
|
|
379
379
|
* maxClicks: 20
|
|
@@ -383,14 +383,14 @@ export declare function clickButtonAndWait(input: {
|
|
|
383
383
|
*
|
|
384
384
|
* @example
|
|
385
385
|
* ```typescript Track Container Changes
|
|
386
|
-
* import {
|
|
386
|
+
* import { clickUntilExhausted } from "@intuned/browser";
|
|
387
387
|
* export default async function handler(params, page, context){
|
|
388
388
|
* await page.goto("https://example.com/products");
|
|
389
389
|
* const loadMoreButton = page.locator("#load-more");
|
|
390
390
|
* const productsContainer = page.locator("#products-list");
|
|
391
391
|
*
|
|
392
392
|
* let clickCount = 0;
|
|
393
|
-
* await
|
|
393
|
+
* await clickUntilExhausted({
|
|
394
394
|
* page,
|
|
395
395
|
* buttonLocator: loadMoreButton,
|
|
396
396
|
* containerLocator: productsContainer,
|
|
@@ -405,7 +405,7 @@ export declare function clickButtonAndWait(input: {
|
|
|
405
405
|
* }
|
|
406
406
|
* ```
|
|
407
407
|
*/
|
|
408
|
-
export declare function
|
|
408
|
+
export declare function clickUntilExhausted(input: {
|
|
409
409
|
page: Page;
|
|
410
410
|
buttonLocator: Locator;
|
|
411
411
|
heartbeat?: CallableFunction;
|
package/dist/helpers/index.d.ts
CHANGED
|
@@ -367,13 +367,13 @@ export declare function clickButtonAndWait(input: {
|
|
|
367
367
|
*
|
|
368
368
|
* @example
|
|
369
369
|
* ```typescript Load All Items
|
|
370
|
-
* import {
|
|
370
|
+
* import { clickUntilExhausted } from "@intuned/browser";
|
|
371
371
|
* export default async function handler(params, page, context){
|
|
372
372
|
* await page.goto("https://example.com/products");
|
|
373
373
|
* const loadMoreButton = page.locator("button:has-text('Load More')");
|
|
374
374
|
*
|
|
375
375
|
* // Click until button disappears or is disabled
|
|
376
|
-
* await
|
|
376
|
+
* await clickUntilExhausted({
|
|
377
377
|
* page,
|
|
378
378
|
* buttonLocator: loadMoreButton,
|
|
379
379
|
* maxClicks: 20
|
|
@@ -383,14 +383,14 @@ export declare function clickButtonAndWait(input: {
|
|
|
383
383
|
*
|
|
384
384
|
* @example
|
|
385
385
|
* ```typescript Track Container Changes
|
|
386
|
-
* import {
|
|
386
|
+
* import { clickUntilExhausted } from "@intuned/browser";
|
|
387
387
|
* export default async function handler(params, page, context){
|
|
388
388
|
* await page.goto("https://example.com/products");
|
|
389
389
|
* const loadMoreButton = page.locator("#load-more");
|
|
390
390
|
* const productsContainer = page.locator("#products-list");
|
|
391
391
|
*
|
|
392
392
|
* let clickCount = 0;
|
|
393
|
-
* await
|
|
393
|
+
* await clickUntilExhausted({
|
|
394
394
|
* page,
|
|
395
395
|
* buttonLocator: loadMoreButton,
|
|
396
396
|
* containerLocator: productsContainer,
|
|
@@ -405,7 +405,7 @@ export declare function clickButtonAndWait(input: {
|
|
|
405
405
|
* }
|
|
406
406
|
* ```
|
|
407
407
|
*/
|
|
408
|
-
export declare function
|
|
408
|
+
export declare function clickUntilExhausted(input: {
|
|
409
409
|
page: Page;
|
|
410
410
|
buttonLocator: Locator;
|
|
411
411
|
heartbeat?: CallableFunction;
|
package/dist/helpers/index.js
CHANGED
|
@@ -21,16 +21,10 @@ Object.defineProperty(exports, "CustomTypeValidator", {
|
|
|
21
21
|
return _types.CustomTypeValidator;
|
|
22
22
|
}
|
|
23
23
|
});
|
|
24
|
-
Object.defineProperty(exports, "
|
|
24
|
+
Object.defineProperty(exports, "clickUntilExhausted", {
|
|
25
25
|
enumerable: true,
|
|
26
26
|
get: function () {
|
|
27
|
-
return _clickUntilExhausted.
|
|
28
|
-
}
|
|
29
|
-
});
|
|
30
|
-
Object.defineProperty(exports, "clickButtonUntilNoChange", {
|
|
31
|
-
enumerable: true,
|
|
32
|
-
get: function () {
|
|
33
|
-
return _clickUntilExhausted.clickButtonUntilNoChange;
|
|
27
|
+
return _clickUntilExhausted.clickUntilExhausted;
|
|
34
28
|
}
|
|
35
29
|
});
|
|
36
30
|
Object.defineProperty(exports, "downloadFile", {
|
|
@@ -223,28 +223,13 @@ const noChangeThresholdHtml = `
|
|
|
223
223
|
(0, _extendedTest.afterEach)(async () => {
|
|
224
224
|
await page.close();
|
|
225
225
|
});
|
|
226
|
-
(0, _extendedTest.describe)("
|
|
227
|
-
(0, _extendedTest.test)("should click button and wait for content", async () => {
|
|
228
|
-
await page.setContent(basicClickHtml);
|
|
229
|
-
const initialCount = await page.locator(".item").count();
|
|
230
|
-
(0, _extendedTest.expect)(initialCount).toBe(3);
|
|
231
|
-
const buttonLocator = page.locator("#load-more");
|
|
232
|
-
await (0, _.clickButtonAndWait)({
|
|
233
|
-
page,
|
|
234
|
-
buttonLocator,
|
|
235
|
-
clickDelay: 0.1
|
|
236
|
-
});
|
|
237
|
-
const finalCount = await page.locator(".item").count();
|
|
238
|
-
(0, _extendedTest.expect)(finalCount).toBe(4);
|
|
239
|
-
});
|
|
240
|
-
});
|
|
241
|
-
(0, _extendedTest.describe)("clickButtonUntilNoChange", () => {
|
|
226
|
+
(0, _extendedTest.describe)("clickUntilExhausted", () => {
|
|
242
227
|
(0, _extendedTest.test)("should click button until max items loaded with default parameters", async () => {
|
|
243
228
|
await page.setContent(basicClickHtml);
|
|
244
229
|
const initialCount = await page.locator(".item").count();
|
|
245
230
|
(0, _extendedTest.expect)(initialCount).toBe(3);
|
|
246
231
|
const buttonLocator = page.locator("#load-more");
|
|
247
|
-
await (0, _.
|
|
232
|
+
await (0, _.clickUntilExhausted)({
|
|
248
233
|
page,
|
|
249
234
|
buttonLocator
|
|
250
235
|
});
|
|
@@ -256,7 +241,7 @@ const noChangeThresholdHtml = `
|
|
|
256
241
|
const initialCount = await page.locator(".item").count();
|
|
257
242
|
(0, _extendedTest.expect)(initialCount).toBe(3);
|
|
258
243
|
const buttonLocator = page.locator("#load-more");
|
|
259
|
-
await (0, _.
|
|
244
|
+
await (0, _.clickUntilExhausted)({
|
|
260
245
|
page,
|
|
261
246
|
buttonLocator,
|
|
262
247
|
maxClicks: 3,
|
|
@@ -272,7 +257,7 @@ const noChangeThresholdHtml = `
|
|
|
272
257
|
heartbeatCalls.push(1);
|
|
273
258
|
};
|
|
274
259
|
const buttonLocator = page.locator("#load-more");
|
|
275
|
-
await (0, _.
|
|
260
|
+
await (0, _.clickUntilExhausted)({
|
|
276
261
|
page,
|
|
277
262
|
buttonLocator,
|
|
278
263
|
heartbeat: onHeartbeat,
|
|
@@ -286,7 +271,7 @@ const noChangeThresholdHtml = `
|
|
|
286
271
|
const initialCount = await page.locator(".item").count();
|
|
287
272
|
(0, _extendedTest.expect)(initialCount).toBe(1);
|
|
288
273
|
const buttonLocator = page.locator("#load-more");
|
|
289
|
-
await (0, _.
|
|
274
|
+
await (0, _.clickUntilExhausted)({
|
|
290
275
|
page,
|
|
291
276
|
buttonLocator,
|
|
292
277
|
maxClicks: 20,
|
|
@@ -302,7 +287,7 @@ const noChangeThresholdHtml = `
|
|
|
302
287
|
const initialCount = await page.locator(".item").count();
|
|
303
288
|
(0, _extendedTest.expect)(initialCount).toBe(1);
|
|
304
289
|
const buttonLocator = page.locator("#load-more");
|
|
305
|
-
await (0, _.
|
|
290
|
+
await (0, _.clickUntilExhausted)({
|
|
306
291
|
page,
|
|
307
292
|
buttonLocator,
|
|
308
293
|
maxClicks: 20,
|
|
@@ -319,7 +304,7 @@ const noChangeThresholdHtml = `
|
|
|
319
304
|
(0, _extendedTest.expect)(initialCount).toBe(2);
|
|
320
305
|
const buttonLocator = page.locator("#load-more");
|
|
321
306
|
const containerLocator = page.locator("#container");
|
|
322
|
-
await (0, _.
|
|
307
|
+
await (0, _.clickUntilExhausted)({
|
|
323
308
|
page,
|
|
324
309
|
buttonLocator,
|
|
325
310
|
containerLocator,
|
|
@@ -335,7 +320,7 @@ const noChangeThresholdHtml = `
|
|
|
335
320
|
(0, _extendedTest.expect)(initialCount).toBe(1);
|
|
336
321
|
const buttonLocator = page.locator("#load-more");
|
|
337
322
|
const containerLocator = page.locator("#content");
|
|
338
|
-
await (0, _.
|
|
323
|
+
await (0, _.clickUntilExhausted)({
|
|
339
324
|
page,
|
|
340
325
|
buttonLocator,
|
|
341
326
|
containerLocator,
|
|
@@ -351,7 +336,7 @@ const noChangeThresholdHtml = `
|
|
|
351
336
|
const initialCount = await page.locator(".item").count();
|
|
352
337
|
(0, _extendedTest.expect)(initialCount).toBe(3);
|
|
353
338
|
const buttonLocator = page.locator("#load-more");
|
|
354
|
-
await (0, _.
|
|
339
|
+
await (0, _.clickUntilExhausted)({
|
|
355
340
|
page,
|
|
356
341
|
buttonLocator,
|
|
357
342
|
maxClicks: 3,
|
|
@@ -370,7 +355,7 @@ const noChangeThresholdHtml = `
|
|
|
370
355
|
(0, _extendedTest.expect)(initialCount).toBe(2);
|
|
371
356
|
const buttonLocator = page.locator("#load-more");
|
|
372
357
|
const containerLocator = page.locator("#container");
|
|
373
|
-
await (0, _.
|
|
358
|
+
await (0, _.clickUntilExhausted)({
|
|
374
359
|
page,
|
|
375
360
|
buttonLocator,
|
|
376
361
|
heartbeat: onHeartbeat,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@intuned/browser-dev",
|
|
3
|
-
"version": "2.2.3-test-build.
|
|
3
|
+
"version": "2.2.3-test-build.2",
|
|
4
4
|
"description": "runner package for intuned functions",
|
|
5
5
|
"types": "./dist/index.d.ts",
|
|
6
6
|
"typesVersions": {
|
|
@@ -35,7 +35,9 @@
|
|
|
35
35
|
"author": "Intuned Team",
|
|
36
36
|
"license": "Elastic-2.0",
|
|
37
37
|
"scripts": {
|
|
38
|
-
"
|
|
38
|
+
"generate-browser-script": "node scripts/generate-browser-script.js",
|
|
39
|
+
"postinstall": "yarn generate-browser-script",
|
|
40
|
+
"build": "rm -rf dist && yarn generate-browser-script && tsc -p tsconfig.json && yarn copy-dts && babel src --out-dir dist --extensions '.ts' && node scripts/ensure-index-types.js",
|
|
39
41
|
"ensure-types": "node scripts/ensure-index-types.js",
|
|
40
42
|
"test": "vitest run",
|
|
41
43
|
"test:dev": "vitest",
|