@intuned/runtime-dev 1.7.0-dev-52-10 → 1.8.0-dev-52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.babelrc +2 -2
- package/dist/commands/api/run.js +2 -2
- package/dist/commands/cli-auth-sessions/create.js +1 -1
- package/dist/commands/cli-auth-sessions/utils.js +2 -1
- package/dist/commands/common/browserUtils.d.ts +1 -1
- package/dist/commands/common/browserUtils.js +1 -1
- package/dist/commands/common/getFirstLineNumber.js +4 -2
- package/dist/commands/deploy/utils.js +2 -1
- package/dist/commands/interface/run.js +6 -5
- package/dist/commands/run-api-cli/utils.js +6 -6
- package/dist/common/assets/browser_scripts.js +2143 -2509
- package/dist/common/contextStorageStateHelpers.d.ts +1 -1
- package/dist/common/getPlaywrightConstructs.d.ts +1 -1
- package/dist/common/getPlaywrightConstructs.js +12 -12
- package/dist/common/jwtTokenManager.js +5 -3
- package/dist/common/runApi/errors.js +5 -4
- package/dist/common/runApi/index.d.ts +1 -1
- package/dist/common/runApi/index.js +11 -8
- package/dist/runtime/executionHelpers.test.js +4 -3
- package/dist/runtime/export.d.ts +0 -15
- package/dist/runtime/extendPayload.js +1 -1
- package/package.json +3 -2
|
@@ -1,2580 +1,2214 @@
|
|
|
1
1
|
(function () {
|
|
2
|
-
|
|
2
|
+
'use strict';
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
4
|
+
var MatchSource;
|
|
5
|
+
(function (MatchSource) {
|
|
6
|
+
MatchSource["ATTRIBUTE"] = "attribute";
|
|
7
|
+
MatchSource["TEXT_CONTENT"] = "text_content";
|
|
8
|
+
MatchSource["DIRECT_TEXT_NODE"] = "direct_text_node";
|
|
9
|
+
})(MatchSource || (MatchSource = {}));
|
|
10
|
+
var MatchMode;
|
|
11
|
+
(function (MatchMode) {
|
|
12
|
+
MatchMode["FULL"] = "full";
|
|
13
|
+
MatchMode["PARTIAL"] = "partial";
|
|
14
|
+
MatchMode["FUZZY"] = "fuzzy";
|
|
15
|
+
})(MatchMode || (MatchMode = {}));
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
17
|
+
function* searchExact(needle, haystack, startIndex = 0, endIndex = null) {
|
|
18
|
+
const needleLen = needle.length;
|
|
19
|
+
if (needleLen === 0)
|
|
20
|
+
return;
|
|
21
|
+
if (endIndex === null) {
|
|
22
|
+
endIndex = haystack.length;
|
|
23
|
+
}
|
|
24
|
+
let index;
|
|
25
|
+
while ((index = haystack.indexOf(needle, startIndex)) > -1) {
|
|
26
|
+
if (index + needle.length > endIndex)
|
|
27
|
+
break;
|
|
28
|
+
yield index;
|
|
29
|
+
startIndex = index + 1;
|
|
30
|
+
}
|
|
22
31
|
}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
if (index + needle.length > endIndex) break;
|
|
26
|
-
yield index;
|
|
27
|
-
startIndex = index + 1;
|
|
32
|
+
function reverse(string) {
|
|
33
|
+
return string.split("").reverse().join("");
|
|
28
34
|
}
|
|
29
|
-
}
|
|
30
|
-
function reverse(string) {
|
|
31
|
-
return string.split("").reverse().join("");
|
|
32
|
-
}
|
|
33
35
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
}
|
|
41
|
-
function* fuzzySearch(needle, haystack, maxDist) {
|
|
42
|
-
if (needle.length > haystack.length + maxDist) return;
|
|
43
|
-
const ngramLen = Math.floor(needle.length / (maxDist + 1));
|
|
44
|
-
if (maxDist === 0) {
|
|
45
|
-
for (const index of searchExact(needle, haystack)) {
|
|
46
|
-
yield {
|
|
47
|
-
start: index,
|
|
48
|
-
end: index + needle.length,
|
|
49
|
-
dist: 0,
|
|
50
|
-
};
|
|
51
|
-
}
|
|
52
|
-
} else if (ngramLen >= 10) {
|
|
53
|
-
yield* fuzzySearchNgrams(needle, haystack, maxDist);
|
|
54
|
-
} else {
|
|
55
|
-
yield* fuzzySearchCandidates(needle, haystack, maxDist);
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
function _expand(needle, haystack, maxDist) {
|
|
59
|
-
maxDist = +maxDist;
|
|
60
|
-
let firstDiff;
|
|
61
|
-
for (
|
|
62
|
-
firstDiff = 0;
|
|
63
|
-
firstDiff < Math.min(needle.length, haystack.length);
|
|
64
|
-
firstDiff++
|
|
65
|
-
) {
|
|
66
|
-
if (needle.charCodeAt(firstDiff) !== haystack.charCodeAt(firstDiff))
|
|
67
|
-
break;
|
|
68
|
-
}
|
|
69
|
-
if (firstDiff) {
|
|
70
|
-
needle = needle.slice(firstDiff);
|
|
71
|
-
haystack = haystack.slice(firstDiff);
|
|
72
|
-
}
|
|
73
|
-
if (!needle) {
|
|
74
|
-
return [0, firstDiff];
|
|
75
|
-
} else if (!haystack) {
|
|
76
|
-
if (needle.length <= maxDist) {
|
|
77
|
-
return [needle.length, firstDiff];
|
|
78
|
-
} else {
|
|
79
|
-
return [null, null];
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
if (maxDist === 0) return [null, null];
|
|
83
|
-
let scores = new Array(needle.length + 1);
|
|
84
|
-
for (let i = 0; i <= maxDist; i++) {
|
|
85
|
-
scores[i] = i;
|
|
86
|
-
}
|
|
87
|
-
let newScores = new Array(needle.length + 1);
|
|
88
|
-
let minScore = null;
|
|
89
|
-
let minScoreIdx = null;
|
|
90
|
-
let maxGoodScore = maxDist;
|
|
91
|
-
let firstGoodScoreIdx = 0;
|
|
92
|
-
let lastGoodScoreIdx = needle.length - 1;
|
|
93
|
-
for (let haystackIdx = 0; haystackIdx < haystack.length; haystackIdx++) {
|
|
94
|
-
const char = haystack.charCodeAt(haystackIdx);
|
|
95
|
-
const needleIdxStart = Math.max(0, firstGoodScoreIdx - 1);
|
|
96
|
-
const needleIdxLimit = Math.min(
|
|
97
|
-
haystackIdx + maxDist,
|
|
98
|
-
needle.length - 1,
|
|
99
|
-
lastGoodScoreIdx
|
|
100
|
-
);
|
|
101
|
-
newScores[0] = scores[0] + 1;
|
|
102
|
-
firstGoodScoreIdx = newScores[0] <= maxGoodScore ? 0 : null;
|
|
103
|
-
lastGoodScoreIdx = newScores[0] <= maxGoodScore ? 0 : -1;
|
|
104
|
-
let needleIdx;
|
|
105
|
-
for (
|
|
106
|
-
needleIdx = needleIdxStart;
|
|
107
|
-
needleIdx < needleIdxLimit;
|
|
108
|
-
needleIdx++
|
|
109
|
-
) {
|
|
110
|
-
const score = (newScores[needleIdx + 1] = Math.min(
|
|
111
|
-
scores[needleIdx] + +(char !== needle.charCodeAt(needleIdx)),
|
|
112
|
-
scores[needleIdx + 1] + 1,
|
|
113
|
-
newScores[needleIdx] + 1
|
|
114
|
-
));
|
|
115
|
-
if (score <= maxGoodScore) {
|
|
116
|
-
if (firstGoodScoreIdx === null) firstGoodScoreIdx = needleIdx + 1;
|
|
117
|
-
lastGoodScoreIdx = Math.max(
|
|
118
|
-
lastGoodScoreIdx,
|
|
119
|
-
needleIdx + 1 + (maxGoodScore - score)
|
|
120
|
-
);
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
const lastScore = (newScores[needleIdx + 1] = Math.min(
|
|
124
|
-
scores[needleIdx] + +(char !== needle.charCodeAt(needleIdx)),
|
|
125
|
-
newScores[needleIdx] + 1
|
|
126
|
-
));
|
|
127
|
-
if (lastScore <= maxGoodScore) {
|
|
128
|
-
if (firstGoodScoreIdx === null) firstGoodScoreIdx = needleIdx + 1;
|
|
129
|
-
lastGoodScoreIdx = needleIdx + 1;
|
|
130
|
-
}
|
|
131
|
-
if (
|
|
132
|
-
needleIdx === needle.length - 1 &&
|
|
133
|
-
(minScore === null || lastScore <= minScore)
|
|
134
|
-
) {
|
|
135
|
-
minScore = lastScore;
|
|
136
|
-
minScoreIdx = haystackIdx;
|
|
137
|
-
if (minScore < maxGoodScore) maxGoodScore = minScore;
|
|
138
|
-
}
|
|
139
|
-
[scores, newScores] = [newScores, scores];
|
|
140
|
-
if (firstGoodScoreIdx === null) break;
|
|
141
|
-
}
|
|
142
|
-
if (minScore !== null && minScore <= maxDist) {
|
|
143
|
-
return [minScore, minScoreIdx + 1 + firstDiff];
|
|
144
|
-
} else {
|
|
145
|
-
return [null, null];
|
|
36
|
+
function makeChar2needleIdx(needle, maxDist) {
|
|
37
|
+
const res = {};
|
|
38
|
+
for (let i = Math.min(needle.length - 1, maxDist); i >= 0; i--) {
|
|
39
|
+
res[needle[i]] = i;
|
|
40
|
+
}
|
|
41
|
+
return res;
|
|
146
42
|
}
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
);
|
|
167
|
-
for (const haystackMatchIdx of searchExact(
|
|
168
|
-
ngram,
|
|
169
|
-
haystack,
|
|
170
|
-
startIdx,
|
|
171
|
-
endIdx
|
|
172
|
-
)) {
|
|
173
|
-
// try to expand left
|
|
174
|
-
const [distRight, rightExpandSize] = _expand(
|
|
175
|
-
needleAfter,
|
|
176
|
-
haystack.slice(
|
|
177
|
-
haystackMatchIdx + ngramLen,
|
|
178
|
-
haystackMatchIdx - ngramStartIdx + needleLen + maxDist
|
|
179
|
-
),
|
|
180
|
-
maxDist
|
|
181
|
-
);
|
|
182
|
-
if (distRight === null) continue;
|
|
183
|
-
const [distLeft, leftExpandSize] = _expand(
|
|
184
|
-
needleBeforeReversed,
|
|
185
|
-
reverse(
|
|
186
|
-
haystack.slice(
|
|
187
|
-
Math.max(
|
|
188
|
-
0,
|
|
189
|
-
haystackMatchIdx - ngramStartIdx - (maxDist - distRight)
|
|
190
|
-
),
|
|
191
|
-
haystackMatchIdx
|
|
192
|
-
)
|
|
193
|
-
),
|
|
194
|
-
maxDist - distRight
|
|
195
|
-
);
|
|
196
|
-
if (distLeft === null) continue;
|
|
197
|
-
yield {
|
|
198
|
-
start: haystackMatchIdx - leftExpandSize,
|
|
199
|
-
end: haystackMatchIdx + ngramLen + rightExpandSize,
|
|
200
|
-
dist: distLeft + distRight,
|
|
201
|
-
};
|
|
202
|
-
}
|
|
43
|
+
function* fuzzySearch(needle, haystack, maxDist) {
|
|
44
|
+
if (needle.length > haystack.length + maxDist)
|
|
45
|
+
return;
|
|
46
|
+
const ngramLen = Math.floor(needle.length / (maxDist + 1));
|
|
47
|
+
if (maxDist === 0) {
|
|
48
|
+
for (const index of searchExact(needle, haystack)) {
|
|
49
|
+
yield {
|
|
50
|
+
start: index,
|
|
51
|
+
end: index + needle.length,
|
|
52
|
+
dist: 0,
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
else if (ngramLen >= 10) {
|
|
57
|
+
yield* fuzzySearchNgrams(needle, haystack, maxDist);
|
|
58
|
+
}
|
|
59
|
+
else {
|
|
60
|
+
yield* fuzzySearchCandidates(needle, haystack, maxDist);
|
|
61
|
+
}
|
|
203
62
|
}
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
};
|
|
225
|
-
} else {
|
|
226
|
-
candidates.set(`${i},${needleIdx + 1},${needleIdx}`, {
|
|
227
|
-
startIdx: i,
|
|
228
|
-
needleIdx: needleIdx + 1,
|
|
229
|
-
dist: needleIdx,
|
|
230
|
-
});
|
|
231
|
-
}
|
|
232
|
-
}
|
|
233
|
-
for (const [, candidate] of prevCandidates) {
|
|
234
|
-
// if this sequence char is the candidate's next expected char
|
|
235
|
-
if (needle[candidate.needleIdx] === haystackChar) {
|
|
236
|
-
// if reached the end of the needle, return a match
|
|
237
|
-
if (candidate.needleIdx + 1 === needleLen) {
|
|
238
|
-
yield {
|
|
239
|
-
start: candidate.startIdx,
|
|
240
|
-
end: i + 1,
|
|
241
|
-
dist: candidate.dist,
|
|
242
|
-
};
|
|
243
|
-
} else {
|
|
244
|
-
// otherwise, update the candidate's needleIdx and keep it
|
|
245
|
-
candidates.set(
|
|
246
|
-
`${candidate.startIdx},${candidate.needleIdx + 1},${
|
|
247
|
-
candidate.dist
|
|
248
|
-
}`,
|
|
249
|
-
{
|
|
250
|
-
startIdx: candidate.startIdx,
|
|
251
|
-
needleIdx: candidate.needleIdx + 1,
|
|
252
|
-
dist: candidate.dist,
|
|
253
|
-
}
|
|
254
|
-
);
|
|
255
|
-
}
|
|
256
|
-
} else {
|
|
257
|
-
if (candidate.dist === maxDist) continue;
|
|
258
|
-
candidates.set(
|
|
259
|
-
`${candidate.startIdx},${candidate.needleIdx},${
|
|
260
|
-
candidate.dist + 1
|
|
261
|
-
}`,
|
|
262
|
-
{
|
|
263
|
-
startIdx: candidate.startIdx,
|
|
264
|
-
needleIdx: candidate.needleIdx,
|
|
265
|
-
dist: candidate.dist + 1,
|
|
63
|
+
function _expand(needle, haystack, maxDist) {
|
|
64
|
+
maxDist = +maxDist;
|
|
65
|
+
let firstDiff;
|
|
66
|
+
for (firstDiff = 0; firstDiff < Math.min(needle.length, haystack.length); firstDiff++) {
|
|
67
|
+
if (needle.charCodeAt(firstDiff) !== haystack.charCodeAt(firstDiff))
|
|
68
|
+
break;
|
|
69
|
+
}
|
|
70
|
+
if (firstDiff) {
|
|
71
|
+
needle = needle.slice(firstDiff);
|
|
72
|
+
haystack = haystack.slice(firstDiff);
|
|
73
|
+
}
|
|
74
|
+
if (!needle) {
|
|
75
|
+
return [0, firstDiff];
|
|
76
|
+
}
|
|
77
|
+
else if (!haystack) {
|
|
78
|
+
if (needle.length <= maxDist) {
|
|
79
|
+
return [needle.length, firstDiff];
|
|
80
|
+
}
|
|
81
|
+
else {
|
|
82
|
+
return [null, null];
|
|
266
83
|
}
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
)
|
|
283
|
-
|
|
84
|
+
}
|
|
85
|
+
if (maxDist === 0)
|
|
86
|
+
return [null, null];
|
|
87
|
+
let scores = new Array(needle.length + 1);
|
|
88
|
+
for (let i = 0; i <= maxDist; i++) {
|
|
89
|
+
scores[i] = i;
|
|
90
|
+
}
|
|
91
|
+
let newScores = new Array(needle.length + 1);
|
|
92
|
+
let minScore = null;
|
|
93
|
+
let minScoreIdx = null;
|
|
94
|
+
let maxGoodScore = maxDist;
|
|
95
|
+
let firstGoodScoreIdx = 0;
|
|
96
|
+
let lastGoodScoreIdx = needle.length - 1;
|
|
97
|
+
for (let haystackIdx = 0; haystackIdx < haystack.length; haystackIdx++) {
|
|
98
|
+
const char = haystack.charCodeAt(haystackIdx);
|
|
99
|
+
const needleIdxStart = Math.max(0, firstGoodScoreIdx - 1);
|
|
100
|
+
const needleIdxLimit = Math.min(haystackIdx + maxDist, needle.length - 1, lastGoodScoreIdx);
|
|
101
|
+
newScores[0] = scores[0] + 1;
|
|
102
|
+
firstGoodScoreIdx = newScores[0] <= maxGoodScore ? 0 : null;
|
|
103
|
+
lastGoodScoreIdx = newScores[0] <= maxGoodScore ? 0 : -1;
|
|
104
|
+
let needleIdx;
|
|
105
|
+
for (needleIdx = needleIdxStart; needleIdx < needleIdxLimit; needleIdx++) {
|
|
106
|
+
const score = (newScores[needleIdx + 1] = Math.min(scores[needleIdx] + +(char !== needle.charCodeAt(needleIdx)), scores[needleIdx + 1] + 1, newScores[needleIdx] + 1));
|
|
107
|
+
if (score <= maxGoodScore) {
|
|
108
|
+
if (firstGoodScoreIdx === null)
|
|
109
|
+
firstGoodScoreIdx = needleIdx + 1;
|
|
110
|
+
lastGoodScoreIdx = Math.max(lastGoodScoreIdx, needleIdx + 1 + (maxGoodScore - score));
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
const lastScore = (newScores[needleIdx + 1] = Math.min(scores[needleIdx] + +(char !== needle.charCodeAt(needleIdx)), newScores[needleIdx] + 1));
|
|
114
|
+
if (lastScore <= maxGoodScore) {
|
|
115
|
+
if (firstGoodScoreIdx === null)
|
|
116
|
+
firstGoodScoreIdx = needleIdx + 1;
|
|
117
|
+
lastGoodScoreIdx = needleIdx + 1;
|
|
118
|
+
}
|
|
119
|
+
if (needleIdx === needle.length - 1 &&
|
|
120
|
+
(minScore === null || lastScore <= minScore)) {
|
|
121
|
+
minScore = lastScore;
|
|
122
|
+
minScoreIdx = haystackIdx;
|
|
123
|
+
if (minScore < maxGoodScore)
|
|
124
|
+
maxGoodScore = minScore;
|
|
125
|
+
}
|
|
126
|
+
[scores, newScores] = [newScores, scores];
|
|
127
|
+
if (firstGoodScoreIdx === null)
|
|
128
|
+
break;
|
|
129
|
+
}
|
|
130
|
+
if (minScore !== null && minScore <= maxDist) {
|
|
131
|
+
return [minScore, minScoreIdx + 1 + firstDiff];
|
|
132
|
+
}
|
|
133
|
+
else {
|
|
134
|
+
return [null, null];
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
function* fuzzySearchNgrams(needle, haystack, maxDist) {
|
|
138
|
+
// use n-gram search
|
|
139
|
+
const ngramLen = Math.floor(needle.length / (maxDist + 1));
|
|
140
|
+
const needleLen = needle.length;
|
|
141
|
+
const haystackLen = haystack.length;
|
|
142
|
+
for (let ngramStartIdx = 0; ngramStartIdx <= needle.length - ngramLen; ngramStartIdx += ngramLen) {
|
|
143
|
+
const ngram = needle.slice(ngramStartIdx, ngramStartIdx + ngramLen);
|
|
144
|
+
const ngramEnd = ngramStartIdx + ngramLen;
|
|
145
|
+
const needleBeforeReversed = reverse(needle.slice(0, ngramStartIdx));
|
|
146
|
+
const needleAfter = needle.slice(ngramEnd);
|
|
147
|
+
const startIdx = Math.max(0, ngramStartIdx - maxDist);
|
|
148
|
+
const endIdx = Math.min(haystackLen, haystackLen - needleLen + ngramEnd + maxDist);
|
|
149
|
+
for (const haystackMatchIdx of searchExact(ngram, haystack, startIdx, endIdx)) {
|
|
150
|
+
// try to expand left
|
|
151
|
+
const [distRight, rightExpandSize] = _expand(needleAfter, haystack.slice(haystackMatchIdx + ngramLen, haystackMatchIdx - ngramStartIdx + needleLen + maxDist), maxDist);
|
|
152
|
+
if (distRight === null)
|
|
153
|
+
continue;
|
|
154
|
+
const [distLeft, leftExpandSize] = _expand(needleBeforeReversed, reverse(haystack.slice(Math.max(0, haystackMatchIdx - ngramStartIdx - (maxDist - distRight)), haystackMatchIdx)), maxDist - distRight);
|
|
155
|
+
if (distLeft === null)
|
|
156
|
+
continue;
|
|
284
157
|
yield {
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
158
|
+
start: haystackMatchIdx - leftExpandSize,
|
|
159
|
+
end: haystackMatchIdx + ngramLen + rightExpandSize,
|
|
160
|
+
dist: distLeft + distRight,
|
|
288
161
|
};
|
|
289
|
-
} else {
|
|
290
|
-
candidates.set(
|
|
291
|
-
`${candidate.startIdx},${
|
|
292
|
-
candidate.needleIdx + 1 + nSkipped
|
|
293
|
-
},${candidate.dist + nSkipped}`,
|
|
294
|
-
{
|
|
295
|
-
startIdx: candidate.startIdx,
|
|
296
|
-
needleIdx: candidate.needleIdx + 1 + nSkipped,
|
|
297
|
-
dist: candidate.dist + nSkipped,
|
|
298
|
-
}
|
|
299
|
-
);
|
|
300
|
-
}
|
|
301
|
-
break;
|
|
302
162
|
}
|
|
303
|
-
|
|
304
|
-
if (i + 1 < haystackLen && candidate.needleIdx + 1 < needleLen) {
|
|
305
|
-
candidates.set(
|
|
306
|
-
`${candidate.startIdx},${candidate.needleIdx + 1},${
|
|
307
|
-
candidate.dist + 1
|
|
308
|
-
}`,
|
|
309
|
-
{
|
|
310
|
-
startIdx: candidate.startIdx,
|
|
311
|
-
needleIdx: candidate.needleIdx + 1,
|
|
312
|
-
dist: candidate.dist + 1,
|
|
313
|
-
}
|
|
314
|
-
);
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
|
-
}
|
|
163
|
+
}
|
|
318
164
|
}
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
165
|
+
function* fuzzySearchCandidates(needle, haystack, maxDist) {
|
|
166
|
+
const needleLen = needle.length;
|
|
167
|
+
const haystackLen = haystack.length;
|
|
168
|
+
if (needleLen > haystackLen + maxDist)
|
|
169
|
+
return;
|
|
170
|
+
const char2needleIdx = makeChar2needleIdx(needle, maxDist);
|
|
171
|
+
let prevCandidates = new Map(); // candidates from the last iteration
|
|
172
|
+
let candidates = new Map(); // new candidates from the current iteration
|
|
173
|
+
// iterate over the chars in the haystack, updating the candidates for each
|
|
174
|
+
for (let i = 0; i < haystack.length; i++) {
|
|
175
|
+
const haystackChar = haystack[i];
|
|
176
|
+
prevCandidates = candidates;
|
|
177
|
+
candidates = new Map();
|
|
178
|
+
const needleIdx = char2needleIdx[haystackChar];
|
|
179
|
+
if (needleIdx !== undefined) {
|
|
180
|
+
if (needleIdx + 1 === needleLen) {
|
|
181
|
+
yield {
|
|
182
|
+
start: i,
|
|
183
|
+
end: i + 1,
|
|
184
|
+
dist: needleIdx,
|
|
185
|
+
};
|
|
186
|
+
}
|
|
187
|
+
else {
|
|
188
|
+
candidates.set(`${i},${needleIdx + 1},${needleIdx}`, {
|
|
189
|
+
startIdx: i,
|
|
190
|
+
needleIdx: needleIdx + 1,
|
|
191
|
+
dist: needleIdx,
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
for (const [, candidate] of prevCandidates) {
|
|
196
|
+
// if this sequence char is the candidate's next expected char
|
|
197
|
+
if (needle[candidate.needleIdx] === haystackChar) {
|
|
198
|
+
// if reached the end of the needle, return a match
|
|
199
|
+
if (candidate.needleIdx + 1 === needleLen) {
|
|
200
|
+
yield {
|
|
201
|
+
start: candidate.startIdx,
|
|
202
|
+
end: i + 1,
|
|
203
|
+
dist: candidate.dist,
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
else {
|
|
207
|
+
// otherwise, update the candidate's needleIdx and keep it
|
|
208
|
+
candidates.set(`${candidate.startIdx},${candidate.needleIdx + 1},${candidate.dist}`, {
|
|
209
|
+
startIdx: candidate.startIdx,
|
|
210
|
+
needleIdx: candidate.needleIdx + 1,
|
|
211
|
+
dist: candidate.dist,
|
|
212
|
+
});
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
else {
|
|
216
|
+
if (candidate.dist === maxDist)
|
|
217
|
+
continue;
|
|
218
|
+
candidates.set(`${candidate.startIdx},${candidate.needleIdx},${candidate.dist + 1}`, {
|
|
219
|
+
startIdx: candidate.startIdx,
|
|
220
|
+
needleIdx: candidate.needleIdx,
|
|
221
|
+
dist: candidate.dist + 1,
|
|
222
|
+
});
|
|
223
|
+
for (let nSkipped = 1; nSkipped <= maxDist - candidate.dist; nSkipped++) {
|
|
224
|
+
if (candidate.needleIdx + nSkipped === needleLen) {
|
|
225
|
+
yield {
|
|
226
|
+
start: candidate.startIdx,
|
|
227
|
+
end: i + 1,
|
|
228
|
+
dist: candidate.dist + nSkipped,
|
|
229
|
+
};
|
|
230
|
+
break;
|
|
231
|
+
}
|
|
232
|
+
else if (needle[candidate.needleIdx + nSkipped] === haystackChar) {
|
|
233
|
+
if (candidate.needleIdx + nSkipped + 1 === needleLen) {
|
|
234
|
+
yield {
|
|
235
|
+
start: candidate.startIdx,
|
|
236
|
+
end: i + 1,
|
|
237
|
+
dist: candidate.dist + nSkipped,
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
else {
|
|
241
|
+
candidates.set(`${candidate.startIdx},${candidate.needleIdx + 1 + nSkipped},${candidate.dist + nSkipped}`, {
|
|
242
|
+
startIdx: candidate.startIdx,
|
|
243
|
+
needleIdx: candidate.needleIdx + 1 + nSkipped,
|
|
244
|
+
dist: candidate.dist + nSkipped,
|
|
245
|
+
});
|
|
246
|
+
}
|
|
247
|
+
break;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
if (i + 1 < haystackLen && candidate.needleIdx + 1 < needleLen) {
|
|
251
|
+
candidates.set(`${candidate.startIdx},${candidate.needleIdx + 1},${candidate.dist + 1}`, {
|
|
252
|
+
startIdx: candidate.startIdx,
|
|
253
|
+
needleIdx: candidate.needleIdx + 1,
|
|
254
|
+
dist: candidate.dist + 1,
|
|
255
|
+
});
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
for (const [, candidate] of candidates) {
|
|
261
|
+
candidate.dist += needle.length - candidate.needleIdx;
|
|
262
|
+
if (candidate.dist <= maxDist) {
|
|
263
|
+
yield {
|
|
264
|
+
start: candidate.startIdx,
|
|
265
|
+
end: haystack.length,
|
|
266
|
+
dist: candidate.dist,
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
}
|
|
328
270
|
}
|
|
329
|
-
}
|
|
330
271
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
matchedSourceValue: normalizedContent,
|
|
406
|
-
distance: match.dist,
|
|
407
|
-
};
|
|
408
|
-
}
|
|
409
|
-
function hasNonFuzzyOrCloseFuzzyMatch(matches) {
|
|
410
|
-
const hasNonFuzzyMatch = matches.some(
|
|
411
|
-
(match) => match.match_mode !== MatchMode.FUZZY
|
|
412
|
-
);
|
|
413
|
-
const hasVeryCloseFuzzyMatch = matches.some(
|
|
414
|
-
(match) =>
|
|
415
|
-
match.match_mode === MatchMode.FUZZY &&
|
|
416
|
-
match.fuzzy_distance &&
|
|
417
|
-
match.fuzzy_distance < 5
|
|
418
|
-
);
|
|
419
|
-
return hasNonFuzzyMatch || hasVeryCloseFuzzyMatch;
|
|
420
|
-
}
|
|
421
|
-
function getElementXPath(element) {
|
|
422
|
-
if (!element || !element.parentNode || element.nodeName === "#document") {
|
|
423
|
-
return null;
|
|
272
|
+
function findClosestMatch(searchTerm, content, maxLDist) {
|
|
273
|
+
const results = [];
|
|
274
|
+
for (const result of fuzzySearch(searchTerm, content, maxLDist)) {
|
|
275
|
+
results.push(result);
|
|
276
|
+
}
|
|
277
|
+
results.sort((a, b) => {
|
|
278
|
+
if (a.dist === b.dist) {
|
|
279
|
+
return b.end - b.start - (a.end - a.start); // Sort by match length if distances are equal
|
|
280
|
+
}
|
|
281
|
+
return a.dist - b.dist; // Sort by distance
|
|
282
|
+
});
|
|
283
|
+
return results[0];
|
|
284
|
+
}
|
|
285
|
+
function normalizeSpacing(text) {
|
|
286
|
+
if (!text) {
|
|
287
|
+
return "";
|
|
288
|
+
}
|
|
289
|
+
// Replace newlines and tabs with spaces
|
|
290
|
+
let normalized = text.replace(/\n/g, " ").replace(/\t/g, " ");
|
|
291
|
+
// Replace multiple spaces with a single space
|
|
292
|
+
normalized = normalized.split(/\s+/).join(" ");
|
|
293
|
+
return normalized.trim();
|
|
294
|
+
}
|
|
295
|
+
function isMatchExact(data, value) {
|
|
296
|
+
if (!data || !value) {
|
|
297
|
+
return [false, null];
|
|
298
|
+
}
|
|
299
|
+
const normalizedData = normalizeSpacing(data);
|
|
300
|
+
const normalizedValue = normalizeSpacing(value);
|
|
301
|
+
return [normalizedData === normalizedValue, normalizedValue];
|
|
302
|
+
}
|
|
303
|
+
function calculateMaxLDist(value) {
|
|
304
|
+
const length = value.length;
|
|
305
|
+
const Pmax = 0.2;
|
|
306
|
+
const Pmin = 0.05;
|
|
307
|
+
const lengthAtPmax = 10;
|
|
308
|
+
let percentage;
|
|
309
|
+
if (length <= lengthAtPmax) {
|
|
310
|
+
percentage = Pmax;
|
|
311
|
+
}
|
|
312
|
+
else {
|
|
313
|
+
const k = -Math.log(Pmin / Pmax) / (600 - lengthAtPmax);
|
|
314
|
+
percentage = Pmax * Math.exp(-k * (length - lengthAtPmax));
|
|
315
|
+
}
|
|
316
|
+
percentage = Math.max(Pmin, percentage);
|
|
317
|
+
return Math.max(1, Math.floor(length * percentage));
|
|
318
|
+
}
|
|
319
|
+
function isFuzzMatch(searchTerm, content) {
|
|
320
|
+
if (!searchTerm || !content) {
|
|
321
|
+
return {
|
|
322
|
+
found: false,
|
|
323
|
+
matchedValue: null,
|
|
324
|
+
distance: null,
|
|
325
|
+
matchedSourceValue: null,
|
|
326
|
+
};
|
|
327
|
+
}
|
|
328
|
+
const maxLDist = calculateMaxLDist(searchTerm);
|
|
329
|
+
const normalizedSearchTerm = normalizeSpacing(searchTerm);
|
|
330
|
+
const normalizedContent = normalizeSpacing(content);
|
|
331
|
+
const match = findClosestMatch(normalizedSearchTerm.toLowerCase(), normalizedContent.toLowerCase(), maxLDist);
|
|
332
|
+
if (!match) {
|
|
333
|
+
return {
|
|
334
|
+
found: false,
|
|
335
|
+
matchedValue: null,
|
|
336
|
+
distance: null,
|
|
337
|
+
matchedSourceValue: null,
|
|
338
|
+
};
|
|
339
|
+
}
|
|
340
|
+
return {
|
|
341
|
+
found: true,
|
|
342
|
+
matchedValue: normalizedContent.slice(match.start, match.end),
|
|
343
|
+
matchedSourceValue: normalizedContent,
|
|
344
|
+
distance: match.dist,
|
|
345
|
+
};
|
|
424
346
|
}
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
siblingsCount
|
|
437
|
-
|
|
347
|
+
function hasNonFuzzyOrCloseFuzzyMatch(matches) {
|
|
348
|
+
const hasNonFuzzyMatch = matches.some((match) => match.match_mode !== MatchMode.FUZZY);
|
|
349
|
+
const hasVeryCloseFuzzyMatch = matches.some((match) => match.match_mode === MatchMode.FUZZY &&
|
|
350
|
+
match.fuzzy_distance &&
|
|
351
|
+
match.fuzzy_distance < 5);
|
|
352
|
+
return hasNonFuzzyMatch || hasVeryCloseFuzzyMatch;
|
|
353
|
+
}
|
|
354
|
+
function getElementXPath(element) {
|
|
355
|
+
if (!element || !element.parentNode || element.nodeName === "#document") {
|
|
356
|
+
return null;
|
|
357
|
+
}
|
|
358
|
+
let siblingsCount = 1;
|
|
359
|
+
const parent = element.parentNode;
|
|
360
|
+
const nodeName = element.nodeName.toLowerCase();
|
|
361
|
+
const siblings = Array.from(parent.childNodes).filter((node) => node.nodeType === 1 // Node.ELEMENT_NODE
|
|
362
|
+
);
|
|
363
|
+
for (const sibling of siblings) {
|
|
364
|
+
if (sibling === element) {
|
|
365
|
+
break;
|
|
366
|
+
}
|
|
367
|
+
if (sibling.nodeName.toLowerCase() === nodeName) {
|
|
368
|
+
siblingsCount++;
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
const parentXPath = getElementXPath(parent);
|
|
372
|
+
if (element.nodeName === "#text") {
|
|
373
|
+
return parentXPath;
|
|
374
|
+
}
|
|
375
|
+
return parentXPath
|
|
376
|
+
? `${parentXPath}/${nodeName}[${siblingsCount}]`
|
|
377
|
+
: `${nodeName}[${siblingsCount}]`;
|
|
438
378
|
}
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
379
|
+
function traverseAndPrune(node, conditionFunc) {
|
|
380
|
+
const children = Array.from(node.children ?? []);
|
|
381
|
+
children.forEach((child) => {
|
|
382
|
+
if (child.children) {
|
|
383
|
+
if (!conditionFunc(child)) {
|
|
384
|
+
traverseAndPrune(child, conditionFunc);
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
});
|
|
442
388
|
}
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
}
|
|
457
|
-
function isPartOfString(input, dom) {
|
|
458
|
-
if (!input || !dom) {
|
|
459
|
-
return [false, null, null];
|
|
389
|
+
function isPartOfString(input, dom) {
|
|
390
|
+
if (!input || !dom) {
|
|
391
|
+
return [false, null, null];
|
|
392
|
+
}
|
|
393
|
+
const normalizedInput = normalizeSpacing(input);
|
|
394
|
+
const normalizedDom = normalizeSpacing(dom);
|
|
395
|
+
const matchIndex = normalizedDom
|
|
396
|
+
.toLowerCase()
|
|
397
|
+
.indexOf(normalizedInput.toLowerCase());
|
|
398
|
+
const matchedText = matchIndex !== -1
|
|
399
|
+
? normalizedDom.substring(matchIndex, matchIndex + normalizedInput.length)
|
|
400
|
+
: null;
|
|
401
|
+
return [matchIndex !== -1, matchedText, normalizedDom];
|
|
460
402
|
}
|
|
461
|
-
const normalizedInput = normalizeSpacing(input);
|
|
462
|
-
const normalizedDom = normalizeSpacing(dom);
|
|
463
|
-
const matchIndex = normalizedDom
|
|
464
|
-
.toLowerCase()
|
|
465
|
-
.indexOf(normalizedInput.toLowerCase());
|
|
466
|
-
const matchedText =
|
|
467
|
-
matchIndex !== -1
|
|
468
|
-
? normalizedDom.substring(
|
|
469
|
-
matchIndex,
|
|
470
|
-
matchIndex + normalizedInput.length
|
|
471
|
-
)
|
|
472
|
-
: null;
|
|
473
|
-
return [matchIndex !== -1, matchedText, normalizedDom];
|
|
474
|
-
}
|
|
475
403
|
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
404
|
+
function matchStringsWithDomContent(domNode, stringsList) {
|
|
405
|
+
const exactMatchedMap = matchExactStrings(domNode, stringsList);
|
|
406
|
+
const stringsWithNoExactMatch = stringsList.filter((data) => !hasNonFuzzyOrCloseFuzzyMatch(exactMatchedMap[data]));
|
|
407
|
+
if (stringsWithNoExactMatch.length === 0) {
|
|
408
|
+
return exactMatchedMap;
|
|
409
|
+
}
|
|
410
|
+
const fuzzMatchedMap = matchFuzzyStrings(domNode, stringsWithNoExactMatch);
|
|
411
|
+
for (const [data, fuzzyMatches] of Object.entries(fuzzMatchedMap)) {
|
|
412
|
+
if (data in exactMatchedMap) {
|
|
413
|
+
exactMatchedMap[data].push(...fuzzyMatches);
|
|
414
|
+
}
|
|
415
|
+
else {
|
|
416
|
+
exactMatchedMap[data] = fuzzyMatches;
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
// attributes to try fuzzy match attributes on
|
|
420
|
+
const stringsWithNoMatch = stringsList.filter((data) => !hasNonFuzzyOrCloseFuzzyMatch(exactMatchedMap[data]));
|
|
421
|
+
const attributesFuzzyMatchedMap = matchFuzzyAttributes(domNode, stringsWithNoMatch);
|
|
422
|
+
for (const [data, attributeFuzzyMatches] of Object.entries(attributesFuzzyMatchedMap)) {
|
|
423
|
+
if (data in exactMatchedMap) {
|
|
424
|
+
exactMatchedMap[data].push(...attributeFuzzyMatches);
|
|
425
|
+
}
|
|
426
|
+
else {
|
|
427
|
+
exactMatchedMap[data] = attributeFuzzyMatches;
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
return exactMatchedMap;
|
|
431
|
+
}
|
|
432
|
+
function matchExactStrings(domNode, stringsList) {
|
|
433
|
+
const allNodes = [
|
|
434
|
+
domNode,
|
|
435
|
+
...Array.from(domNode.querySelectorAll("*")),
|
|
436
|
+
].reverse();
|
|
437
|
+
const matchesMap = Object.fromEntries(stringsList.map((data) => [data, []]));
|
|
438
|
+
for (const tag of allNodes) {
|
|
439
|
+
const xpath = getElementXPath(tag);
|
|
440
|
+
for (const stringValue of stringsList) {
|
|
441
|
+
const matchesXPaths = matchesMap[stringValue].map((match) => match.xpath || "");
|
|
442
|
+
const xpathIsChildOfMatch = matchesXPaths.some((matchXPath) => matchXPath !== xpath && matchXPath.startsWith(xpath));
|
|
443
|
+
if (xpathIsChildOfMatch)
|
|
444
|
+
continue;
|
|
445
|
+
const attributeNames = tag.getAttributeNames();
|
|
446
|
+
for (const attr of attributeNames) {
|
|
447
|
+
const attributeValue = tag.getAttribute(attr) || "";
|
|
448
|
+
const [isPartOfStringResult, matchedValue] = isPartOfString(stringValue, attributeValue);
|
|
449
|
+
if (isPartOfStringResult) {
|
|
450
|
+
const [isExact] = isMatchExact(stringValue, attributeValue);
|
|
451
|
+
matchesMap[stringValue].push({
|
|
452
|
+
attribute: attr,
|
|
453
|
+
fuzzy_distance: null,
|
|
454
|
+
match_mode: isExact ? MatchMode.FULL : MatchMode.PARTIAL,
|
|
455
|
+
match_source: MatchSource.ATTRIBUTE,
|
|
456
|
+
matched_value: matchedValue,
|
|
457
|
+
matched_source_value: attributeValue,
|
|
458
|
+
tag: tag.tagName.toLowerCase(),
|
|
459
|
+
xpath,
|
|
460
|
+
});
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
if (tag["href"]) {
|
|
464
|
+
const result = matchHref(tag, stringValue);
|
|
465
|
+
if (result) {
|
|
466
|
+
matchesMap[stringValue].push(result);
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
// Check for direct text nodes
|
|
470
|
+
for (const childNode of tag.childNodes) {
|
|
471
|
+
// Node.TEXT_NODE
|
|
472
|
+
if (childNode.nodeType === 3) {
|
|
473
|
+
const directTextContent = childNode.textContent?.trim() || "";
|
|
474
|
+
if (directTextContent) {
|
|
475
|
+
const [isPartOfStringResult, matchedValue, source_value] = isPartOfString(stringValue, directTextContent);
|
|
476
|
+
if (isPartOfStringResult) {
|
|
477
|
+
const [isExact] = isMatchExact(stringValue, directTextContent);
|
|
478
|
+
matchesMap[stringValue].push({
|
|
479
|
+
attribute: null,
|
|
480
|
+
fuzzy_distance: null,
|
|
481
|
+
match_mode: isExact ? MatchMode.FULL : MatchMode.PARTIAL,
|
|
482
|
+
match_source: MatchSource.DIRECT_TEXT_NODE,
|
|
483
|
+
matched_value: matchedValue,
|
|
484
|
+
matched_source_value: source_value,
|
|
485
|
+
tag: tag.tagName.toLowerCase(),
|
|
486
|
+
xpath,
|
|
487
|
+
});
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
const tagTextContent = tag.textContent || "";
|
|
493
|
+
const [isPartOfStringResult, matchedValue, source_value] = isPartOfString(stringValue, tagTextContent);
|
|
494
|
+
if (isPartOfStringResult) {
|
|
495
|
+
const [isExact] = isMatchExact(stringValue, tagTextContent);
|
|
496
|
+
matchesMap[stringValue].push({
|
|
497
|
+
attribute: null,
|
|
498
|
+
fuzzy_distance: null,
|
|
499
|
+
match_mode: isExact ? MatchMode.FULL : MatchMode.PARTIAL,
|
|
500
|
+
match_source: MatchSource.TEXT_CONTENT,
|
|
501
|
+
matched_value: matchedValue,
|
|
502
|
+
matched_source_value: source_value,
|
|
503
|
+
tag: tag.tagName.toLowerCase(),
|
|
504
|
+
xpath,
|
|
505
|
+
});
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
return matchesMap;
|
|
510
|
+
}
|
|
511
|
+
function matchFuzzyStrings(domNode, stringsToMatch) {
|
|
512
|
+
const matchesMap = Object.fromEntries(stringsToMatch.map((data) => [data, []]));
|
|
513
|
+
const conditionFunc = (stringToMatch, node) => {
|
|
514
|
+
let foundMatch = false;
|
|
515
|
+
const currentXPath = getElementXPath(node);
|
|
516
|
+
for (const attr of node.getAttributeNames()) {
|
|
517
|
+
const attributeValue = node.getAttribute(attr) || "";
|
|
518
|
+
const { found: isFuzzMatchFound, matchedValue, distance: dist, matchedSourceValue, } = isFuzzMatch(stringToMatch, attributeValue);
|
|
519
|
+
if (isFuzzMatchFound) {
|
|
520
|
+
matchesMap[stringToMatch].push({
|
|
521
|
+
attribute: attr,
|
|
522
|
+
fuzzy_distance: dist,
|
|
523
|
+
match_mode: MatchMode.FUZZY,
|
|
524
|
+
match_source: MatchSource.ATTRIBUTE,
|
|
525
|
+
matched_value: matchedValue,
|
|
526
|
+
tag: node.tagName.toLowerCase(),
|
|
527
|
+
xpath: currentXPath,
|
|
528
|
+
matched_source_value: matchedSourceValue,
|
|
529
|
+
});
|
|
530
|
+
foundMatch = true;
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
const tagTextContent = node.textContent || "";
|
|
534
|
+
if (tagTextContent) {
|
|
535
|
+
const { found: isFuzzMatchFound, matchedValue, distance: dist, matchedSourceValue, } = isFuzzMatch(stringToMatch, tagTextContent);
|
|
536
|
+
if (isFuzzMatchFound) {
|
|
537
|
+
matchesMap[stringToMatch].push({
|
|
538
|
+
attribute: null,
|
|
539
|
+
fuzzy_distance: dist,
|
|
540
|
+
match_mode: MatchMode.FUZZY,
|
|
541
|
+
match_source: MatchSource.TEXT_CONTENT,
|
|
542
|
+
matched_value: matchedValue,
|
|
543
|
+
tag: node.tagName.toLowerCase(),
|
|
544
|
+
xpath: currentXPath,
|
|
545
|
+
matched_source_value: matchedSourceValue,
|
|
546
|
+
});
|
|
547
|
+
foundMatch = true;
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
// Check for direct text nodes
|
|
551
|
+
for (const childNode of node.childNodes) {
|
|
552
|
+
// Node.TEXT_NODE
|
|
553
|
+
if (childNode.nodeType === 3) {
|
|
554
|
+
const directTextContent = childNode.textContent?.trim() || "";
|
|
555
|
+
if (directTextContent) {
|
|
556
|
+
const { found: isFuzzMatchFound, matchedValue, distance: dist, matchedSourceValue, } = isFuzzMatch(stringToMatch, directTextContent);
|
|
557
|
+
if (isFuzzMatchFound) {
|
|
558
|
+
matchesMap[stringToMatch].push({
|
|
559
|
+
attribute: null,
|
|
560
|
+
fuzzy_distance: dist,
|
|
561
|
+
match_mode: MatchMode.FUZZY,
|
|
562
|
+
match_source: MatchSource.DIRECT_TEXT_NODE,
|
|
563
|
+
matched_value: matchedValue,
|
|
564
|
+
tag: node.tagName.toLowerCase(),
|
|
565
|
+
xpath: currentXPath,
|
|
566
|
+
matched_source_value: matchedSourceValue,
|
|
567
|
+
});
|
|
568
|
+
foundMatch = true;
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
return !foundMatch;
|
|
574
|
+
};
|
|
575
|
+
for (const stringToMatch of stringsToMatch) {
|
|
576
|
+
conditionFunc(stringToMatch, domNode);
|
|
577
|
+
traverseAndPrune(domNode, (node) => conditionFunc(stringToMatch, node));
|
|
578
|
+
}
|
|
579
|
+
for (const [stringToMatch, matches] of Object.entries(matchesMap)) {
|
|
580
|
+
const matchesToRemove = new Set();
|
|
581
|
+
matches.forEach((match, i) => {
|
|
582
|
+
for (const otherMatch of matches.slice(i + 1)) {
|
|
583
|
+
if ((otherMatch.xpath || "").startsWith((match.xpath || "") + "/")) {
|
|
584
|
+
matchesToRemove.add(i);
|
|
585
|
+
break;
|
|
586
|
+
}
|
|
587
|
+
}
|
|
547
588
|
});
|
|
548
|
-
|
|
549
|
-
}
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
589
|
+
matchesMap[stringToMatch] = matches.filter((_, i) => !matchesToRemove.has(i));
|
|
590
|
+
}
|
|
591
|
+
return matchesMap;
|
|
592
|
+
}
|
|
593
|
+
function matchFuzzyAttributes(domNode, stringsToMatch) {
|
|
594
|
+
const matchesMap = Object.fromEntries(stringsToMatch.map((data) => [data, []]));
|
|
595
|
+
const allAttributes = getAllAttributes(domNode);
|
|
596
|
+
for (const stringToMatch of stringsToMatch) {
|
|
597
|
+
const stringToSearchIn = allAttributes
|
|
598
|
+
.filter((attr) => attr.value.length > 10)
|
|
599
|
+
.filter((attr) => {
|
|
600
|
+
const lengthDiff = Math.abs(attr.value.length - stringToMatch.length);
|
|
601
|
+
return lengthDiff <= 0.2 * stringToMatch.length;
|
|
602
|
+
})
|
|
603
|
+
.map((attr) => attr.value)
|
|
604
|
+
.join("\n");
|
|
605
|
+
const { found: isFuzzMatchFound, matchedValue, distance: dist, } = isFuzzMatch(stringToMatch, stringToSearchIn);
|
|
606
|
+
if (isFuzzMatchFound) {
|
|
607
|
+
const matchLine = allAttributes.find((attr) => matchedValue && attr.value.includes(matchedValue));
|
|
608
|
+
if (!matchLine)
|
|
609
|
+
continue;
|
|
610
|
+
matchesMap[stringToMatch].push({
|
|
611
|
+
attribute: matchLine.attr,
|
|
612
|
+
fuzzy_distance: dist,
|
|
613
|
+
match_mode: MatchMode.FUZZY,
|
|
614
|
+
match_source: MatchSource.ATTRIBUTE,
|
|
615
|
+
matched_value: matchedValue,
|
|
616
|
+
xpath: matchLine.node,
|
|
617
|
+
matched_source_value: matchLine.value,
|
|
618
|
+
tag: matchLine.tag,
|
|
575
619
|
});
|
|
576
|
-
}
|
|
577
620
|
}
|
|
578
|
-
}
|
|
579
621
|
}
|
|
580
|
-
|
|
581
|
-
const [isPartOfStringResult, matchedValue, source_value] =
|
|
582
|
-
isPartOfString(stringValue, tagTextContent);
|
|
583
|
-
if (isPartOfStringResult) {
|
|
584
|
-
const [isExact] = isMatchExact(stringValue, tagTextContent);
|
|
585
|
-
matchesMap[stringValue].push({
|
|
586
|
-
attribute: null,
|
|
587
|
-
fuzzy_distance: null,
|
|
588
|
-
match_mode: isExact ? MatchMode.FULL : MatchMode.PARTIAL,
|
|
589
|
-
match_source: MatchSource.TEXT_CONTENT,
|
|
590
|
-
matched_value: matchedValue,
|
|
591
|
-
matched_source_value: source_value,
|
|
592
|
-
tag: tag.tagName.toLowerCase(),
|
|
593
|
-
xpath,
|
|
594
|
-
});
|
|
595
|
-
}
|
|
596
|
-
}
|
|
622
|
+
return matchesMap;
|
|
597
623
|
}
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
const currentXPath = getElementXPath(node);
|
|
607
|
-
for (const attr of node.getAttributeNames()) {
|
|
608
|
-
const attributeValue = node.getAttribute(attr) || "";
|
|
609
|
-
const {
|
|
610
|
-
found: isFuzzMatchFound,
|
|
611
|
-
matchedValue,
|
|
612
|
-
distance: dist,
|
|
613
|
-
matchedSourceValue,
|
|
614
|
-
} = isFuzzMatch(stringToMatch, attributeValue);
|
|
615
|
-
if (isFuzzMatchFound) {
|
|
616
|
-
matchesMap[stringToMatch].push({
|
|
617
|
-
attribute: attr,
|
|
618
|
-
fuzzy_distance: dist,
|
|
619
|
-
match_mode: MatchMode.FUZZY,
|
|
620
|
-
match_source: MatchSource.ATTRIBUTE,
|
|
621
|
-
matched_value: matchedValue,
|
|
624
|
+
function getAllAttributes(node) {
|
|
625
|
+
const allNodes = [node, ...Array.from(node.querySelectorAll("*"))].reverse();
|
|
626
|
+
return allNodes.flatMap((node) => node
|
|
627
|
+
.getAttributeNames()
|
|
628
|
+
.map((attr) => ({
|
|
629
|
+
node: getElementXPath(node),
|
|
630
|
+
attr,
|
|
631
|
+
value: node.getAttribute(attr) || "",
|
|
622
632
|
tag: node.tagName.toLowerCase(),
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
attribute: null,
|
|
640
|
-
fuzzy_distance: dist,
|
|
641
|
-
match_mode: MatchMode.FUZZY,
|
|
642
|
-
match_source: MatchSource.TEXT_CONTENT,
|
|
643
|
-
matched_value: matchedValue,
|
|
644
|
-
tag: node.tagName.toLowerCase(),
|
|
645
|
-
xpath: currentXPath,
|
|
646
|
-
matched_source_value: matchedSourceValue,
|
|
647
|
-
});
|
|
648
|
-
foundMatch = true;
|
|
649
|
-
}
|
|
650
|
-
}
|
|
651
|
-
// Check for direct text nodes
|
|
652
|
-
for (const childNode of node.childNodes) {
|
|
653
|
-
// Node.TEXT_NODE
|
|
654
|
-
if (childNode.nodeType === 3) {
|
|
655
|
-
const directTextContent = childNode.textContent?.trim() || "";
|
|
656
|
-
if (directTextContent) {
|
|
657
|
-
const {
|
|
658
|
-
found: isFuzzMatchFound,
|
|
659
|
-
matchedValue,
|
|
660
|
-
distance: dist,
|
|
661
|
-
matchedSourceValue,
|
|
662
|
-
} = isFuzzMatch(stringToMatch, directTextContent);
|
|
663
|
-
if (isFuzzMatchFound) {
|
|
664
|
-
matchesMap[stringToMatch].push({
|
|
665
|
-
attribute: null,
|
|
666
|
-
fuzzy_distance: dist,
|
|
667
|
-
match_mode: MatchMode.FUZZY,
|
|
668
|
-
match_source: MatchSource.DIRECT_TEXT_NODE,
|
|
633
|
+
}))
|
|
634
|
+
.filter((i) => i.value.length > 10));
|
|
635
|
+
}
|
|
636
|
+
function matchHref(node, stringToMatch) {
|
|
637
|
+
if (!node["href"] || typeof node["href"] !== "string") {
|
|
638
|
+
return;
|
|
639
|
+
}
|
|
640
|
+
const attributeValue = node["href"] || "";
|
|
641
|
+
let [isPartOfStringResult, matchedValue] = isPartOfString(stringToMatch, attributeValue);
|
|
642
|
+
if (isPartOfStringResult) {
|
|
643
|
+
const [isExact] = isMatchExact(stringToMatch, attributeValue);
|
|
644
|
+
return {
|
|
645
|
+
attribute: "href",
|
|
646
|
+
fuzzy_distance: null,
|
|
647
|
+
match_mode: isExact ? MatchMode.FULL : MatchMode.PARTIAL,
|
|
648
|
+
match_source: MatchSource.ATTRIBUTE,
|
|
669
649
|
matched_value: matchedValue,
|
|
650
|
+
matched_source_value: attributeValue,
|
|
670
651
|
tag: node.tagName.toLowerCase(),
|
|
671
|
-
xpath:
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
652
|
+
xpath: getElementXPath(node),
|
|
653
|
+
};
|
|
654
|
+
}
|
|
655
|
+
let decodedStringToMatch;
|
|
656
|
+
try {
|
|
657
|
+
decodedStringToMatch = decodeURI(stringToMatch);
|
|
658
|
+
}
|
|
659
|
+
catch (e) {
|
|
660
|
+
console.log("failed to decode stringToMatch", stringToMatch);
|
|
661
|
+
return;
|
|
662
|
+
}
|
|
663
|
+
[isPartOfStringResult, matchedValue] = isPartOfString(decodedStringToMatch, attributeValue);
|
|
664
|
+
if (isPartOfStringResult) {
|
|
665
|
+
const [isExact] = isMatchExact(stringToMatch, attributeValue);
|
|
666
|
+
return {
|
|
667
|
+
attribute: "href",
|
|
668
|
+
fuzzy_distance: null,
|
|
669
|
+
match_mode: isExact ? MatchMode.FULL : MatchMode.PARTIAL,
|
|
670
|
+
match_source: MatchSource.ATTRIBUTE,
|
|
671
|
+
matched_value: matchedValue,
|
|
672
|
+
matched_source_value: attributeValue,
|
|
673
|
+
tag: node.tagName.toLowerCase(),
|
|
674
|
+
xpath: getElementXPath(node),
|
|
675
|
+
};
|
|
677
676
|
}
|
|
678
|
-
}
|
|
679
|
-
return !foundMatch;
|
|
680
|
-
};
|
|
681
|
-
for (const stringToMatch of stringsToMatch) {
|
|
682
|
-
conditionFunc(stringToMatch, domNode);
|
|
683
|
-
traverseAndPrune(domNode, (node) => conditionFunc(stringToMatch, node));
|
|
684
|
-
}
|
|
685
|
-
for (const [stringToMatch, matches] of Object.entries(matchesMap)) {
|
|
686
|
-
const matchesToRemove = new Set();
|
|
687
|
-
matches.forEach((match, i) => {
|
|
688
|
-
for (const otherMatch of matches.slice(i + 1)) {
|
|
689
|
-
if ((otherMatch.xpath || "").startsWith((match.xpath || "") + "/")) {
|
|
690
|
-
matchesToRemove.add(i);
|
|
691
|
-
break;
|
|
692
|
-
}
|
|
693
|
-
}
|
|
694
|
-
});
|
|
695
|
-
matchesMap[stringToMatch] = matches.filter(
|
|
696
|
-
(_, i) => !matchesToRemove.has(i)
|
|
697
|
-
);
|
|
698
|
-
}
|
|
699
|
-
return matchesMap;
|
|
700
|
-
}
|
|
701
|
-
function matchFuzzyAttributes(domNode, stringsToMatch) {
|
|
702
|
-
const matchesMap = Object.fromEntries(
|
|
703
|
-
stringsToMatch.map((data) => [data, []])
|
|
704
|
-
);
|
|
705
|
-
const allAttributes = getAllAttributes(domNode);
|
|
706
|
-
for (const stringToMatch of stringsToMatch) {
|
|
707
|
-
const stringToSearchIn = allAttributes
|
|
708
|
-
.filter((attr) => attr.value.length > 10)
|
|
709
|
-
.filter((attr) => {
|
|
710
|
-
const lengthDiff = Math.abs(attr.value.length - stringToMatch.length);
|
|
711
|
-
return lengthDiff <= 0.2 * stringToMatch.length;
|
|
712
|
-
})
|
|
713
|
-
.map((attr) => attr.value)
|
|
714
|
-
.join("\n");
|
|
715
|
-
const {
|
|
716
|
-
found: isFuzzMatchFound,
|
|
717
|
-
matchedValue,
|
|
718
|
-
distance: dist,
|
|
719
|
-
} = isFuzzMatch(stringToMatch, stringToSearchIn);
|
|
720
|
-
if (isFuzzMatchFound) {
|
|
721
|
-
const matchLine = allAttributes.find(
|
|
722
|
-
(attr) => matchedValue && attr.value.includes(matchedValue)
|
|
723
|
-
);
|
|
724
|
-
if (!matchLine) continue;
|
|
725
|
-
matchesMap[stringToMatch].push({
|
|
726
|
-
attribute: matchLine.attr,
|
|
727
|
-
fuzzy_distance: dist,
|
|
728
|
-
match_mode: MatchMode.FUZZY,
|
|
729
|
-
match_source: MatchSource.ATTRIBUTE,
|
|
730
|
-
matched_value: matchedValue,
|
|
731
|
-
xpath: matchLine.node,
|
|
732
|
-
matched_source_value: matchLine.value,
|
|
733
|
-
tag: matchLine.tag,
|
|
734
|
-
});
|
|
735
|
-
}
|
|
736
|
-
}
|
|
737
|
-
return matchesMap;
|
|
738
|
-
}
|
|
739
|
-
function getAllAttributes(node) {
|
|
740
|
-
const allNodes = [
|
|
741
|
-
node,
|
|
742
|
-
...Array.from(node.querySelectorAll("*")),
|
|
743
|
-
].reverse();
|
|
744
|
-
return allNodes.flatMap((node) =>
|
|
745
|
-
node
|
|
746
|
-
.getAttributeNames()
|
|
747
|
-
.map((attr) => ({
|
|
748
|
-
node: getElementXPath(node),
|
|
749
|
-
attr,
|
|
750
|
-
value: node.getAttribute(attr) || "",
|
|
751
|
-
tag: node.tagName.toLowerCase(),
|
|
752
|
-
}))
|
|
753
|
-
.filter((i) => i.value.length > 10)
|
|
754
|
-
);
|
|
755
|
-
}
|
|
756
|
-
function matchHref(node, stringToMatch) {
|
|
757
|
-
if (!node["href"] || typeof node["href"] !== "string") {
|
|
758
|
-
return;
|
|
759
|
-
}
|
|
760
|
-
const attributeValue = node["href"] || "";
|
|
761
|
-
let [isPartOfStringResult, matchedValue] = isPartOfString(
|
|
762
|
-
stringToMatch,
|
|
763
|
-
attributeValue
|
|
764
|
-
);
|
|
765
|
-
if (isPartOfStringResult) {
|
|
766
|
-
const [isExact] = isMatchExact(stringToMatch, attributeValue);
|
|
767
|
-
return {
|
|
768
|
-
attribute: "href",
|
|
769
|
-
fuzzy_distance: null,
|
|
770
|
-
match_mode: isExact ? MatchMode.FULL : MatchMode.PARTIAL,
|
|
771
|
-
match_source: MatchSource.ATTRIBUTE,
|
|
772
|
-
matched_value: matchedValue,
|
|
773
|
-
matched_source_value: attributeValue,
|
|
774
|
-
tag: node.tagName.toLowerCase(),
|
|
775
|
-
xpath: getElementXPath(node),
|
|
776
|
-
};
|
|
777
|
-
}
|
|
778
|
-
let decodedStringToMatch;
|
|
779
|
-
try {
|
|
780
|
-
decodedStringToMatch = decodeURI(stringToMatch);
|
|
781
|
-
} catch (e) {
|
|
782
|
-
console.log("failed to decode stringToMatch", stringToMatch);
|
|
783
|
-
return;
|
|
784
|
-
}
|
|
785
|
-
[isPartOfStringResult, matchedValue] = isPartOfString(
|
|
786
|
-
decodedStringToMatch,
|
|
787
|
-
attributeValue
|
|
788
|
-
);
|
|
789
|
-
if (isPartOfStringResult) {
|
|
790
|
-
const [isExact] = isMatchExact(stringToMatch, attributeValue);
|
|
791
|
-
return {
|
|
792
|
-
attribute: "href",
|
|
793
|
-
fuzzy_distance: null,
|
|
794
|
-
match_mode: isExact ? MatchMode.FULL : MatchMode.PARTIAL,
|
|
795
|
-
match_source: MatchSource.ATTRIBUTE,
|
|
796
|
-
matched_value: matchedValue,
|
|
797
|
-
matched_source_value: attributeValue,
|
|
798
|
-
tag: node.tagName.toLowerCase(),
|
|
799
|
-
xpath: getElementXPath(node),
|
|
800
|
-
};
|
|
801
677
|
}
|
|
802
|
-
}
|
|
803
678
|
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
} else {
|
|
815
|
-
return 0;
|
|
816
|
-
}
|
|
817
|
-
}
|
|
818
|
-
// Define the characters that require escaping
|
|
819
|
-
const slashChars = "\\`*_{}[]()#+-.!";
|
|
820
|
-
// Escape any special regex characters in slashChars
|
|
821
|
-
const escapedSlashChars = slashChars.replace(
|
|
822
|
-
/[-/\\^$*+?.()|[\]{}]/g,
|
|
823
|
-
"\\$&"
|
|
824
|
-
);
|
|
825
|
-
// Create the regular expression
|
|
826
|
-
const mdBackslashMatcher = new RegExp(
|
|
827
|
-
`\\\\(?=[${escapedSlashChars}])`,
|
|
828
|
-
"g"
|
|
829
|
-
);
|
|
830
|
-
const mdDotMatcher = new RegExp(`^(\\s*\\d+)(\\.)(?=\\s)`, "gm");
|
|
831
|
-
const mdPlusMatcher = new RegExp(`^(\\s*)(\\+)(?=\\s)`, "gm");
|
|
832
|
-
const mdDashMatcher = new RegExp(`^(\\s*)(-)(?=\\s|-)`, "gm");
|
|
833
|
-
function escapeMdSection(text) {
|
|
834
|
-
text = text.replace(mdBackslashMatcher, "\\\\");
|
|
835
|
-
text = text.replace(mdDotMatcher, "$1\\$2");
|
|
836
|
-
text = text.replace(mdPlusMatcher, "$1\\$2");
|
|
837
|
-
text = text.replace(mdDashMatcher, "$1\\$2");
|
|
838
|
-
return text;
|
|
839
|
-
}
|
|
840
|
-
function isFirstTbody(element) {
|
|
841
|
-
const previousSibling = element.previousSibling;
|
|
842
|
-
return (
|
|
843
|
-
element.nodeName === "TBODY" &&
|
|
844
|
-
(!previousSibling ||
|
|
845
|
-
(previousSibling.nodeName === "THEAD" &&
|
|
846
|
-
/^\s*$/i.test(previousSibling.textContent ?? "")))
|
|
847
|
-
);
|
|
848
|
-
}
|
|
849
|
-
function isHeadingRow(tr) {
|
|
850
|
-
const parentNode = tr.parentNode;
|
|
851
|
-
return (
|
|
852
|
-
parentNode.nodeName === "THEAD" ||
|
|
853
|
-
(parentNode.firstChild === tr &&
|
|
854
|
-
(parentNode.nodeName === "TABLE" || isFirstTbody(parentNode)) &&
|
|
855
|
-
Array.from(tr.childNodes).every(function (n) {
|
|
856
|
-
return n.nodeName === "TH";
|
|
857
|
-
}))
|
|
858
|
-
);
|
|
859
|
-
}
|
|
860
|
-
class Html2Text {
|
|
861
|
-
p_p = 0;
|
|
862
|
-
abbrData; // last inner HTML (for abbr being defined)
|
|
863
|
-
pre = false;
|
|
864
|
-
code = false;
|
|
865
|
-
startPre = false;
|
|
866
|
-
blockquote = 0;
|
|
867
|
-
list = [];
|
|
868
|
-
start = true;
|
|
869
|
-
breakToggle = "";
|
|
870
|
-
space;
|
|
871
|
-
lastWasNewLine = false;
|
|
872
|
-
a = null;
|
|
873
|
-
outCount = 0;
|
|
874
|
-
baseurl;
|
|
875
|
-
abbrList = {};
|
|
876
|
-
outText = "";
|
|
877
|
-
outTextList = [];
|
|
878
|
-
abbr_title;
|
|
879
|
-
skipInternalLinks = true;
|
|
880
|
-
aStack = [];
|
|
881
|
-
maybeAutomaticLink;
|
|
882
|
-
lastWasList = false;
|
|
883
|
-
absoluteUrlMatcher = new RegExp("^[a-zA-Z+]+://");
|
|
884
|
-
emphasis_mark = "_";
|
|
885
|
-
strong_mark = "**";
|
|
886
|
-
break() {
|
|
887
|
-
if (this.p_p === 0) {
|
|
888
|
-
this.p_p = 1;
|
|
889
|
-
}
|
|
890
|
-
}
|
|
891
|
-
softBreak() {
|
|
892
|
-
this.break();
|
|
893
|
-
this.breakToggle = " ";
|
|
894
|
-
}
|
|
895
|
-
processOutput(data, pureData = 0, force = 0) {
|
|
896
|
-
if (this.abbrData !== undefined) {
|
|
897
|
-
this.abbrData += data;
|
|
898
|
-
}
|
|
899
|
-
if (pureData && !this.pre) {
|
|
900
|
-
data = data.replace(/\s+/g, " ");
|
|
901
|
-
if (data && data[0] === " ") {
|
|
902
|
-
this.space = 1;
|
|
903
|
-
data = data.substring(1);
|
|
904
|
-
}
|
|
905
|
-
}
|
|
906
|
-
if (!data && force !== "end") return;
|
|
907
|
-
if (this.startPre) {
|
|
908
|
-
if (!data.startsWith("\n")) {
|
|
909
|
-
data = "\n" + data;
|
|
910
|
-
}
|
|
911
|
-
}
|
|
912
|
-
let newLineIndent = ">".repeat(this.blockquote ?? 0);
|
|
913
|
-
if (!(force === "end" && data && data[0] === ">") && this.blockquote) {
|
|
914
|
-
newLineIndent += " ";
|
|
915
|
-
}
|
|
916
|
-
if (this.pre) {
|
|
917
|
-
if (this.list.length === 0) {
|
|
918
|
-
newLineIndent += " ";
|
|
919
|
-
} else {
|
|
920
|
-
for (let i = 0; i < this.list.length + 1; i++) {
|
|
921
|
-
newLineIndent += " ";
|
|
679
|
+
function convertElementToMarkdown(element) {
|
|
680
|
+
const mdCharsMatcher = /([\\[\]()])/g;
|
|
681
|
+
function escapeMd(text) {
|
|
682
|
+
// Escapes markdown-sensitive characters within other markdown constructs.
|
|
683
|
+
return text.replace(mdCharsMatcher, "\\$1");
|
|
684
|
+
}
|
|
685
|
+
function listNumberingStart(attrs) {
|
|
686
|
+
const start = attrs.getNamedItem("start")?.value;
|
|
687
|
+
if (start) {
|
|
688
|
+
return parseInt(start, 10) - 1;
|
|
922
689
|
}
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
}
|
|
926
|
-
if (this.startPre) {
|
|
927
|
-
this.startPre = false;
|
|
928
|
-
if (this.list.length > 0) {
|
|
929
|
-
data = data.trimStart();
|
|
930
|
-
}
|
|
931
|
-
}
|
|
932
|
-
if (this.start) {
|
|
933
|
-
this.space = 0;
|
|
934
|
-
this.p_p = 0;
|
|
935
|
-
this.start = false;
|
|
936
|
-
}
|
|
937
|
-
if (force === "end") {
|
|
938
|
-
this.p_p = 0;
|
|
939
|
-
this.out("\n");
|
|
940
|
-
this.space = 0;
|
|
941
|
-
}
|
|
942
|
-
if (this.p_p) {
|
|
943
|
-
this.out((this.breakToggle + "\n" + newLineIndent).repeat(this.p_p));
|
|
944
|
-
this.space = 0;
|
|
945
|
-
this.breakToggle = "";
|
|
946
|
-
}
|
|
947
|
-
if (this.space) {
|
|
948
|
-
if (!this.lastWasNewLine) {
|
|
949
|
-
this.out(" ");
|
|
950
|
-
}
|
|
951
|
-
this.space = 0;
|
|
952
|
-
}
|
|
953
|
-
if (this.a && force === "end") {
|
|
954
|
-
if (force === "end") {
|
|
955
|
-
this.out("\n");
|
|
956
|
-
}
|
|
957
|
-
const newA = this.a.filter((link) => {
|
|
958
|
-
if (this.outCount > link.outcount) {
|
|
959
|
-
this.out(
|
|
960
|
-
" [" +
|
|
961
|
-
link.count +
|
|
962
|
-
"]: " +
|
|
963
|
-
new URL(link.href, this.baseurl).toString()
|
|
964
|
-
);
|
|
965
|
-
if (link.title) {
|
|
966
|
-
this.out(" (" + link.title + ")");
|
|
967
|
-
}
|
|
968
|
-
this.out("\n");
|
|
969
|
-
return false;
|
|
690
|
+
else {
|
|
691
|
+
return 0;
|
|
970
692
|
}
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
}
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
693
|
+
}
|
|
694
|
+
// Define the characters that require escaping
|
|
695
|
+
const slashChars = "\\`*_{}[]()#+-.!";
|
|
696
|
+
// Escape any special regex characters in slashChars
|
|
697
|
+
const escapedSlashChars = slashChars.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
|
|
698
|
+
// Create the regular expression
|
|
699
|
+
const mdBackslashMatcher = new RegExp(`\\\\(?=[${escapedSlashChars}])`, "g");
|
|
700
|
+
const mdDotMatcher = new RegExp(`^(\\s*\\d+)(\\.)(?=\\s)`, "gm");
|
|
701
|
+
const mdPlusMatcher = new RegExp(`^(\\s*)(\\+)(?=\\s)`, "gm");
|
|
702
|
+
const mdDashMatcher = new RegExp(`^(\\s*)(-)(?=\\s|-)`, "gm");
|
|
703
|
+
function escapeMdSection(text) {
|
|
704
|
+
text = text.replace(mdBackslashMatcher, "\\\\");
|
|
705
|
+
text = text.replace(mdDotMatcher, "$1\\$2");
|
|
706
|
+
text = text.replace(mdPlusMatcher, "$1\\$2");
|
|
707
|
+
text = text.replace(mdDashMatcher, "$1\\$2");
|
|
708
|
+
return text;
|
|
709
|
+
}
|
|
710
|
+
function isFirstTbody(element) {
|
|
711
|
+
const previousSibling = element.previousSibling;
|
|
712
|
+
return (element.nodeName === "TBODY" &&
|
|
713
|
+
(!previousSibling ||
|
|
714
|
+
(previousSibling.nodeName === "THEAD" &&
|
|
715
|
+
/^\s*$/i.test(previousSibling.textContent ?? ""))));
|
|
716
|
+
}
|
|
717
|
+
function isHeadingRow(tr) {
|
|
718
|
+
const parentNode = tr.parentNode;
|
|
719
|
+
return (parentNode.nodeName === "THEAD" ||
|
|
720
|
+
(parentNode.firstChild === tr &&
|
|
721
|
+
(parentNode.nodeName === "TABLE" ||
|
|
722
|
+
isFirstTbody(parentNode)) &&
|
|
723
|
+
Array.from(tr.childNodes).every(function (n) {
|
|
724
|
+
return n.nodeName === "TH";
|
|
725
|
+
})));
|
|
726
|
+
}
|
|
727
|
+
class Html2Text {
|
|
728
|
+
p_p = 0;
|
|
729
|
+
abbrData; // last inner HTML (for abbr being defined)
|
|
730
|
+
pre = false;
|
|
731
|
+
code = false;
|
|
732
|
+
startPre = false;
|
|
733
|
+
blockquote = 0;
|
|
734
|
+
list = [];
|
|
735
|
+
start = true;
|
|
736
|
+
breakToggle = "";
|
|
737
|
+
space;
|
|
738
|
+
lastWasNewLine = false;
|
|
739
|
+
a = null;
|
|
740
|
+
outCount = 0;
|
|
741
|
+
baseurl;
|
|
742
|
+
abbrList = {};
|
|
743
|
+
outText = "";
|
|
744
|
+
outTextList = [];
|
|
745
|
+
abbr_title;
|
|
746
|
+
skipInternalLinks = true;
|
|
747
|
+
aStack = [];
|
|
748
|
+
maybeAutomaticLink;
|
|
749
|
+
lastWasList = false;
|
|
750
|
+
absoluteUrlMatcher = new RegExp("^[a-zA-Z+]+://");
|
|
751
|
+
emphasis_mark = "_";
|
|
752
|
+
strong_mark = "**";
|
|
753
|
+
break() {
|
|
754
|
+
if (this.p_p === 0) {
|
|
755
|
+
this.p_p = 1;
|
|
756
|
+
}
|
|
1005
757
|
}
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
}
|
|
1010
|
-
return 0;
|
|
1011
|
-
}
|
|
1012
|
-
padding() {
|
|
1013
|
-
this.p_p = 2;
|
|
1014
|
-
}
|
|
1015
|
-
handleData(node) {
|
|
1016
|
-
if (this.maybeAutomaticLink) {
|
|
1017
|
-
const href = this.maybeAutomaticLink;
|
|
1018
|
-
if (
|
|
1019
|
-
href?.value === node.nodeValue &&
|
|
1020
|
-
this.absoluteUrlMatcher.test(href.value)
|
|
1021
|
-
) {
|
|
1022
|
-
this.processOutput(`<${node.nodeValue}>`);
|
|
1023
|
-
return;
|
|
1024
|
-
} else {
|
|
1025
|
-
this.processOutput("[");
|
|
1026
|
-
this.maybeAutomaticLink = null;
|
|
1027
|
-
}
|
|
1028
|
-
}
|
|
1029
|
-
if (!this.code && !this.pre && node.nodeValue) {
|
|
1030
|
-
const data = escapeMdSection(node.nodeValue);
|
|
1031
|
-
this.processOutput(data, 1);
|
|
1032
|
-
return;
|
|
1033
|
-
}
|
|
1034
|
-
this.processOutput(node.textContent || "", 1);
|
|
1035
|
-
}
|
|
1036
|
-
handleTag(node) {
|
|
1037
|
-
const tag = node.nodeName.toLowerCase();
|
|
1038
|
-
if (["head", "style", "script"].includes(tag)) {
|
|
1039
|
-
return;
|
|
1040
|
-
}
|
|
1041
|
-
if (this.getHeadingLevel(tag)) {
|
|
1042
|
-
this.padding();
|
|
1043
|
-
this.processOutput("#".repeat(this.getHeadingLevel(tag)) + " ");
|
|
1044
|
-
}
|
|
1045
|
-
if (tag == "br") this.processOutput(" \n");
|
|
1046
|
-
if (tag == "hr") {
|
|
1047
|
-
this.padding();
|
|
1048
|
-
this.processOutput("---");
|
|
1049
|
-
this.padding();
|
|
1050
|
-
}
|
|
1051
|
-
if (tag == "blockquote") {
|
|
1052
|
-
this.padding();
|
|
1053
|
-
this.processOutput("> ", 0, 1);
|
|
1054
|
-
}
|
|
1055
|
-
}
|
|
1056
|
-
handleTagPrefix(node) {
|
|
1057
|
-
const nodeName = node.nodeName.toLowerCase();
|
|
1058
|
-
let attrs =
|
|
1059
|
-
node.nodeType === node.ELEMENT_NODE ? node.attributes : null;
|
|
1060
|
-
if (["table"].includes(nodeName)) {
|
|
1061
|
-
this.padding();
|
|
1062
|
-
}
|
|
1063
|
-
if (nodeName == "td" || nodeName == "th") {
|
|
1064
|
-
const index = Array.from(node.parentNode?.children ?? []).indexOf(
|
|
1065
|
-
node
|
|
1066
|
-
);
|
|
1067
|
-
let prefix = " ";
|
|
1068
|
-
if (index === 0) prefix = "| ";
|
|
1069
|
-
this.processOutput(prefix);
|
|
1070
|
-
// this.break();
|
|
1071
|
-
}
|
|
1072
|
-
if (["div", "p"].includes(nodeName)) {
|
|
1073
|
-
this.padding();
|
|
1074
|
-
}
|
|
1075
|
-
if (nodeName === "blockquote") {
|
|
1076
|
-
this.blockquote += 1;
|
|
1077
|
-
}
|
|
1078
|
-
if (nodeName === "pre") {
|
|
1079
|
-
this.pre = true;
|
|
1080
|
-
this.startPre = true;
|
|
1081
|
-
this.padding();
|
|
1082
|
-
}
|
|
1083
|
-
if (["code", "tt"].includes(nodeName)) {
|
|
1084
|
-
this.processOutput("`");
|
|
1085
|
-
}
|
|
1086
|
-
if (["em", "i", "u"].includes(nodeName)) {
|
|
1087
|
-
this.processOutput(this.emphasis_mark);
|
|
1088
|
-
}
|
|
1089
|
-
if (["strong", "b"].includes(nodeName)) {
|
|
1090
|
-
this.processOutput(this.strong_mark);
|
|
1091
|
-
}
|
|
1092
|
-
if (["del", "strike", "s"].includes(nodeName)) {
|
|
1093
|
-
this.processOutput("<" + nodeName + ">");
|
|
1094
|
-
}
|
|
1095
|
-
if (nodeName === "abbr") {
|
|
1096
|
-
this.abbr_title = null;
|
|
1097
|
-
this.abbrData = "";
|
|
1098
|
-
const title = attrs && attrs.getNamedItem("title");
|
|
1099
|
-
if (attrs && title) {
|
|
1100
|
-
this.abbr_title = title.value;
|
|
1101
|
-
}
|
|
1102
|
-
}
|
|
1103
|
-
if (nodeName === "dl") {
|
|
1104
|
-
this.padding();
|
|
1105
|
-
}
|
|
1106
|
-
if (nodeName === "dd") {
|
|
1107
|
-
this.processOutput(" ");
|
|
1108
|
-
}
|
|
1109
|
-
if (nodeName == "a") {
|
|
1110
|
-
const href = attrs ? attrs.getNamedItem("href") : null;
|
|
1111
|
-
if (href && !(this.skipInternalLinks && href.value.startsWith("#"))) {
|
|
1112
|
-
this.aStack.push(attrs);
|
|
1113
|
-
this.maybeAutomaticLink = href;
|
|
1114
|
-
} else {
|
|
1115
|
-
this.aStack.push(null);
|
|
1116
|
-
}
|
|
1117
|
-
}
|
|
1118
|
-
if (nodeName === "img") {
|
|
1119
|
-
const src = attrs ? attrs.getNamedItem("src") : null;
|
|
1120
|
-
if (src) {
|
|
1121
|
-
node.setAttribute("href", src.value);
|
|
1122
|
-
attrs = node.attributes;
|
|
1123
|
-
const alt = attrs.getNamedItem("alt")?.value;
|
|
1124
|
-
this.processOutput("![" + escapeMd(alt ?? "") + "]");
|
|
1125
|
-
this.processOutput(
|
|
1126
|
-
"(" + escapeMd(attrs.getNamedItem("href")?.value ?? "") + ")"
|
|
1127
|
-
);
|
|
1128
|
-
}
|
|
1129
|
-
}
|
|
1130
|
-
if (["ul", "ol"].includes(nodeName)) {
|
|
1131
|
-
const listStyle = nodeName;
|
|
1132
|
-
const numberingStart = listNumberingStart(node.attributes);
|
|
1133
|
-
this.list.push({ name: listStyle, num: numberingStart });
|
|
1134
|
-
this.lastWasList = true;
|
|
1135
|
-
} else {
|
|
1136
|
-
this.lastWasList = false;
|
|
1137
|
-
}
|
|
1138
|
-
if (nodeName === "li") {
|
|
1139
|
-
let li;
|
|
1140
|
-
this.break();
|
|
1141
|
-
if (this.list.length > 0) {
|
|
1142
|
-
li = this.list[this.list.length - 1];
|
|
1143
|
-
} else {
|
|
1144
|
-
li = { name: "ul", num: 0 };
|
|
1145
|
-
}
|
|
1146
|
-
const nestCount = this.list.length;
|
|
1147
|
-
this.processOutput(" ".repeat(nestCount));
|
|
1148
|
-
if (li["name"] == "ul") this.processOutput("*" + " ");
|
|
1149
|
-
else if (li["name"] == "ol") {
|
|
1150
|
-
li["num"] += 1;
|
|
1151
|
-
this.processOutput(li["num"] + ". ");
|
|
1152
|
-
}
|
|
1153
|
-
this.start = true;
|
|
1154
|
-
}
|
|
1155
|
-
}
|
|
1156
|
-
handleTagSuffix(node) {
|
|
1157
|
-
const nodeName = node.nodeName.toLowerCase();
|
|
1158
|
-
if (nodeName === "blockquote") {
|
|
1159
|
-
this.blockquote -= 1;
|
|
1160
|
-
}
|
|
1161
|
-
if (nodeName == "td" || nodeName == "th") {
|
|
1162
|
-
this.processOutput(" |");
|
|
1163
|
-
}
|
|
1164
|
-
if (nodeName == "tr") {
|
|
1165
|
-
const cell = (content, node) => {
|
|
1166
|
-
const index = Array.from(node.parentNode.childNodes).indexOf(node);
|
|
1167
|
-
let prefix = " ";
|
|
1168
|
-
if (index === 0) prefix = "| ";
|
|
1169
|
-
return prefix + content + " |";
|
|
1170
|
-
};
|
|
1171
|
-
let borderCells = "";
|
|
1172
|
-
const alignMap = { left: ":--", right: "--:", center: ":-:" };
|
|
1173
|
-
if (isHeadingRow(node)) {
|
|
1174
|
-
for (let i = 0; i < node.children.length; i++) {
|
|
1175
|
-
let border = "---";
|
|
1176
|
-
const align = (
|
|
1177
|
-
node.children[i].getAttribute("align") || ""
|
|
1178
|
-
).toLowerCase();
|
|
1179
|
-
if (align) border = alignMap[align] || border;
|
|
1180
|
-
borderCells += cell(border, node.childNodes[i]);
|
|
758
|
+
softBreak() {
|
|
759
|
+
this.break();
|
|
760
|
+
this.breakToggle = " ";
|
|
1181
761
|
}
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
762
|
+
processOutput(data, pureData = 0, force = 0) {
|
|
763
|
+
if (this.abbrData !== undefined) {
|
|
764
|
+
this.abbrData += data;
|
|
765
|
+
}
|
|
766
|
+
if (pureData && !this.pre) {
|
|
767
|
+
data = data.replace(/\s+/g, " ");
|
|
768
|
+
if (data && data[0] === " ") {
|
|
769
|
+
this.space = 1;
|
|
770
|
+
data = data.substring(1);
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
if (!data && force !== "end")
|
|
774
|
+
return;
|
|
775
|
+
if (this.startPre) {
|
|
776
|
+
if (!data.startsWith("\n")) {
|
|
777
|
+
data = "\n" + data;
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
let newLineIndent = ">".repeat(this.blockquote ?? 0);
|
|
781
|
+
if (!(force === "end" && data && data[0] === ">") && this.blockquote) {
|
|
782
|
+
newLineIndent += " ";
|
|
783
|
+
}
|
|
784
|
+
if (this.pre) {
|
|
785
|
+
if (this.list.length === 0) {
|
|
786
|
+
newLineIndent += " ";
|
|
787
|
+
}
|
|
788
|
+
else {
|
|
789
|
+
for (let i = 0; i < this.list.length + 1; i++) {
|
|
790
|
+
newLineIndent += " ";
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
data = data.replace(/\n/g, `\n${newLineIndent}`);
|
|
794
|
+
}
|
|
795
|
+
if (this.startPre) {
|
|
796
|
+
this.startPre = false;
|
|
797
|
+
if (this.list.length > 0) {
|
|
798
|
+
data = data.trimStart();
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
if (this.start) {
|
|
802
|
+
this.space = 0;
|
|
803
|
+
this.p_p = 0;
|
|
804
|
+
this.start = false;
|
|
805
|
+
}
|
|
806
|
+
if (force === "end") {
|
|
807
|
+
this.p_p = 0;
|
|
808
|
+
this.out("\n");
|
|
809
|
+
this.space = 0;
|
|
810
|
+
}
|
|
811
|
+
if (this.p_p) {
|
|
812
|
+
this.out((this.breakToggle + "\n" + newLineIndent).repeat(this.p_p));
|
|
813
|
+
this.space = 0;
|
|
814
|
+
this.breakToggle = "";
|
|
815
|
+
}
|
|
816
|
+
if (this.space) {
|
|
817
|
+
if (!this.lastWasNewLine) {
|
|
818
|
+
this.out(" ");
|
|
819
|
+
}
|
|
820
|
+
this.space = 0;
|
|
821
|
+
}
|
|
822
|
+
if (this.a && force === "end") {
|
|
823
|
+
if (force === "end") {
|
|
824
|
+
this.out("\n");
|
|
825
|
+
}
|
|
826
|
+
const newA = this.a.filter((link) => {
|
|
827
|
+
if (this.outCount > link.outcount) {
|
|
828
|
+
this.out(" [" +
|
|
829
|
+
link.count +
|
|
830
|
+
"]: " +
|
|
831
|
+
new URL(link.href, this.baseurl).toString());
|
|
832
|
+
if (link.title) {
|
|
833
|
+
this.out(" (" + link.title + ")");
|
|
834
|
+
}
|
|
835
|
+
this.out("\n");
|
|
836
|
+
return false;
|
|
837
|
+
}
|
|
838
|
+
return true;
|
|
839
|
+
});
|
|
840
|
+
if (this.a.length !== newA.length) {
|
|
841
|
+
this.out("\n");
|
|
842
|
+
}
|
|
843
|
+
this.a = newA;
|
|
844
|
+
}
|
|
845
|
+
if (this.abbrList && force === "end") {
|
|
846
|
+
for (const [abbr, definition] of Object.entries(this.abbrList)) {
|
|
847
|
+
this.out("\n *[" + abbr + "]: " + definition + "\n");
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
this.p_p = 0;
|
|
851
|
+
this.out(data);
|
|
852
|
+
this.outCount++;
|
|
1226
853
|
}
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
} else {
|
|
1233
|
-
this.lastWasList = false;
|
|
1234
|
-
}
|
|
1235
|
-
if (nodeName === "li") {
|
|
1236
|
-
this.break();
|
|
1237
|
-
}
|
|
1238
|
-
}
|
|
1239
|
-
previousIndex(attrs) {
|
|
1240
|
-
// Returns the index of a certain set of attributes (of a link) in the
|
|
1241
|
-
// this.a list.
|
|
1242
|
-
// If the set of attributes is not found, returns null.
|
|
1243
|
-
const href = attrs.getNamedItem("href");
|
|
1244
|
-
if (!attrs.getNamedItem("href")) return null;
|
|
1245
|
-
let itemIndex = -1;
|
|
1246
|
-
for (const a of this.a ?? []) {
|
|
1247
|
-
itemIndex += 1;
|
|
1248
|
-
let match = false;
|
|
1249
|
-
if (a.getNamedItem("href") === href) {
|
|
1250
|
-
if (a.getNamedItem("title") || attrs.getNamedItem("title")) {
|
|
1251
|
-
if (
|
|
1252
|
-
a.getNamedItem("title") &&
|
|
1253
|
-
attrs.getNamedItem("title") &&
|
|
1254
|
-
a.getNamedItem("title") === attrs.getNamedItem("title")
|
|
1255
|
-
) {
|
|
1256
|
-
match = true;
|
|
1257
|
-
}
|
|
1258
|
-
} else {
|
|
1259
|
-
match = true;
|
|
854
|
+
out(string) {
|
|
855
|
+
this.outTextList.push(string);
|
|
856
|
+
if (string) {
|
|
857
|
+
this.lastWasNewLine = string.charAt(string.length - 1) === "\n";
|
|
858
|
+
}
|
|
1260
859
|
}
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
handle(htmlElement) {
|
|
1267
|
-
// jsdom failed to parse hilton page due to invalid stylesheet
|
|
1268
|
-
// Nodes to be removed
|
|
1269
|
-
const filteredNodes = ["style", "script", "noscript"];
|
|
1270
|
-
for (const node of filteredNodes) {
|
|
1271
|
-
const nodeSelectors = htmlElement.querySelectorAll(node);
|
|
1272
|
-
nodeSelectors.forEach((nodeSelector) => {
|
|
1273
|
-
if (nodeSelector && nodeSelector.parentNode) {
|
|
1274
|
-
nodeSelector.parentNode.removeChild(nodeSelector);
|
|
860
|
+
getResult() {
|
|
861
|
+
this.processOutput("", 0, "end");
|
|
862
|
+
this.outText = this.outTextList.join("");
|
|
863
|
+
this.outText = this.outText.replace(" _place_holder;", " ");
|
|
864
|
+
return this.outText;
|
|
1275
865
|
}
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
866
|
+
getHeadingLevel(tag) {
|
|
867
|
+
if (tag[0] === "h" && tag.length === 2) {
|
|
868
|
+
try {
|
|
869
|
+
const n = parseInt(tag[1]);
|
|
870
|
+
if (!isNaN(n) && n >= 1 && n <= 9) {
|
|
871
|
+
return n;
|
|
872
|
+
}
|
|
873
|
+
}
|
|
874
|
+
catch (error) {
|
|
875
|
+
return 0;
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
return 0;
|
|
879
|
+
}
|
|
880
|
+
padding() {
|
|
881
|
+
this.p_p = 2;
|
|
882
|
+
}
|
|
883
|
+
handleData(node) {
|
|
884
|
+
if (this.maybeAutomaticLink) {
|
|
885
|
+
const href = this.maybeAutomaticLink;
|
|
886
|
+
if (href?.value === node.nodeValue &&
|
|
887
|
+
this.absoluteUrlMatcher.test(href.value)) {
|
|
888
|
+
this.processOutput(`<${node.nodeValue}>`);
|
|
889
|
+
return;
|
|
890
|
+
}
|
|
891
|
+
else {
|
|
892
|
+
this.processOutput("[");
|
|
893
|
+
this.maybeAutomaticLink = null;
|
|
894
|
+
}
|
|
895
|
+
}
|
|
896
|
+
if (!this.code && !this.pre && node.nodeValue) {
|
|
897
|
+
const data = escapeMdSection(node.nodeValue);
|
|
898
|
+
this.processOutput(data, 1);
|
|
899
|
+
return;
|
|
900
|
+
}
|
|
901
|
+
this.processOutput(node.textContent || "", 1);
|
|
902
|
+
}
|
|
903
|
+
handleTag(node) {
|
|
904
|
+
const tag = node.nodeName.toLowerCase();
|
|
905
|
+
if (["head", "style", "script"].includes(tag)) {
|
|
906
|
+
return;
|
|
907
|
+
}
|
|
908
|
+
if (this.getHeadingLevel(tag)) {
|
|
909
|
+
this.padding();
|
|
910
|
+
this.processOutput("#".repeat(this.getHeadingLevel(tag)) + " ");
|
|
911
|
+
}
|
|
912
|
+
if (tag == "br")
|
|
913
|
+
this.processOutput(" \n");
|
|
914
|
+
if (tag == "hr") {
|
|
915
|
+
this.padding();
|
|
916
|
+
this.processOutput("---");
|
|
917
|
+
this.padding();
|
|
918
|
+
}
|
|
919
|
+
if (tag == "blockquote") {
|
|
920
|
+
this.padding();
|
|
921
|
+
this.processOutput("> ", 0, 1);
|
|
922
|
+
}
|
|
923
|
+
}
|
|
924
|
+
handleTagPrefix(node) {
|
|
925
|
+
const nodeName = node.nodeName.toLowerCase();
|
|
926
|
+
let attrs = node.nodeType === node.ELEMENT_NODE
|
|
927
|
+
? node.attributes
|
|
928
|
+
: null;
|
|
929
|
+
if (["table"].includes(nodeName)) {
|
|
930
|
+
this.padding();
|
|
931
|
+
}
|
|
932
|
+
if (nodeName == "td" || nodeName == "th") {
|
|
933
|
+
const index = Array.from(node.parentNode?.children ?? []).indexOf(node);
|
|
934
|
+
let prefix = " ";
|
|
935
|
+
if (index === 0)
|
|
936
|
+
prefix = "| ";
|
|
937
|
+
this.processOutput(prefix);
|
|
938
|
+
// this.break();
|
|
939
|
+
}
|
|
940
|
+
if (["div", "p"].includes(nodeName)) {
|
|
941
|
+
this.padding();
|
|
942
|
+
}
|
|
943
|
+
if (nodeName === "blockquote") {
|
|
944
|
+
this.blockquote += 1;
|
|
945
|
+
}
|
|
946
|
+
if (nodeName === "pre") {
|
|
947
|
+
this.pre = true;
|
|
948
|
+
this.startPre = true;
|
|
949
|
+
this.padding();
|
|
950
|
+
}
|
|
951
|
+
if (["code", "tt"].includes(nodeName)) {
|
|
952
|
+
this.processOutput("`");
|
|
953
|
+
}
|
|
954
|
+
if (["em", "i", "u"].includes(nodeName)) {
|
|
955
|
+
this.processOutput(this.emphasis_mark);
|
|
956
|
+
}
|
|
957
|
+
if (["strong", "b"].includes(nodeName)) {
|
|
958
|
+
this.processOutput(this.strong_mark);
|
|
959
|
+
}
|
|
960
|
+
if (["del", "strike", "s"].includes(nodeName)) {
|
|
961
|
+
this.processOutput("<" + nodeName + ">");
|
|
962
|
+
}
|
|
963
|
+
if (nodeName === "abbr") {
|
|
964
|
+
this.abbr_title = null;
|
|
965
|
+
this.abbrData = "";
|
|
966
|
+
const title = attrs && attrs.getNamedItem("title");
|
|
967
|
+
if (attrs && title) {
|
|
968
|
+
this.abbr_title = title.value;
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
if (nodeName === "dl") {
|
|
972
|
+
this.padding();
|
|
973
|
+
}
|
|
974
|
+
if (nodeName === "dd") {
|
|
975
|
+
this.processOutput(" ");
|
|
976
|
+
}
|
|
977
|
+
if (nodeName == "a") {
|
|
978
|
+
const href = attrs ? attrs.getNamedItem("href") : null;
|
|
979
|
+
if (href && !(this.skipInternalLinks && href.value.startsWith("#"))) {
|
|
980
|
+
this.aStack.push(attrs);
|
|
981
|
+
this.maybeAutomaticLink = href;
|
|
982
|
+
}
|
|
983
|
+
else {
|
|
984
|
+
this.aStack.push(null);
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
if (nodeName === "img") {
|
|
988
|
+
const src = attrs ? attrs.getNamedItem("src") : null;
|
|
989
|
+
if (src) {
|
|
990
|
+
node.setAttribute("href", src.value);
|
|
991
|
+
attrs = node.attributes;
|
|
992
|
+
const alt = attrs.getNamedItem("alt")?.value;
|
|
993
|
+
this.processOutput("![" + escapeMd(alt ?? "") + "]");
|
|
994
|
+
this.processOutput("(" + escapeMd(attrs.getNamedItem("href")?.value ?? "") + ")");
|
|
995
|
+
}
|
|
996
|
+
}
|
|
997
|
+
if (["ul", "ol"].includes(nodeName)) {
|
|
998
|
+
const listStyle = nodeName;
|
|
999
|
+
const numberingStart = listNumberingStart(node.attributes);
|
|
1000
|
+
this.list.push({ name: listStyle, num: numberingStart });
|
|
1001
|
+
this.lastWasList = true;
|
|
1002
|
+
}
|
|
1003
|
+
else {
|
|
1004
|
+
this.lastWasList = false;
|
|
1005
|
+
}
|
|
1006
|
+
if (nodeName === "li") {
|
|
1007
|
+
let li;
|
|
1008
|
+
this.break();
|
|
1009
|
+
if (this.list.length > 0) {
|
|
1010
|
+
li = this.list[this.list.length - 1];
|
|
1011
|
+
}
|
|
1012
|
+
else {
|
|
1013
|
+
li = { name: "ul", num: 0 };
|
|
1014
|
+
}
|
|
1015
|
+
const nestCount = this.list.length;
|
|
1016
|
+
this.processOutput(" ".repeat(nestCount));
|
|
1017
|
+
if (li["name"] == "ul")
|
|
1018
|
+
this.processOutput("*" + " ");
|
|
1019
|
+
else if (li["name"] == "ol") {
|
|
1020
|
+
li["num"] += 1;
|
|
1021
|
+
this.processOutput(li["num"] + ". ");
|
|
1022
|
+
}
|
|
1023
|
+
this.start = true;
|
|
1024
|
+
}
|
|
1025
|
+
}
|
|
1026
|
+
handleTagSuffix(node) {
|
|
1027
|
+
const nodeName = node.nodeName.toLowerCase();
|
|
1028
|
+
if (nodeName === "blockquote") {
|
|
1029
|
+
this.blockquote -= 1;
|
|
1030
|
+
}
|
|
1031
|
+
if (nodeName == "td" || nodeName == "th") {
|
|
1032
|
+
this.processOutput(" |");
|
|
1033
|
+
}
|
|
1034
|
+
if (nodeName == "tr") {
|
|
1035
|
+
const cell = (content, node) => {
|
|
1036
|
+
const index = Array.from(node.parentNode.childNodes).indexOf(node);
|
|
1037
|
+
let prefix = " ";
|
|
1038
|
+
if (index === 0)
|
|
1039
|
+
prefix = "| ";
|
|
1040
|
+
return prefix + content + " |";
|
|
1041
|
+
};
|
|
1042
|
+
let borderCells = "";
|
|
1043
|
+
const alignMap = { left: ":--", right: "--:", center: ":-:" };
|
|
1044
|
+
if (isHeadingRow(node)) {
|
|
1045
|
+
for (let i = 0; i < node.children.length; i++) {
|
|
1046
|
+
let border = "---";
|
|
1047
|
+
const align = (node.children[i].getAttribute("align") || "").toLowerCase();
|
|
1048
|
+
if (align)
|
|
1049
|
+
border = alignMap[align] || border;
|
|
1050
|
+
borderCells += cell(border, node.childNodes[i]);
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
this.processOutput(borderCells ? "\n" + borderCells + "\n" : "\n");
|
|
1054
|
+
}
|
|
1055
|
+
if (nodeName === "pre") {
|
|
1056
|
+
this.pre = false;
|
|
1057
|
+
this.padding();
|
|
1058
|
+
}
|
|
1059
|
+
if (["code", "tt"].includes(nodeName)) {
|
|
1060
|
+
this.processOutput("`");
|
|
1061
|
+
}
|
|
1062
|
+
if (["em", "i", "u"].includes(nodeName)) {
|
|
1063
|
+
this.processOutput(this.emphasis_mark);
|
|
1064
|
+
}
|
|
1065
|
+
if (["strong", "b"].includes(nodeName)) {
|
|
1066
|
+
this.processOutput(this.strong_mark);
|
|
1067
|
+
}
|
|
1068
|
+
if (["div", "p"].includes(nodeName)) {
|
|
1069
|
+
this.padding();
|
|
1070
|
+
}
|
|
1071
|
+
if (["del", "strike", "s"].includes(nodeName)) {
|
|
1072
|
+
this.processOutput("</" + nodeName + ">");
|
|
1073
|
+
}
|
|
1074
|
+
if (nodeName === "abbr") {
|
|
1075
|
+
if (this.abbr_title && this.abbrData) {
|
|
1076
|
+
this.abbrList[this.abbrData] = this.abbr_title;
|
|
1077
|
+
this.abbr_title = null;
|
|
1078
|
+
}
|
|
1079
|
+
this.abbrData = "";
|
|
1080
|
+
}
|
|
1081
|
+
if (nodeName === "dt") {
|
|
1082
|
+
this.break();
|
|
1083
|
+
}
|
|
1084
|
+
if (nodeName === "dd") {
|
|
1085
|
+
this.break();
|
|
1086
|
+
}
|
|
1087
|
+
if (nodeName === "a") {
|
|
1088
|
+
if (this.aStack.length > 0) {
|
|
1089
|
+
const a = this.aStack.pop();
|
|
1090
|
+
if (this.maybeAutomaticLink) {
|
|
1091
|
+
this.maybeAutomaticLink = null;
|
|
1092
|
+
}
|
|
1093
|
+
else if (a) {
|
|
1094
|
+
this.processOutput(`](${escapeMd(a.getNamedItem("href")?.value || "")})`);
|
|
1095
|
+
}
|
|
1096
|
+
}
|
|
1097
|
+
}
|
|
1098
|
+
if (["ul", "ol"].includes(nodeName)) {
|
|
1099
|
+
if (this.list.length > 0)
|
|
1100
|
+
this.list.pop();
|
|
1101
|
+
this.lastWasList = true;
|
|
1102
|
+
}
|
|
1103
|
+
else {
|
|
1104
|
+
this.lastWasList = false;
|
|
1105
|
+
}
|
|
1106
|
+
if (nodeName === "li") {
|
|
1107
|
+
this.break();
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
1110
|
+
previousIndex(attrs) {
|
|
1111
|
+
// Returns the index of a certain set of attributes (of a link) in the
|
|
1112
|
+
// this.a list.
|
|
1113
|
+
// If the set of attributes is not found, returns null.
|
|
1114
|
+
const href = attrs.getNamedItem("href");
|
|
1115
|
+
if (!attrs.getNamedItem("href"))
|
|
1116
|
+
return null;
|
|
1117
|
+
let itemIndex = -1;
|
|
1118
|
+
for (const a of this.a ?? []) {
|
|
1119
|
+
itemIndex += 1;
|
|
1120
|
+
let match = false;
|
|
1121
|
+
if (a.getNamedItem("href") === href) {
|
|
1122
|
+
if (a.getNamedItem("title") || attrs.getNamedItem("title")) {
|
|
1123
|
+
if (a.getNamedItem("title") &&
|
|
1124
|
+
attrs.getNamedItem("title") &&
|
|
1125
|
+
a.getNamedItem("title") === attrs.getNamedItem("title")) {
|
|
1126
|
+
match = true;
|
|
1127
|
+
}
|
|
1128
|
+
}
|
|
1129
|
+
else {
|
|
1130
|
+
match = true;
|
|
1131
|
+
}
|
|
1132
|
+
}
|
|
1133
|
+
if (match)
|
|
1134
|
+
return itemIndex;
|
|
1135
|
+
}
|
|
1136
|
+
return null;
|
|
1137
|
+
}
|
|
1138
|
+
handle(htmlElement) {
|
|
1139
|
+
// jsdom failed to parse hilton page due to invalid stylesheet
|
|
1140
|
+
// Nodes to be removed
|
|
1141
|
+
const filteredNodes = ["style", "script", "noscript"];
|
|
1142
|
+
for (const node of filteredNodes) {
|
|
1143
|
+
const nodeSelectors = htmlElement.querySelectorAll(node);
|
|
1144
|
+
nodeSelectors.forEach((nodeSelector) => {
|
|
1145
|
+
if (nodeSelector && nodeSelector.parentNode) {
|
|
1146
|
+
nodeSelector.parentNode.removeChild(nodeSelector);
|
|
1147
|
+
}
|
|
1148
|
+
});
|
|
1149
|
+
}
|
|
1150
|
+
// Get the cleaned-up HTML content
|
|
1151
|
+
const htmlContent = htmlElement.outerHTML;
|
|
1152
|
+
const parser = new DOMParser();
|
|
1153
|
+
const doc = parser.parseFromString(htmlContent, "text/html");
|
|
1154
|
+
const traverseDOM = (node) => {
|
|
1155
|
+
const tag = node.nodeName.toLowerCase();
|
|
1156
|
+
if (node.nodeType === node.TEXT_NODE) {
|
|
1157
|
+
const element = node;
|
|
1158
|
+
this.handleData(element);
|
|
1159
|
+
return;
|
|
1160
|
+
}
|
|
1161
|
+
if (node.nodeType === node.ELEMENT_NODE) {
|
|
1162
|
+
const element = node;
|
|
1163
|
+
this.handleTag(element);
|
|
1164
|
+
}
|
|
1165
|
+
if (!["head", "style", "script"].includes(tag)) {
|
|
1166
|
+
this.handleTagPrefix(node);
|
|
1167
|
+
node.childNodes.forEach((child) => traverseDOM(child));
|
|
1168
|
+
this.handleTagSuffix(node);
|
|
1169
|
+
}
|
|
1170
|
+
};
|
|
1171
|
+
traverseDOM(doc.documentElement);
|
|
1172
|
+
return this.getResult();
|
|
1173
|
+
}
|
|
1174
|
+
}
|
|
1175
|
+
const converter = new Html2Text();
|
|
1176
|
+
const result = converter.handle(element);
|
|
1177
|
+
return result;
|
|
1302
1178
|
}
|
|
1303
|
-
const converter = new Html2Text();
|
|
1304
|
-
const result = converter.handle(element);
|
|
1305
|
-
return result;
|
|
1306
|
-
}
|
|
1307
|
-
|
|
1308
|
-
var node = {};
|
|
1309
1179
|
|
|
1310
|
-
|
|
1180
|
+
var node = {};
|
|
1311
1181
|
|
|
1312
|
-
|
|
1182
|
+
var htmlToMarkdownAST$1 = {};
|
|
1313
1183
|
|
|
1314
|
-
|
|
1315
|
-
ElementNode._Node = void 0;
|
|
1316
|
-
// this is by value copy of the global Node
|
|
1317
|
-
ElementNode._Node = {
|
|
1318
|
-
/** node is an element. */
|
|
1319
|
-
ELEMENT_NODE: 1,
|
|
1320
|
-
ATTRIBUTE_NODE: 2,
|
|
1321
|
-
/** node is a Text node. */
|
|
1322
|
-
TEXT_NODE: 3,
|
|
1323
|
-
/** node is a CDATASection node. */
|
|
1324
|
-
CDATA_SECTION_NODE: 4,
|
|
1325
|
-
ENTITY_REFERENCE_NODE: 5,
|
|
1326
|
-
ENTITY_NODE: 6,
|
|
1327
|
-
/** node is a ProcessingInstruction node. */
|
|
1328
|
-
PROCESSING_INSTRUCTION_NODE: 7,
|
|
1329
|
-
/** node is a Comment node. */
|
|
1330
|
-
COMMENT_NODE: 8,
|
|
1331
|
-
/** node is a document. */
|
|
1332
|
-
DOCUMENT_NODE: 9,
|
|
1333
|
-
/** node is a doctype. */
|
|
1334
|
-
DOCUMENT_TYPE_NODE: 10,
|
|
1335
|
-
/** node is a DocumentFragment node. */
|
|
1336
|
-
DOCUMENT_FRAGMENT_NODE: 11,
|
|
1337
|
-
NOTATION_NODE: 12,
|
|
1338
|
-
/** Set when node and other are not in the same tree. */
|
|
1339
|
-
DOCUMENT_POSITION_DISCONNECTED: 0x01,
|
|
1340
|
-
/** Set when other is preceding node. */
|
|
1341
|
-
DOCUMENT_POSITION_PRECEDING: 0x02,
|
|
1342
|
-
/** Set when other is following node. */
|
|
1343
|
-
DOCUMENT_POSITION_FOLLOWING: 0x04,
|
|
1344
|
-
/** Set when other is an ancestor of node. */
|
|
1345
|
-
DOCUMENT_POSITION_CONTAINS: 0x08,
|
|
1346
|
-
/** Set when other is a descendant of node. */
|
|
1347
|
-
DOCUMENT_POSITION_CONTAINED_BY: 0x10,
|
|
1348
|
-
DOCUMENT_POSITION_IMPLEMENTATION_SPECIFIC: 0x20,
|
|
1349
|
-
};
|
|
1184
|
+
var ElementNode = {};
|
|
1350
1185
|
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1186
|
+
Object.defineProperty(ElementNode, "__esModule", { value: true });
|
|
1187
|
+
ElementNode._Node = void 0;
|
|
1188
|
+
// this is by value copy of the global Node
|
|
1189
|
+
ElementNode._Node = {
|
|
1190
|
+
/** node is an element. */
|
|
1191
|
+
ELEMENT_NODE: 1,
|
|
1192
|
+
ATTRIBUTE_NODE: 2,
|
|
1193
|
+
/** node is a Text node. */
|
|
1194
|
+
TEXT_NODE: 3,
|
|
1195
|
+
/** node is a CDATASection node. */
|
|
1196
|
+
CDATA_SECTION_NODE: 4,
|
|
1197
|
+
ENTITY_REFERENCE_NODE: 5,
|
|
1198
|
+
ENTITY_NODE: 6,
|
|
1199
|
+
/** node is a ProcessingInstruction node. */
|
|
1200
|
+
PROCESSING_INSTRUCTION_NODE: 7,
|
|
1201
|
+
/** node is a Comment node. */
|
|
1202
|
+
COMMENT_NODE: 8,
|
|
1203
|
+
/** node is a document. */
|
|
1204
|
+
DOCUMENT_NODE: 9,
|
|
1205
|
+
/** node is a doctype. */
|
|
1206
|
+
DOCUMENT_TYPE_NODE: 10,
|
|
1207
|
+
/** node is a DocumentFragment node. */
|
|
1208
|
+
DOCUMENT_FRAGMENT_NODE: 11,
|
|
1209
|
+
NOTATION_NODE: 12,
|
|
1210
|
+
/** Set when node and other are not in the same tree. */
|
|
1211
|
+
DOCUMENT_POSITION_DISCONNECTED: 0x01,
|
|
1212
|
+
/** Set when other is preceding node. */
|
|
1213
|
+
DOCUMENT_POSITION_PRECEDING: 0x02,
|
|
1214
|
+
/** Set when other is following node. */
|
|
1215
|
+
DOCUMENT_POSITION_FOLLOWING: 0x04,
|
|
1216
|
+
/** Set when other is an ancestor of node. */
|
|
1217
|
+
DOCUMENT_POSITION_CONTAINS: 0x08,
|
|
1218
|
+
/** Set when other is a descendant of node. */
|
|
1219
|
+
DOCUMENT_POSITION_CONTAINED_BY: 0x10,
|
|
1220
|
+
DOCUMENT_POSITION_IMPLEMENTATION_SPECIFIC: 0x20,
|
|
1360
1221
|
};
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
} else if (childElement.nodeType === ElementNode_1$1._Node.TEXT_NODE) {
|
|
1371
|
-
const textContent = escapeMarkdownCharacters(
|
|
1372
|
-
childElement.textContent?.trim() ?? ""
|
|
1373
|
-
);
|
|
1374
|
-
if (textContent && !!childElement.textContent) {
|
|
1375
|
-
debugLog(`Text Node: '${textContent}'`);
|
|
1376
|
-
// preserve whitespaces when text childElement is not empty
|
|
1377
|
-
result.push({
|
|
1378
|
-
type: "text",
|
|
1379
|
-
content: childElement.textContent?.trim(),
|
|
1380
|
-
});
|
|
1381
|
-
}
|
|
1382
|
-
} else if (childElement.nodeType === ElementNode_1$1._Node.ELEMENT_NODE) {
|
|
1383
|
-
const elem = childElement;
|
|
1384
|
-
if (/^h[1-6]$/i.test(elem.tagName)) {
|
|
1385
|
-
const level = parseInt(elem.tagName.substring(1));
|
|
1386
|
-
const content = escapeMarkdownCharacters(
|
|
1387
|
-
elem.textContent || ""
|
|
1388
|
-
).trim();
|
|
1389
|
-
if (content) {
|
|
1390
|
-
debugLog(`Heading ${level}: '${elem.textContent}'`);
|
|
1391
|
-
result.push({ type: "heading", level, content });
|
|
1392
|
-
}
|
|
1393
|
-
} else if (elem.tagName.toLowerCase() === "p") {
|
|
1394
|
-
debugLog("Paragraph");
|
|
1395
|
-
result.push(...htmlToMarkdownAST(elem, options));
|
|
1396
|
-
// Add a new line after the paragraph
|
|
1397
|
-
result.push({ type: "text", content: "\n\n" });
|
|
1398
|
-
} else if (elem.tagName.toLowerCase() === "a") {
|
|
1399
|
-
debugLog(`Link: '${elem.href}' with text '${elem.textContent}'`);
|
|
1400
|
-
// Check if the href is a data URL for an image
|
|
1401
|
-
if (
|
|
1402
|
-
typeof elem.href === "string" &&
|
|
1403
|
-
elem.href.startsWith("data:image")
|
|
1404
|
-
) {
|
|
1405
|
-
// If it's a data URL for an image, skip this link
|
|
1406
|
-
result.push({
|
|
1407
|
-
type: "link",
|
|
1408
|
-
href: "-",
|
|
1409
|
-
content: htmlToMarkdownAST(elem, options),
|
|
1410
|
-
});
|
|
1411
|
-
} else {
|
|
1412
|
-
// Process the link as usual
|
|
1413
|
-
let href = elem.href;
|
|
1414
|
-
if (typeof href === "string") {
|
|
1415
|
-
href =
|
|
1416
|
-
options?.websiteDomain && href.startsWith(options.websiteDomain)
|
|
1417
|
-
? href.substring(options.websiteDomain.length)
|
|
1418
|
-
: href;
|
|
1419
|
-
} else {
|
|
1420
|
-
href = "#"; // Use a default value when href is not a string
|
|
1222
|
+
|
|
1223
|
+
Object.defineProperty(htmlToMarkdownAST$1, "__esModule", { value: true });
|
|
1224
|
+
htmlToMarkdownAST$1.htmlToMarkdownAST = htmlToMarkdownAST;
|
|
1225
|
+
const ElementNode_1$1 = ElementNode;
|
|
1226
|
+
function htmlToMarkdownAST(element, options, indentLevel = 0) {
|
|
1227
|
+
let result = [];
|
|
1228
|
+
const debugLog = (message) => {
|
|
1229
|
+
if (options?.debug) {
|
|
1230
|
+
console.log(message);
|
|
1421
1231
|
}
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
result.push({
|
|
1429
|
-
type: "link",
|
|
1430
|
-
href: href,
|
|
1431
|
-
content: [
|
|
1432
|
-
{ type: "text", content: elem.textContent?.trim() ?? "" },
|
|
1433
|
-
],
|
|
1434
|
-
});
|
|
1435
|
-
} else {
|
|
1436
|
-
result.push({
|
|
1437
|
-
type: "link",
|
|
1438
|
-
href: href,
|
|
1439
|
-
content: htmlToMarkdownAST(elem, options),
|
|
1440
|
-
});
|
|
1232
|
+
};
|
|
1233
|
+
element.childNodes.forEach((childElement) => {
|
|
1234
|
+
const overriddenElementProcessing = options?.overrideElementProcessing?.(childElement, options, indentLevel);
|
|
1235
|
+
if (overriddenElementProcessing) {
|
|
1236
|
+
debugLog(`Element Processing Overridden: '${childElement.nodeType}'`);
|
|
1237
|
+
result.push(...overriddenElementProcessing);
|
|
1441
1238
|
}
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
alt: escapeMarkdownCharacters(elem.alt),
|
|
1450
|
-
});
|
|
1451
|
-
} else {
|
|
1452
|
-
const src =
|
|
1453
|
-
options?.websiteDomain &&
|
|
1454
|
-
elem.src?.startsWith(options.websiteDomain)
|
|
1455
|
-
? elem.src?.substring(options.websiteDomain.length)
|
|
1456
|
-
: elem.src;
|
|
1457
|
-
result.push({
|
|
1458
|
-
type: "image",
|
|
1459
|
-
src,
|
|
1460
|
-
alt: escapeMarkdownCharacters(elem.alt),
|
|
1461
|
-
});
|
|
1462
|
-
}
|
|
1463
|
-
} else if (elem.tagName.toLowerCase() === "video") {
|
|
1464
|
-
debugLog(
|
|
1465
|
-
`Video: src='${elem.src}', poster='${elem.poster}', controls='${elem.controls}'`
|
|
1466
|
-
);
|
|
1467
|
-
result.push({
|
|
1468
|
-
type: "video",
|
|
1469
|
-
src: elem.src,
|
|
1470
|
-
poster: escapeMarkdownCharacters(elem.poster),
|
|
1471
|
-
controls: elem.controls,
|
|
1472
|
-
});
|
|
1473
|
-
} else if (
|
|
1474
|
-
elem.tagName.toLowerCase() === "ul" ||
|
|
1475
|
-
elem.tagName.toLowerCase() === "ol"
|
|
1476
|
-
) {
|
|
1477
|
-
debugLog(
|
|
1478
|
-
`${
|
|
1479
|
-
elem.tagName.toLowerCase() === "ul" ? "Unordered" : "Ordered"
|
|
1480
|
-
} List`
|
|
1481
|
-
);
|
|
1482
|
-
result.push({
|
|
1483
|
-
type: "list",
|
|
1484
|
-
ordered: elem.tagName.toLowerCase() === "ol",
|
|
1485
|
-
items: Array.from(elem.children).map((li) => ({
|
|
1486
|
-
type: "listItem",
|
|
1487
|
-
content: htmlToMarkdownAST(li, options, indentLevel + 1),
|
|
1488
|
-
})),
|
|
1489
|
-
});
|
|
1490
|
-
} else if (elem.tagName.toLowerCase() === "br") {
|
|
1491
|
-
debugLog("Line Break");
|
|
1492
|
-
result.push({ type: "text", content: "\n" });
|
|
1493
|
-
} else if (elem.tagName.toLowerCase() === "table") {
|
|
1494
|
-
debugLog("Table");
|
|
1495
|
-
let colIds = [];
|
|
1496
|
-
if (options?.enableTableColumnTracking) {
|
|
1497
|
-
// Generate unique column IDs
|
|
1498
|
-
const headerCells = Array.from(elem.querySelectorAll("th, td"));
|
|
1499
|
-
headerCells.forEach((_, index) => {
|
|
1500
|
-
colIds.push(`col-${index}`);
|
|
1501
|
-
});
|
|
1502
|
-
}
|
|
1503
|
-
const tableRows = Array.from(elem.querySelectorAll("tr"));
|
|
1504
|
-
const markdownTableRows = tableRows.map((row) => {
|
|
1505
|
-
let columnIndex = 0;
|
|
1506
|
-
const cells = Array.from(row.querySelectorAll("th, td")).map(
|
|
1507
|
-
(cell) => {
|
|
1508
|
-
const colspan = parseInt(
|
|
1509
|
-
cell.getAttribute("colspan") || "1",
|
|
1510
|
-
10
|
|
1511
|
-
);
|
|
1512
|
-
const rowspan = parseInt(
|
|
1513
|
-
cell.getAttribute("rowspan") || "1",
|
|
1514
|
-
10
|
|
1515
|
-
);
|
|
1516
|
-
const cellNode = {
|
|
1517
|
-
type: "tableCell",
|
|
1518
|
-
content:
|
|
1519
|
-
cell.nodeType === ElementNode_1$1._Node.TEXT_NODE
|
|
1520
|
-
? escapeMarkdownCharacters(cell.textContent?.trim() ?? "")
|
|
1521
|
-
: htmlToMarkdownAST(cell, options, indentLevel + 1),
|
|
1522
|
-
colId: colIds[columnIndex],
|
|
1523
|
-
colspan: colspan > 1 ? colspan : undefined,
|
|
1524
|
-
rowspan: rowspan > 1 ? rowspan : undefined,
|
|
1525
|
-
};
|
|
1526
|
-
columnIndex += colspan;
|
|
1527
|
-
return cellNode;
|
|
1528
|
-
}
|
|
1529
|
-
);
|
|
1530
|
-
return { type: "tableRow", cells };
|
|
1531
|
-
});
|
|
1532
|
-
if (markdownTableRows.length > 0) {
|
|
1533
|
-
// Check if the first row contains header cells
|
|
1534
|
-
const hasHeaders = tableRows[0].querySelector("th") !== null;
|
|
1535
|
-
if (hasHeaders) {
|
|
1536
|
-
// Create a header separator row
|
|
1537
|
-
const headerSeparatorCells = Array.from(
|
|
1538
|
-
tableRows[0].querySelectorAll("th, td")
|
|
1539
|
-
).map(() => ({
|
|
1540
|
-
type: "tableCell",
|
|
1541
|
-
content: "---",
|
|
1542
|
-
colId: undefined,
|
|
1543
|
-
colspan: undefined,
|
|
1544
|
-
rowspan: undefined,
|
|
1545
|
-
}));
|
|
1546
|
-
const headerSeparatorRow = {
|
|
1547
|
-
type: "tableRow",
|
|
1548
|
-
cells: headerSeparatorCells,
|
|
1549
|
-
};
|
|
1550
|
-
markdownTableRows.splice(1, 0, headerSeparatorRow);
|
|
1239
|
+
else if (childElement.nodeType === ElementNode_1$1._Node.TEXT_NODE) {
|
|
1240
|
+
const textContent = escapeMarkdownCharacters(childElement.textContent?.trim() ?? '');
|
|
1241
|
+
if (textContent && !!childElement.textContent) {
|
|
1242
|
+
debugLog(`Text Node: '${textContent}'`);
|
|
1243
|
+
// preserve whitespaces when text childElement is not empty
|
|
1244
|
+
result.push({ type: 'text', content: childElement.textContent?.trim() });
|
|
1245
|
+
}
|
|
1551
1246
|
}
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
|
|
1247
|
+
else if (childElement.nodeType === ElementNode_1$1._Node.ELEMENT_NODE) {
|
|
1248
|
+
const elem = childElement;
|
|
1249
|
+
if (/^h[1-6]$/i.test(elem.tagName)) {
|
|
1250
|
+
const level = parseInt(elem.tagName.substring(1));
|
|
1251
|
+
const content = escapeMarkdownCharacters(elem.textContent || '').trim();
|
|
1252
|
+
if (content) {
|
|
1253
|
+
debugLog(`Heading ${level}: '${elem.textContent}'`);
|
|
1254
|
+
result.push({ type: 'heading', level, content });
|
|
1255
|
+
}
|
|
1256
|
+
}
|
|
1257
|
+
else if (elem.tagName.toLowerCase() === 'p') {
|
|
1258
|
+
debugLog("Paragraph");
|
|
1259
|
+
result.push(...htmlToMarkdownAST(elem, options));
|
|
1260
|
+
// Add a new line after the paragraph
|
|
1261
|
+
result.push({ type: 'text', content: '\n\n' });
|
|
1262
|
+
}
|
|
1263
|
+
else if (elem.tagName.toLowerCase() === 'a') {
|
|
1264
|
+
debugLog(`Link: '${elem.href}' with text '${elem.textContent}'`);
|
|
1265
|
+
// Check if the href is a data URL for an image
|
|
1266
|
+
if (typeof elem.href === 'string' && elem.href.startsWith("data:image")) {
|
|
1267
|
+
// If it's a data URL for an image, skip this link
|
|
1268
|
+
result.push({
|
|
1269
|
+
type: 'link',
|
|
1270
|
+
href: '-',
|
|
1271
|
+
content: htmlToMarkdownAST(elem, options)
|
|
1272
|
+
});
|
|
1273
|
+
}
|
|
1274
|
+
else {
|
|
1275
|
+
// Process the link as usual
|
|
1276
|
+
let href = elem.href;
|
|
1277
|
+
if (typeof href === 'string') {
|
|
1278
|
+
href = options?.websiteDomain && href.startsWith(options.websiteDomain) ?
|
|
1279
|
+
href.substring(options.websiteDomain.length) : href;
|
|
1280
|
+
}
|
|
1281
|
+
else {
|
|
1282
|
+
href = '#'; // Use a default value when href is not a string
|
|
1283
|
+
}
|
|
1284
|
+
// if all children are text,
|
|
1285
|
+
if (Array.from(elem.childNodes).every(_ => _.nodeType === ElementNode_1$1._Node.TEXT_NODE)) {
|
|
1286
|
+
result.push({
|
|
1287
|
+
type: 'link',
|
|
1288
|
+
href: href,
|
|
1289
|
+
content: [{ type: 'text', content: elem.textContent?.trim() ?? '' }]
|
|
1290
|
+
});
|
|
1291
|
+
}
|
|
1292
|
+
else {
|
|
1293
|
+
result.push({
|
|
1294
|
+
type: 'link',
|
|
1295
|
+
href: href,
|
|
1296
|
+
content: htmlToMarkdownAST(elem, options)
|
|
1297
|
+
});
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
}
|
|
1301
|
+
else if (elem.tagName.toLowerCase() === 'img') {
|
|
1302
|
+
debugLog(`Image: src='${elem.src}', alt='${elem.alt}'`);
|
|
1303
|
+
if (elem.src?.startsWith("data:image")) {
|
|
1304
|
+
result.push({
|
|
1305
|
+
type: 'image',
|
|
1306
|
+
src: '-',
|
|
1307
|
+
alt: escapeMarkdownCharacters(elem.alt)
|
|
1308
|
+
});
|
|
1309
|
+
}
|
|
1310
|
+
else {
|
|
1311
|
+
const src = options?.websiteDomain && elem.src?.startsWith(options.websiteDomain) ?
|
|
1312
|
+
elem.src?.substring(options.websiteDomain.length) :
|
|
1313
|
+
elem.src;
|
|
1314
|
+
result.push({ type: 'image', src, alt: escapeMarkdownCharacters(elem.alt) });
|
|
1315
|
+
}
|
|
1316
|
+
}
|
|
1317
|
+
else if (elem.tagName.toLowerCase() === 'video') {
|
|
1318
|
+
debugLog(`Video: src='${elem.src}', poster='${elem.poster}', controls='${elem.controls}'`);
|
|
1319
|
+
result.push({
|
|
1320
|
+
type: 'video',
|
|
1321
|
+
src: elem.src,
|
|
1322
|
+
poster: escapeMarkdownCharacters(elem.poster),
|
|
1323
|
+
controls: elem.controls
|
|
1324
|
+
});
|
|
1325
|
+
}
|
|
1326
|
+
else if (elem.tagName.toLowerCase() === 'ul' || elem.tagName.toLowerCase() === 'ol') {
|
|
1327
|
+
debugLog(`${elem.tagName.toLowerCase() === 'ul' ? 'Unordered' : 'Ordered'} List`);
|
|
1328
|
+
result.push({
|
|
1329
|
+
type: 'list',
|
|
1330
|
+
ordered: elem.tagName.toLowerCase() === 'ol',
|
|
1331
|
+
items: Array.from(elem.children).map(li => ({
|
|
1332
|
+
type: 'listItem',
|
|
1333
|
+
content: htmlToMarkdownAST(li, options, indentLevel + 1)
|
|
1334
|
+
}))
|
|
1335
|
+
});
|
|
1336
|
+
}
|
|
1337
|
+
else if (elem.tagName.toLowerCase() === 'br') {
|
|
1338
|
+
debugLog("Line Break");
|
|
1339
|
+
result.push({ type: 'text', content: '\n' });
|
|
1340
|
+
}
|
|
1341
|
+
else if (elem.tagName.toLowerCase() === 'table') {
|
|
1342
|
+
debugLog("Table");
|
|
1343
|
+
let colIds = [];
|
|
1344
|
+
if (options?.enableTableColumnTracking) {
|
|
1345
|
+
// Generate unique column IDs
|
|
1346
|
+
const headerCells = Array.from(elem.querySelectorAll('th, td'));
|
|
1347
|
+
headerCells.forEach((_, index) => {
|
|
1348
|
+
colIds.push(`col-${index}`);
|
|
1349
|
+
});
|
|
1350
|
+
}
|
|
1351
|
+
const tableRows = Array.from(elem.querySelectorAll('tr'));
|
|
1352
|
+
const markdownTableRows = tableRows.map(row => {
|
|
1353
|
+
let columnIndex = 0;
|
|
1354
|
+
const cells = Array.from(row.querySelectorAll('th, td')).map((cell) => {
|
|
1355
|
+
const colspan = parseInt(cell.getAttribute('colspan') || '1', 10);
|
|
1356
|
+
const rowspan = parseInt(cell.getAttribute('rowspan') || '1', 10);
|
|
1357
|
+
const cellNode = {
|
|
1358
|
+
type: 'tableCell',
|
|
1359
|
+
content: cell.nodeType === ElementNode_1$1._Node.TEXT_NODE
|
|
1360
|
+
? escapeMarkdownCharacters(cell.textContent?.trim() ?? '')
|
|
1361
|
+
: htmlToMarkdownAST(cell, options, indentLevel + 1),
|
|
1362
|
+
colId: colIds[columnIndex],
|
|
1363
|
+
colspan: colspan > 1 ? colspan : undefined,
|
|
1364
|
+
rowspan: rowspan > 1 ? rowspan : undefined
|
|
1365
|
+
};
|
|
1366
|
+
columnIndex += colspan;
|
|
1367
|
+
return cellNode;
|
|
1368
|
+
});
|
|
1369
|
+
return { type: 'tableRow', cells };
|
|
1370
|
+
});
|
|
1371
|
+
if (markdownTableRows.length > 0) {
|
|
1372
|
+
// Check if the first row contains header cells
|
|
1373
|
+
const hasHeaders = tableRows[0].querySelector('th') !== null;
|
|
1374
|
+
if (hasHeaders) {
|
|
1375
|
+
// Create a header separator row
|
|
1376
|
+
const headerSeparatorCells = Array.from(tableRows[0].querySelectorAll('th, td'))
|
|
1377
|
+
.map(() => ({
|
|
1378
|
+
type: 'tableCell',
|
|
1379
|
+
content: '---',
|
|
1380
|
+
colId: undefined,
|
|
1381
|
+
colspan: undefined,
|
|
1382
|
+
rowspan: undefined,
|
|
1383
|
+
}));
|
|
1384
|
+
const headerSeparatorRow = {
|
|
1385
|
+
type: 'tableRow',
|
|
1386
|
+
cells: headerSeparatorCells,
|
|
1387
|
+
};
|
|
1388
|
+
markdownTableRows.splice(1, 0, headerSeparatorRow);
|
|
1389
|
+
}
|
|
1390
|
+
}
|
|
1391
|
+
result.push({ type: 'table', rows: markdownTableRows, colIds });
|
|
1392
|
+
}
|
|
1393
|
+
else if (elem.tagName.toLowerCase() === 'head' && !!options?.includeMetaData) {
|
|
1394
|
+
const node = {
|
|
1395
|
+
type: 'meta',
|
|
1396
|
+
content: {
|
|
1397
|
+
standard: {},
|
|
1398
|
+
openGraph: {},
|
|
1399
|
+
twitter: {},
|
|
1400
|
+
}
|
|
1401
|
+
};
|
|
1402
|
+
elem.querySelectorAll('title')
|
|
1403
|
+
.forEach(titleElem => {
|
|
1404
|
+
node.content.standard['title'] = escapeMarkdownCharacters(titleElem.text);
|
|
1405
|
+
});
|
|
1406
|
+
// Extract meta tags
|
|
1407
|
+
const metaTags = elem.querySelectorAll('meta');
|
|
1408
|
+
const nonSemanticTagNames = [
|
|
1409
|
+
"viewport",
|
|
1410
|
+
"referrer",
|
|
1411
|
+
"Content-Security-Policy"
|
|
1412
|
+
];
|
|
1413
|
+
metaTags.forEach(metaTag => {
|
|
1414
|
+
const name = metaTag.getAttribute('name');
|
|
1415
|
+
const property = metaTag.getAttribute('property');
|
|
1416
|
+
const content = metaTag.getAttribute('content');
|
|
1417
|
+
if (property && property.startsWith('og:') && content) {
|
|
1418
|
+
if (options.includeMetaData === 'extended') {
|
|
1419
|
+
node.content.openGraph[property.substring(3)] = content;
|
|
1420
|
+
}
|
|
1421
|
+
}
|
|
1422
|
+
else if (name && name.startsWith('twitter:') && content) {
|
|
1423
|
+
if (options.includeMetaData === 'extended') {
|
|
1424
|
+
node.content.twitter[name.substring(8)] = content;
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
else if (name && !nonSemanticTagNames.includes(name) && content) {
|
|
1428
|
+
node.content.standard[name] = content;
|
|
1429
|
+
}
|
|
1430
|
+
});
|
|
1431
|
+
// Extract JSON-LD data
|
|
1432
|
+
if (options.includeMetaData === 'extended') {
|
|
1433
|
+
const jsonLdData = [];
|
|
1434
|
+
const jsonLDScripts = elem.querySelectorAll('script[type="application/ld+json"]');
|
|
1435
|
+
jsonLDScripts.forEach(script => {
|
|
1436
|
+
try {
|
|
1437
|
+
const jsonContent = script.textContent;
|
|
1438
|
+
if (jsonContent) {
|
|
1439
|
+
const parsedData = JSON.parse(jsonContent);
|
|
1440
|
+
jsonLdData.push(parsedData);
|
|
1441
|
+
}
|
|
1442
|
+
}
|
|
1443
|
+
catch (error) {
|
|
1444
|
+
console.error('Failed to parse JSON-LD', error);
|
|
1445
|
+
}
|
|
1446
|
+
});
|
|
1447
|
+
node.content.jsonLd = jsonLdData;
|
|
1448
|
+
}
|
|
1449
|
+
result.push(node);
|
|
1450
|
+
}
|
|
1451
|
+
else {
|
|
1452
|
+
const content = escapeMarkdownCharacters(elem.textContent || '');
|
|
1453
|
+
switch (elem.tagName.toLowerCase()) {
|
|
1454
|
+
case 'noscript':
|
|
1455
|
+
case 'script':
|
|
1456
|
+
case 'style':
|
|
1457
|
+
case 'html':
|
|
1458
|
+
// blackhole..
|
|
1459
|
+
break;
|
|
1460
|
+
case 'strong':
|
|
1461
|
+
case 'b':
|
|
1462
|
+
if (content) {
|
|
1463
|
+
debugLog(`Bold: '${content}'`);
|
|
1464
|
+
result.push({
|
|
1465
|
+
type: 'bold',
|
|
1466
|
+
content: htmlToMarkdownAST(elem, options, indentLevel + 1)
|
|
1467
|
+
});
|
|
1468
|
+
}
|
|
1469
|
+
break;
|
|
1470
|
+
case 'em':
|
|
1471
|
+
case 'i':
|
|
1472
|
+
if (content) {
|
|
1473
|
+
debugLog(`Italic: '${content}'`);
|
|
1474
|
+
result.push({
|
|
1475
|
+
type: 'italic',
|
|
1476
|
+
content: htmlToMarkdownAST(elem, options, indentLevel + 1)
|
|
1477
|
+
});
|
|
1478
|
+
}
|
|
1479
|
+
break;
|
|
1480
|
+
case 's':
|
|
1481
|
+
case 'strike':
|
|
1482
|
+
if (content) {
|
|
1483
|
+
debugLog(`Strikethrough: '${content}'`);
|
|
1484
|
+
result.push({
|
|
1485
|
+
type: 'strikethrough',
|
|
1486
|
+
content: htmlToMarkdownAST(elem, options, indentLevel + 1)
|
|
1487
|
+
});
|
|
1488
|
+
}
|
|
1489
|
+
break;
|
|
1490
|
+
case 'code':
|
|
1491
|
+
if (content) {
|
|
1492
|
+
// Handling inline code differently
|
|
1493
|
+
const isCodeBlock = elem.parentNode && elem.parentNode.nodeName.toLowerCase() === 'pre';
|
|
1494
|
+
debugLog(`${isCodeBlock ? 'Code Block' : 'Inline Code'}: '${content}'`);
|
|
1495
|
+
const languageClass = elem.className?.split(" ").find(cls => cls.startsWith("language-"));
|
|
1496
|
+
const language = languageClass ? languageClass.replace("language-", "") : "";
|
|
1497
|
+
result.push({
|
|
1498
|
+
type: 'code',
|
|
1499
|
+
content: elem.textContent?.trim() ?? '',
|
|
1500
|
+
language,
|
|
1501
|
+
inline: !isCodeBlock
|
|
1502
|
+
});
|
|
1503
|
+
}
|
|
1504
|
+
break;
|
|
1505
|
+
case 'blockquote':
|
|
1506
|
+
debugLog(`Blockquote`);
|
|
1507
|
+
result.push({
|
|
1508
|
+
type: 'blockquote',
|
|
1509
|
+
content: htmlToMarkdownAST(elem, options)
|
|
1510
|
+
});
|
|
1511
|
+
break;
|
|
1512
|
+
case 'article':
|
|
1513
|
+
case 'aside':
|
|
1514
|
+
case 'details':
|
|
1515
|
+
case 'figcaption':
|
|
1516
|
+
case 'figure':
|
|
1517
|
+
case 'footer':
|
|
1518
|
+
case 'header':
|
|
1519
|
+
case 'main':
|
|
1520
|
+
case 'mark':
|
|
1521
|
+
case 'nav':
|
|
1522
|
+
case 'section':
|
|
1523
|
+
case 'summary':
|
|
1524
|
+
case 'time':
|
|
1525
|
+
debugLog(`Semantic HTML Element: '${elem.tagName}'`);
|
|
1526
|
+
result.push({
|
|
1527
|
+
type: 'semanticHtml',
|
|
1528
|
+
htmlType: elem.tagName.toLowerCase(),
|
|
1529
|
+
content: htmlToMarkdownAST(elem, options)
|
|
1530
|
+
});
|
|
1531
|
+
break;
|
|
1532
|
+
default:
|
|
1533
|
+
const unhandledElementProcessing = options?.processUnhandledElement?.(elem, options, indentLevel);
|
|
1534
|
+
if (unhandledElementProcessing) {
|
|
1535
|
+
debugLog(`Processing Unhandled Element: '${elem.tagName}'`);
|
|
1536
|
+
result.push(...unhandledElementProcessing);
|
|
1537
|
+
}
|
|
1538
|
+
else {
|
|
1539
|
+
debugLog(`Generic HTMLElement: '${elem.tagName}'`);
|
|
1540
|
+
result.push(...htmlToMarkdownAST(elem, options, indentLevel + 1));
|
|
1541
|
+
}
|
|
1542
|
+
break;
|
|
1543
|
+
}
|
|
1544
|
+
}
|
|
1592
1545
|
}
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
console.error("Failed to parse JSON-LD", error);
|
|
1609
|
-
}
|
|
1610
|
-
});
|
|
1611
|
-
node.content.jsonLd = jsonLdData;
|
|
1612
|
-
}
|
|
1613
|
-
result.push(node);
|
|
1614
|
-
} else {
|
|
1615
|
-
const content = escapeMarkdownCharacters(elem.textContent || "");
|
|
1616
|
-
switch (elem.tagName.toLowerCase()) {
|
|
1617
|
-
case "noscript":
|
|
1618
|
-
case "script":
|
|
1619
|
-
case "style":
|
|
1620
|
-
case "html":
|
|
1621
|
-
// blackhole..
|
|
1622
|
-
break;
|
|
1623
|
-
case "strong":
|
|
1624
|
-
case "b":
|
|
1625
|
-
if (content) {
|
|
1626
|
-
debugLog(`Bold: '${content}'`);
|
|
1627
|
-
result.push({
|
|
1628
|
-
type: "bold",
|
|
1629
|
-
content: htmlToMarkdownAST(elem, options, indentLevel + 1),
|
|
1630
|
-
});
|
|
1631
|
-
}
|
|
1632
|
-
break;
|
|
1633
|
-
case "em":
|
|
1634
|
-
case "i":
|
|
1635
|
-
if (content) {
|
|
1636
|
-
debugLog(`Italic: '${content}'`);
|
|
1637
|
-
result.push({
|
|
1638
|
-
type: "italic",
|
|
1639
|
-
content: htmlToMarkdownAST(elem, options, indentLevel + 1),
|
|
1640
|
-
});
|
|
1641
|
-
}
|
|
1642
|
-
break;
|
|
1643
|
-
case "s":
|
|
1644
|
-
case "strike":
|
|
1645
|
-
if (content) {
|
|
1646
|
-
debugLog(`Strikethrough: '${content}'`);
|
|
1647
|
-
result.push({
|
|
1648
|
-
type: "strikethrough",
|
|
1649
|
-
content: htmlToMarkdownAST(elem, options, indentLevel + 1),
|
|
1650
|
-
});
|
|
1651
|
-
}
|
|
1652
|
-
break;
|
|
1653
|
-
case "code":
|
|
1654
|
-
if (content) {
|
|
1655
|
-
// Handling inline code differently
|
|
1656
|
-
const isCodeBlock =
|
|
1657
|
-
elem.parentNode &&
|
|
1658
|
-
elem.parentNode.nodeName.toLowerCase() === "pre";
|
|
1659
|
-
debugLog(
|
|
1660
|
-
`${isCodeBlock ? "Code Block" : "Inline Code"}: '${content}'`
|
|
1661
|
-
);
|
|
1662
|
-
const languageClass = elem.className
|
|
1663
|
-
?.split(" ")
|
|
1664
|
-
.find((cls) => cls.startsWith("language-"));
|
|
1665
|
-
const language = languageClass
|
|
1666
|
-
? languageClass.replace("language-", "")
|
|
1667
|
-
: "";
|
|
1668
|
-
result.push({
|
|
1669
|
-
type: "code",
|
|
1670
|
-
content: elem.textContent?.trim() ?? "",
|
|
1671
|
-
language,
|
|
1672
|
-
inline: !isCodeBlock,
|
|
1673
|
-
});
|
|
1674
|
-
}
|
|
1675
|
-
break;
|
|
1676
|
-
case "blockquote":
|
|
1677
|
-
debugLog(`Blockquote`);
|
|
1678
|
-
result.push({
|
|
1679
|
-
type: "blockquote",
|
|
1680
|
-
content: htmlToMarkdownAST(elem, options),
|
|
1681
|
-
});
|
|
1682
|
-
break;
|
|
1683
|
-
case "article":
|
|
1684
|
-
case "aside":
|
|
1685
|
-
case "details":
|
|
1686
|
-
case "figcaption":
|
|
1687
|
-
case "figure":
|
|
1688
|
-
case "footer":
|
|
1689
|
-
case "header":
|
|
1690
|
-
case "main":
|
|
1691
|
-
case "mark":
|
|
1692
|
-
case "nav":
|
|
1693
|
-
case "section":
|
|
1694
|
-
case "summary":
|
|
1695
|
-
case "time":
|
|
1696
|
-
debugLog(`Semantic HTML Element: '${elem.tagName}'`);
|
|
1697
|
-
result.push({
|
|
1698
|
-
type: "semanticHtml",
|
|
1699
|
-
htmlType: elem.tagName.toLowerCase(),
|
|
1700
|
-
content: htmlToMarkdownAST(elem, options),
|
|
1701
|
-
});
|
|
1702
|
-
break;
|
|
1703
|
-
default:
|
|
1704
|
-
const unhandledElementProcessing =
|
|
1705
|
-
options?.processUnhandledElement?.(elem, options, indentLevel);
|
|
1706
|
-
if (unhandledElementProcessing) {
|
|
1707
|
-
debugLog(`Processing Unhandled Element: '${elem.tagName}'`);
|
|
1708
|
-
result.push(...unhandledElementProcessing);
|
|
1709
|
-
} else {
|
|
1710
|
-
debugLog(`Generic HTMLElement: '${elem.tagName}'`);
|
|
1711
|
-
result.push(
|
|
1712
|
-
...htmlToMarkdownAST(elem, options, indentLevel + 1)
|
|
1713
|
-
);
|
|
1714
|
-
}
|
|
1715
|
-
break;
|
|
1716
|
-
}
|
|
1717
|
-
}
|
|
1718
|
-
}
|
|
1719
|
-
});
|
|
1720
|
-
return result;
|
|
1721
|
-
}
|
|
1722
|
-
function escapeMarkdownCharacters(text, isInlineCode = false) {
|
|
1723
|
-
if (isInlineCode || !text?.trim()) {
|
|
1724
|
-
// In inline code, we don't escape any characters
|
|
1725
|
-
return text;
|
|
1546
|
+
});
|
|
1547
|
+
return result;
|
|
1548
|
+
}
|
|
1549
|
+
function escapeMarkdownCharacters(text, isInlineCode = false) {
|
|
1550
|
+
if (isInlineCode || !text?.trim()) {
|
|
1551
|
+
// In inline code, we don't escape any characters
|
|
1552
|
+
return text;
|
|
1553
|
+
}
|
|
1554
|
+
// First, replace special HTML characters with their entity equivalents
|
|
1555
|
+
let escapedText = text.replace(/&/g, '&') // Replace & first
|
|
1556
|
+
.replace(/</g, '<')
|
|
1557
|
+
.replace(/>/g, '>');
|
|
1558
|
+
// Then escape characters that have special meaning in Markdown
|
|
1559
|
+
escapedText = escapedText.replace(/([\\`*_{}[\]#+!|])/g, '\\$1');
|
|
1560
|
+
return escapedText;
|
|
1726
1561
|
}
|
|
1727
|
-
// First, replace special HTML characters with their entity equivalents
|
|
1728
|
-
let escapedText = text
|
|
1729
|
-
.replace(/&/g, "&") // Replace & first
|
|
1730
|
-
.replace(/</g, "<")
|
|
1731
|
-
.replace(/>/g, ">");
|
|
1732
|
-
// Then escape characters that have special meaning in Markdown
|
|
1733
|
-
escapedText = escapedText.replace(/([\\`*_{}[\]#+!|])/g, "\\$1");
|
|
1734
|
-
return escapedText;
|
|
1735
|
-
}
|
|
1736
1562
|
|
|
1737
|
-
|
|
1563
|
+
var markdownASTToString = {};
|
|
1738
1564
|
|
|
1739
|
-
|
|
1565
|
+
var hasRequiredMarkdownASTToString;
|
|
1740
1566
|
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1567
|
+
function requireMarkdownASTToString () {
|
|
1568
|
+
if (hasRequiredMarkdownASTToString) return markdownASTToString;
|
|
1569
|
+
hasRequiredMarkdownASTToString = 1;
|
|
1570
|
+
Object.defineProperty(markdownASTToString, "__esModule", { value: true });
|
|
1571
|
+
markdownASTToString.markdownASTToString = markdownASTToString$1;
|
|
1572
|
+
const index_1 = requireNode();
|
|
1573
|
+
function markdownASTToString$1(nodes, options, indentLevel = 0) {
|
|
1574
|
+
let markdownString = '';
|
|
1575
|
+
markdownString += markdownMetaASTToString(nodes, options, indentLevel);
|
|
1576
|
+
markdownString += markdownContentASTToString(nodes, options, indentLevel);
|
|
1577
|
+
return markdownString;
|
|
1578
|
+
}
|
|
1579
|
+
function markdownMetaASTToString(nodes, options, indentLevel = 0) {
|
|
1580
|
+
let markdownString = '';
|
|
1581
|
+
if (options?.includeMetaData) {
|
|
1582
|
+
// include meta-data
|
|
1583
|
+
markdownString += '---\n';
|
|
1584
|
+
const node = (0, index_1.findInMarkdownAST)(nodes, _ => _.type === 'meta');
|
|
1585
|
+
if (node?.type === 'meta') {
|
|
1586
|
+
if (node.content.standard) {
|
|
1587
|
+
Object.keys(node.content.standard).forEach(key => {
|
|
1588
|
+
markdownString += `${key}: "${node.content.standard[key]}"\n`;
|
|
1589
|
+
});
|
|
1590
|
+
}
|
|
1591
|
+
if (options.includeMetaData === 'extended') {
|
|
1592
|
+
if (node.content.openGraph) {
|
|
1593
|
+
if (Object.keys(node.content.openGraph).length > 0) {
|
|
1594
|
+
markdownString += 'openGraph:\n';
|
|
1595
|
+
for (const [key, value] of Object.entries(node.content.openGraph)) {
|
|
1596
|
+
markdownString += ` ${key}: "${value}"\n`;
|
|
1597
|
+
}
|
|
1598
|
+
}
|
|
1599
|
+
}
|
|
1600
|
+
if (node.content.twitter) {
|
|
1601
|
+
if (Object.keys(node.content.twitter).length > 0) {
|
|
1602
|
+
markdownString += 'twitter:\n';
|
|
1603
|
+
for (const [key, value] of Object.entries(node.content.twitter)) {
|
|
1604
|
+
markdownString += ` ${key}: "${value}"\n`;
|
|
1605
|
+
}
|
|
1606
|
+
}
|
|
1607
|
+
}
|
|
1608
|
+
if (node.content.jsonLd && node.content.jsonLd.length > 0) {
|
|
1609
|
+
markdownString += 'schema:\n';
|
|
1610
|
+
node.content.jsonLd.forEach(item => {
|
|
1611
|
+
const { '@context': jldContext, '@type': jldType, ...semanticData } = item;
|
|
1612
|
+
markdownString += ` ${jldType ?? '(unknown type)'}:\n`;
|
|
1613
|
+
Object.keys(semanticData).forEach(key => {
|
|
1614
|
+
markdownString += ` ${key}: ${JSON.stringify(semanticData[key])}\n`;
|
|
1615
|
+
});
|
|
1616
|
+
});
|
|
1617
|
+
}
|
|
1618
|
+
}
|
|
1619
|
+
}
|
|
1620
|
+
markdownString += '---\n\n';
|
|
1621
|
+
}
|
|
1622
|
+
return markdownString;
|
|
1623
|
+
}
|
|
1624
|
+
function markdownContentASTToString(nodes, options, indentLevel = 0) {
|
|
1625
|
+
let markdownString = '';
|
|
1626
|
+
nodes.forEach((node) => {
|
|
1627
|
+
const indent = ' '.repeat(indentLevel * 2); // Adjust the multiplier for different indent sizes
|
|
1628
|
+
const nodeRenderingOverride = options?.overrideNodeRenderer?.(node, options, indentLevel);
|
|
1629
|
+
if (nodeRenderingOverride) {
|
|
1630
|
+
markdownString += nodeRenderingOverride;
|
|
1631
|
+
}
|
|
1632
|
+
else {
|
|
1633
|
+
switch (node.type) {
|
|
1634
|
+
case 'text':
|
|
1635
|
+
case 'bold':
|
|
1636
|
+
case 'italic':
|
|
1637
|
+
case 'strikethrough':
|
|
1638
|
+
case 'link':
|
|
1639
|
+
let content = node.content; // might be a nodes array but we take care of that below
|
|
1640
|
+
if (Array.isArray(node.content)) {
|
|
1641
|
+
content = markdownContentASTToString(node.content, options, indentLevel);
|
|
1642
|
+
}
|
|
1643
|
+
const isMarkdownStringNotEmpty = markdownString.length > 0;
|
|
1644
|
+
const isFirstCharOfContentWhitespace = /\s/.test(content.slice(0, 1));
|
|
1645
|
+
const isLastCharOfMarkdownWhitespace = /\s/.test(markdownString.slice(-1));
|
|
1646
|
+
const isContentPunctuation = content.length === 1 && /^[.,!?;:]/.test(content);
|
|
1647
|
+
if (isMarkdownStringNotEmpty && !isContentPunctuation && !isFirstCharOfContentWhitespace && !isLastCharOfMarkdownWhitespace) {
|
|
1648
|
+
markdownString += ' ';
|
|
1649
|
+
}
|
|
1650
|
+
if (node.type === 'text') {
|
|
1651
|
+
markdownString += `${indent}${content}`;
|
|
1652
|
+
}
|
|
1653
|
+
else {
|
|
1654
|
+
if (node.type === 'bold') {
|
|
1655
|
+
markdownString += `**${content}**`;
|
|
1656
|
+
}
|
|
1657
|
+
else if (node.type === 'italic') {
|
|
1658
|
+
markdownString += `*${content}*`;
|
|
1659
|
+
}
|
|
1660
|
+
else if (node.type === 'strikethrough') {
|
|
1661
|
+
markdownString += `~~${content}~~`;
|
|
1662
|
+
}
|
|
1663
|
+
else if (node.type === 'link') {
|
|
1664
|
+
// check if the link contains only text
|
|
1665
|
+
if (node.content.length === 1 && node.content[0].type === 'text') {
|
|
1666
|
+
// use native markdown syntax for text-only links
|
|
1667
|
+
markdownString += `[${content}](${encodeURI(node.href)})`;
|
|
1668
|
+
}
|
|
1669
|
+
else {
|
|
1670
|
+
// Use HTML <a> tag for links with rich content
|
|
1671
|
+
markdownString += `<a href="${node.href}">${content}</a>`;
|
|
1672
|
+
}
|
|
1673
|
+
}
|
|
1674
|
+
}
|
|
1675
|
+
break;
|
|
1676
|
+
case 'heading':
|
|
1677
|
+
const isEndsWithNewLine = markdownString.slice(-1) === '\n';
|
|
1678
|
+
if (!isEndsWithNewLine) {
|
|
1679
|
+
markdownString += '\n';
|
|
1680
|
+
}
|
|
1681
|
+
markdownString += `${'#'.repeat(node.level)} ${node.content}\n\n`;
|
|
1682
|
+
break;
|
|
1683
|
+
case 'image':
|
|
1684
|
+
if (!node.alt?.trim() || !!node.src?.trim()) {
|
|
1685
|
+
markdownString += ``;
|
|
1686
|
+
}
|
|
1687
|
+
break;
|
|
1688
|
+
case 'list':
|
|
1689
|
+
node.items.forEach((item, i) => {
|
|
1690
|
+
const listItemPrefix = node.ordered ? `${i + 1}.` : '-';
|
|
1691
|
+
const contents = markdownContentASTToString(item.content, options, indentLevel + 1).trim();
|
|
1692
|
+
if (markdownString.slice(-1) !== '\n') {
|
|
1693
|
+
markdownString += '\n';
|
|
1694
|
+
}
|
|
1695
|
+
if (contents) {
|
|
1696
|
+
markdownString += `${indent}${listItemPrefix} ${contents}\n`;
|
|
1697
|
+
}
|
|
1698
|
+
});
|
|
1699
|
+
markdownString += '\n';
|
|
1700
|
+
break;
|
|
1701
|
+
case 'video':
|
|
1702
|
+
markdownString += `\n\n`;
|
|
1703
|
+
if (node.poster) {
|
|
1704
|
+
markdownString += `\n`;
|
|
1705
|
+
}
|
|
1706
|
+
if (node.controls) {
|
|
1707
|
+
markdownString += `Controls: ${node.controls}\n`;
|
|
1708
|
+
}
|
|
1709
|
+
markdownString += '\n';
|
|
1710
|
+
break;
|
|
1711
|
+
case 'table':
|
|
1712
|
+
const maxColumns = Math.max(...node.rows.map(row => row.cells.reduce((sum, cell) => sum + (cell.colspan || 1), 0)));
|
|
1713
|
+
node.rows.forEach((row) => {
|
|
1714
|
+
let currentColumn = 0;
|
|
1715
|
+
row.cells.forEach((cell) => {
|
|
1716
|
+
let cellContent = typeof cell.content === 'string'
|
|
1717
|
+
? cell.content
|
|
1718
|
+
: markdownContentASTToString(cell.content, options, indentLevel + 1).trim();
|
|
1719
|
+
if (cell.colId) {
|
|
1720
|
+
cellContent += ` <!-- ${cell.colId} -->`;
|
|
1721
|
+
}
|
|
1722
|
+
if (cell.colspan && cell.colspan > 1) {
|
|
1723
|
+
cellContent += ` <!-- colspan: ${cell.colspan} -->`;
|
|
1724
|
+
}
|
|
1725
|
+
if (cell.rowspan && cell.rowspan > 1) {
|
|
1726
|
+
cellContent += ` <!-- rowspan: ${cell.rowspan} -->`;
|
|
1727
|
+
}
|
|
1728
|
+
markdownString += `| ${cellContent} `;
|
|
1729
|
+
currentColumn += cell.colspan || 1;
|
|
1730
|
+
// Add empty cells for colspan
|
|
1731
|
+
for (let i = 1; i < (cell.colspan || 1); i++) {
|
|
1732
|
+
markdownString += '| ';
|
|
1733
|
+
}
|
|
1734
|
+
});
|
|
1735
|
+
// Fill remaining columns with empty cells
|
|
1736
|
+
while (currentColumn < maxColumns) {
|
|
1737
|
+
markdownString += '| ';
|
|
1738
|
+
currentColumn++;
|
|
1739
|
+
}
|
|
1740
|
+
markdownString += '|\n';
|
|
1741
|
+
});
|
|
1742
|
+
markdownString += '\n';
|
|
1743
|
+
break;
|
|
1744
|
+
case 'code':
|
|
1745
|
+
if (node.inline) {
|
|
1746
|
+
const isLsatWhitespace = /\s/.test(markdownString.slice(-1));
|
|
1747
|
+
if (!isLsatWhitespace) {
|
|
1748
|
+
markdownString += ' ';
|
|
1749
|
+
}
|
|
1750
|
+
markdownString += `\`${node.content}\``;
|
|
1751
|
+
}
|
|
1752
|
+
else {
|
|
1753
|
+
// For code blocks, we do not escape characters and preserve formatting
|
|
1754
|
+
markdownString += '\n```' + (node.language ?? '') + '\n';
|
|
1755
|
+
markdownString += `${node.content}\n`;
|
|
1756
|
+
markdownString += '```\n\n';
|
|
1757
|
+
}
|
|
1758
|
+
break;
|
|
1759
|
+
case 'blockquote':
|
|
1760
|
+
markdownString += `> ${markdownContentASTToString(node.content, options).trim()}\n\n`;
|
|
1761
|
+
break;
|
|
1762
|
+
case "meta":
|
|
1763
|
+
// already handled
|
|
1764
|
+
break;
|
|
1765
|
+
case 'semanticHtml':
|
|
1766
|
+
switch (node.htmlType) {
|
|
1767
|
+
case "article":
|
|
1768
|
+
markdownString += '\n\n' + markdownContentASTToString(node.content, options);
|
|
1769
|
+
break;
|
|
1770
|
+
case "summary":
|
|
1771
|
+
case "time":
|
|
1772
|
+
case "aside":
|
|
1773
|
+
case "nav":
|
|
1774
|
+
case "figcaption":
|
|
1775
|
+
case "main":
|
|
1776
|
+
case "mark":
|
|
1777
|
+
case "header":
|
|
1778
|
+
case "footer":
|
|
1779
|
+
case "details":
|
|
1780
|
+
case "figure":
|
|
1781
|
+
markdownString += `\n\n<-${node.htmlType}->\n` + markdownContentASTToString(node.content, options) + `\n\n</-${node.htmlType}->\n`;
|
|
1782
|
+
break;
|
|
1783
|
+
case "section":
|
|
1784
|
+
markdownString += '---\n\n';
|
|
1785
|
+
markdownString += markdownContentASTToString(node.content, options);
|
|
1786
|
+
markdownString += '\n\n';
|
|
1787
|
+
markdownString += '---\n\n';
|
|
1788
|
+
break;
|
|
1789
|
+
}
|
|
1790
|
+
break;
|
|
1791
|
+
case "custom":
|
|
1792
|
+
const customNodeRendering = options?.renderCustomNode?.(node, options, indentLevel);
|
|
1793
|
+
if (customNodeRendering) {
|
|
1794
|
+
markdownString += customNodeRendering;
|
|
1795
|
+
}
|
|
1796
|
+
break;
|
|
1797
|
+
}
|
|
1798
|
+
}
|
|
1799
|
+
});
|
|
1800
|
+
return markdownString;
|
|
1801
|
+
}
|
|
1802
|
+
return markdownASTToString;
|
|
1752
1803
|
}
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
|
|
1777
|
-
|
|
1804
|
+
|
|
1805
|
+
var domUtils = {};
|
|
1806
|
+
|
|
1807
|
+
Object.defineProperty(domUtils, "__esModule", { value: true });
|
|
1808
|
+
domUtils.findMainContent = findMainContent;
|
|
1809
|
+
domUtils.wrapMainContent = wrapMainContent;
|
|
1810
|
+
domUtils.isElementVisible = isElementVisible;
|
|
1811
|
+
domUtils.getVisibleText = getVisibleText;
|
|
1812
|
+
const ElementNode_1 = ElementNode;
|
|
1813
|
+
const debugMessage = (message) => {
|
|
1814
|
+
};
|
|
1815
|
+
/**
|
|
1816
|
+
* Attempts to find the main content of a web page.
|
|
1817
|
+
* @param document The Document object to search.
|
|
1818
|
+
* @returns The Element containing the main content, or the body if no main content is found.
|
|
1819
|
+
*/
|
|
1820
|
+
function findMainContent(document) {
|
|
1821
|
+
const mainElement = document.querySelector('main');
|
|
1822
|
+
if (mainElement) {
|
|
1823
|
+
return mainElement;
|
|
1824
|
+
}
|
|
1825
|
+
if (!document.body) {
|
|
1826
|
+
return document.documentElement;
|
|
1827
|
+
}
|
|
1828
|
+
return detectMainContent(document.body);
|
|
1829
|
+
}
|
|
1830
|
+
function wrapMainContent(mainContentElement, document) {
|
|
1831
|
+
if (mainContentElement.tagName.toLowerCase() !== 'main') {
|
|
1832
|
+
const mainElement = document.createElement('main');
|
|
1833
|
+
mainContentElement.before(mainElement);
|
|
1834
|
+
mainElement.appendChild(mainContentElement);
|
|
1835
|
+
mainElement.id = 'detected-main-content';
|
|
1836
|
+
}
|
|
1837
|
+
}
|
|
1838
|
+
function detectMainContent(rootElement) {
|
|
1839
|
+
const candidates = [];
|
|
1840
|
+
const minScore = 20;
|
|
1841
|
+
collectCandidates(rootElement, candidates, minScore);
|
|
1842
|
+
if (candidates.length === 0) {
|
|
1843
|
+
return rootElement;
|
|
1844
|
+
}
|
|
1845
|
+
candidates.sort((a, b) => calculateScore(b) - calculateScore(a));
|
|
1846
|
+
let bestIndependentCandidate = candidates[0];
|
|
1847
|
+
for (let i = 1; i < candidates.length; i++) {
|
|
1848
|
+
if (!candidates.some((otherCandidate, j) => j !== i && otherCandidate.contains(candidates[i]))) {
|
|
1849
|
+
if (calculateScore(candidates[i]) > calculateScore(bestIndependentCandidate)) {
|
|
1850
|
+
bestIndependentCandidate = candidates[i];
|
|
1851
|
+
debugMessage(`New best independent candidate found: ${elementToString(bestIndependentCandidate)}`);
|
|
1852
|
+
}
|
|
1778
1853
|
}
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
|
|
1854
|
+
}
|
|
1855
|
+
debugMessage(`Final main content candidate: ${elementToString(bestIndependentCandidate)}`);
|
|
1856
|
+
return bestIndependentCandidate;
|
|
1857
|
+
}
|
|
1858
|
+
function elementToString(element) {
|
|
1859
|
+
if (!element) {
|
|
1860
|
+
return 'No element';
|
|
1861
|
+
}
|
|
1862
|
+
return `${element.tagName}#${element.id || 'no-id'}.${Array.from(element.classList).join('.')}`;
|
|
1863
|
+
}
|
|
1864
|
+
function collectCandidates(element, candidates, minScore) {
|
|
1865
|
+
const score = calculateScore(element);
|
|
1866
|
+
if (score >= minScore) {
|
|
1867
|
+
candidates.push(element);
|
|
1868
|
+
debugMessage(`Candidate found: ${elementToString(element)}, score: ${score}`);
|
|
1869
|
+
}
|
|
1870
|
+
Array.from(element.children).forEach(child => {
|
|
1871
|
+
collectCandidates(child, candidates, minScore);
|
|
1872
|
+
});
|
|
1873
|
+
}
|
|
1874
|
+
function calculateScore(element) {
|
|
1875
|
+
let score = 0;
|
|
1876
|
+
let scoreLog = [];
|
|
1877
|
+
// High impact attributes
|
|
1878
|
+
const highImpactAttributes = ['article', 'content', 'main-container', 'main', 'main-content'];
|
|
1879
|
+
highImpactAttributes.forEach(attr => {
|
|
1880
|
+
if (element.classList.contains(attr) || element.id.includes(attr)) {
|
|
1881
|
+
score += 10;
|
|
1882
|
+
scoreLog.push(`High impact attribute found: ${attr}, score increased by 10`);
|
|
1788
1883
|
}
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1884
|
+
});
|
|
1885
|
+
// High impact tags
|
|
1886
|
+
const highImpactTags = ['article', 'main', 'section'];
|
|
1887
|
+
if (highImpactTags.includes(element.tagName.toLowerCase())) {
|
|
1888
|
+
score += 5;
|
|
1889
|
+
scoreLog.push(`High impact tag found: ${element.tagName}, score increased by 5`);
|
|
1890
|
+
}
|
|
1891
|
+
// Paragraph count
|
|
1892
|
+
const paragraphCount = element.getElementsByTagName('p').length;
|
|
1893
|
+
const paragraphScore = Math.min(paragraphCount, 5);
|
|
1894
|
+
if (paragraphScore > 0) {
|
|
1895
|
+
score += paragraphScore;
|
|
1896
|
+
scoreLog.push(`Paragraph count: ${paragraphCount}, score increased by ${paragraphScore}`);
|
|
1897
|
+
}
|
|
1898
|
+
// Text content length
|
|
1899
|
+
const textContentLength = element.textContent?.trim().length || 0;
|
|
1900
|
+
if (textContentLength > 200) {
|
|
1901
|
+
const textScore = Math.min(Math.floor(textContentLength / 200), 5);
|
|
1902
|
+
score += textScore;
|
|
1903
|
+
scoreLog.push(`Text content length: ${textContentLength}, score increased by ${textScore}`);
|
|
1904
|
+
}
|
|
1905
|
+
// Link density
|
|
1906
|
+
const linkDensity = calculateLinkDensity(element);
|
|
1907
|
+
if (linkDensity < 0.3) {
|
|
1908
|
+
score += 5;
|
|
1909
|
+
scoreLog.push(`Link density: ${linkDensity.toFixed(2)}, score increased by 5`);
|
|
1910
|
+
}
|
|
1911
|
+
// Data attributes
|
|
1912
|
+
if (element.hasAttribute('data-main') || element.hasAttribute('data-content')) {
|
|
1913
|
+
score += 10;
|
|
1914
|
+
scoreLog.push('Data attribute for main content found, score increased by 10');
|
|
1915
|
+
}
|
|
1916
|
+
// Role attribute
|
|
1917
|
+
if (element.getAttribute('role')?.includes('main')) {
|
|
1918
|
+
score += 10;
|
|
1919
|
+
scoreLog.push('Role attribute indicating main content found, score increased by 10');
|
|
1920
|
+
}
|
|
1921
|
+
if (scoreLog.length > 0) {
|
|
1922
|
+
debugMessage(`Scoring for ${elementToString(element)}:`);
|
|
1923
|
+
}
|
|
1924
|
+
return score;
|
|
1925
|
+
}
|
|
1926
|
+
function calculateLinkDensity(element) {
|
|
1927
|
+
const linkLength = Array.from(element.getElementsByTagName('a'))
|
|
1928
|
+
.reduce((sum, link) => sum + (link.textContent?.length || 0), 0);
|
|
1929
|
+
const textLength = element.textContent?.length || 1; // Avoid division by zero
|
|
1930
|
+
return linkLength / textLength;
|
|
1931
|
+
}
|
|
1932
|
+
function isElementVisible(element) {
|
|
1933
|
+
if (!(element instanceof HTMLElement)) {
|
|
1934
|
+
return true; // Non-HTMLElements are considered visible
|
|
1935
|
+
}
|
|
1936
|
+
const style = window.getComputedStyle(element);
|
|
1937
|
+
return style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0';
|
|
1938
|
+
}
|
|
1939
|
+
function getVisibleText(element) {
|
|
1940
|
+
if (!isElementVisible(element)) {
|
|
1941
|
+
return '';
|
|
1942
|
+
}
|
|
1943
|
+
let text = '';
|
|
1944
|
+
for (const child of Array.from(element.childNodes)) {
|
|
1945
|
+
if (child.nodeType === ElementNode_1._Node.TEXT_NODE) {
|
|
1946
|
+
text += child.textContent;
|
|
1947
|
+
}
|
|
1948
|
+
else if (child.nodeType === ElementNode_1._Node.ELEMENT_NODE) {
|
|
1949
|
+
text += getVisibleText(child);
|
|
1804
1950
|
}
|
|
1805
|
-
}
|
|
1806
1951
|
}
|
|
1807
|
-
|
|
1808
|
-
}
|
|
1809
|
-
return markdownString;
|
|
1952
|
+
return text.trim();
|
|
1810
1953
|
}
|
|
1811
|
-
function markdownContentASTToString(nodes, options, indentLevel = 0) {
|
|
1812
|
-
let markdownString = "";
|
|
1813
|
-
nodes.forEach((node) => {
|
|
1814
|
-
const indent = " ".repeat(indentLevel * 2); // Adjust the multiplier for different indent sizes
|
|
1815
|
-
const nodeRenderingOverride = options?.overrideNodeRenderer?.(
|
|
1816
|
-
node,
|
|
1817
|
-
options,
|
|
1818
|
-
indentLevel
|
|
1819
|
-
);
|
|
1820
|
-
if (nodeRenderingOverride) {
|
|
1821
|
-
markdownString += nodeRenderingOverride;
|
|
1822
|
-
} else {
|
|
1823
|
-
switch (node.type) {
|
|
1824
|
-
case "text":
|
|
1825
|
-
case "bold":
|
|
1826
|
-
case "italic":
|
|
1827
|
-
case "strikethrough":
|
|
1828
|
-
case "link":
|
|
1829
|
-
let content = node.content; // might be a nodes array but we take care of that below
|
|
1830
|
-
if (Array.isArray(node.content)) {
|
|
1831
|
-
content = markdownContentASTToString(
|
|
1832
|
-
node.content,
|
|
1833
|
-
options,
|
|
1834
|
-
indentLevel
|
|
1835
|
-
);
|
|
1836
|
-
}
|
|
1837
|
-
const isMarkdownStringNotEmpty = markdownString.length > 0;
|
|
1838
|
-
const isFirstCharOfContentWhitespace = /\s/.test(
|
|
1839
|
-
content.slice(0, 1)
|
|
1840
|
-
);
|
|
1841
|
-
const isLastCharOfMarkdownWhitespace = /\s/.test(
|
|
1842
|
-
markdownString.slice(-1)
|
|
1843
|
-
);
|
|
1844
|
-
const isContentPunctuation =
|
|
1845
|
-
content.length === 1 && /^[.,!?;:]/.test(content);
|
|
1846
|
-
if (
|
|
1847
|
-
isMarkdownStringNotEmpty &&
|
|
1848
|
-
!isContentPunctuation &&
|
|
1849
|
-
!isFirstCharOfContentWhitespace &&
|
|
1850
|
-
!isLastCharOfMarkdownWhitespace
|
|
1851
|
-
) {
|
|
1852
|
-
markdownString += " ";
|
|
1853
|
-
}
|
|
1854
|
-
if (node.type === "text") {
|
|
1855
|
-
markdownString += `${indent}${content}`;
|
|
1856
|
-
} else {
|
|
1857
|
-
if (node.type === "bold") {
|
|
1858
|
-
markdownString += `**${content}**`;
|
|
1859
|
-
} else if (node.type === "italic") {
|
|
1860
|
-
markdownString += `*${content}*`;
|
|
1861
|
-
} else if (node.type === "strikethrough") {
|
|
1862
|
-
markdownString += `~~${content}~~`;
|
|
1863
|
-
} else if (node.type === "link") {
|
|
1864
|
-
// check if the link contains only text
|
|
1865
|
-
if (
|
|
1866
|
-
node.content.length === 1 &&
|
|
1867
|
-
node.content[0].type === "text"
|
|
1868
|
-
) {
|
|
1869
|
-
// use native markdown syntax for text-only links
|
|
1870
|
-
markdownString += `[${content}](${encodeURI(node.href)})`;
|
|
1871
|
-
} else {
|
|
1872
|
-
// Use HTML <a> tag for links with rich content
|
|
1873
|
-
markdownString += `<a href="${node.href}">${content}</a>`;
|
|
1874
|
-
}
|
|
1875
|
-
}
|
|
1876
|
-
}
|
|
1877
|
-
break;
|
|
1878
|
-
case "heading":
|
|
1879
|
-
const isEndsWithNewLine = markdownString.slice(-1) === "\n";
|
|
1880
|
-
if (!isEndsWithNewLine) {
|
|
1881
|
-
markdownString += "\n";
|
|
1882
|
-
}
|
|
1883
|
-
markdownString += `${"#".repeat(node.level)} ${node.content}\n\n`;
|
|
1884
|
-
break;
|
|
1885
|
-
case "image":
|
|
1886
|
-
if (!node.alt?.trim() || !!node.src?.trim()) {
|
|
1887
|
-
markdownString += ``;
|
|
1888
|
-
}
|
|
1889
|
-
break;
|
|
1890
|
-
case "list":
|
|
1891
|
-
node.items.forEach((item, i) => {
|
|
1892
|
-
const listItemPrefix = node.ordered ? `${i + 1}.` : "-";
|
|
1893
|
-
const contents = markdownContentASTToString(
|
|
1894
|
-
item.content,
|
|
1895
|
-
options,
|
|
1896
|
-
indentLevel + 1
|
|
1897
|
-
).trim();
|
|
1898
|
-
if (markdownString.slice(-1) !== "\n") {
|
|
1899
|
-
markdownString += "\n";
|
|
1900
|
-
}
|
|
1901
|
-
if (contents) {
|
|
1902
|
-
markdownString += `${indent}${listItemPrefix} ${contents}\n`;
|
|
1903
|
-
}
|
|
1904
|
-
});
|
|
1905
|
-
markdownString += "\n";
|
|
1906
|
-
break;
|
|
1907
|
-
case "video":
|
|
1908
|
-
markdownString += `\n\n`;
|
|
1909
|
-
if (node.poster) {
|
|
1910
|
-
markdownString += `\n`;
|
|
1911
|
-
}
|
|
1912
|
-
if (node.controls) {
|
|
1913
|
-
markdownString += `Controls: ${node.controls}\n`;
|
|
1914
|
-
}
|
|
1915
|
-
markdownString += "\n";
|
|
1916
|
-
break;
|
|
1917
|
-
case "table":
|
|
1918
|
-
const maxColumns = Math.max(
|
|
1919
|
-
...node.rows.map((row) =>
|
|
1920
|
-
row.cells.reduce((sum, cell) => sum + (cell.colspan || 1), 0)
|
|
1921
|
-
)
|
|
1922
|
-
);
|
|
1923
|
-
node.rows.forEach((row) => {
|
|
1924
|
-
let currentColumn = 0;
|
|
1925
|
-
row.cells.forEach((cell) => {
|
|
1926
|
-
let cellContent =
|
|
1927
|
-
typeof cell.content === "string"
|
|
1928
|
-
? cell.content
|
|
1929
|
-
: markdownContentASTToString(
|
|
1930
|
-
cell.content,
|
|
1931
|
-
options,
|
|
1932
|
-
indentLevel + 1
|
|
1933
|
-
).trim();
|
|
1934
|
-
if (cell.colId) {
|
|
1935
|
-
cellContent += ` <!-- ${cell.colId} -->`;
|
|
1936
|
-
}
|
|
1937
|
-
if (cell.colspan && cell.colspan > 1) {
|
|
1938
|
-
cellContent += ` <!-- colspan: ${cell.colspan} -->`;
|
|
1939
|
-
}
|
|
1940
|
-
if (cell.rowspan && cell.rowspan > 1) {
|
|
1941
|
-
cellContent += ` <!-- rowspan: ${cell.rowspan} -->`;
|
|
1942
|
-
}
|
|
1943
|
-
markdownString += `| ${cellContent} `;
|
|
1944
|
-
currentColumn += cell.colspan || 1;
|
|
1945
|
-
// Add empty cells for colspan
|
|
1946
|
-
for (let i = 1; i < (cell.colspan || 1); i++) {
|
|
1947
|
-
markdownString += "| ";
|
|
1948
|
-
}
|
|
1949
|
-
});
|
|
1950
|
-
// Fill remaining columns with empty cells
|
|
1951
|
-
while (currentColumn < maxColumns) {
|
|
1952
|
-
markdownString += "| ";
|
|
1953
|
-
currentColumn++;
|
|
1954
|
-
}
|
|
1955
|
-
markdownString += "|\n";
|
|
1956
|
-
});
|
|
1957
|
-
markdownString += "\n";
|
|
1958
|
-
break;
|
|
1959
|
-
case "code":
|
|
1960
|
-
if (node.inline) {
|
|
1961
|
-
const isLsatWhitespace = /\s/.test(markdownString.slice(-1));
|
|
1962
|
-
if (!isLsatWhitespace) {
|
|
1963
|
-
markdownString += " ";
|
|
1964
|
-
}
|
|
1965
|
-
markdownString += `\`${node.content}\``;
|
|
1966
|
-
} else {
|
|
1967
|
-
// For code blocks, we do not escape characters and preserve formatting
|
|
1968
|
-
markdownString += "\n```" + (node.language ?? "") + "\n";
|
|
1969
|
-
markdownString += `${node.content}\n`;
|
|
1970
|
-
markdownString += "```\n\n";
|
|
1971
|
-
}
|
|
1972
|
-
break;
|
|
1973
|
-
case "blockquote":
|
|
1974
|
-
markdownString += `> ${markdownContentASTToString(
|
|
1975
|
-
node.content,
|
|
1976
|
-
options
|
|
1977
|
-
).trim()}\n\n`;
|
|
1978
|
-
break;
|
|
1979
|
-
case "meta":
|
|
1980
|
-
// already handled
|
|
1981
|
-
break;
|
|
1982
|
-
case "semanticHtml":
|
|
1983
|
-
switch (node.htmlType) {
|
|
1984
|
-
case "article":
|
|
1985
|
-
markdownString +=
|
|
1986
|
-
"\n\n" + markdownContentASTToString(node.content, options);
|
|
1987
|
-
break;
|
|
1988
|
-
case "summary":
|
|
1989
|
-
case "time":
|
|
1990
|
-
case "aside":
|
|
1991
|
-
case "nav":
|
|
1992
|
-
case "figcaption":
|
|
1993
|
-
case "main":
|
|
1994
|
-
case "mark":
|
|
1995
|
-
case "header":
|
|
1996
|
-
case "footer":
|
|
1997
|
-
case "details":
|
|
1998
|
-
case "figure":
|
|
1999
|
-
markdownString +=
|
|
2000
|
-
`\n\n<-${node.htmlType}->\n` +
|
|
2001
|
-
markdownContentASTToString(node.content, options) +
|
|
2002
|
-
`\n\n</-${node.htmlType}->\n`;
|
|
2003
|
-
break;
|
|
2004
|
-
case "section":
|
|
2005
|
-
markdownString += "---\n\n";
|
|
2006
|
-
markdownString += markdownContentASTToString(
|
|
2007
|
-
node.content,
|
|
2008
|
-
options
|
|
2009
|
-
);
|
|
2010
|
-
markdownString += "\n\n";
|
|
2011
|
-
markdownString += "---\n\n";
|
|
2012
|
-
break;
|
|
2013
|
-
}
|
|
2014
|
-
break;
|
|
2015
|
-
case "custom":
|
|
2016
|
-
const customNodeRendering = options?.renderCustomNode?.(
|
|
2017
|
-
node,
|
|
2018
|
-
options,
|
|
2019
|
-
indentLevel
|
|
2020
|
-
);
|
|
2021
|
-
if (customNodeRendering) {
|
|
2022
|
-
markdownString += customNodeRendering;
|
|
2023
|
-
}
|
|
2024
|
-
break;
|
|
2025
|
-
}
|
|
2026
|
-
}
|
|
2027
|
-
});
|
|
2028
|
-
return markdownString;
|
|
2029
|
-
}
|
|
2030
|
-
return markdownASTToString;
|
|
2031
|
-
}
|
|
2032
1954
|
|
|
2033
|
-
|
|
1955
|
+
var urlUtils = {};
|
|
2034
1956
|
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
|
|
2042
|
-
/**
|
|
2043
|
-
* Attempts to find the main content of a web page.
|
|
2044
|
-
* @param document The Document object to search.
|
|
2045
|
-
* @returns The Element containing the main content, or the body if no main content is found.
|
|
2046
|
-
*/
|
|
2047
|
-
function findMainContent(document) {
|
|
2048
|
-
const mainElement = document.querySelector("main");
|
|
2049
|
-
if (mainElement) {
|
|
2050
|
-
return mainElement;
|
|
2051
|
-
}
|
|
2052
|
-
if (!document.body) {
|
|
2053
|
-
return document.documentElement;
|
|
2054
|
-
}
|
|
2055
|
-
return detectMainContent(document.body);
|
|
2056
|
-
}
|
|
2057
|
-
function wrapMainContent(mainContentElement, document) {
|
|
2058
|
-
if (mainContentElement.tagName.toLowerCase() !== "main") {
|
|
2059
|
-
const mainElement = document.createElement("main");
|
|
2060
|
-
mainContentElement.before(mainElement);
|
|
2061
|
-
mainElement.appendChild(mainContentElement);
|
|
2062
|
-
mainElement.id = "detected-main-content";
|
|
2063
|
-
}
|
|
2064
|
-
}
|
|
2065
|
-
function detectMainContent(rootElement) {
|
|
2066
|
-
const candidates = [];
|
|
2067
|
-
const minScore = 20;
|
|
2068
|
-
collectCandidates(rootElement, candidates, minScore);
|
|
2069
|
-
if (candidates.length === 0) {
|
|
2070
|
-
return rootElement;
|
|
2071
|
-
}
|
|
2072
|
-
candidates.sort((a, b) => calculateScore(b) - calculateScore(a));
|
|
2073
|
-
let bestIndependentCandidate = candidates[0];
|
|
2074
|
-
for (let i = 1; i < candidates.length; i++) {
|
|
2075
|
-
if (
|
|
2076
|
-
!candidates.some(
|
|
2077
|
-
(otherCandidate, j) =>
|
|
2078
|
-
j !== i && otherCandidate.contains(candidates[i])
|
|
2079
|
-
)
|
|
2080
|
-
) {
|
|
2081
|
-
if (
|
|
2082
|
-
calculateScore(candidates[i]) >
|
|
2083
|
-
calculateScore(bestIndependentCandidate)
|
|
2084
|
-
) {
|
|
2085
|
-
bestIndependentCandidate = candidates[i];
|
|
2086
|
-
debugMessage(
|
|
2087
|
-
`New best independent candidate found: ${elementToString(
|
|
2088
|
-
bestIndependentCandidate
|
|
2089
|
-
)}`
|
|
2090
|
-
);
|
|
2091
|
-
}
|
|
2092
|
-
}
|
|
2093
|
-
}
|
|
2094
|
-
debugMessage(
|
|
2095
|
-
`Final main content candidate: ${elementToString(
|
|
2096
|
-
bestIndependentCandidate
|
|
2097
|
-
)}`
|
|
2098
|
-
);
|
|
2099
|
-
return bestIndependentCandidate;
|
|
2100
|
-
}
|
|
2101
|
-
function elementToString(element) {
|
|
2102
|
-
if (!element) {
|
|
2103
|
-
return "No element";
|
|
2104
|
-
}
|
|
2105
|
-
return `${element.tagName}#${element.id || "no-id"}.${Array.from(
|
|
2106
|
-
element.classList
|
|
2107
|
-
).join(".")}`;
|
|
2108
|
-
}
|
|
2109
|
-
function collectCandidates(element, candidates, minScore) {
|
|
2110
|
-
const score = calculateScore(element);
|
|
2111
|
-
if (score >= minScore) {
|
|
2112
|
-
candidates.push(element);
|
|
2113
|
-
debugMessage(
|
|
2114
|
-
`Candidate found: ${elementToString(element)}, score: ${score}`
|
|
2115
|
-
);
|
|
2116
|
-
}
|
|
2117
|
-
Array.from(element.children).forEach((child) => {
|
|
2118
|
-
collectCandidates(child, candidates, minScore);
|
|
2119
|
-
});
|
|
2120
|
-
}
|
|
2121
|
-
function calculateScore(element) {
|
|
2122
|
-
let score = 0;
|
|
2123
|
-
let scoreLog = [];
|
|
2124
|
-
// High impact attributes
|
|
2125
|
-
const highImpactAttributes = [
|
|
2126
|
-
"article",
|
|
2127
|
-
"content",
|
|
2128
|
-
"main-container",
|
|
2129
|
-
"main",
|
|
2130
|
-
"main-content",
|
|
1957
|
+
Object.defineProperty(urlUtils, "__esModule", { value: true });
|
|
1958
|
+
urlUtils.refifyUrls = refifyUrls;
|
|
1959
|
+
const mediaSuffixes = ["jpeg", "jpg", "png", "gif", "bmp", "tiff", "tif", "svg",
|
|
1960
|
+
"webp", "ico", "avi", "mov", "mp4", "mkv", "flv", "wmv", "webm", "mpeg",
|
|
1961
|
+
"mpg", "mp3", "wav", "aac", "ogg", "flac", "m4a", "pdf", "doc", "docx",
|
|
1962
|
+
"ppt", "pptx", "xls", "xlsx", "txt", "css", "js", "xml", "json",
|
|
1963
|
+
"html", "htm"
|
|
2131
1964
|
];
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
|
|
2144
|
-
|
|
2145
|
-
|
|
2146
|
-
|
|
2147
|
-
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
|
|
2167
|
-
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
scoreLog.push(
|
|
2188
|
-
"Role attribute indicating main content found, score increased by 10"
|
|
2189
|
-
);
|
|
2190
|
-
}
|
|
2191
|
-
if (scoreLog.length > 0) {
|
|
2192
|
-
debugMessage(`Scoring for ${elementToString(element)}:`);
|
|
2193
|
-
}
|
|
2194
|
-
return score;
|
|
2195
|
-
}
|
|
2196
|
-
function calculateLinkDensity(element) {
|
|
2197
|
-
const linkLength = Array.from(element.getElementsByTagName("a")).reduce(
|
|
2198
|
-
(sum, link) => sum + (link.textContent?.length || 0),
|
|
2199
|
-
0
|
|
2200
|
-
);
|
|
2201
|
-
const textLength = element.textContent?.length || 1; // Avoid division by zero
|
|
2202
|
-
return linkLength / textLength;
|
|
2203
|
-
}
|
|
2204
|
-
function isElementVisible(element) {
|
|
2205
|
-
if (!(element instanceof HTMLElement)) {
|
|
2206
|
-
return true; // Non-HTMLElements are considered visible
|
|
2207
|
-
}
|
|
2208
|
-
const style = window.getComputedStyle(element);
|
|
2209
|
-
return (
|
|
2210
|
-
style.display !== "none" &&
|
|
2211
|
-
style.visibility !== "hidden" &&
|
|
2212
|
-
style.opacity !== "0"
|
|
2213
|
-
);
|
|
2214
|
-
}
|
|
2215
|
-
function getVisibleText(element) {
|
|
2216
|
-
if (!isElementVisible(element)) {
|
|
2217
|
-
return "";
|
|
2218
|
-
}
|
|
2219
|
-
let text = "";
|
|
2220
|
-
for (const child of Array.from(element.childNodes)) {
|
|
2221
|
-
if (child.nodeType === ElementNode_1._Node.TEXT_NODE) {
|
|
2222
|
-
text += child.textContent;
|
|
2223
|
-
} else if (child.nodeType === ElementNode_1._Node.ELEMENT_NODE) {
|
|
2224
|
-
text += getVisibleText(child);
|
|
2225
|
-
}
|
|
1965
|
+
const addRefPrefix = (prefix, prefixesToRefs) => {
|
|
1966
|
+
if (!prefixesToRefs[prefix]) {
|
|
1967
|
+
prefixesToRefs[prefix] = 'ref' + Object.values(prefixesToRefs).length;
|
|
1968
|
+
}
|
|
1969
|
+
return prefixesToRefs[prefix];
|
|
1970
|
+
};
|
|
1971
|
+
const processUrl = (url, prefixesToRefs) => {
|
|
1972
|
+
if (!url.startsWith('http')) {
|
|
1973
|
+
return url;
|
|
1974
|
+
}
|
|
1975
|
+
else {
|
|
1976
|
+
const mediaSuffix = url.split('.').slice(-1)[0];
|
|
1977
|
+
if (mediaSuffix && mediaSuffixes.includes(mediaSuffix)) {
|
|
1978
|
+
const parts = url.split('/'); // Split URL keeping the slash before text
|
|
1979
|
+
const prefix = parts.slice(0, -1).join('/'); // Get the prefix by removing last part
|
|
1980
|
+
const refPrefix = addRefPrefix(prefix, prefixesToRefs);
|
|
1981
|
+
return `${refPrefix}://${parts.slice(-1).join('')}`;
|
|
1982
|
+
}
|
|
1983
|
+
else {
|
|
1984
|
+
if (url.split('/').length > 4) {
|
|
1985
|
+
return addRefPrefix(url, prefixesToRefs);
|
|
1986
|
+
}
|
|
1987
|
+
else {
|
|
1988
|
+
return url;
|
|
1989
|
+
}
|
|
1990
|
+
}
|
|
1991
|
+
}
|
|
1992
|
+
};
|
|
1993
|
+
function refifyUrls(markdownElement, prefixesToRefs = {}) {
|
|
1994
|
+
if (Array.isArray(markdownElement)) {
|
|
1995
|
+
markdownElement.forEach(element => refifyUrls(element, prefixesToRefs));
|
|
1996
|
+
}
|
|
1997
|
+
else {
|
|
1998
|
+
switch (markdownElement.type) {
|
|
1999
|
+
case 'link':
|
|
2000
|
+
markdownElement.href = processUrl(markdownElement.href, prefixesToRefs);
|
|
2001
|
+
refifyUrls(markdownElement.content, prefixesToRefs);
|
|
2002
|
+
break;
|
|
2003
|
+
case 'image':
|
|
2004
|
+
case 'video':
|
|
2005
|
+
markdownElement.src = processUrl(markdownElement.src, prefixesToRefs);
|
|
2006
|
+
break;
|
|
2007
|
+
case 'list':
|
|
2008
|
+
markdownElement.items.forEach(item => item.content.forEach(_ => refifyUrls(_, prefixesToRefs)));
|
|
2009
|
+
break;
|
|
2010
|
+
case 'table':
|
|
2011
|
+
markdownElement.rows.forEach(row => row.cells.forEach(cell => typeof cell.content === 'string' ? null : refifyUrls(cell.content, prefixesToRefs)));
|
|
2012
|
+
break;
|
|
2013
|
+
case 'blockquote':
|
|
2014
|
+
case 'semanticHtml':
|
|
2015
|
+
refifyUrls(markdownElement.content, prefixesToRefs);
|
|
2016
|
+
break;
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
2019
|
+
return prefixesToRefs;
|
|
2226
2020
|
}
|
|
2227
|
-
return text.trim();
|
|
2228
|
-
}
|
|
2229
2021
|
|
|
2230
|
-
|
|
2022
|
+
var astUtils = {};
|
|
2231
2023
|
|
|
2232
|
-
|
|
2233
|
-
|
|
2234
|
-
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
|
|
2262
|
-
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
|
|
2266
|
-
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
|
|
2271
|
-
|
|
2272
|
-
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
|
|
2277
|
-
|
|
2278
|
-
|
|
2279
|
-
|
|
2280
|
-
|
|
2281
|
-
|
|
2282
|
-
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
|
|
2291
|
-
|
|
2292
|
-
|
|
2293
|
-
|
|
2294
|
-
|
|
2295
|
-
|
|
2296
|
-
|
|
2297
|
-
|
|
2298
|
-
|
|
2299
|
-
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
|
|
2304
|
-
|
|
2305
|
-
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2313
|
-
|
|
2314
|
-
|
|
2315
|
-
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
typeof cell.content === "string"
|
|
2325
|
-
? null
|
|
2326
|
-
: refifyUrls(cell.content, prefixesToRefs)
|
|
2327
|
-
)
|
|
2328
|
-
);
|
|
2329
|
-
break;
|
|
2330
|
-
case "blockquote":
|
|
2331
|
-
case "semanticHtml":
|
|
2332
|
-
refifyUrls(markdownElement.content, prefixesToRefs);
|
|
2333
|
-
break;
|
|
2334
|
-
}
|
|
2335
|
-
}
|
|
2336
|
-
return prefixesToRefs;
|
|
2337
|
-
}
|
|
2024
|
+
(function (exports) {
|
|
2025
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
2026
|
+
exports.isNot = exports.getMainContent = void 0;
|
|
2027
|
+
exports.findInAST = findInAST;
|
|
2028
|
+
exports.findAllInAST = findAllInAST;
|
|
2029
|
+
const getMainContent = (markdownStr) => {
|
|
2030
|
+
if (markdownStr.includes('<-main->')) {
|
|
2031
|
+
const regex = /(?<=<-main->)[\s\S]*?(?=<\/-main->)/;
|
|
2032
|
+
const match = markdownStr.match(regex);
|
|
2033
|
+
return match?.[0] ?? '';
|
|
2034
|
+
}
|
|
2035
|
+
else {
|
|
2036
|
+
const removeSectionsRegex = /(<-nav->[\s\S]*?<\/-nav->)|(<-footer->[\s\S]*?<\/-footer->)|(<-header->[\s\S]*?<\/-header->)|(<-aside->[\s\S]*?<\/-aside->)/g;
|
|
2037
|
+
return markdownStr.replace(removeSectionsRegex, '');
|
|
2038
|
+
}
|
|
2039
|
+
};
|
|
2040
|
+
exports.getMainContent = getMainContent;
|
|
2041
|
+
const isNot = (tPred) => (t) => !tPred(t);
|
|
2042
|
+
exports.isNot = isNot;
|
|
2043
|
+
const isString = (x) => typeof x === "string";
|
|
2044
|
+
function findInAST(markdownElement, checker) {
|
|
2045
|
+
const loopCheck = (z) => {
|
|
2046
|
+
for (const element of z) {
|
|
2047
|
+
const found = findInAST(element, checker);
|
|
2048
|
+
if (found) {
|
|
2049
|
+
return found;
|
|
2050
|
+
}
|
|
2051
|
+
}
|
|
2052
|
+
return undefined;
|
|
2053
|
+
};
|
|
2054
|
+
if (Array.isArray(markdownElement)) {
|
|
2055
|
+
return loopCheck(markdownElement);
|
|
2056
|
+
}
|
|
2057
|
+
else {
|
|
2058
|
+
if (checker(markdownElement)) {
|
|
2059
|
+
return markdownElement;
|
|
2060
|
+
}
|
|
2061
|
+
switch (markdownElement.type) {
|
|
2062
|
+
case 'link':
|
|
2063
|
+
return loopCheck(markdownElement.content);
|
|
2064
|
+
case 'list':
|
|
2065
|
+
return loopCheck(markdownElement.items
|
|
2066
|
+
.map(_ => _.content)
|
|
2067
|
+
.flat());
|
|
2068
|
+
case 'table':
|
|
2069
|
+
return loopCheck(markdownElement.rows
|
|
2070
|
+
.map(row => row.cells.map(_ => _.content)
|
|
2071
|
+
.filter((0, exports.isNot)(isString)))
|
|
2072
|
+
.flat());
|
|
2073
|
+
case 'blockquote':
|
|
2074
|
+
case 'semanticHtml':
|
|
2075
|
+
return loopCheck(markdownElement.content);
|
|
2076
|
+
}
|
|
2077
|
+
return undefined;
|
|
2078
|
+
}
|
|
2079
|
+
}
|
|
2080
|
+
function findAllInAST(markdownElement, checker) {
|
|
2081
|
+
const loopCheck = (z) => {
|
|
2082
|
+
let out = [];
|
|
2083
|
+
for (const element of z) {
|
|
2084
|
+
const found = findAllInAST(element, checker);
|
|
2085
|
+
out = [...out, ...found];
|
|
2086
|
+
}
|
|
2087
|
+
return out;
|
|
2088
|
+
};
|
|
2089
|
+
if (Array.isArray(markdownElement)) {
|
|
2090
|
+
return loopCheck(markdownElement);
|
|
2091
|
+
}
|
|
2092
|
+
else {
|
|
2093
|
+
if (checker(markdownElement)) {
|
|
2094
|
+
return [markdownElement];
|
|
2095
|
+
}
|
|
2096
|
+
switch (markdownElement.type) {
|
|
2097
|
+
case 'link':
|
|
2098
|
+
return loopCheck(markdownElement.content);
|
|
2099
|
+
case 'list':
|
|
2100
|
+
return loopCheck(markdownElement.items
|
|
2101
|
+
.map(_ => _.content)
|
|
2102
|
+
.flat());
|
|
2103
|
+
case 'table':
|
|
2104
|
+
return loopCheck(markdownElement.rows
|
|
2105
|
+
.map(row => row.cells.map(_ => _.content)
|
|
2106
|
+
.filter((0, exports.isNot)(isString)))
|
|
2107
|
+
.flat());
|
|
2108
|
+
case 'blockquote':
|
|
2109
|
+
case 'semanticHtml':
|
|
2110
|
+
return loopCheck(markdownElement.content);
|
|
2111
|
+
}
|
|
2112
|
+
return [];
|
|
2113
|
+
}
|
|
2114
|
+
}
|
|
2115
|
+
} (astUtils));
|
|
2338
2116
|
|
|
2339
|
-
|
|
2117
|
+
var hasRequiredNode;
|
|
2340
2118
|
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
|
|
2355
|
-
|
|
2356
|
-
|
|
2357
|
-
|
|
2358
|
-
|
|
2359
|
-
|
|
2360
|
-
|
|
2361
|
-
|
|
2362
|
-
|
|
2363
|
-
|
|
2364
|
-
|
|
2365
|
-
|
|
2366
|
-
|
|
2367
|
-
|
|
2368
|
-
|
|
2369
|
-
|
|
2370
|
-
|
|
2371
|
-
|
|
2372
|
-
|
|
2373
|
-
|
|
2374
|
-
|
|
2375
|
-
|
|
2376
|
-
|
|
2377
|
-
|
|
2378
|
-
|
|
2379
|
-
|
|
2380
|
-
|
|
2381
|
-
|
|
2382
|
-
|
|
2383
|
-
|
|
2384
|
-
|
|
2385
|
-
|
|
2386
|
-
|
|
2387
|
-
|
|
2388
|
-
|
|
2389
|
-
|
|
2390
|
-
|
|
2391
|
-
|
|
2392
|
-
|
|
2393
|
-
|
|
2394
|
-
|
|
2395
|
-
|
|
2396
|
-
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
|
|
2119
|
+
function requireNode () {
|
|
2120
|
+
if (hasRequiredNode) return node;
|
|
2121
|
+
hasRequiredNode = 1;
|
|
2122
|
+
(function (exports) {
|
|
2123
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
2124
|
+
exports.wrapMainContent = exports.refifyUrls = exports.findMainContent = exports.markdownASTToString = exports.htmlToMarkdownAST = void 0;
|
|
2125
|
+
exports.convertHtmlToMarkdown = convertHtmlToMarkdown;
|
|
2126
|
+
exports.convertElementToMarkdown = convertElementToMarkdown;
|
|
2127
|
+
exports.findInMarkdownAST = findInMarkdownAST;
|
|
2128
|
+
exports.findAllInMarkdownAST = findAllInMarkdownAST;
|
|
2129
|
+
const htmlToMarkdownAST_1 = htmlToMarkdownAST$1;
|
|
2130
|
+
Object.defineProperty(exports, "htmlToMarkdownAST", { enumerable: true, get: function () { return htmlToMarkdownAST_1.htmlToMarkdownAST; } });
|
|
2131
|
+
const markdownASTToString_1 = requireMarkdownASTToString();
|
|
2132
|
+
Object.defineProperty(exports, "markdownASTToString", { enumerable: true, get: function () { return markdownASTToString_1.markdownASTToString; } });
|
|
2133
|
+
const domUtils_1 = domUtils;
|
|
2134
|
+
Object.defineProperty(exports, "findMainContent", { enumerable: true, get: function () { return domUtils_1.findMainContent; } });
|
|
2135
|
+
Object.defineProperty(exports, "wrapMainContent", { enumerable: true, get: function () { return domUtils_1.wrapMainContent; } });
|
|
2136
|
+
const urlUtils_1 = urlUtils;
|
|
2137
|
+
Object.defineProperty(exports, "refifyUrls", { enumerable: true, get: function () { return urlUtils_1.refifyUrls; } });
|
|
2138
|
+
const astUtils_1 = astUtils;
|
|
2139
|
+
/**
|
|
2140
|
+
* Converts an HTML string to Markdown.
|
|
2141
|
+
* @param html The HTML string to convert.
|
|
2142
|
+
* @param options Conversion options.
|
|
2143
|
+
* @returns The converted Markdown string.
|
|
2144
|
+
*/
|
|
2145
|
+
function convertHtmlToMarkdown(html, options) {
|
|
2146
|
+
const parser = options?.overrideDOMParser ?? (typeof DOMParser !== 'undefined' ? new DOMParser() : null);
|
|
2147
|
+
if (!parser) {
|
|
2148
|
+
throw new Error('DOMParser is not available. Please provide an overrideDOMParser in options.');
|
|
2149
|
+
}
|
|
2150
|
+
const doc = parser.parseFromString(html, 'text/html');
|
|
2151
|
+
let element;
|
|
2152
|
+
if (options?.extractMainContent) {
|
|
2153
|
+
element = (0, domUtils_1.findMainContent)(doc);
|
|
2154
|
+
if (options.includeMetaData && !!doc.querySelector('head')?.innerHTML && !element.querySelector('head')) {
|
|
2155
|
+
// content container was found and extracted, re-attaching the head for meta-data extraction
|
|
2156
|
+
element = parser.parseFromString(`<html>${doc.head.outerHTML}${element.outerHTML}`, 'text/html').documentElement;
|
|
2157
|
+
}
|
|
2158
|
+
}
|
|
2159
|
+
else {
|
|
2160
|
+
// If there's a body, use it; otherwise, use the document element
|
|
2161
|
+
if (options?.includeMetaData && !!doc.querySelector('head')?.innerHTML) {
|
|
2162
|
+
element = doc.documentElement;
|
|
2163
|
+
}
|
|
2164
|
+
else {
|
|
2165
|
+
element = doc.body || doc.documentElement;
|
|
2166
|
+
}
|
|
2167
|
+
}
|
|
2168
|
+
return convertElementToMarkdown(element, options);
|
|
2169
|
+
}
|
|
2170
|
+
/**
|
|
2171
|
+
* Converts an HTML Element to Markdown.
|
|
2172
|
+
* @param element The HTML Element to convert.
|
|
2173
|
+
* @param options Conversion options.
|
|
2174
|
+
* @returns The converted Markdown string.
|
|
2175
|
+
*/
|
|
2176
|
+
function convertElementToMarkdown(element, options) {
|
|
2177
|
+
let ast = (0, htmlToMarkdownAST_1.htmlToMarkdownAST)(element, options);
|
|
2178
|
+
if (options?.refifyUrls) {
|
|
2179
|
+
options.urlMap = (0, urlUtils_1.refifyUrls)(ast);
|
|
2180
|
+
}
|
|
2181
|
+
return (0, markdownASTToString_1.markdownASTToString)(ast, options);
|
|
2182
|
+
}
|
|
2183
|
+
/**
|
|
2184
|
+
* Finds a node in the Markdown AST that matches the given predicate.
|
|
2185
|
+
* @param ast The Markdown AST to search.
|
|
2186
|
+
* @param predicate A function that returns true for the desired node.
|
|
2187
|
+
* @returns The first matching node, or undefined if not found.
|
|
2188
|
+
*/
|
|
2189
|
+
function findInMarkdownAST(ast, predicate) {
|
|
2190
|
+
return (0, astUtils_1.findInAST)(ast, predicate);
|
|
2191
|
+
}
|
|
2192
|
+
/**
|
|
2193
|
+
* Finds all nodes in the Markdown AST that match the given predicate.
|
|
2194
|
+
* @param ast The Markdown AST to search.
|
|
2195
|
+
* @param predicate A function that returns true for the desired nodes.
|
|
2196
|
+
* @returns An array of all matching nodes.
|
|
2197
|
+
*/
|
|
2198
|
+
function findAllInMarkdownAST(ast, predicate) {
|
|
2199
|
+
return (0, astUtils_1.findAllInAST)(ast, predicate);
|
|
2200
|
+
}
|
|
2201
|
+
} (node));
|
|
2202
|
+
return node;
|
|
2400
2203
|
}
|
|
2401
|
-
function findAllInAST(markdownElement, checker) {
|
|
2402
|
-
const loopCheck = (z) => {
|
|
2403
|
-
let out = [];
|
|
2404
|
-
for (const element of z) {
|
|
2405
|
-
const found = findAllInAST(element, checker);
|
|
2406
|
-
out = [...out, ...found];
|
|
2407
|
-
}
|
|
2408
|
-
return out;
|
|
2409
|
-
};
|
|
2410
|
-
if (Array.isArray(markdownElement)) {
|
|
2411
|
-
return loopCheck(markdownElement);
|
|
2412
|
-
} else {
|
|
2413
|
-
if (checker(markdownElement)) {
|
|
2414
|
-
return [markdownElement];
|
|
2415
|
-
}
|
|
2416
|
-
switch (markdownElement.type) {
|
|
2417
|
-
case "link":
|
|
2418
|
-
return loopCheck(markdownElement.content);
|
|
2419
|
-
case "list":
|
|
2420
|
-
return loopCheck(
|
|
2421
|
-
markdownElement.items.map((_) => _.content).flat()
|
|
2422
|
-
);
|
|
2423
|
-
case "table":
|
|
2424
|
-
return loopCheck(
|
|
2425
|
-
markdownElement.rows
|
|
2426
|
-
.map((row) =>
|
|
2427
|
-
row.cells
|
|
2428
|
-
.map((_) => _.content)
|
|
2429
|
-
.filter((0, exports.isNot)(isString))
|
|
2430
|
-
)
|
|
2431
|
-
.flat()
|
|
2432
|
-
);
|
|
2433
|
-
case "blockquote":
|
|
2434
|
-
case "semanticHtml":
|
|
2435
|
-
return loopCheck(markdownElement.content);
|
|
2436
|
-
}
|
|
2437
|
-
return [];
|
|
2438
|
-
}
|
|
2439
|
-
}
|
|
2440
|
-
})(astUtils);
|
|
2441
|
-
|
|
2442
|
-
var hasRequiredNode;
|
|
2443
2204
|
|
|
2444
|
-
|
|
2445
|
-
if (hasRequiredNode) return node;
|
|
2446
|
-
hasRequiredNode = 1;
|
|
2447
|
-
(function (exports) {
|
|
2448
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
2449
|
-
exports.wrapMainContent =
|
|
2450
|
-
exports.refifyUrls =
|
|
2451
|
-
exports.findMainContent =
|
|
2452
|
-
exports.markdownASTToString =
|
|
2453
|
-
exports.htmlToMarkdownAST =
|
|
2454
|
-
void 0;
|
|
2455
|
-
exports.convertHtmlToMarkdown = convertHtmlToMarkdown;
|
|
2456
|
-
exports.convertElementToMarkdown = convertElementToMarkdown;
|
|
2457
|
-
exports.findInMarkdownAST = findInMarkdownAST;
|
|
2458
|
-
exports.findAllInMarkdownAST = findAllInMarkdownAST;
|
|
2459
|
-
const htmlToMarkdownAST_1 = htmlToMarkdownAST$1;
|
|
2460
|
-
Object.defineProperty(exports, "htmlToMarkdownAST", {
|
|
2461
|
-
enumerable: true,
|
|
2462
|
-
get: function () {
|
|
2463
|
-
return htmlToMarkdownAST_1.htmlToMarkdownAST;
|
|
2464
|
-
},
|
|
2465
|
-
});
|
|
2466
|
-
const markdownASTToString_1 = requireMarkdownASTToString();
|
|
2467
|
-
Object.defineProperty(exports, "markdownASTToString", {
|
|
2468
|
-
enumerable: true,
|
|
2469
|
-
get: function () {
|
|
2470
|
-
return markdownASTToString_1.markdownASTToString;
|
|
2471
|
-
},
|
|
2472
|
-
});
|
|
2473
|
-
const domUtils_1 = domUtils;
|
|
2474
|
-
Object.defineProperty(exports, "findMainContent", {
|
|
2475
|
-
enumerable: true,
|
|
2476
|
-
get: function () {
|
|
2477
|
-
return domUtils_1.findMainContent;
|
|
2478
|
-
},
|
|
2479
|
-
});
|
|
2480
|
-
Object.defineProperty(exports, "wrapMainContent", {
|
|
2481
|
-
enumerable: true,
|
|
2482
|
-
get: function () {
|
|
2483
|
-
return domUtils_1.wrapMainContent;
|
|
2484
|
-
},
|
|
2485
|
-
});
|
|
2486
|
-
const urlUtils_1 = urlUtils;
|
|
2487
|
-
Object.defineProperty(exports, "refifyUrls", {
|
|
2488
|
-
enumerable: true,
|
|
2489
|
-
get: function () {
|
|
2490
|
-
return urlUtils_1.refifyUrls;
|
|
2491
|
-
},
|
|
2492
|
-
});
|
|
2493
|
-
const astUtils_1 = astUtils;
|
|
2494
|
-
/**
|
|
2495
|
-
* Converts an HTML string to Markdown.
|
|
2496
|
-
* @param html The HTML string to convert.
|
|
2497
|
-
* @param options Conversion options.
|
|
2498
|
-
* @returns The converted Markdown string.
|
|
2499
|
-
*/
|
|
2500
|
-
function convertHtmlToMarkdown(html, options) {
|
|
2501
|
-
const parser =
|
|
2502
|
-
options?.overrideDOMParser ??
|
|
2503
|
-
(typeof DOMParser !== "undefined" ? new DOMParser() : null);
|
|
2504
|
-
if (!parser) {
|
|
2505
|
-
throw new Error(
|
|
2506
|
-
"DOMParser is not available. Please provide an overrideDOMParser in options."
|
|
2507
|
-
);
|
|
2508
|
-
}
|
|
2509
|
-
const doc = parser.parseFromString(html, "text/html");
|
|
2510
|
-
let element;
|
|
2511
|
-
if (options?.extractMainContent) {
|
|
2512
|
-
element = (0, domUtils_1.findMainContent)(doc);
|
|
2513
|
-
if (
|
|
2514
|
-
options.includeMetaData &&
|
|
2515
|
-
!!doc.querySelector("head")?.innerHTML &&
|
|
2516
|
-
!element.querySelector("head")
|
|
2517
|
-
) {
|
|
2518
|
-
// content container was found and extracted, re-attaching the head for meta-data extraction
|
|
2519
|
-
element = parser.parseFromString(
|
|
2520
|
-
`<html>${doc.head.outerHTML}${element.outerHTML}`,
|
|
2521
|
-
"text/html"
|
|
2522
|
-
).documentElement;
|
|
2523
|
-
}
|
|
2524
|
-
} else {
|
|
2525
|
-
// If there's a body, use it; otherwise, use the document element
|
|
2526
|
-
if (
|
|
2527
|
-
options?.includeMetaData &&
|
|
2528
|
-
!!doc.querySelector("head")?.innerHTML
|
|
2529
|
-
) {
|
|
2530
|
-
element = doc.documentElement;
|
|
2531
|
-
} else {
|
|
2532
|
-
element = doc.body || doc.documentElement;
|
|
2533
|
-
}
|
|
2534
|
-
}
|
|
2535
|
-
return convertElementToMarkdown(element, options);
|
|
2536
|
-
}
|
|
2537
|
-
/**
|
|
2538
|
-
* Converts an HTML Element to Markdown.
|
|
2539
|
-
* @param element The HTML Element to convert.
|
|
2540
|
-
* @param options Conversion options.
|
|
2541
|
-
* @returns The converted Markdown string.
|
|
2542
|
-
*/
|
|
2543
|
-
function convertElementToMarkdown(element, options) {
|
|
2544
|
-
let ast = (0, htmlToMarkdownAST_1.htmlToMarkdownAST)(element, options);
|
|
2545
|
-
if (options?.refifyUrls) {
|
|
2546
|
-
options.urlMap = (0, urlUtils_1.refifyUrls)(ast);
|
|
2547
|
-
}
|
|
2548
|
-
return (0, markdownASTToString_1.markdownASTToString)(ast, options);
|
|
2549
|
-
}
|
|
2550
|
-
/**
|
|
2551
|
-
* Finds a node in the Markdown AST that matches the given predicate.
|
|
2552
|
-
* @param ast The Markdown AST to search.
|
|
2553
|
-
* @param predicate A function that returns true for the desired node.
|
|
2554
|
-
* @returns The first matching node, or undefined if not found.
|
|
2555
|
-
*/
|
|
2556
|
-
function findInMarkdownAST(ast, predicate) {
|
|
2557
|
-
return (0, astUtils_1.findInAST)(ast, predicate);
|
|
2558
|
-
}
|
|
2559
|
-
/**
|
|
2560
|
-
* Finds all nodes in the Markdown AST that match the given predicate.
|
|
2561
|
-
* @param ast The Markdown AST to search.
|
|
2562
|
-
* @param predicate A function that returns true for the desired nodes.
|
|
2563
|
-
* @returns An array of all matching nodes.
|
|
2564
|
-
*/
|
|
2565
|
-
function findAllInMarkdownAST(ast, predicate) {
|
|
2566
|
-
return (0, astUtils_1.findAllInAST)(ast, predicate);
|
|
2567
|
-
}
|
|
2568
|
-
})(node);
|
|
2569
|
-
return node;
|
|
2570
|
-
}
|
|
2205
|
+
var nodeExports = requireNode();
|
|
2571
2206
|
|
|
2572
|
-
|
|
2207
|
+
//@ts-ignore
|
|
2208
|
+
window.__INTUNED__ = {
|
|
2209
|
+
matchStringsWithDomContent,
|
|
2210
|
+
convertElementToMarkdown,
|
|
2211
|
+
convertHtmlStringToSemanticMarkdown: nodeExports.convertHtmlToMarkdown,
|
|
2212
|
+
};
|
|
2573
2213
|
|
|
2574
|
-
//@ts-ignore
|
|
2575
|
-
window.__INTUNED__ = {
|
|
2576
|
-
matchStringsWithDomContent,
|
|
2577
|
-
convertElementToMarkdown,
|
|
2578
|
-
convertHtmlStringToSemanticMarkdown: nodeExports.convertHtmlToMarkdown,
|
|
2579
|
-
};
|
|
2580
2214
|
})();
|