node-html-parser 4.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintignore +3 -0
- package/.eslintrc.json +226 -0
- package/.mocharc.yaml +1 -0
- package/.prettierrc +7 -0
- package/LICENSE +7 -0
- package/README.md +255 -0
- package/dist/back.d.ts +1 -0
- package/dist/back.js +6 -0
- package/dist/esm/back.js +3 -0
- package/dist/esm/index.js +7 -0
- package/dist/esm/matcher.js +101 -0
- package/dist/esm/nodes/comment.js +23 -0
- package/dist/esm/nodes/html.js +1048 -0
- package/dist/esm/nodes/node.js +25 -0
- package/dist/esm/nodes/text.js +95 -0
- package/dist/esm/nodes/type.js +7 -0
- package/dist/esm/parse.js +1 -0
- package/dist/esm/valid.js +9 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.js +21 -0
- package/dist/main.js +1542 -0
- package/dist/matcher.d.ts +6 -0
- package/dist/matcher.js +106 -0
- package/dist/nodes/comment.d.ts +18 -0
- package/dist/nodes/comment.js +51 -0
- package/dist/nodes/html.d.ts +206 -0
- package/dist/nodes/html.js +1188 -0
- package/dist/nodes/node.d.ts +18 -0
- package/dist/nodes/node.js +38 -0
- package/dist/nodes/text.d.ts +42 -0
- package/dist/nodes/text.js +139 -0
- package/dist/nodes/type.d.ts +6 -0
- package/dist/nodes/type.js +9 -0
- package/dist/parse.d.ts +1 -0
- package/dist/parse.js +5 -0
- package/dist/valid.d.ts +6 -0
- package/dist/valid.js +13 -0
- package/package.json +88 -0
|
@@ -0,0 +1,1048 @@
|
|
|
1
|
+
import he from 'he';
|
|
2
|
+
import { selectAll, selectOne } from 'css-select';
|
|
3
|
+
import Node from './node';
|
|
4
|
+
import NodeType from './type';
|
|
5
|
+
import TextNode from './text';
|
|
6
|
+
import Matcher from '../matcher';
|
|
7
|
+
import arr_back from '../back';
|
|
8
|
+
import CommentNode from './comment';
|
|
9
|
+
function decode(val) {
|
|
10
|
+
// clone string
|
|
11
|
+
return JSON.parse(JSON.stringify(he.decode(val)));
|
|
12
|
+
}
|
|
13
|
+
// https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
|
|
14
|
+
const Htags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup'];
|
|
15
|
+
const Dtags = ['details', 'dialog', 'dd', 'div', 'dt'];
|
|
16
|
+
const Ftags = ['fieldset', 'figcaption', 'figure', 'footer', 'form'];
|
|
17
|
+
const tableTags = ['table', 'td', 'tr'];
|
|
18
|
+
const htmlTags = ['address', 'article', 'aside', 'blockquote', 'br', 'hr', 'li', 'main', 'nav', 'ol', 'p', 'pre', 'section', 'ul'];
|
|
19
|
+
const kBlockElements = new Set();
|
|
20
|
+
function addToKBlockElement(...args) {
|
|
21
|
+
const addToSet = (array) => {
|
|
22
|
+
for (let index = 0; index < array.length; index++) {
|
|
23
|
+
const element = array[index];
|
|
24
|
+
kBlockElements.add(element);
|
|
25
|
+
kBlockElements.add(element.toUpperCase());
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
for (const arg of args)
|
|
29
|
+
addToSet(arg);
|
|
30
|
+
}
|
|
31
|
+
addToKBlockElement(Htags, Dtags, Ftags, tableTags, htmlTags);
|
|
32
|
+
class DOMTokenList {
|
|
33
|
+
constructor(valuesInit = [], afterUpdate = () => null) {
|
|
34
|
+
this._set = new Set(valuesInit);
|
|
35
|
+
this._afterUpdate = afterUpdate;
|
|
36
|
+
}
|
|
37
|
+
_validate(c) {
|
|
38
|
+
if (/\s/.test(c)) {
|
|
39
|
+
throw new Error(`DOMException in DOMTokenList.add: The token '${c}' contains HTML space characters, which are not valid in tokens.`);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
add(c) {
|
|
43
|
+
this._validate(c);
|
|
44
|
+
this._set.add(c);
|
|
45
|
+
this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
|
|
46
|
+
}
|
|
47
|
+
replace(c1, c2) {
|
|
48
|
+
this._validate(c2);
|
|
49
|
+
this._set.delete(c1);
|
|
50
|
+
this._set.add(c2);
|
|
51
|
+
this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
|
|
52
|
+
}
|
|
53
|
+
remove(c) {
|
|
54
|
+
this._set.delete(c) && this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
|
|
55
|
+
}
|
|
56
|
+
toggle(c) {
|
|
57
|
+
this._validate(c);
|
|
58
|
+
if (this._set.has(c))
|
|
59
|
+
this._set.delete(c);
|
|
60
|
+
else
|
|
61
|
+
this._set.add(c);
|
|
62
|
+
this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
|
|
63
|
+
}
|
|
64
|
+
contains(c) {
|
|
65
|
+
return this._set.has(c);
|
|
66
|
+
}
|
|
67
|
+
get length() {
|
|
68
|
+
return this._set.size;
|
|
69
|
+
}
|
|
70
|
+
values() {
|
|
71
|
+
return this._set.values();
|
|
72
|
+
}
|
|
73
|
+
get value() {
|
|
74
|
+
return Array.from(this._set.values());
|
|
75
|
+
}
|
|
76
|
+
toString() {
|
|
77
|
+
return Array.from(this._set.values()).join(' ');
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* HTMLElement, which contains a set of children.
|
|
82
|
+
*
|
|
83
|
+
* Note: this is a minimalist implementation, no complete tree
|
|
84
|
+
* structure provided (no parentNode, nextSibling,
|
|
85
|
+
* previousSibling etc).
|
|
86
|
+
* @class HTMLElement
|
|
87
|
+
* @extends {Node}
|
|
88
|
+
*/
|
|
89
|
+
export default class HTMLElement extends Node {
|
|
90
|
+
/**
|
|
91
|
+
* Creates an instance of HTMLElement.
|
|
92
|
+
* @param keyAttrs id and class attribute
|
|
93
|
+
* @param [rawAttrs] attributes in string
|
|
94
|
+
*
|
|
95
|
+
* @memberof HTMLElement
|
|
96
|
+
*/
|
|
97
|
+
constructor(tagName, keyAttrs, rawAttrs = '', parentNode, range) {
|
|
98
|
+
super(parentNode, range);
|
|
99
|
+
this.rawAttrs = rawAttrs;
|
|
100
|
+
/**
|
|
101
|
+
* Node Type declaration.
|
|
102
|
+
*/
|
|
103
|
+
this.nodeType = NodeType.ELEMENT_NODE;
|
|
104
|
+
this.rawTagName = tagName;
|
|
105
|
+
this.rawAttrs = rawAttrs || '';
|
|
106
|
+
this.id = keyAttrs.id || '';
|
|
107
|
+
this.childNodes = [];
|
|
108
|
+
this.classList = new DOMTokenList(keyAttrs.class ? keyAttrs.class.split(/\s+/) : [], (classList) => this.setAttribute('class', classList.toString()) // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call
|
|
109
|
+
);
|
|
110
|
+
if (keyAttrs.id) {
|
|
111
|
+
if (!rawAttrs) {
|
|
112
|
+
this.rawAttrs = `id="${keyAttrs.id}"`;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
if (keyAttrs.class) {
|
|
116
|
+
if (!rawAttrs) {
|
|
117
|
+
const cls = `class="${this.classList.toString()}"`;
|
|
118
|
+
if (this.rawAttrs) {
|
|
119
|
+
this.rawAttrs += ` ${cls}`;
|
|
120
|
+
}
|
|
121
|
+
else {
|
|
122
|
+
this.rawAttrs = cls;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Quote attribute values
|
|
129
|
+
* @param attr attribute value
|
|
130
|
+
* @returns {string} quoted value
|
|
131
|
+
*/
|
|
132
|
+
quoteAttribute(attr) {
|
|
133
|
+
if (attr === null) {
|
|
134
|
+
return 'null';
|
|
135
|
+
}
|
|
136
|
+
return JSON.stringify(attr.replace(/"/g, '"'));
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Remove current element
|
|
140
|
+
*/
|
|
141
|
+
remove() {
|
|
142
|
+
if (this.parentNode) {
|
|
143
|
+
const children = this.parentNode.childNodes;
|
|
144
|
+
this.parentNode.childNodes = children.filter((child) => {
|
|
145
|
+
return this !== child;
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Remove Child element from childNodes array
|
|
151
|
+
* @param {HTMLElement} node node to remove
|
|
152
|
+
*/
|
|
153
|
+
removeChild(node) {
|
|
154
|
+
this.childNodes = this.childNodes.filter((child) => {
|
|
155
|
+
return child !== node;
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Exchanges given child with new child
|
|
160
|
+
* @param {HTMLElement} oldNode node to exchange
|
|
161
|
+
* @param {HTMLElement} newNode new node
|
|
162
|
+
*/
|
|
163
|
+
exchangeChild(oldNode, newNode) {
|
|
164
|
+
const children = this.childNodes;
|
|
165
|
+
this.childNodes = children.map((child) => {
|
|
166
|
+
if (child === oldNode) {
|
|
167
|
+
return newNode;
|
|
168
|
+
}
|
|
169
|
+
return child;
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
get tagName() {
|
|
173
|
+
return this.rawTagName ? this.rawTagName.toUpperCase() : this.rawTagName;
|
|
174
|
+
}
|
|
175
|
+
get localName() {
|
|
176
|
+
return this.rawTagName.toLowerCase();
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Get escpaed (as-it) text value of current node and its children.
|
|
180
|
+
* @return {string} text content
|
|
181
|
+
*/
|
|
182
|
+
get rawText() {
|
|
183
|
+
return this.childNodes.reduce((pre, cur) => {
|
|
184
|
+
return (pre += cur.rawText);
|
|
185
|
+
}, '');
|
|
186
|
+
}
|
|
187
|
+
get textContent() {
|
|
188
|
+
return decode(this.rawText);
|
|
189
|
+
}
|
|
190
|
+
set textContent(val) {
|
|
191
|
+
const content = [new TextNode(val, this)];
|
|
192
|
+
this.childNodes = content;
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Get unescaped text value of current node and its children.
|
|
196
|
+
* @return {string} text content
|
|
197
|
+
*/
|
|
198
|
+
get text() {
|
|
199
|
+
return decode(this.rawText);
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Get structured Text (with '\n' etc.)
|
|
203
|
+
* @return {string} structured text
|
|
204
|
+
*/
|
|
205
|
+
get structuredText() {
|
|
206
|
+
let currentBlock = [];
|
|
207
|
+
const blocks = [currentBlock];
|
|
208
|
+
function dfs(node) {
|
|
209
|
+
if (node.nodeType === NodeType.ELEMENT_NODE) {
|
|
210
|
+
if (kBlockElements.has(node.rawTagName)) {
|
|
211
|
+
if (currentBlock.length > 0) {
|
|
212
|
+
blocks.push((currentBlock = []));
|
|
213
|
+
}
|
|
214
|
+
node.childNodes.forEach(dfs);
|
|
215
|
+
if (currentBlock.length > 0) {
|
|
216
|
+
blocks.push((currentBlock = []));
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
else {
|
|
220
|
+
node.childNodes.forEach(dfs);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
else if (node.nodeType === NodeType.TEXT_NODE) {
|
|
224
|
+
if (node.isWhitespace) {
|
|
225
|
+
// Whitespace node, postponed output
|
|
226
|
+
currentBlock.prependWhitespace = true;
|
|
227
|
+
}
|
|
228
|
+
else {
|
|
229
|
+
let text = node.trimmedText;
|
|
230
|
+
if (currentBlock.prependWhitespace) {
|
|
231
|
+
text = ` ${text}`;
|
|
232
|
+
currentBlock.prependWhitespace = false;
|
|
233
|
+
}
|
|
234
|
+
currentBlock.push(text);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
dfs(this);
|
|
239
|
+
return blocks
|
|
240
|
+
.map((block) => {
|
|
241
|
+
return block.join('').replace(/\s{2,}/g, ' '); // Normalize each line's whitespace
|
|
242
|
+
})
|
|
243
|
+
.join('\n')
|
|
244
|
+
.replace(/\s+$/, ''); // trimRight;
|
|
245
|
+
}
|
|
246
|
+
toString() {
|
|
247
|
+
const tag = this.rawTagName;
|
|
248
|
+
if (tag) {
|
|
249
|
+
// const void_tags = new Set('area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr'.split('|'));
|
|
250
|
+
// const is_void = void_tags.has(tag);
|
|
251
|
+
const is_void = /^(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)$/i.test(tag);
|
|
252
|
+
const attrs = this.rawAttrs ? ` ${this.rawAttrs}` : '';
|
|
253
|
+
if (is_void) {
|
|
254
|
+
return `<${tag}${attrs}>`;
|
|
255
|
+
}
|
|
256
|
+
return `<${tag}${attrs}>${this.innerHTML}</${tag}>`;
|
|
257
|
+
}
|
|
258
|
+
return this.innerHTML;
|
|
259
|
+
}
|
|
260
|
+
get innerHTML() {
|
|
261
|
+
return this.childNodes
|
|
262
|
+
.map((child) => {
|
|
263
|
+
return child.toString();
|
|
264
|
+
})
|
|
265
|
+
.join('');
|
|
266
|
+
}
|
|
267
|
+
set innerHTML(content) {
|
|
268
|
+
//const r = parse(content, global.options); // TODO global.options ?
|
|
269
|
+
const r = parse(content);
|
|
270
|
+
this.childNodes = r.childNodes.length ? r.childNodes : [new TextNode(content, this)];
|
|
271
|
+
}
|
|
272
|
+
set_content(content, options = {}) {
|
|
273
|
+
if (content instanceof Node) {
|
|
274
|
+
content = [content];
|
|
275
|
+
}
|
|
276
|
+
else if (typeof content == 'string') {
|
|
277
|
+
const r = parse(content, options);
|
|
278
|
+
content = r.childNodes.length ? r.childNodes : [new TextNode(content, this)];
|
|
279
|
+
}
|
|
280
|
+
this.childNodes = content;
|
|
281
|
+
}
|
|
282
|
+
replaceWith(...nodes) {
|
|
283
|
+
const content = nodes
|
|
284
|
+
.map((node) => {
|
|
285
|
+
if (node instanceof Node) {
|
|
286
|
+
return [node];
|
|
287
|
+
}
|
|
288
|
+
else if (typeof node == 'string') {
|
|
289
|
+
// const r = parse(content, global.options); // TODO global.options ?
|
|
290
|
+
const r = parse(node);
|
|
291
|
+
return r.childNodes.length ? r.childNodes : [new TextNode(node, this)];
|
|
292
|
+
}
|
|
293
|
+
return [];
|
|
294
|
+
})
|
|
295
|
+
.flat();
|
|
296
|
+
const idx = this.parentNode.childNodes.findIndex((child) => {
|
|
297
|
+
return child === this;
|
|
298
|
+
});
|
|
299
|
+
this.parentNode.childNodes = [
|
|
300
|
+
...this.parentNode.childNodes.slice(0, idx),
|
|
301
|
+
...content,
|
|
302
|
+
...this.parentNode.childNodes.slice(idx + 1),
|
|
303
|
+
];
|
|
304
|
+
}
|
|
305
|
+
get outerHTML() {
|
|
306
|
+
return this.toString();
|
|
307
|
+
}
|
|
308
|
+
/**
|
|
309
|
+
* Trim element from right (in block) after seeing pattern in a TextNode.
|
|
310
|
+
* @param {RegExp} pattern pattern to find
|
|
311
|
+
* @return {HTMLElement} reference to current node
|
|
312
|
+
*/
|
|
313
|
+
trimRight(pattern) {
|
|
314
|
+
for (let i = 0; i < this.childNodes.length; i++) {
|
|
315
|
+
const childNode = this.childNodes[i];
|
|
316
|
+
if (childNode.nodeType === NodeType.ELEMENT_NODE) {
|
|
317
|
+
childNode.trimRight(pattern);
|
|
318
|
+
}
|
|
319
|
+
else {
|
|
320
|
+
const index = childNode.rawText.search(pattern);
|
|
321
|
+
if (index > -1) {
|
|
322
|
+
childNode.rawText = childNode.rawText.substr(0, index);
|
|
323
|
+
// trim all following nodes.
|
|
324
|
+
this.childNodes.length = i + 1;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
return this;
|
|
329
|
+
}
|
|
330
|
+
/**
|
|
331
|
+
* Get DOM structure
|
|
332
|
+
* @return {string} strucutre
|
|
333
|
+
*/
|
|
334
|
+
get structure() {
|
|
335
|
+
const res = [];
|
|
336
|
+
let indention = 0;
|
|
337
|
+
function write(str) {
|
|
338
|
+
res.push(' '.repeat(indention) + str);
|
|
339
|
+
}
|
|
340
|
+
function dfs(node) {
|
|
341
|
+
const idStr = node.id ? `#${node.id}` : '';
|
|
342
|
+
const classStr = node.classList.length ? `.${node.classList.value.join('.')}` : ''; // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/restrict-template-expressions, @typescript-eslint/no-unsafe-call
|
|
343
|
+
write(`${node.rawTagName}${idStr}${classStr}`);
|
|
344
|
+
indention++;
|
|
345
|
+
node.childNodes.forEach((childNode) => {
|
|
346
|
+
if (childNode.nodeType === NodeType.ELEMENT_NODE) {
|
|
347
|
+
dfs(childNode);
|
|
348
|
+
}
|
|
349
|
+
else if (childNode.nodeType === NodeType.TEXT_NODE) {
|
|
350
|
+
if (!childNode.isWhitespace) {
|
|
351
|
+
write('#text');
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
});
|
|
355
|
+
indention--;
|
|
356
|
+
}
|
|
357
|
+
dfs(this);
|
|
358
|
+
return res.join('\n');
|
|
359
|
+
}
|
|
360
|
+
/**
|
|
361
|
+
* Remove whitespaces in this sub tree.
|
|
362
|
+
* @return {HTMLElement} pointer to this
|
|
363
|
+
*/
|
|
364
|
+
removeWhitespace() {
|
|
365
|
+
let o = 0;
|
|
366
|
+
this.childNodes.forEach((node) => {
|
|
367
|
+
if (node.nodeType === NodeType.TEXT_NODE) {
|
|
368
|
+
if (node.isWhitespace) {
|
|
369
|
+
return;
|
|
370
|
+
}
|
|
371
|
+
node.rawText = node.trimmedRawText;
|
|
372
|
+
}
|
|
373
|
+
else if (node.nodeType === NodeType.ELEMENT_NODE) {
|
|
374
|
+
node.removeWhitespace();
|
|
375
|
+
}
|
|
376
|
+
this.childNodes[o++] = node;
|
|
377
|
+
});
|
|
378
|
+
this.childNodes.length = o;
|
|
379
|
+
return this;
|
|
380
|
+
}
|
|
381
|
+
/**
|
|
382
|
+
* Query CSS selector to find matching nodes.
|
|
383
|
+
* @param {string} selector Simplified CSS selector
|
|
384
|
+
* @return {HTMLElement[]} matching elements
|
|
385
|
+
*/
|
|
386
|
+
querySelectorAll(selector) {
|
|
387
|
+
return selectAll(selector, this, {
|
|
388
|
+
xmlMode: true,
|
|
389
|
+
adapter: Matcher,
|
|
390
|
+
});
|
|
391
|
+
// let matcher: Matcher;
|
|
392
|
+
// if (selector instanceof Matcher) {
|
|
393
|
+
// matcher = selector;
|
|
394
|
+
// matcher.reset();
|
|
395
|
+
// } else {
|
|
396
|
+
// if (selector.includes(',')) {
|
|
397
|
+
// const selectors = selector.split(',');
|
|
398
|
+
// return Array.from(selectors.reduce((pre, cur) => {
|
|
399
|
+
// const result = this.querySelectorAll(cur.trim());
|
|
400
|
+
// return result.reduce((p, c) => {
|
|
401
|
+
// return p.add(c);
|
|
402
|
+
// }, pre);
|
|
403
|
+
// }, new Set<HTMLElement>()));
|
|
404
|
+
// }
|
|
405
|
+
// matcher = new Matcher(selector);
|
|
406
|
+
// }
|
|
407
|
+
// interface IStack {
|
|
408
|
+
// 0: Node; // node
|
|
409
|
+
// 1: number; // children
|
|
410
|
+
// 2: boolean; // found flag
|
|
411
|
+
// }
|
|
412
|
+
// const stack = [] as IStack[];
|
|
413
|
+
// return this.childNodes.reduce((res, cur) => {
|
|
414
|
+
// stack.push([cur, 0, false]);
|
|
415
|
+
// while (stack.length) {
|
|
416
|
+
// const state = arr_back(stack); // get last element
|
|
417
|
+
// const el = state[0];
|
|
418
|
+
// if (state[1] === 0) {
|
|
419
|
+
// // Seen for first time.
|
|
420
|
+
// if (el.nodeType !== NodeType.ELEMENT_NODE) {
|
|
421
|
+
// stack.pop();
|
|
422
|
+
// continue;
|
|
423
|
+
// }
|
|
424
|
+
// const html_el = el as HTMLElement;
|
|
425
|
+
// state[2] = matcher.advance(html_el);
|
|
426
|
+
// if (state[2]) {
|
|
427
|
+
// if (matcher.matched) {
|
|
428
|
+
// res.push(html_el);
|
|
429
|
+
// res.push(...(html_el.querySelectorAll(selector)));
|
|
430
|
+
// // no need to go further.
|
|
431
|
+
// matcher.rewind();
|
|
432
|
+
// stack.pop();
|
|
433
|
+
// continue;
|
|
434
|
+
// }
|
|
435
|
+
// }
|
|
436
|
+
// }
|
|
437
|
+
// if (state[1] < el.childNodes.length) {
|
|
438
|
+
// stack.push([el.childNodes[state[1]++], 0, false]);
|
|
439
|
+
// } else {
|
|
440
|
+
// if (state[2]) {
|
|
441
|
+
// matcher.rewind();
|
|
442
|
+
// }
|
|
443
|
+
// stack.pop();
|
|
444
|
+
// }
|
|
445
|
+
// }
|
|
446
|
+
// return res;
|
|
447
|
+
// }, [] as HTMLElement[]);
|
|
448
|
+
}
|
|
449
|
+
/**
|
|
450
|
+
* Query CSS Selector to find matching node.
|
|
451
|
+
* @param {string} selector Simplified CSS selector
|
|
452
|
+
* @return {HTMLElement} matching node
|
|
453
|
+
*/
|
|
454
|
+
querySelector(selector) {
|
|
455
|
+
return selectOne(selector, this, {
|
|
456
|
+
xmlMode: true,
|
|
457
|
+
adapter: Matcher,
|
|
458
|
+
});
|
|
459
|
+
// let matcher: Matcher;
|
|
460
|
+
// if (selector instanceof Matcher) {
|
|
461
|
+
// matcher = selector;
|
|
462
|
+
// matcher.reset();
|
|
463
|
+
// } else {
|
|
464
|
+
// matcher = new Matcher(selector);
|
|
465
|
+
// }
|
|
466
|
+
// const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean }[];
|
|
467
|
+
// for (const node of this.childNodes) {
|
|
468
|
+
// stack.push([node, 0, false]);
|
|
469
|
+
// while (stack.length) {
|
|
470
|
+
// const state = arr_back(stack);
|
|
471
|
+
// const el = state[0];
|
|
472
|
+
// if (state[1] === 0) {
|
|
473
|
+
// // Seen for first time.
|
|
474
|
+
// if (el.nodeType !== NodeType.ELEMENT_NODE) {
|
|
475
|
+
// stack.pop();
|
|
476
|
+
// continue;
|
|
477
|
+
// }
|
|
478
|
+
// state[2] = matcher.advance(el as HTMLElement);
|
|
479
|
+
// if (state[2]) {
|
|
480
|
+
// if (matcher.matched) {
|
|
481
|
+
// return el as HTMLElement;
|
|
482
|
+
// }
|
|
483
|
+
// }
|
|
484
|
+
// }
|
|
485
|
+
// if (state[1] < el.childNodes.length) {
|
|
486
|
+
// stack.push([el.childNodes[state[1]++], 0, false]);
|
|
487
|
+
// } else {
|
|
488
|
+
// if (state[2]) {
|
|
489
|
+
// matcher.rewind();
|
|
490
|
+
// }
|
|
491
|
+
// stack.pop();
|
|
492
|
+
// }
|
|
493
|
+
// }
|
|
494
|
+
// }
|
|
495
|
+
// return null;
|
|
496
|
+
}
|
|
497
|
+
/**
|
|
498
|
+
* traverses the Element and its parents (heading toward the document root) until it finds a node that matches the provided selector string. Will return itself or the matching ancestor. If no such element exists, it returns null.
|
|
499
|
+
* @param selector a DOMString containing a selector list
|
|
500
|
+
*/
|
|
501
|
+
closest(selector) {
|
|
502
|
+
const mapChild = new Map();
|
|
503
|
+
let el = this;
|
|
504
|
+
let old = null;
|
|
505
|
+
function findOne(test, elems) {
|
|
506
|
+
let elem = null;
|
|
507
|
+
for (let i = 0, l = elems.length; i < l && !elem; i++) {
|
|
508
|
+
const el = elems[i];
|
|
509
|
+
if (test(el)) {
|
|
510
|
+
elem = el;
|
|
511
|
+
}
|
|
512
|
+
else {
|
|
513
|
+
const child = mapChild.get(el);
|
|
514
|
+
if (child) {
|
|
515
|
+
elem = findOne(test, [child]);
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
return elem;
|
|
520
|
+
}
|
|
521
|
+
while (el) {
|
|
522
|
+
mapChild.set(el, old);
|
|
523
|
+
old = el;
|
|
524
|
+
el = el.parentNode;
|
|
525
|
+
}
|
|
526
|
+
el = this;
|
|
527
|
+
while (el) {
|
|
528
|
+
const e = selectOne(selector, el, {
|
|
529
|
+
xmlMode: true,
|
|
530
|
+
adapter: {
|
|
531
|
+
...Matcher,
|
|
532
|
+
getChildren(node) {
|
|
533
|
+
const child = mapChild.get(node);
|
|
534
|
+
return child && [child];
|
|
535
|
+
},
|
|
536
|
+
getSiblings(node) {
|
|
537
|
+
return [node];
|
|
538
|
+
},
|
|
539
|
+
findOne,
|
|
540
|
+
findAll() {
|
|
541
|
+
return [];
|
|
542
|
+
},
|
|
543
|
+
},
|
|
544
|
+
});
|
|
545
|
+
if (e) {
|
|
546
|
+
return e;
|
|
547
|
+
}
|
|
548
|
+
el = el.parentNode;
|
|
549
|
+
}
|
|
550
|
+
return null;
|
|
551
|
+
}
|
|
552
|
+
/**
|
|
553
|
+
* Append a child node to childNodes
|
|
554
|
+
* @param {Node} node node to append
|
|
555
|
+
* @return {Node} node appended
|
|
556
|
+
*/
|
|
557
|
+
appendChild(node) {
|
|
558
|
+
// node.parentNode = this;
|
|
559
|
+
this.childNodes.push(node);
|
|
560
|
+
node.parentNode = this;
|
|
561
|
+
return node;
|
|
562
|
+
}
|
|
563
|
+
/**
|
|
564
|
+
* Get first child node
|
|
565
|
+
* @return {Node} first child node
|
|
566
|
+
*/
|
|
567
|
+
get firstChild() {
|
|
568
|
+
return this.childNodes[0];
|
|
569
|
+
}
|
|
570
|
+
/**
|
|
571
|
+
* Get last child node
|
|
572
|
+
* @return {Node} last child node
|
|
573
|
+
*/
|
|
574
|
+
get lastChild() {
|
|
575
|
+
return arr_back(this.childNodes);
|
|
576
|
+
}
|
|
577
|
+
/**
|
|
578
|
+
* Get attributes
|
|
579
|
+
* @access private
|
|
580
|
+
* @return {Object} parsed and unescaped attributes
|
|
581
|
+
*/
|
|
582
|
+
get attrs() {
|
|
583
|
+
if (this._attrs) {
|
|
584
|
+
return this._attrs;
|
|
585
|
+
}
|
|
586
|
+
this._attrs = {};
|
|
587
|
+
const attrs = this.rawAttributes;
|
|
588
|
+
for (const key in attrs) {
|
|
589
|
+
const val = attrs[key] || '';
|
|
590
|
+
this._attrs[key.toLowerCase()] = decode(val);
|
|
591
|
+
}
|
|
592
|
+
return this._attrs;
|
|
593
|
+
}
|
|
594
|
+
get attributes() {
|
|
595
|
+
const ret_attrs = {};
|
|
596
|
+
const attrs = this.rawAttributes;
|
|
597
|
+
for (const key in attrs) {
|
|
598
|
+
const val = attrs[key] || '';
|
|
599
|
+
ret_attrs[key] = decode(val);
|
|
600
|
+
}
|
|
601
|
+
return ret_attrs;
|
|
602
|
+
}
|
|
603
|
+
/**
|
|
604
|
+
* Get escaped (as-it) attributes
|
|
605
|
+
* @return {Object} parsed attributes
|
|
606
|
+
*/
|
|
607
|
+
get rawAttributes() {
|
|
608
|
+
if (this._rawAttrs) {
|
|
609
|
+
return this._rawAttrs;
|
|
610
|
+
}
|
|
611
|
+
const attrs = {};
|
|
612
|
+
if (this.rawAttrs) {
|
|
613
|
+
const re = /([a-z()#][a-z0-9-_:()#]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/gi;
|
|
614
|
+
let match;
|
|
615
|
+
while ((match = re.exec(this.rawAttrs))) {
|
|
616
|
+
attrs[match[1]] = match[2] || match[3] || match[4] || null;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
this._rawAttrs = attrs;
|
|
620
|
+
return attrs;
|
|
621
|
+
}
|
|
622
|
+
removeAttribute(key) {
|
|
623
|
+
const attrs = this.rawAttributes;
|
|
624
|
+
delete attrs[key];
|
|
625
|
+
// Update this.attribute
|
|
626
|
+
if (this._attrs) {
|
|
627
|
+
delete this._attrs[key];
|
|
628
|
+
}
|
|
629
|
+
// Update rawString
|
|
630
|
+
this.rawAttrs = Object.keys(attrs)
|
|
631
|
+
.map((name) => {
|
|
632
|
+
const val = JSON.stringify(attrs[name]);
|
|
633
|
+
if (val === undefined || val === 'null') {
|
|
634
|
+
return name;
|
|
635
|
+
}
|
|
636
|
+
return `${name}=${val}`;
|
|
637
|
+
})
|
|
638
|
+
.join(' ');
|
|
639
|
+
// Update this.id
|
|
640
|
+
if (key === 'id') {
|
|
641
|
+
this.id = '';
|
|
642
|
+
}
|
|
643
|
+
}
|
|
644
|
+
hasAttribute(key) {
|
|
645
|
+
return key.toLowerCase() in this.attrs;
|
|
646
|
+
}
|
|
647
|
+
/**
|
|
648
|
+
* Get an attribute
|
|
649
|
+
* @return {string} value of the attribute
|
|
650
|
+
*/
|
|
651
|
+
getAttribute(key) {
|
|
652
|
+
return this.attrs[key.toLowerCase()];
|
|
653
|
+
}
|
|
654
|
+
/**
|
|
655
|
+
* Set an attribute value to the HTMLElement
|
|
656
|
+
* @param {string} key The attribute name
|
|
657
|
+
* @param {string} value The value to set, or null / undefined to remove an attribute
|
|
658
|
+
*/
|
|
659
|
+
setAttribute(key, value) {
|
|
660
|
+
if (arguments.length < 2) {
|
|
661
|
+
throw new Error("Failed to execute 'setAttribute' on 'Element'");
|
|
662
|
+
}
|
|
663
|
+
const k2 = key.toLowerCase();
|
|
664
|
+
const attrs = this.rawAttributes;
|
|
665
|
+
for (const k in attrs) {
|
|
666
|
+
if (k.toLowerCase() === k2) {
|
|
667
|
+
key = k;
|
|
668
|
+
break;
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
attrs[key] = String(value);
|
|
672
|
+
// update this.attrs
|
|
673
|
+
if (this._attrs) {
|
|
674
|
+
this._attrs[k2] = decode(attrs[key]);
|
|
675
|
+
}
|
|
676
|
+
// Update rawString
|
|
677
|
+
this.rawAttrs = Object.keys(attrs)
|
|
678
|
+
.map((name) => {
|
|
679
|
+
const val = this.quoteAttribute(attrs[name]);
|
|
680
|
+
if (val === 'null' || val === '""')
|
|
681
|
+
return name;
|
|
682
|
+
return `${name}=${val}`;
|
|
683
|
+
})
|
|
684
|
+
.join(' ');
|
|
685
|
+
// Update this.id
|
|
686
|
+
if (key === 'id') {
|
|
687
|
+
this.id = value;
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
/**
|
|
691
|
+
* Replace all the attributes of the HTMLElement by the provided attributes
|
|
692
|
+
* @param {Attributes} attributes the new attribute set
|
|
693
|
+
*/
|
|
694
|
+
setAttributes(attributes) {
|
|
695
|
+
// Invalidate current this.attributes
|
|
696
|
+
if (this._attrs) {
|
|
697
|
+
delete this._attrs;
|
|
698
|
+
}
|
|
699
|
+
// Invalidate current this.rawAttributes
|
|
700
|
+
if (this._rawAttrs) {
|
|
701
|
+
delete this._rawAttrs;
|
|
702
|
+
}
|
|
703
|
+
// Update rawString
|
|
704
|
+
this.rawAttrs = Object.keys(attributes)
|
|
705
|
+
.map((name) => {
|
|
706
|
+
const val = attributes[name];
|
|
707
|
+
if (val === 'null' || val === '""')
|
|
708
|
+
return name;
|
|
709
|
+
return `${name}=${this.quoteAttribute(String(val))}`;
|
|
710
|
+
})
|
|
711
|
+
.join(' ');
|
|
712
|
+
}
|
|
713
|
+
insertAdjacentHTML(where, html) {
|
|
714
|
+
if (arguments.length < 2) {
|
|
715
|
+
throw new Error('2 arguments required');
|
|
716
|
+
}
|
|
717
|
+
const p = parse(html);
|
|
718
|
+
if (where === 'afterend') {
|
|
719
|
+
const idx = this.parentNode.childNodes.findIndex((child) => {
|
|
720
|
+
return child === this;
|
|
721
|
+
});
|
|
722
|
+
this.parentNode.childNodes.splice(idx + 1, 0, ...p.childNodes);
|
|
723
|
+
p.childNodes.forEach((n) => {
|
|
724
|
+
if (n instanceof HTMLElement) {
|
|
725
|
+
n.parentNode = this.parentNode;
|
|
726
|
+
}
|
|
727
|
+
});
|
|
728
|
+
}
|
|
729
|
+
else if (where === 'afterbegin') {
|
|
730
|
+
this.childNodes.unshift(...p.childNodes);
|
|
731
|
+
}
|
|
732
|
+
else if (where === 'beforeend') {
|
|
733
|
+
p.childNodes.forEach((n) => {
|
|
734
|
+
this.appendChild(n);
|
|
735
|
+
});
|
|
736
|
+
}
|
|
737
|
+
else if (where === 'beforebegin') {
|
|
738
|
+
const idx = this.parentNode.childNodes.findIndex((child) => {
|
|
739
|
+
return child === this;
|
|
740
|
+
});
|
|
741
|
+
this.parentNode.childNodes.splice(idx, 0, ...p.childNodes);
|
|
742
|
+
p.childNodes.forEach((n) => {
|
|
743
|
+
if (n instanceof HTMLElement) {
|
|
744
|
+
n.parentNode = this.parentNode;
|
|
745
|
+
}
|
|
746
|
+
});
|
|
747
|
+
}
|
|
748
|
+
else {
|
|
749
|
+
throw new Error(`The value provided ('${where}') is not one of 'beforebegin', 'afterbegin', 'beforeend', or 'afterend'`);
|
|
750
|
+
}
|
|
751
|
+
// if (!where || html === undefined || html === null) {
|
|
752
|
+
// return;
|
|
753
|
+
// }
|
|
754
|
+
}
|
|
755
|
+
get nextSibling() {
|
|
756
|
+
if (this.parentNode) {
|
|
757
|
+
const children = this.parentNode.childNodes;
|
|
758
|
+
let i = 0;
|
|
759
|
+
while (i < children.length) {
|
|
760
|
+
const child = children[i++];
|
|
761
|
+
if (this === child)
|
|
762
|
+
return children[i] || null;
|
|
763
|
+
}
|
|
764
|
+
return null;
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
get nextElementSibling() {
|
|
768
|
+
if (this.parentNode) {
|
|
769
|
+
const children = this.parentNode.childNodes;
|
|
770
|
+
let i = 0;
|
|
771
|
+
let find = false;
|
|
772
|
+
while (i < children.length) {
|
|
773
|
+
const child = children[i++];
|
|
774
|
+
if (find) {
|
|
775
|
+
if (child instanceof HTMLElement) {
|
|
776
|
+
return child || null;
|
|
777
|
+
}
|
|
778
|
+
}
|
|
779
|
+
else if (this === child) {
|
|
780
|
+
find = true;
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
return null;
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
get classNames() {
|
|
787
|
+
return this.classList.toString();
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
// https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name
|
|
791
|
+
const kMarkupPattern = /<!--[^]*?(?=-->)-->|<(\/?)([a-z][-.:0-9_a-z]*)\s*((?=[/>]*?)|(?:.*?[\s\d/'"])|(?:.*?[\w]))(\/?)>/gi;
|
|
792
|
+
// <(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
|
|
793
|
+
// <([a-z][-.:0-9_a-z]*)\s*\/>
|
|
794
|
+
// <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>
|
|
795
|
+
// <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>|<(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
|
|
796
|
+
const kAttributePattern = /(^|\s)(id|class)\s*=\s*("([^"]*)"|'([^']*)'|(\S+))/gi;
|
|
797
|
+
const kSelfClosingElements = {
|
|
798
|
+
area: true,
|
|
799
|
+
AREA: true,
|
|
800
|
+
base: true,
|
|
801
|
+
BASE: true,
|
|
802
|
+
br: true,
|
|
803
|
+
BR: true,
|
|
804
|
+
col: true,
|
|
805
|
+
COL: true,
|
|
806
|
+
hr: true,
|
|
807
|
+
HR: true,
|
|
808
|
+
img: true,
|
|
809
|
+
IMG: true,
|
|
810
|
+
input: true,
|
|
811
|
+
INPUT: true,
|
|
812
|
+
link: true,
|
|
813
|
+
LINK: true,
|
|
814
|
+
meta: true,
|
|
815
|
+
META: true,
|
|
816
|
+
source: true,
|
|
817
|
+
SOURCE: true,
|
|
818
|
+
embed: true,
|
|
819
|
+
EMBED: true,
|
|
820
|
+
param: true,
|
|
821
|
+
PARAM: true,
|
|
822
|
+
track: true,
|
|
823
|
+
TRACK: true,
|
|
824
|
+
wbr: true,
|
|
825
|
+
WBR: true,
|
|
826
|
+
};
|
|
827
|
+
const kElementsClosedByOpening = {
|
|
828
|
+
li: { li: true, LI: true },
|
|
829
|
+
LI: { li: true, LI: true },
|
|
830
|
+
p: { p: true, div: true, P: true, DIV: true },
|
|
831
|
+
P: { p: true, div: true, P: true, DIV: true },
|
|
832
|
+
b: { div: true, DIV: true },
|
|
833
|
+
B: { div: true, DIV: true },
|
|
834
|
+
td: { td: true, th: true, TD: true, TH: true },
|
|
835
|
+
TD: { td: true, th: true, TD: true, TH: true },
|
|
836
|
+
th: { td: true, th: true, TD: true, TH: true },
|
|
837
|
+
TH: { td: true, th: true, TD: true, TH: true },
|
|
838
|
+
h1: { h1: true, H1: true },
|
|
839
|
+
H1: { h1: true, H1: true },
|
|
840
|
+
h2: { h2: true, H2: true },
|
|
841
|
+
H2: { h2: true, H2: true },
|
|
842
|
+
h3: { h3: true, H3: true },
|
|
843
|
+
H3: { h3: true, H3: true },
|
|
844
|
+
h4: { h4: true, H4: true },
|
|
845
|
+
H4: { h4: true, H4: true },
|
|
846
|
+
h5: { h5: true, H5: true },
|
|
847
|
+
H5: { h5: true, H5: true },
|
|
848
|
+
h6: { h6: true, H6: true },
|
|
849
|
+
H6: { h6: true, H6: true },
|
|
850
|
+
};
|
|
851
|
+
const kElementsClosedByClosing = {
|
|
852
|
+
li: { ul: true, ol: true, UL: true, OL: true },
|
|
853
|
+
LI: { ul: true, ol: true, UL: true, OL: true },
|
|
854
|
+
a: { div: true, DIV: true },
|
|
855
|
+
A: { div: true, DIV: true },
|
|
856
|
+
b: { div: true, DIV: true },
|
|
857
|
+
B: { div: true, DIV: true },
|
|
858
|
+
i: { div: true, DIV: true },
|
|
859
|
+
I: { div: true, DIV: true },
|
|
860
|
+
p: { div: true, DIV: true },
|
|
861
|
+
P: { div: true, DIV: true },
|
|
862
|
+
td: { tr: true, table: true, TR: true, TABLE: true },
|
|
863
|
+
TD: { tr: true, table: true, TR: true, TABLE: true },
|
|
864
|
+
th: { tr: true, table: true, TR: true, TABLE: true },
|
|
865
|
+
TH: { tr: true, table: true, TR: true, TABLE: true },
|
|
866
|
+
};
|
|
867
|
+
const frameflag = 'documentfragmentcontainer';
|
|
868
|
+
/**
|
|
869
|
+
* Parses HTML and returns a root element
|
|
870
|
+
* Parse a chuck of HTML source.
|
|
871
|
+
* @param {string} data html
|
|
872
|
+
* @return {HTMLElement} root element
|
|
873
|
+
*/
|
|
874
|
+
export function base_parse(data, options = { lowerCaseTagName: false, comment: false }) {
|
|
875
|
+
const elements = options.blockTextElements || {
|
|
876
|
+
script: true,
|
|
877
|
+
noscript: true,
|
|
878
|
+
style: true,
|
|
879
|
+
pre: true,
|
|
880
|
+
};
|
|
881
|
+
const element_names = Object.keys(elements);
|
|
882
|
+
const kBlockTextElements = element_names.map((it) => new RegExp(it, 'i'));
|
|
883
|
+
const kIgnoreElements = element_names.filter((it) => elements[it]).map((it) => new RegExp(it, 'i'));
|
|
884
|
+
function element_should_be_ignore(tag) {
|
|
885
|
+
return kIgnoreElements.some((it) => it.test(tag));
|
|
886
|
+
}
|
|
887
|
+
function is_block_text_element(tag) {
|
|
888
|
+
return kBlockTextElements.some((it) => it.test(tag));
|
|
889
|
+
}
|
|
890
|
+
const createRange = (startPos, endPos) => [startPos - frameFlagOffset, endPos - frameFlagOffset];
|
|
891
|
+
const root = new HTMLElement(null, {}, '', null, [0, data.length]);
|
|
892
|
+
let currentParent = root;
|
|
893
|
+
const stack = [root];
|
|
894
|
+
let lastTextPos = -1;
|
|
895
|
+
let noNestedTagIndex = undefined;
|
|
896
|
+
let match;
|
|
897
|
+
// https://github.com/taoqf/node-html-parser/issues/38
|
|
898
|
+
data = `<${frameflag}>${data}</${frameflag}>`;
|
|
899
|
+
const dataEndPos = data.length - (frameflag.length + 2);
|
|
900
|
+
const frameFlagOffset = frameflag.length + 2;
|
|
901
|
+
while ((match = kMarkupPattern.exec(data))) {
|
|
902
|
+
const tagStartPos = kMarkupPattern.lastIndex - match[0].length;
|
|
903
|
+
const tagEndPos = kMarkupPattern.lastIndex;
|
|
904
|
+
// Add TextNode if content
|
|
905
|
+
if (lastTextPos > -1) {
|
|
906
|
+
if (lastTextPos + match[0].length < tagEndPos) {
|
|
907
|
+
const text = data.substring(lastTextPos, tagStartPos);
|
|
908
|
+
currentParent.appendChild(new TextNode(text, currentParent, createRange(lastTextPos, tagStartPos)));
|
|
909
|
+
}
|
|
910
|
+
}
|
|
911
|
+
lastTextPos = kMarkupPattern.lastIndex;
|
|
912
|
+
// https://github.com/taoqf/node-html-parser/issues/38
|
|
913
|
+
// Skip frameflag node
|
|
914
|
+
if (match[2] === frameflag)
|
|
915
|
+
continue;
|
|
916
|
+
// Handle comments
|
|
917
|
+
if (match[0][1] === '!') {
|
|
918
|
+
if (options.comment) {
|
|
919
|
+
// Only keep what is in between <!-- and -->
|
|
920
|
+
const text = data.substring(tagStartPos + 4, tagEndPos - 3);
|
|
921
|
+
currentParent.appendChild(new CommentNode(text, currentParent, createRange(tagStartPos, tagEndPos)));
|
|
922
|
+
}
|
|
923
|
+
continue;
|
|
924
|
+
}
|
|
925
|
+
/* -- Handle tag matching -- */
|
|
926
|
+
// Fix tag casing if necessary
|
|
927
|
+
if (options.lowerCaseTagName)
|
|
928
|
+
match[2] = match[2].toLowerCase();
|
|
929
|
+
// Handle opening tags (ie. <this> not </that>)
|
|
930
|
+
if (!match[1]) {
|
|
931
|
+
/* Populate attributes */
|
|
932
|
+
const attrs = {};
|
|
933
|
+
for (let attMatch; (attMatch = kAttributePattern.exec(match[3]));) {
|
|
934
|
+
attrs[attMatch[2].toLowerCase()] = attMatch[4] || attMatch[5] || attMatch[6];
|
|
935
|
+
}
|
|
936
|
+
const tagName = currentParent.rawTagName;
|
|
937
|
+
if (!match[4] && kElementsClosedByOpening[tagName]) {
|
|
938
|
+
if (kElementsClosedByOpening[tagName][match[2]]) {
|
|
939
|
+
stack.pop();
|
|
940
|
+
currentParent = arr_back(stack);
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
// Prevent nested A tags by terminating the last A and starting a new one : see issue #144
|
|
944
|
+
if (match[2] === 'a' || match[2] === 'A') {
|
|
945
|
+
if (noNestedTagIndex !== undefined) {
|
|
946
|
+
stack.splice(noNestedTagIndex);
|
|
947
|
+
currentParent = arr_back(stack);
|
|
948
|
+
}
|
|
949
|
+
noNestedTagIndex = stack.length;
|
|
950
|
+
}
|
|
951
|
+
const tagEndPos = kMarkupPattern.lastIndex;
|
|
952
|
+
const tagStartPos = tagEndPos - match[0].length;
|
|
953
|
+
currentParent = currentParent.appendChild(
|
|
954
|
+
// Initialize range (end position updated later for closed tags)
|
|
955
|
+
new HTMLElement(match[2], attrs, match[3], null, createRange(tagStartPos, tagEndPos)));
|
|
956
|
+
stack.push(currentParent);
|
|
957
|
+
if (is_block_text_element(match[2])) {
|
|
958
|
+
// Find closing tag
|
|
959
|
+
const closeMarkup = `</${match[2]}>`;
|
|
960
|
+
const closeIndex = options.lowerCaseTagName
|
|
961
|
+
? data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex)
|
|
962
|
+
: data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
|
|
963
|
+
const textEndPos = closeIndex === -1 ? dataEndPos : closeIndex;
|
|
964
|
+
if (element_should_be_ignore(match[2])) {
|
|
965
|
+
const text = data.substring(tagEndPos, textEndPos);
|
|
966
|
+
if (text.length > 0 && /\S/.test(text)) {
|
|
967
|
+
currentParent.appendChild(new TextNode(text, currentParent, createRange(tagEndPos, textEndPos)));
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
if (closeIndex === -1) {
|
|
971
|
+
lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
|
|
972
|
+
}
|
|
973
|
+
else {
|
|
974
|
+
lastTextPos = kMarkupPattern.lastIndex = closeIndex + closeMarkup.length;
|
|
975
|
+
// Cause to be treated as self-closing, because no close found
|
|
976
|
+
match[1] = 'true';
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
// Handle closing tags or self-closed elements (ie </tag> or <br>)
|
|
981
|
+
if (match[1] || match[4] || kSelfClosingElements[match[2]]) {
|
|
982
|
+
while (true) {
|
|
983
|
+
if (match[2] === 'a' || match[2] === 'A')
|
|
984
|
+
noNestedTagIndex = undefined;
|
|
985
|
+
if (currentParent.rawTagName === match[2]) {
|
|
986
|
+
// Update range end for closed tag
|
|
987
|
+
currentParent.range[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];
|
|
988
|
+
stack.pop();
|
|
989
|
+
currentParent = arr_back(stack);
|
|
990
|
+
break;
|
|
991
|
+
}
|
|
992
|
+
else {
|
|
993
|
+
const tagName = currentParent.tagName;
|
|
994
|
+
// Trying to close current tag, and move on
|
|
995
|
+
if (kElementsClosedByClosing[tagName]) {
|
|
996
|
+
if (kElementsClosedByClosing[tagName][match[2]]) {
|
|
997
|
+
stack.pop();
|
|
998
|
+
currentParent = arr_back(stack);
|
|
999
|
+
continue;
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
// Use aggressive strategy to handle unmatching markups.
|
|
1003
|
+
break;
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
return stack;
|
|
1009
|
+
}
|
|
1010
|
+
/**
|
|
1011
|
+
* Parses HTML and returns a root element
|
|
1012
|
+
* Parse a chuck of HTML source.
|
|
1013
|
+
*/
|
|
1014
|
+
export function parse(data, options = { lowerCaseTagName: false, comment: false }) {
|
|
1015
|
+
const stack = base_parse(data, options);
|
|
1016
|
+
const [root] = stack;
|
|
1017
|
+
while (stack.length > 1) {
|
|
1018
|
+
// Handle each error elements.
|
|
1019
|
+
const last = stack.pop();
|
|
1020
|
+
const oneBefore = arr_back(stack);
|
|
1021
|
+
if (last.parentNode && last.parentNode.parentNode) {
|
|
1022
|
+
if (last.parentNode === oneBefore && last.tagName === oneBefore.tagName) {
|
|
1023
|
+
// Pair error case <h3> <h3> handle : Fixes to <h3> </h3>
|
|
1024
|
+
oneBefore.removeChild(last);
|
|
1025
|
+
last.childNodes.forEach((child) => {
|
|
1026
|
+
oneBefore.parentNode.appendChild(child);
|
|
1027
|
+
});
|
|
1028
|
+
stack.pop();
|
|
1029
|
+
}
|
|
1030
|
+
else {
|
|
1031
|
+
// Single error <div> <h3> </div> handle: Just removes <h3>
|
|
1032
|
+
oneBefore.removeChild(last);
|
|
1033
|
+
last.childNodes.forEach((child) => {
|
|
1034
|
+
oneBefore.appendChild(child);
|
|
1035
|
+
});
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
1038
|
+
else {
|
|
1039
|
+
// If it's final element just skip.
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
// response.childNodes.forEach((node) => {
|
|
1043
|
+
// if (node instanceof HTMLElement) {
|
|
1044
|
+
// node.parentNode = null;
|
|
1045
|
+
// }
|
|
1046
|
+
// });
|
|
1047
|
+
return root;
|
|
1048
|
+
}
|