node-html-parser 1.3.2 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -0
- package/dist/esm/back.js +3 -0
- package/dist/esm/index.js +5 -0
- package/dist/esm/matcher.js +251 -0
- package/dist/esm/nodes/comment.js +23 -0
- package/dist/esm/nodes/html.js +787 -0
- package/dist/esm/nodes/node.js +11 -0
- package/dist/esm/nodes/text.js +34 -0
- package/dist/esm/nodes/type.js +7 -0
- package/dist/main.js +39 -22
- package/dist/nodes/html.d.ts +8 -5
- package/dist/nodes/html.js +39 -22
- package/package.json +5 -2
package/README.md
CHANGED
|
@@ -75,6 +75,12 @@ Parse given data, and return root of the generated DOM.
|
|
|
75
75
|
{
|
|
76
76
|
lowerCaseTagName: false, // convert tag name to lower case (hurt performance heavily)
|
|
77
77
|
comment: false // retrieve comments (hurt performance slightly)
|
|
78
|
+
blockTextElements: {
|
|
79
|
+
script: true, // keep text content when parsing
|
|
80
|
+
noscript: true, // keep text content when parsing
|
|
81
|
+
style: true, // keep text content when parsing
|
|
82
|
+
pre: true // keep text content when parsing
|
|
83
|
+
}
|
|
78
84
|
}
|
|
79
85
|
```
|
|
80
86
|
|
package/dist/esm/back.js
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export { default as CommentNode } from './nodes/comment';
|
|
2
|
+
export { default as HTMLElement, parse, parse as default } from './nodes/html';
|
|
3
|
+
export { default as Node } from './nodes/node';
|
|
4
|
+
export { default as TextNode } from './nodes/text';
|
|
5
|
+
export { default as NodeType } from './nodes/type';
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cache to store generated match functions
|
|
3
|
+
* @type {Object}
|
|
4
|
+
*/
|
|
5
|
+
let pMatchFunctionCache = {};
|
|
6
|
+
function compare_tagname(tag1, tag2) {
|
|
7
|
+
if (!tag1) {
|
|
8
|
+
return !tag2;
|
|
9
|
+
}
|
|
10
|
+
if (!tag2) {
|
|
11
|
+
return !tag1;
|
|
12
|
+
}
|
|
13
|
+
return tag1.toLowerCase() === tag2.toLowerCase();
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Function cache
|
|
17
|
+
*/
|
|
18
|
+
const functionCache = {
|
|
19
|
+
f145(el, tagName, classes) {
|
|
20
|
+
'use strict';
|
|
21
|
+
tagName = tagName || '';
|
|
22
|
+
classes = classes || [];
|
|
23
|
+
if (el.id !== tagName.substr(1)) {
|
|
24
|
+
return false;
|
|
25
|
+
}
|
|
26
|
+
for (let cls = classes, i = 0; i < cls.length; i++) {
|
|
27
|
+
if (el.classNames.indexOf(cls[i]) === -1) {
|
|
28
|
+
return false;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
return true;
|
|
32
|
+
},
|
|
33
|
+
f45(el, tagName, classes) {
|
|
34
|
+
'use strict';
|
|
35
|
+
tagName = tagName || '';
|
|
36
|
+
classes = classes || [];
|
|
37
|
+
for (let cls = classes, i = 0; i < cls.length; i++) {
|
|
38
|
+
if (el.classNames.indexOf(cls[i]) === -1) {
|
|
39
|
+
return false;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return true;
|
|
43
|
+
},
|
|
44
|
+
f15(el, tagName) {
|
|
45
|
+
'use strict';
|
|
46
|
+
tagName = tagName || '';
|
|
47
|
+
if (el.id !== tagName.substr(1)) {
|
|
48
|
+
return false;
|
|
49
|
+
}
|
|
50
|
+
return true;
|
|
51
|
+
},
|
|
52
|
+
f1(el, tagName) {
|
|
53
|
+
'use strict';
|
|
54
|
+
tagName = tagName || '';
|
|
55
|
+
if (el.id !== tagName.substr(1)) {
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
},
|
|
59
|
+
f5() {
|
|
60
|
+
'use strict';
|
|
61
|
+
return true;
|
|
62
|
+
},
|
|
63
|
+
f55(el, tagName, classes, attr_key) {
|
|
64
|
+
'use strict';
|
|
65
|
+
tagName = tagName || '';
|
|
66
|
+
classes = classes || [];
|
|
67
|
+
attr_key = attr_key || '';
|
|
68
|
+
const attrs = el.attributes;
|
|
69
|
+
return attrs.hasOwnProperty(attr_key);
|
|
70
|
+
},
|
|
71
|
+
f245(el, tagName, classes, attr_key, value) {
|
|
72
|
+
'use strict';
|
|
73
|
+
tagName = tagName || '';
|
|
74
|
+
classes = classes || [];
|
|
75
|
+
attr_key = (attr_key || '').toLowerCase();
|
|
76
|
+
value = value || '';
|
|
77
|
+
const attrs = el.attributes;
|
|
78
|
+
return Object.keys(attrs).some((key) => {
|
|
79
|
+
const val = attrs[key];
|
|
80
|
+
return key.toLowerCase() === attr_key && val === value;
|
|
81
|
+
});
|
|
82
|
+
// for (let cls = classes, i = 0; i < cls.length; i++) {if (el.classNames.indexOf(cls[i]) === -1){ return false;}}
|
|
83
|
+
// return true;
|
|
84
|
+
},
|
|
85
|
+
f25(el, tagName, classes, attr_key, value) {
|
|
86
|
+
'use strict';
|
|
87
|
+
tagName = tagName || '';
|
|
88
|
+
classes = classes || [];
|
|
89
|
+
attr_key = (attr_key || '').toLowerCase();
|
|
90
|
+
value = value || '';
|
|
91
|
+
const attrs = el.attributes;
|
|
92
|
+
return Object.keys(attrs).some((key) => {
|
|
93
|
+
const val = attrs[key];
|
|
94
|
+
return key.toLowerCase() === attr_key && val === value;
|
|
95
|
+
});
|
|
96
|
+
// return true;
|
|
97
|
+
},
|
|
98
|
+
f2(el, tagName, classes, attr_key, value) {
|
|
99
|
+
'use strict';
|
|
100
|
+
tagName = tagName || '';
|
|
101
|
+
classes = classes || [];
|
|
102
|
+
attr_key = (attr_key || '').toLowerCase();
|
|
103
|
+
value = value || '';
|
|
104
|
+
const attrs = el.attributes;
|
|
105
|
+
return Object.keys(attrs).some((key) => {
|
|
106
|
+
const val = attrs[key];
|
|
107
|
+
return key.toLowerCase() === attr_key && val === value;
|
|
108
|
+
});
|
|
109
|
+
},
|
|
110
|
+
f345(el, tagName, classes) {
|
|
111
|
+
'use strict';
|
|
112
|
+
tagName = tagName || '';
|
|
113
|
+
classes = classes || [];
|
|
114
|
+
if (!compare_tagname(el.tagName, tagName)) {
|
|
115
|
+
return false;
|
|
116
|
+
}
|
|
117
|
+
for (let cls = classes, i = 0; i < cls.length; i++) {
|
|
118
|
+
if (el.classNames.indexOf(cls[i]) === -1) {
|
|
119
|
+
return false;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
return true;
|
|
123
|
+
},
|
|
124
|
+
f35(el, tagName) {
|
|
125
|
+
'use strict';
|
|
126
|
+
tagName = tagName || '';
|
|
127
|
+
return compare_tagname(el.tagName, tagName);
|
|
128
|
+
},
|
|
129
|
+
f3(el, tagName) {
|
|
130
|
+
'use strict';
|
|
131
|
+
tagName = tagName || '';
|
|
132
|
+
// if (el.tagName !== tagName) {
|
|
133
|
+
// return false;
|
|
134
|
+
// }
|
|
135
|
+
return compare_tagname(el.tagName, tagName);
|
|
136
|
+
}
|
|
137
|
+
};
|
|
138
|
+
/**
|
|
139
|
+
* Matcher class to make CSS match
|
|
140
|
+
*
|
|
141
|
+
* @class Matcher
|
|
142
|
+
*/
|
|
143
|
+
export default class Matcher {
|
|
144
|
+
/**
|
|
145
|
+
* Creates an instance of Matcher.
|
|
146
|
+
* @param {string} selector
|
|
147
|
+
*
|
|
148
|
+
* @memberof Matcher
|
|
149
|
+
*/
|
|
150
|
+
constructor(selector) {
|
|
151
|
+
this.nextMatch = 0;
|
|
152
|
+
this.matchers = selector.split(' ').map((matcher) => {
|
|
153
|
+
if (pMatchFunctionCache[matcher]) {
|
|
154
|
+
return pMatchFunctionCache[matcher];
|
|
155
|
+
}
|
|
156
|
+
const parts = matcher.split('.');
|
|
157
|
+
const tagName = parts[0];
|
|
158
|
+
const classes = parts.slice(1).sort();
|
|
159
|
+
// let source = '"use strict";';
|
|
160
|
+
let function_name = 'f';
|
|
161
|
+
let attr_key = '';
|
|
162
|
+
let value = '';
|
|
163
|
+
if (tagName && tagName !== '*') {
|
|
164
|
+
let reg;
|
|
165
|
+
if (tagName.startsWith('#')) {
|
|
166
|
+
// source += 'if (el.id != ' + JSON.stringify(tagName.substr(1)) + ') return false;';// 1
|
|
167
|
+
function_name += '1';
|
|
168
|
+
}
|
|
169
|
+
else {
|
|
170
|
+
reg = /^\[\s*(\S+)\s*(=|!=)\s*((((["'])([^\6]*)\6))|(\S*?))\]\s*/.exec(tagName);
|
|
171
|
+
if (reg) {
|
|
172
|
+
attr_key = reg[1];
|
|
173
|
+
let method = reg[2];
|
|
174
|
+
if (method !== '=' && method !== '!=') {
|
|
175
|
+
// eslint-disable-next-line no-template-curly-in-string
|
|
176
|
+
throw new Error('Selector not supported, Expect [key${op}value].op must be =,!=');
|
|
177
|
+
}
|
|
178
|
+
if (method === '=') {
|
|
179
|
+
method = '==';
|
|
180
|
+
}
|
|
181
|
+
value = reg[7] || reg[8];
|
|
182
|
+
// source += `let attrs = el.attributes;for (let key in attrs){const val = attrs[key]; if (key == "${attr_key}" && val == "${value}"){return true;}} return false;`;// 2
|
|
183
|
+
function_name += '2';
|
|
184
|
+
}
|
|
185
|
+
else if ((reg = /^\[(.*?)\]/.exec(tagName))) {
|
|
186
|
+
attr_key = reg[1];
|
|
187
|
+
function_name += '5';
|
|
188
|
+
}
|
|
189
|
+
else {
|
|
190
|
+
// source += 'if (el.tagName != ' + JSON.stringify(tagName) + ') return false;';// 3
|
|
191
|
+
function_name += '3';
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
if (classes.length > 0) {
|
|
196
|
+
// source += 'for (let cls = ' + JSON.stringify(classes) + ', i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false;';// 4
|
|
197
|
+
function_name += '4';
|
|
198
|
+
}
|
|
199
|
+
// source += 'return true;';// 5
|
|
200
|
+
function_name += '5';
|
|
201
|
+
const obj = {
|
|
202
|
+
func: functionCache[function_name],
|
|
203
|
+
tagName: tagName || '',
|
|
204
|
+
classes: classes || '',
|
|
205
|
+
attr_key: attr_key || '',
|
|
206
|
+
value: value || ''
|
|
207
|
+
};
|
|
208
|
+
// source = source || '';
|
|
209
|
+
return (pMatchFunctionCache[matcher] = obj);
|
|
210
|
+
});
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Trying to advance match pointer
|
|
214
|
+
* @param {HTMLElement} el element to make the match
|
|
215
|
+
* @return {bool} true when pointer advanced.
|
|
216
|
+
*/
|
|
217
|
+
advance(el) {
|
|
218
|
+
if (this.nextMatch < this.matchers.length &&
|
|
219
|
+
this.matchers[this.nextMatch].func(el, this.matchers[this.nextMatch].tagName, this.matchers[this.nextMatch].classes, this.matchers[this.nextMatch].attr_key, this.matchers[this.nextMatch].value)) {
|
|
220
|
+
this.nextMatch++;
|
|
221
|
+
return true;
|
|
222
|
+
}
|
|
223
|
+
return false;
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Rewind the match pointer
|
|
227
|
+
*/
|
|
228
|
+
rewind() {
|
|
229
|
+
this.nextMatch--;
|
|
230
|
+
}
|
|
231
|
+
/**
|
|
232
|
+
* Trying to determine if match made.
|
|
233
|
+
* @return {bool} true when the match is made
|
|
234
|
+
*/
|
|
235
|
+
get matched() {
|
|
236
|
+
return this.nextMatch === this.matchers.length;
|
|
237
|
+
}
|
|
238
|
+
/**
|
|
239
|
+
* Rest match pointer.
|
|
240
|
+
* @return {[type]} [description]
|
|
241
|
+
*/
|
|
242
|
+
reset() {
|
|
243
|
+
this.nextMatch = 0;
|
|
244
|
+
}
|
|
245
|
+
/**
|
|
246
|
+
* flush cache to free memory
|
|
247
|
+
*/
|
|
248
|
+
flushCache() {
|
|
249
|
+
pMatchFunctionCache = {};
|
|
250
|
+
}
|
|
251
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import Node from './node';
|
|
2
|
+
import NodeType from './type';
|
|
3
|
+
export default class CommentNode extends Node {
|
|
4
|
+
constructor(rawText) {
|
|
5
|
+
super();
|
|
6
|
+
this.rawText = rawText;
|
|
7
|
+
/**
|
|
8
|
+
* Node Type declaration.
|
|
9
|
+
* @type {Number}
|
|
10
|
+
*/
|
|
11
|
+
this.nodeType = NodeType.COMMENT_NODE;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Get unescaped text value of current node and its children.
|
|
15
|
+
* @return {string} text content
|
|
16
|
+
*/
|
|
17
|
+
get text() {
|
|
18
|
+
return this.rawText;
|
|
19
|
+
}
|
|
20
|
+
toString() {
|
|
21
|
+
return `<!--${this.rawText}-->`;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
@@ -0,0 +1,787 @@
|
|
|
1
|
+
import { decode } from 'he';
|
|
2
|
+
import Node from './node';
|
|
3
|
+
import NodeType from './type';
|
|
4
|
+
import TextNode from './text';
|
|
5
|
+
import Matcher from '../matcher';
|
|
6
|
+
import arr_back from '../back';
|
|
7
|
+
import CommentNode from './comment';
|
|
8
|
+
const kBlockElements = new Map();
|
|
9
|
+
kBlockElements.set('DIV', true);
|
|
10
|
+
kBlockElements.set('div', true);
|
|
11
|
+
kBlockElements.set('P', true);
|
|
12
|
+
kBlockElements.set('p', true);
|
|
13
|
+
// ul: true,
|
|
14
|
+
// ol: true,
|
|
15
|
+
kBlockElements.set('LI', true);
|
|
16
|
+
kBlockElements.set('li', true);
|
|
17
|
+
// table: true,
|
|
18
|
+
// tr: true,
|
|
19
|
+
kBlockElements.set('TD', true);
|
|
20
|
+
kBlockElements.set('td', true);
|
|
21
|
+
kBlockElements.set('SECTION', true);
|
|
22
|
+
kBlockElements.set('section', true);
|
|
23
|
+
kBlockElements.set('BR', true);
|
|
24
|
+
kBlockElements.set('br', true);
|
|
25
|
+
/**
|
|
26
|
+
* HTMLElement, which contains a set of children.
|
|
27
|
+
*
|
|
28
|
+
* Note: this is a minimalist implementation, no complete tree
|
|
29
|
+
* structure provided (no parentNode, nextSibling,
|
|
30
|
+
* previousSibling etc).
|
|
31
|
+
* @class HTMLElement
|
|
32
|
+
* @extends {Node}
|
|
33
|
+
*/
|
|
34
|
+
export default class HTMLElement extends Node {
|
|
35
|
+
/**
|
|
36
|
+
* Creates an instance of HTMLElement.
|
|
37
|
+
* @param keyAttrs id and class attribute
|
|
38
|
+
* @param [rawAttrs] attributes in string
|
|
39
|
+
*
|
|
40
|
+
* @memberof HTMLElement
|
|
41
|
+
*/
|
|
42
|
+
constructor(tagName, keyAttrs, rawAttrs = '', parentNode = null) {
|
|
43
|
+
super();
|
|
44
|
+
this.rawAttrs = rawAttrs;
|
|
45
|
+
this.parentNode = parentNode;
|
|
46
|
+
this.classNames = [];
|
|
47
|
+
/**
|
|
48
|
+
* Node Type declaration.
|
|
49
|
+
*/
|
|
50
|
+
this.nodeType = NodeType.ELEMENT_NODE;
|
|
51
|
+
this.rawTagName = tagName;
|
|
52
|
+
this.rawAttrs = rawAttrs || '';
|
|
53
|
+
this.parentNode = parentNode || null;
|
|
54
|
+
this.childNodes = [];
|
|
55
|
+
if (keyAttrs.id) {
|
|
56
|
+
this.id = keyAttrs.id;
|
|
57
|
+
if (!rawAttrs) {
|
|
58
|
+
this.rawAttrs = `id="${keyAttrs.id}"`;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
if (keyAttrs.class) {
|
|
62
|
+
this.classNames = keyAttrs.class.split(/\s+/);
|
|
63
|
+
if (!rawAttrs) {
|
|
64
|
+
const cls = `class="${this.classNames.join(' ')}"`;
|
|
65
|
+
if (this.rawAttrs) {
|
|
66
|
+
this.rawAttrs += ` ${cls}`;
|
|
67
|
+
}
|
|
68
|
+
else {
|
|
69
|
+
this.rawAttrs = cls;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Remove Child element from childNodes array
|
|
76
|
+
* @param {HTMLElement} node node to remove
|
|
77
|
+
*/
|
|
78
|
+
removeChild(node) {
|
|
79
|
+
this.childNodes = this.childNodes.filter((child) => {
|
|
80
|
+
return (child !== node);
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Exchanges given child with new child
|
|
85
|
+
* @param {HTMLElement} oldNode node to exchange
|
|
86
|
+
* @param {HTMLElement} newNode new node
|
|
87
|
+
*/
|
|
88
|
+
exchangeChild(oldNode, newNode) {
|
|
89
|
+
let idx = -1;
|
|
90
|
+
for (let i = 0; i < this.childNodes.length; i++) {
|
|
91
|
+
if (this.childNodes[i] === oldNode) {
|
|
92
|
+
idx = i;
|
|
93
|
+
break;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
this.childNodes[idx] = newNode;
|
|
97
|
+
}
|
|
98
|
+
get tagName() {
|
|
99
|
+
return this.rawTagName?.toUpperCase();
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Get escpaed (as-it) text value of current node and its children.
|
|
103
|
+
* @return {string} text content
|
|
104
|
+
*/
|
|
105
|
+
get rawText() {
|
|
106
|
+
return this.childNodes.reduce((pre, cur) => {
|
|
107
|
+
return (pre += cur.rawText);
|
|
108
|
+
}, '');
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Get unescaped text value of current node and its children.
|
|
112
|
+
* @return {string} text content
|
|
113
|
+
*/
|
|
114
|
+
get text() {
|
|
115
|
+
return decode(this.rawText);
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Get structured Text (with '\n' etc.)
|
|
119
|
+
* @return {string} structured text
|
|
120
|
+
*/
|
|
121
|
+
get structuredText() {
|
|
122
|
+
let currentBlock = [];
|
|
123
|
+
const blocks = [currentBlock];
|
|
124
|
+
function dfs(node) {
|
|
125
|
+
if (node.nodeType === NodeType.ELEMENT_NODE) {
|
|
126
|
+
if (kBlockElements.get(node.rawTagName)) {
|
|
127
|
+
if (currentBlock.length > 0) {
|
|
128
|
+
blocks.push(currentBlock = []);
|
|
129
|
+
}
|
|
130
|
+
node.childNodes.forEach(dfs);
|
|
131
|
+
if (currentBlock.length > 0) {
|
|
132
|
+
blocks.push(currentBlock = []);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
else {
|
|
136
|
+
node.childNodes.forEach(dfs);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
else if (node.nodeType === NodeType.TEXT_NODE) {
|
|
140
|
+
if (node.isWhitespace) {
|
|
141
|
+
// Whitespace node, postponed output
|
|
142
|
+
currentBlock.prependWhitespace = true;
|
|
143
|
+
}
|
|
144
|
+
else {
|
|
145
|
+
let text = node.text;
|
|
146
|
+
if (currentBlock.prependWhitespace) {
|
|
147
|
+
text = ` ${text}`;
|
|
148
|
+
currentBlock.prependWhitespace = false;
|
|
149
|
+
}
|
|
150
|
+
currentBlock.push(text);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
dfs(this);
|
|
155
|
+
return blocks.map((block) => {
|
|
156
|
+
// Normalize each line's whitespace
|
|
157
|
+
return block.join('').trim().replace(/\s{2,}/g, ' ');
|
|
158
|
+
})
|
|
159
|
+
.join('\n').replace(/\s+$/, ''); // trimRight;
|
|
160
|
+
}
|
|
161
|
+
toString() {
|
|
162
|
+
const tag = this.rawTagName;
|
|
163
|
+
if (tag) {
|
|
164
|
+
const is_void = /^(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)$/i.test(tag);
|
|
165
|
+
const attrs = this.rawAttrs ? ` ${this.rawAttrs}` : '';
|
|
166
|
+
if (is_void) {
|
|
167
|
+
return `<${tag}${attrs}>`;
|
|
168
|
+
}
|
|
169
|
+
return `<${tag}${attrs}>${this.innerHTML}</${tag}>`;
|
|
170
|
+
}
|
|
171
|
+
return this.innerHTML;
|
|
172
|
+
}
|
|
173
|
+
get innerHTML() {
|
|
174
|
+
return this.childNodes.map((child) => {
|
|
175
|
+
return child.toString();
|
|
176
|
+
}).join('');
|
|
177
|
+
}
|
|
178
|
+
set_content(content, options = {}) {
|
|
179
|
+
if (content instanceof Node) {
|
|
180
|
+
content = [content];
|
|
181
|
+
}
|
|
182
|
+
else if (typeof content == 'string') {
|
|
183
|
+
const r = parse(content, options);
|
|
184
|
+
content = r.childNodes.length ? r.childNodes : [new TextNode(content)];
|
|
185
|
+
}
|
|
186
|
+
this.childNodes = content;
|
|
187
|
+
}
|
|
188
|
+
get outerHTML() {
|
|
189
|
+
return this.toString();
|
|
190
|
+
}
|
|
191
|
+
/**
|
|
192
|
+
* Trim element from right (in block) after seeing pattern in a TextNode.
|
|
193
|
+
* @param {RegExp} pattern pattern to find
|
|
194
|
+
* @return {HTMLElement} reference to current node
|
|
195
|
+
*/
|
|
196
|
+
trimRight(pattern) {
|
|
197
|
+
for (let i = 0; i < this.childNodes.length; i++) {
|
|
198
|
+
const childNode = this.childNodes[i];
|
|
199
|
+
if (childNode.nodeType === NodeType.ELEMENT_NODE) {
|
|
200
|
+
childNode.trimRight(pattern);
|
|
201
|
+
}
|
|
202
|
+
else {
|
|
203
|
+
const index = childNode.rawText.search(pattern);
|
|
204
|
+
if (index > -1) {
|
|
205
|
+
childNode.rawText = childNode.rawText.substr(0, index);
|
|
206
|
+
// trim all following nodes.
|
|
207
|
+
this.childNodes.length = i + 1;
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
return this;
|
|
212
|
+
}
|
|
213
|
+
/**
|
|
214
|
+
* Get DOM structure
|
|
215
|
+
* @return {string} strucutre
|
|
216
|
+
*/
|
|
217
|
+
get structure() {
|
|
218
|
+
const res = [];
|
|
219
|
+
let indention = 0;
|
|
220
|
+
function write(str) {
|
|
221
|
+
res.push(' '.repeat(indention) + str);
|
|
222
|
+
}
|
|
223
|
+
function dfs(node) {
|
|
224
|
+
const idStr = node.id ? (`#${node.id}`) : '';
|
|
225
|
+
const classStr = node.classNames.length ? (`.${node.classNames.join('.')}`) : '';
|
|
226
|
+
write(node.rawTagName + idStr + classStr);
|
|
227
|
+
indention++;
|
|
228
|
+
node.childNodes.forEach((childNode) => {
|
|
229
|
+
if (childNode.nodeType === NodeType.ELEMENT_NODE) {
|
|
230
|
+
dfs(childNode);
|
|
231
|
+
}
|
|
232
|
+
else if (childNode.nodeType === NodeType.TEXT_NODE) {
|
|
233
|
+
if (!childNode.isWhitespace) {
|
|
234
|
+
write('#text');
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
});
|
|
238
|
+
indention--;
|
|
239
|
+
}
|
|
240
|
+
dfs(this);
|
|
241
|
+
return res.join('\n');
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Remove whitespaces in this sub tree.
|
|
245
|
+
* @return {HTMLElement} pointer to this
|
|
246
|
+
*/
|
|
247
|
+
removeWhitespace() {
|
|
248
|
+
let o = 0;
|
|
249
|
+
this.childNodes.forEach((node) => {
|
|
250
|
+
if (node.nodeType === NodeType.TEXT_NODE) {
|
|
251
|
+
if (node.isWhitespace) {
|
|
252
|
+
return;
|
|
253
|
+
}
|
|
254
|
+
node.rawText = node.rawText.trim();
|
|
255
|
+
}
|
|
256
|
+
else if (node.nodeType === NodeType.ELEMENT_NODE) {
|
|
257
|
+
node.removeWhitespace();
|
|
258
|
+
}
|
|
259
|
+
this.childNodes[o++] = node;
|
|
260
|
+
});
|
|
261
|
+
this.childNodes.length = o;
|
|
262
|
+
return this;
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Query CSS selector to find matching nodes.
|
|
266
|
+
* @param {string} selector Simplified CSS selector
|
|
267
|
+
* @param {Matcher} selector A Matcher instance
|
|
268
|
+
* @return {HTMLElement[]} matching elements
|
|
269
|
+
*/
|
|
270
|
+
querySelectorAll(selector) {
|
|
271
|
+
let matcher;
|
|
272
|
+
if (selector instanceof Matcher) {
|
|
273
|
+
matcher = selector;
|
|
274
|
+
matcher.reset();
|
|
275
|
+
}
|
|
276
|
+
else {
|
|
277
|
+
if (selector.includes(',')) {
|
|
278
|
+
const selectors = selector.split(',');
|
|
279
|
+
return Array.from(selectors.reduce((pre, cur) => {
|
|
280
|
+
const result = this.querySelectorAll(cur.trim());
|
|
281
|
+
return result.reduce((p, c) => {
|
|
282
|
+
return p.add(c);
|
|
283
|
+
}, pre);
|
|
284
|
+
}, new Set()));
|
|
285
|
+
}
|
|
286
|
+
matcher = new Matcher(selector);
|
|
287
|
+
}
|
|
288
|
+
const stack = [];
|
|
289
|
+
return this.childNodes.reduce((res, cur) => {
|
|
290
|
+
stack.push([cur, 0, false]);
|
|
291
|
+
while (stack.length) {
|
|
292
|
+
const state = arr_back(stack); // get last element
|
|
293
|
+
const el = state[0];
|
|
294
|
+
if (state[1] === 0) {
|
|
295
|
+
// Seen for first time.
|
|
296
|
+
if (el.nodeType !== NodeType.ELEMENT_NODE) {
|
|
297
|
+
stack.pop();
|
|
298
|
+
continue;
|
|
299
|
+
}
|
|
300
|
+
const html_el = el;
|
|
301
|
+
state[2] = matcher.advance(html_el);
|
|
302
|
+
if (state[2]) {
|
|
303
|
+
if (matcher.matched) {
|
|
304
|
+
res.push(html_el);
|
|
305
|
+
res.push(...(html_el.querySelectorAll(selector)));
|
|
306
|
+
// no need to go further.
|
|
307
|
+
matcher.rewind();
|
|
308
|
+
stack.pop();
|
|
309
|
+
continue;
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
if (state[1] < el.childNodes.length) {
|
|
314
|
+
stack.push([el.childNodes[state[1]++], 0, false]);
|
|
315
|
+
}
|
|
316
|
+
else {
|
|
317
|
+
if (state[2]) {
|
|
318
|
+
matcher.rewind();
|
|
319
|
+
}
|
|
320
|
+
stack.pop();
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
return res;
|
|
324
|
+
}, []);
|
|
325
|
+
}
|
|
326
|
+
/**
|
|
327
|
+
* Query CSS Selector to find matching node.
|
|
328
|
+
* @param {string} selector Simplified CSS selector
|
|
329
|
+
* @param {Matcher} selector A Matcher instance
|
|
330
|
+
* @return {HTMLElement} matching node
|
|
331
|
+
*/
|
|
332
|
+
querySelector(selector) {
|
|
333
|
+
let matcher;
|
|
334
|
+
if (selector instanceof Matcher) {
|
|
335
|
+
matcher = selector;
|
|
336
|
+
matcher.reset();
|
|
337
|
+
}
|
|
338
|
+
else {
|
|
339
|
+
matcher = new Matcher(selector);
|
|
340
|
+
}
|
|
341
|
+
const stack = [];
|
|
342
|
+
for (const node of this.childNodes) {
|
|
343
|
+
stack.push([node, 0, false]);
|
|
344
|
+
while (stack.length) {
|
|
345
|
+
const state = arr_back(stack);
|
|
346
|
+
const el = state[0];
|
|
347
|
+
if (state[1] === 0) {
|
|
348
|
+
// Seen for first time.
|
|
349
|
+
if (el.nodeType !== NodeType.ELEMENT_NODE) {
|
|
350
|
+
stack.pop();
|
|
351
|
+
continue;
|
|
352
|
+
}
|
|
353
|
+
state[2] = matcher.advance(el);
|
|
354
|
+
if (state[2]) {
|
|
355
|
+
if (matcher.matched) {
|
|
356
|
+
return el;
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
if (state[1] < el.childNodes.length) {
|
|
361
|
+
stack.push([el.childNodes[state[1]++], 0, false]);
|
|
362
|
+
}
|
|
363
|
+
else {
|
|
364
|
+
if (state[2]) {
|
|
365
|
+
matcher.rewind();
|
|
366
|
+
}
|
|
367
|
+
stack.pop();
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
return null;
|
|
372
|
+
}
|
|
373
|
+
/**
|
|
374
|
+
* Append a child node to childNodes
|
|
375
|
+
* @param {Node} node node to append
|
|
376
|
+
* @return {Node} node appended
|
|
377
|
+
*/
|
|
378
|
+
appendChild(node) {
|
|
379
|
+
// node.parentNode = this;
|
|
380
|
+
this.childNodes.push(node);
|
|
381
|
+
if (node instanceof HTMLElement) {
|
|
382
|
+
node.parentNode = this;
|
|
383
|
+
}
|
|
384
|
+
return node;
|
|
385
|
+
}
|
|
386
|
+
/**
|
|
387
|
+
* Get first child node
|
|
388
|
+
* @return {Node} first child node
|
|
389
|
+
*/
|
|
390
|
+
get firstChild() {
|
|
391
|
+
return this.childNodes[0];
|
|
392
|
+
}
|
|
393
|
+
/**
|
|
394
|
+
* Get last child node
|
|
395
|
+
* @return {Node} last child node
|
|
396
|
+
*/
|
|
397
|
+
get lastChild() {
|
|
398
|
+
return arr_back(this.childNodes);
|
|
399
|
+
}
|
|
400
|
+
/**
|
|
401
|
+
* Get attributes
|
|
402
|
+
* @return {Object} parsed and unescaped attributes
|
|
403
|
+
*/
|
|
404
|
+
get attributes() {
|
|
405
|
+
if (this._attrs) {
|
|
406
|
+
return this._attrs;
|
|
407
|
+
}
|
|
408
|
+
this._attrs = {};
|
|
409
|
+
const attrs = this.rawAttributes;
|
|
410
|
+
for (const key in attrs) {
|
|
411
|
+
const val = attrs[key] || '';
|
|
412
|
+
this._attrs[key] = decode(val);
|
|
413
|
+
}
|
|
414
|
+
return this._attrs;
|
|
415
|
+
}
|
|
416
|
+
/**
|
|
417
|
+
* Get escaped (as-it) attributes
|
|
418
|
+
* @return {Object} parsed attributes
|
|
419
|
+
*/
|
|
420
|
+
get rawAttributes() {
|
|
421
|
+
if (this._rawAttrs) {
|
|
422
|
+
return this._rawAttrs;
|
|
423
|
+
}
|
|
424
|
+
const attrs = {};
|
|
425
|
+
if (this.rawAttrs) {
|
|
426
|
+
const re = /\b([a-z][a-z0-9-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/ig;
|
|
427
|
+
let match;
|
|
428
|
+
while ((match = re.exec(this.rawAttrs))) {
|
|
429
|
+
attrs[match[1]] = match[2] || match[3] || match[4] || null;
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
this._rawAttrs = attrs;
|
|
433
|
+
return attrs;
|
|
434
|
+
}
|
|
435
|
+
removeAttribute(key) {
|
|
436
|
+
const attrs = this.rawAttributes;
|
|
437
|
+
delete attrs[key];
|
|
438
|
+
// Update this.attribute
|
|
439
|
+
if (this._attrs) {
|
|
440
|
+
delete this._attrs[key];
|
|
441
|
+
}
|
|
442
|
+
// Update rawString
|
|
443
|
+
this.rawAttrs = Object.keys(attrs).map((name) => {
|
|
444
|
+
const val = JSON.stringify(attrs[name]);
|
|
445
|
+
if (val === undefined || val === 'null') {
|
|
446
|
+
return name;
|
|
447
|
+
}
|
|
448
|
+
return `${name}=${val}`;
|
|
449
|
+
}).join(' ');
|
|
450
|
+
}
|
|
451
|
+
hasAttribute(key) {
|
|
452
|
+
return key in this.attributes;
|
|
453
|
+
}
|
|
454
|
+
/**
|
|
455
|
+
* Get an attribute
|
|
456
|
+
* @return {string} value of the attribute
|
|
457
|
+
*/
|
|
458
|
+
getAttribute(key) {
|
|
459
|
+
return this.attributes[key];
|
|
460
|
+
}
|
|
461
|
+
/**
|
|
462
|
+
* Set an attribute value to the HTMLElement
|
|
463
|
+
* @param {string} key The attribute name
|
|
464
|
+
* @param {string} value The value to set, or null / undefined to remove an attribute
|
|
465
|
+
*/
|
|
466
|
+
setAttribute(key, value) {
|
|
467
|
+
if (arguments.length < 2) {
|
|
468
|
+
throw new Error('Failed to execute \'setAttribute\' on \'Element\'');
|
|
469
|
+
}
|
|
470
|
+
const attrs = this.rawAttributes;
|
|
471
|
+
attrs[key] = String(value);
|
|
472
|
+
if (this._attrs) {
|
|
473
|
+
this._attrs[key] = decode(attrs[key]);
|
|
474
|
+
}
|
|
475
|
+
// Update rawString
|
|
476
|
+
this.rawAttrs = Object.keys(attrs).map((name) => {
|
|
477
|
+
const val = JSON.stringify(attrs[name]);
|
|
478
|
+
if (val === 'null' || val === '""') {
|
|
479
|
+
return name;
|
|
480
|
+
}
|
|
481
|
+
return `${name}=${val}`;
|
|
482
|
+
}).join(' ');
|
|
483
|
+
}
|
|
484
|
+
/**
|
|
485
|
+
* Replace all the attributes of the HTMLElement by the provided attributes
|
|
486
|
+
* @param {Attributes} attributes the new attribute set
|
|
487
|
+
*/
|
|
488
|
+
setAttributes(attributes) {
|
|
489
|
+
// Invalidate current this.attributes
|
|
490
|
+
if (this._attrs) {
|
|
491
|
+
delete this._attrs;
|
|
492
|
+
}
|
|
493
|
+
// Invalidate current this.rawAttributes
|
|
494
|
+
if (this._rawAttrs) {
|
|
495
|
+
delete this._rawAttrs;
|
|
496
|
+
}
|
|
497
|
+
// Update rawString
|
|
498
|
+
this.rawAttrs = Object.keys(attributes).map((name) => {
|
|
499
|
+
const val = attributes[name];
|
|
500
|
+
if (val === 'null' || val === '""') {
|
|
501
|
+
return name;
|
|
502
|
+
}
|
|
503
|
+
return `${name}=${JSON.stringify(String(val))}`;
|
|
504
|
+
}).join(' ');
|
|
505
|
+
}
|
|
506
|
+
insertAdjacentHTML(where, html) {
|
|
507
|
+
if (arguments.length < 2) {
|
|
508
|
+
throw new Error('2 arguments required');
|
|
509
|
+
}
|
|
510
|
+
const p = parse(html);
|
|
511
|
+
if (where === 'afterend') {
|
|
512
|
+
const idx = this.parentNode.childNodes.findIndex((child) => {
|
|
513
|
+
return child === this;
|
|
514
|
+
});
|
|
515
|
+
this.parentNode.childNodes.splice(idx + 1, 0, ...p.childNodes);
|
|
516
|
+
p.childNodes.forEach((n) => {
|
|
517
|
+
if (n instanceof HTMLElement) {
|
|
518
|
+
n.parentNode = this.parentNode;
|
|
519
|
+
}
|
|
520
|
+
});
|
|
521
|
+
}
|
|
522
|
+
else if (where === 'afterbegin') {
|
|
523
|
+
this.childNodes.unshift(...p.childNodes);
|
|
524
|
+
}
|
|
525
|
+
else if (where === 'beforeend') {
|
|
526
|
+
p.childNodes.forEach((n) => {
|
|
527
|
+
this.appendChild(n);
|
|
528
|
+
});
|
|
529
|
+
}
|
|
530
|
+
else if (where === 'beforebegin') {
|
|
531
|
+
const idx = this.parentNode.childNodes.findIndex((child) => {
|
|
532
|
+
return child === this;
|
|
533
|
+
});
|
|
534
|
+
this.parentNode.childNodes.splice(idx, 0, ...p.childNodes);
|
|
535
|
+
p.childNodes.forEach((n) => {
|
|
536
|
+
if (n instanceof HTMLElement) {
|
|
537
|
+
n.parentNode = this.parentNode;
|
|
538
|
+
}
|
|
539
|
+
});
|
|
540
|
+
}
|
|
541
|
+
else {
|
|
542
|
+
throw new Error(`The value provided ('${where}') is not one of 'beforebegin', 'afterbegin', 'beforeend', or 'afterend'`);
|
|
543
|
+
}
|
|
544
|
+
// if (!where || html === undefined || html === null) {
|
|
545
|
+
// return;
|
|
546
|
+
// }
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
// https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name
|
|
550
|
+
const kMarkupPattern = /<!--[^]*?(?=-->)-->|<(\/?)([a-z][-.:0-9_a-z]*)\s*([^>]*?)(\/?)>/ig;
|
|
551
|
+
// <(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
|
|
552
|
+
// <([a-z][-.:0-9_a-z]*)\s*\/>
|
|
553
|
+
// <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>
|
|
554
|
+
// <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>|<(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
|
|
555
|
+
const kAttributePattern = /(^|\s)(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
|
|
556
|
+
const kSelfClosingElements = {
|
|
557
|
+
area: true,
|
|
558
|
+
AREA: true,
|
|
559
|
+
base: true,
|
|
560
|
+
BASE: true,
|
|
561
|
+
br: true,
|
|
562
|
+
BR: true,
|
|
563
|
+
col: true,
|
|
564
|
+
COL: true,
|
|
565
|
+
hr: true,
|
|
566
|
+
HR: true,
|
|
567
|
+
img: true,
|
|
568
|
+
IMG: true,
|
|
569
|
+
input: true,
|
|
570
|
+
INPUT: true,
|
|
571
|
+
link: true,
|
|
572
|
+
LINK: true,
|
|
573
|
+
meta: true,
|
|
574
|
+
META: true,
|
|
575
|
+
source: true,
|
|
576
|
+
SOURCE: true
|
|
577
|
+
};
|
|
578
|
+
const kElementsClosedByOpening = {
|
|
579
|
+
li: { li: true, LI: true },
|
|
580
|
+
LI: { li: true, LI: true },
|
|
581
|
+
p: { p: true, div: true, P: true, DIV: true },
|
|
582
|
+
P: { p: true, div: true, P: true, DIV: true },
|
|
583
|
+
b: { div: true, DIV: true },
|
|
584
|
+
B: { div: true, DIV: true },
|
|
585
|
+
td: { td: true, th: true, TD: true, TH: true },
|
|
586
|
+
TD: { td: true, th: true, TD: true, TH: true },
|
|
587
|
+
th: { td: true, th: true, TD: true, TH: true },
|
|
588
|
+
TH: { td: true, th: true, TD: true, TH: true },
|
|
589
|
+
h1: { h1: true, H1: true },
|
|
590
|
+
H1: { h1: true, H1: true },
|
|
591
|
+
h2: { h2: true, H2: true },
|
|
592
|
+
H2: { h2: true, H2: true },
|
|
593
|
+
h3: { h3: true, H3: true },
|
|
594
|
+
H3: { h3: true, H3: true },
|
|
595
|
+
h4: { h4: true, H4: true },
|
|
596
|
+
H4: { h4: true, H4: true },
|
|
597
|
+
h5: { h5: true, H5: true },
|
|
598
|
+
H5: { h5: true, H5: true },
|
|
599
|
+
h6: { h6: true, H6: true },
|
|
600
|
+
H6: { h6: true, H6: true }
|
|
601
|
+
};
|
|
602
|
+
const kElementsClosedByClosing = {
|
|
603
|
+
li: { ul: true, ol: true, UL: true, OL: true },
|
|
604
|
+
LI: { ul: true, ol: true, UL: true, OL: true },
|
|
605
|
+
a: { div: true, DIV: true },
|
|
606
|
+
A: { div: true, DIV: true },
|
|
607
|
+
b: { div: true, DIV: true },
|
|
608
|
+
B: { div: true, DIV: true },
|
|
609
|
+
i: { div: true, DIV: true },
|
|
610
|
+
I: { div: true, DIV: true },
|
|
611
|
+
p: { div: true, DIV: true },
|
|
612
|
+
P: { div: true, DIV: true },
|
|
613
|
+
td: { tr: true, table: true, TR: true, TABLE: true },
|
|
614
|
+
TD: { tr: true, table: true, TR: true, TABLE: true },
|
|
615
|
+
th: { tr: true, table: true, TR: true, TABLE: true },
|
|
616
|
+
TH: { tr: true, table: true, TR: true, TABLE: true }
|
|
617
|
+
};
|
|
618
|
+
const frameflag = 'documentfragmentcontainer';
|
|
619
|
+
export function parse(data, options = { lowerCaseTagName: false, comment: false }) {
|
|
620
|
+
const elements = options.blockTextElements || {
|
|
621
|
+
script: true,
|
|
622
|
+
noscript: true,
|
|
623
|
+
style: true,
|
|
624
|
+
pre: true
|
|
625
|
+
};
|
|
626
|
+
const element_names = Object.keys(elements);
|
|
627
|
+
const kBlockTextElements = element_names.map((it) => {
|
|
628
|
+
return new RegExp(it, 'i');
|
|
629
|
+
});
|
|
630
|
+
const kIgnoreElements = element_names.filter((it) => {
|
|
631
|
+
return elements[it];
|
|
632
|
+
}).map((it) => {
|
|
633
|
+
return new RegExp(it, 'i');
|
|
634
|
+
});
|
|
635
|
+
function element_should_be_ignore(tag) {
|
|
636
|
+
return kIgnoreElements.some((it) => {
|
|
637
|
+
return it.test(tag);
|
|
638
|
+
});
|
|
639
|
+
}
|
|
640
|
+
function is_block_text_element(tag) {
|
|
641
|
+
return kBlockTextElements.some((it) => {
|
|
642
|
+
return it.test(tag);
|
|
643
|
+
});
|
|
644
|
+
}
|
|
645
|
+
const root = new HTMLElement(null, {});
|
|
646
|
+
let currentParent = root;
|
|
647
|
+
const stack = [root];
|
|
648
|
+
let lastTextPos = -1;
|
|
649
|
+
let match;
|
|
650
|
+
// https://github.com/taoqf/node-html-parser/issues/38
|
|
651
|
+
data = `<${frameflag}>${data}</${frameflag}>`;
|
|
652
|
+
while ((match = kMarkupPattern.exec(data))) {
|
|
653
|
+
if (lastTextPos > -1) {
|
|
654
|
+
if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) {
|
|
655
|
+
// if has content
|
|
656
|
+
const text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length);
|
|
657
|
+
currentParent.appendChild(new TextNode(text));
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
lastTextPos = kMarkupPattern.lastIndex;
|
|
661
|
+
if (match[2] === frameflag) {
|
|
662
|
+
continue;
|
|
663
|
+
}
|
|
664
|
+
if (match[0][1] === '!') {
|
|
665
|
+
// this is a comment
|
|
666
|
+
if (options.comment) {
|
|
667
|
+
// Only keep what is in between <!-- and -->
|
|
668
|
+
const text = data.substring(lastTextPos - 3, lastTextPos - match[0].length + 4);
|
|
669
|
+
currentParent.appendChild(new CommentNode(text));
|
|
670
|
+
}
|
|
671
|
+
continue;
|
|
672
|
+
}
|
|
673
|
+
if (options.lowerCaseTagName) {
|
|
674
|
+
match[2] = match[2].toLowerCase();
|
|
675
|
+
}
|
|
676
|
+
if (!match[1]) {
|
|
677
|
+
// not </ tags
|
|
678
|
+
const attrs = {};
|
|
679
|
+
for (let attMatch; (attMatch = kAttributePattern.exec(match[3]));) {
|
|
680
|
+
attrs[attMatch[2].toLowerCase()] = attMatch[4] || attMatch[5] || attMatch[6];
|
|
681
|
+
}
|
|
682
|
+
const tagName = currentParent.rawTagName;
|
|
683
|
+
if (!match[4] && kElementsClosedByOpening[tagName]) {
|
|
684
|
+
if (kElementsClosedByOpening[tagName][match[2]]) {
|
|
685
|
+
stack.pop();
|
|
686
|
+
currentParent = arr_back(stack);
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
// ignore container tag we add above
|
|
690
|
+
// https://github.com/taoqf/node-html-parser/issues/38
|
|
691
|
+
currentParent = currentParent.appendChild(new HTMLElement(match[2], attrs, match[3]));
|
|
692
|
+
stack.push(currentParent);
|
|
693
|
+
if (is_block_text_element(match[2])) {
|
|
694
|
+
// a little test to find next </script> or </style> ...
|
|
695
|
+
const closeMarkup = `</${match[2]}>`;
|
|
696
|
+
const index = (() => {
|
|
697
|
+
if (options.lowerCaseTagName) {
|
|
698
|
+
return data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex);
|
|
699
|
+
}
|
|
700
|
+
return data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
|
|
701
|
+
})();
|
|
702
|
+
if (element_should_be_ignore(match[2])) {
|
|
703
|
+
let text;
|
|
704
|
+
if (index === -1) {
|
|
705
|
+
// there is no matching ending for the text element.
|
|
706
|
+
text = data.substr(kMarkupPattern.lastIndex);
|
|
707
|
+
}
|
|
708
|
+
else {
|
|
709
|
+
text = data.substring(kMarkupPattern.lastIndex, index);
|
|
710
|
+
}
|
|
711
|
+
if (text.length > 0) {
|
|
712
|
+
currentParent.appendChild(new TextNode(text));
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
if (index === -1) {
|
|
716
|
+
lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
|
|
717
|
+
}
|
|
718
|
+
else {
|
|
719
|
+
lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length;
|
|
720
|
+
match[1] = 'true';
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
if (match[1] || match[4] || kSelfClosingElements[match[2]]) {
|
|
725
|
+
// </ or /> or <br> etc.
|
|
726
|
+
while (true) {
|
|
727
|
+
if (currentParent.rawTagName === match[2]) {
|
|
728
|
+
stack.pop();
|
|
729
|
+
currentParent = arr_back(stack);
|
|
730
|
+
break;
|
|
731
|
+
}
|
|
732
|
+
else {
|
|
733
|
+
const tagName = currentParent.tagName;
|
|
734
|
+
// Trying to close current tag, and move on
|
|
735
|
+
if (kElementsClosedByClosing[tagName]) {
|
|
736
|
+
if (kElementsClosedByClosing[tagName][match[2]]) {
|
|
737
|
+
stack.pop();
|
|
738
|
+
currentParent = arr_back(stack);
|
|
739
|
+
continue;
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
// Use aggressive strategy to handle unmatching markups.
|
|
743
|
+
break;
|
|
744
|
+
}
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
const valid = Boolean(stack.length === 1);
|
|
749
|
+
if (!options.noFix) {
|
|
750
|
+
const response = root;
|
|
751
|
+
response.valid = valid;
|
|
752
|
+
while (stack.length > 1) {
|
|
753
|
+
// Handle each error elements.
|
|
754
|
+
const last = stack.pop();
|
|
755
|
+
const oneBefore = arr_back(stack);
|
|
756
|
+
if (last.parentNode && last.parentNode.parentNode) {
|
|
757
|
+
if (last.parentNode === oneBefore && last.tagName === oneBefore.tagName) {
|
|
758
|
+
// Pair error case <h3> <h3> handle : Fixes to <h3> </h3>
|
|
759
|
+
oneBefore.removeChild(last);
|
|
760
|
+
last.childNodes.forEach((child) => {
|
|
761
|
+
oneBefore.parentNode.appendChild(child);
|
|
762
|
+
});
|
|
763
|
+
stack.pop();
|
|
764
|
+
}
|
|
765
|
+
else {
|
|
766
|
+
// Single error <div> <h3> </div> handle: Just removes <h3>
|
|
767
|
+
oneBefore.removeChild(last);
|
|
768
|
+
last.childNodes.forEach((child) => {
|
|
769
|
+
oneBefore.appendChild(child);
|
|
770
|
+
});
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
else {
|
|
774
|
+
// If it's final element just skip.
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
response.childNodes.forEach((node) => {
|
|
778
|
+
if (node instanceof HTMLElement) {
|
|
779
|
+
node.parentNode = null;
|
|
780
|
+
}
|
|
781
|
+
});
|
|
782
|
+
return response;
|
|
783
|
+
}
|
|
784
|
+
const response = new TextNode(data);
|
|
785
|
+
response.valid = valid;
|
|
786
|
+
return response;
|
|
787
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import NodeType from './type';
|
|
2
|
+
import Node from './node';
|
|
3
|
+
/**
|
|
4
|
+
* TextNode to contain a text element in DOM tree.
|
|
5
|
+
* @param {string} value [description]
|
|
6
|
+
*/
|
|
7
|
+
export default class TextNode extends Node {
|
|
8
|
+
constructor(rawText) {
|
|
9
|
+
super();
|
|
10
|
+
this.rawText = rawText;
|
|
11
|
+
/**
|
|
12
|
+
* Node Type declaration.
|
|
13
|
+
* @type {Number}
|
|
14
|
+
*/
|
|
15
|
+
this.nodeType = NodeType.TEXT_NODE;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Get unescaped text value of current node and its children.
|
|
19
|
+
* @return {string} text content
|
|
20
|
+
*/
|
|
21
|
+
get text() {
|
|
22
|
+
return this.rawText;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Detect if the node contains only white space.
|
|
26
|
+
* @return {bool}
|
|
27
|
+
*/
|
|
28
|
+
get isWhitespace() {
|
|
29
|
+
return /^(\s| )*$/.test(this.rawText);
|
|
30
|
+
}
|
|
31
|
+
toString() {
|
|
32
|
+
return this.text;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
var NodeType;
|
|
2
|
+
(function (NodeType) {
|
|
3
|
+
NodeType[NodeType["ELEMENT_NODE"] = 1] = "ELEMENT_NODE";
|
|
4
|
+
NodeType[NodeType["TEXT_NODE"] = 3] = "TEXT_NODE";
|
|
5
|
+
NodeType[NodeType["COMMENT_NODE"] = 8] = "COMMENT_NODE";
|
|
6
|
+
})(NodeType || (NodeType = {}));
|
|
7
|
+
export default NodeType;
|
package/dist/main.js
CHANGED
|
@@ -1084,19 +1084,34 @@ define("nodes/html", ["require", "exports", "he", "nodes/node", "nodes/type", "n
|
|
|
1084
1084
|
th: { tr: true, table: true, TR: true, TABLE: true },
|
|
1085
1085
|
TH: { tr: true, table: true, TR: true, TABLE: true }
|
|
1086
1086
|
};
|
|
1087
|
-
var kBlockTextElements = {
|
|
1088
|
-
script: true,
|
|
1089
|
-
SCRIPT: true,
|
|
1090
|
-
noscript: true,
|
|
1091
|
-
NOSCRIPT: true,
|
|
1092
|
-
style: true,
|
|
1093
|
-
STYLE: true,
|
|
1094
|
-
pre: true,
|
|
1095
|
-
PRE: true
|
|
1096
|
-
};
|
|
1097
1087
|
var frameflag = 'documentfragmentcontainer';
|
|
1098
1088
|
function parse(data, options) {
|
|
1099
|
-
if (options === void 0) { options = {}; }
|
|
1089
|
+
if (options === void 0) { options = { lowerCaseTagName: false, comment: false }; }
|
|
1090
|
+
var elements = options.blockTextElements || {
|
|
1091
|
+
script: true,
|
|
1092
|
+
noscript: true,
|
|
1093
|
+
style: true,
|
|
1094
|
+
pre: true
|
|
1095
|
+
};
|
|
1096
|
+
var element_names = Object.keys(elements);
|
|
1097
|
+
var kBlockTextElements = element_names.map(function (it) {
|
|
1098
|
+
return new RegExp(it, 'i');
|
|
1099
|
+
});
|
|
1100
|
+
var kIgnoreElements = element_names.filter(function (it) {
|
|
1101
|
+
return elements[it];
|
|
1102
|
+
}).map(function (it) {
|
|
1103
|
+
return new RegExp(it, 'i');
|
|
1104
|
+
});
|
|
1105
|
+
function element_should_be_ignore(tag) {
|
|
1106
|
+
return kIgnoreElements.some(function (it) {
|
|
1107
|
+
return it.test(tag);
|
|
1108
|
+
});
|
|
1109
|
+
}
|
|
1110
|
+
function is_block_text_element(tag) {
|
|
1111
|
+
return kBlockTextElements.some(function (it) {
|
|
1112
|
+
return it.test(tag);
|
|
1113
|
+
});
|
|
1114
|
+
}
|
|
1100
1115
|
var root = new HTMLElement(null, {});
|
|
1101
1116
|
var currentParent = root;
|
|
1102
1117
|
var stack = [root];
|
|
@@ -1145,7 +1160,7 @@ define("nodes/html", ["require", "exports", "he", "nodes/node", "nodes/type", "n
|
|
|
1145
1160
|
// https://github.com/taoqf/node-html-parser/issues/38
|
|
1146
1161
|
currentParent = currentParent.appendChild(new HTMLElement(match[2], attrs, match[3]));
|
|
1147
1162
|
stack.push(currentParent);
|
|
1148
|
-
if (
|
|
1163
|
+
if (is_block_text_element(match[2])) {
|
|
1149
1164
|
// a little test to find next </script> or </style> ...
|
|
1150
1165
|
var closeMarkup_1 = "</" + match[2] + ">";
|
|
1151
1166
|
var index = (function () {
|
|
@@ -1154,16 +1169,18 @@ define("nodes/html", ["require", "exports", "he", "nodes/node", "nodes/type", "n
|
|
|
1154
1169
|
}
|
|
1155
1170
|
return data.indexOf(closeMarkup_1, kMarkupPattern.lastIndex);
|
|
1156
1171
|
})();
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1172
|
+
if (element_should_be_ignore(match[2])) {
|
|
1173
|
+
var text = void 0;
|
|
1174
|
+
if (index === -1) {
|
|
1175
|
+
// there is no matching ending for the text element.
|
|
1176
|
+
text = data.substr(kMarkupPattern.lastIndex);
|
|
1177
|
+
}
|
|
1178
|
+
else {
|
|
1179
|
+
text = data.substring(kMarkupPattern.lastIndex, index);
|
|
1180
|
+
}
|
|
1181
|
+
if (text.length > 0) {
|
|
1182
|
+
currentParent.appendChild(new text_1.default(text));
|
|
1183
|
+
}
|
|
1167
1184
|
}
|
|
1168
1185
|
if (index === -1) {
|
|
1169
1186
|
lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
|
package/dist/nodes/html.d.ts
CHANGED
|
@@ -150,8 +150,11 @@ export default class HTMLElement extends Node {
|
|
|
150
150
|
insertAdjacentHTML(where: InsertPosition, html: string): void;
|
|
151
151
|
}
|
|
152
152
|
export interface Options {
|
|
153
|
-
lowerCaseTagName
|
|
154
|
-
comment
|
|
153
|
+
lowerCaseTagName: boolean;
|
|
154
|
+
comment: boolean;
|
|
155
|
+
blockTextElements: {
|
|
156
|
+
[tag: string]: boolean;
|
|
157
|
+
};
|
|
155
158
|
}
|
|
156
159
|
/**
|
|
157
160
|
* Parses HTML and returns a root element
|
|
@@ -159,15 +162,15 @@ export interface Options {
|
|
|
159
162
|
* @param {string} data html
|
|
160
163
|
* @return {HTMLElement} root element
|
|
161
164
|
*/
|
|
162
|
-
export declare function parse(data: string, options?: Options): HTMLElement & {
|
|
165
|
+
export declare function parse(data: string, options?: Partial<Options>): HTMLElement & {
|
|
163
166
|
valid: boolean;
|
|
164
167
|
};
|
|
165
|
-
export declare function parse(data: string, options?: Options & {
|
|
168
|
+
export declare function parse(data: string, options?: Partial<Options> & {
|
|
166
169
|
noFix: false;
|
|
167
170
|
}): HTMLElement & {
|
|
168
171
|
valid: boolean;
|
|
169
172
|
};
|
|
170
|
-
export declare function parse(data: string, options?: Options & {
|
|
173
|
+
export declare function parse(data: string, options?: Partial<Options> & {
|
|
171
174
|
noFix: true;
|
|
172
175
|
}): (HTMLElement | TextNode) & {
|
|
173
176
|
valid: boolean;
|
package/dist/nodes/html.js
CHANGED
|
@@ -698,19 +698,34 @@ var kElementsClosedByClosing = {
|
|
|
698
698
|
th: { tr: true, table: true, TR: true, TABLE: true },
|
|
699
699
|
TH: { tr: true, table: true, TR: true, TABLE: true }
|
|
700
700
|
};
|
|
701
|
-
var kBlockTextElements = {
|
|
702
|
-
script: true,
|
|
703
|
-
SCRIPT: true,
|
|
704
|
-
noscript: true,
|
|
705
|
-
NOSCRIPT: true,
|
|
706
|
-
style: true,
|
|
707
|
-
STYLE: true,
|
|
708
|
-
pre: true,
|
|
709
|
-
PRE: true
|
|
710
|
-
};
|
|
711
701
|
var frameflag = 'documentfragmentcontainer';
|
|
712
702
|
function parse(data, options) {
|
|
713
|
-
if (options === void 0) { options = {}; }
|
|
703
|
+
if (options === void 0) { options = { lowerCaseTagName: false, comment: false }; }
|
|
704
|
+
var elements = options.blockTextElements || {
|
|
705
|
+
script: true,
|
|
706
|
+
noscript: true,
|
|
707
|
+
style: true,
|
|
708
|
+
pre: true
|
|
709
|
+
};
|
|
710
|
+
var element_names = Object.keys(elements);
|
|
711
|
+
var kBlockTextElements = element_names.map(function (it) {
|
|
712
|
+
return new RegExp(it, 'i');
|
|
713
|
+
});
|
|
714
|
+
var kIgnoreElements = element_names.filter(function (it) {
|
|
715
|
+
return elements[it];
|
|
716
|
+
}).map(function (it) {
|
|
717
|
+
return new RegExp(it, 'i');
|
|
718
|
+
});
|
|
719
|
+
function element_should_be_ignore(tag) {
|
|
720
|
+
return kIgnoreElements.some(function (it) {
|
|
721
|
+
return it.test(tag);
|
|
722
|
+
});
|
|
723
|
+
}
|
|
724
|
+
function is_block_text_element(tag) {
|
|
725
|
+
return kBlockTextElements.some(function (it) {
|
|
726
|
+
return it.test(tag);
|
|
727
|
+
});
|
|
728
|
+
}
|
|
714
729
|
var root = new HTMLElement(null, {});
|
|
715
730
|
var currentParent = root;
|
|
716
731
|
var stack = [root];
|
|
@@ -759,7 +774,7 @@ function parse(data, options) {
|
|
|
759
774
|
// https://github.com/taoqf/node-html-parser/issues/38
|
|
760
775
|
currentParent = currentParent.appendChild(new HTMLElement(match[2], attrs, match[3]));
|
|
761
776
|
stack.push(currentParent);
|
|
762
|
-
if (
|
|
777
|
+
if (is_block_text_element(match[2])) {
|
|
763
778
|
// a little test to find next </script> or </style> ...
|
|
764
779
|
var closeMarkup_1 = "</" + match[2] + ">";
|
|
765
780
|
var index = (function () {
|
|
@@ -768,16 +783,18 @@ function parse(data, options) {
|
|
|
768
783
|
}
|
|
769
784
|
return data.indexOf(closeMarkup_1, kMarkupPattern.lastIndex);
|
|
770
785
|
})();
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
786
|
+
if (element_should_be_ignore(match[2])) {
|
|
787
|
+
var text = void 0;
|
|
788
|
+
if (index === -1) {
|
|
789
|
+
// there is no matching ending for the text element.
|
|
790
|
+
text = data.substr(kMarkupPattern.lastIndex);
|
|
791
|
+
}
|
|
792
|
+
else {
|
|
793
|
+
text = data.substring(kMarkupPattern.lastIndex, index);
|
|
794
|
+
}
|
|
795
|
+
if (text.length > 0) {
|
|
796
|
+
currentParent.appendChild(new text_1.default(text));
|
|
797
|
+
}
|
|
781
798
|
}
|
|
782
799
|
if (index === -1) {
|
|
783
800
|
lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
|
package/package.json
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "node-html-parser",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.4.2",
|
|
4
4
|
"description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
|
+
"module": "dist/esm/index.js",
|
|
7
|
+
"browser": "dist/main.js",
|
|
6
8
|
"types": "dist/index.d.ts",
|
|
7
9
|
"scripts": {
|
|
8
10
|
"test": "mocha",
|
|
@@ -10,7 +12,8 @@
|
|
|
10
12
|
"clean": "del-cli ./dist/",
|
|
11
13
|
"ts:cjs": "tsc -m commonjs",
|
|
12
14
|
"ts:umd": "tsc -t es5 -m amd -d false --outFile ./dist/main.js",
|
|
13
|
-
"
|
|
15
|
+
"ts:esm": "tsc -t esnext -m esnext -d false --outDir ./dist/esm/",
|
|
16
|
+
"build": "npm run lint && npm run clean && npm run ts:cjs && npm run ts:umd && npm run ts:esm",
|
|
14
17
|
"dev": "tsc -w",
|
|
15
18
|
"pretest": "tsc -m commonjs"
|
|
16
19
|
},
|