node-html-parser 5.3.3 → 5.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -11
- package/dist/main.js +42 -7
- package/dist/nodes/html.d.ts +13 -1
- package/dist/nodes/html.js +10 -6
- package/dist/void-tag.d.ts +7 -0
- package/dist/void-tag.js +29 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -19,18 +19,20 @@ npm install --save node-html-parser
|
|
|
19
19
|
|
|
20
20
|
## Performance
|
|
21
21
|
|
|
22
|
+
-- 2022-08-10
|
|
23
|
+
|
|
22
24
|
```shell
|
|
23
|
-
html-parser :24.
|
|
24
|
-
htmljs-parser :4.
|
|
25
|
-
html-dom-parser :2.
|
|
26
|
-
html5parser :1.
|
|
27
|
-
cheerio :12.
|
|
28
|
-
parse5 :6.
|
|
29
|
-
htmlparser2 :2.
|
|
30
|
-
htmlparser :17.
|
|
31
|
-
high5 :3.
|
|
32
|
-
node-html-parser:2.
|
|
33
|
-
node-html-parser (last release):2.
|
|
25
|
+
html-parser :24.1595 ms/file ± 18.7667
|
|
26
|
+
htmljs-parser :4.72064 ms/file ± 5.67689
|
|
27
|
+
html-dom-parser :2.18055 ms/file ± 2.96136
|
|
28
|
+
html5parser :1.69639 ms/file ± 2.17111
|
|
29
|
+
cheerio :12.2122 ms/file ± 8.10916
|
|
30
|
+
parse5 :6.50626 ms/file ± 4.02352
|
|
31
|
+
htmlparser2 :2.38179 ms/file ± 3.42389
|
|
32
|
+
htmlparser :17.4820 ms/file ± 128.041
|
|
33
|
+
high5 :3.95188 ms/file ± 2.52313
|
|
34
|
+
node-html-parser:2.04288 ms/file ± 1.25203
|
|
35
|
+
node-html-parser (last release):2.00527 ms/file ± 1.21317
|
|
34
36
|
```
|
|
35
37
|
|
|
36
38
|
Tested with [htmlparser-benchmark](https://github.com/AndreasMadsen/htmlparser-benchmark).
|
|
@@ -82,6 +84,10 @@ Parse the data provided, and return the root of the generated DOM.
|
|
|
82
84
|
{
|
|
83
85
|
lowerCaseTagName: false, // convert tag name to lower case (hurts performance heavily)
|
|
84
86
|
comment: false, // retrieve comments (hurts performance slightly)
|
|
87
|
+
voidTag:{
|
|
88
|
+
tags: ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'], // optional and case insensitive, default value is ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']
|
|
89
|
+
addClosingSlash: true // optional, default false. void tag serialisation, add a final slash <br/>
|
|
90
|
+
},
|
|
85
91
|
blockTextElements: {
|
|
86
92
|
script: true, // keep text content when parsing
|
|
87
93
|
noscript: true, // keep text content when parsing
|
package/dist/main.js
CHANGED
|
@@ -213,6 +213,37 @@ define("matcher", ["require", "exports", "nodes/type"], function (require, expor
|
|
|
213
213
|
findAll: findAll
|
|
214
214
|
};
|
|
215
215
|
});
|
|
216
|
+
define("void-tag", ["require", "exports"], function (require, exports) {
|
|
217
|
+
"use strict";
|
|
218
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
219
|
+
var VoidTag = /** @class */ (function () {
|
|
220
|
+
function VoidTag(addClosingSlash, tags) {
|
|
221
|
+
if (addClosingSlash === void 0) { addClosingSlash = false; }
|
|
222
|
+
this.addClosingSlash = addClosingSlash;
|
|
223
|
+
if (Array.isArray(tags)) {
|
|
224
|
+
this.voidTags = tags.reduce(function (set, tag) {
|
|
225
|
+
return set.add(tag.toLowerCase());
|
|
226
|
+
}, new Set());
|
|
227
|
+
}
|
|
228
|
+
else {
|
|
229
|
+
this.voidTags = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'].reduce(function (set, tag) {
|
|
230
|
+
return set.add(tag);
|
|
231
|
+
}, new Set());
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
VoidTag.prototype.formatNode = function (tag, attrs, innerHTML) {
|
|
235
|
+
var addClosingSlash = this.addClosingSlash;
|
|
236
|
+
var closingSpace = (addClosingSlash && attrs && !attrs.endsWith(' ')) ? ' ' : '';
|
|
237
|
+
var closingSlash = addClosingSlash ? "".concat(closingSpace, "/") : '';
|
|
238
|
+
return this.isVoidElement(tag.toLowerCase()) ? "<".concat(tag).concat(attrs).concat(closingSlash, ">") : "<".concat(tag).concat(attrs, ">").concat(innerHTML, "</").concat(tag, ">");
|
|
239
|
+
};
|
|
240
|
+
VoidTag.prototype.isVoidElement = function (tag) {
|
|
241
|
+
return this.voidTags.has(tag);
|
|
242
|
+
};
|
|
243
|
+
return VoidTag;
|
|
244
|
+
}());
|
|
245
|
+
exports.default = VoidTag;
|
|
246
|
+
});
|
|
216
247
|
define("nodes/text", ["require", "exports", "he", "nodes/node", "nodes/type"], function (require, exports, he_2, node_1, type_2) {
|
|
217
248
|
"use strict";
|
|
218
249
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
@@ -338,18 +369,18 @@ define("nodes/text", ["require", "exports", "he", "nodes/node", "nodes/type"], f
|
|
|
338
369
|
return (hasLeadingSpace ? ' ' : '') + text.slice(startPos, endPos + 1) + (hasTrailingSpace ? ' ' : '');
|
|
339
370
|
}
|
|
340
371
|
});
|
|
341
|
-
define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher", "nodes/comment", "nodes/node", "nodes/text", "nodes/type"], function (require, exports, css_select_1, he_3, back_1, matcher_1, comment_1, node_2, text_1, type_3) {
|
|
372
|
+
define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher", "void-tag", "nodes/comment", "nodes/node", "nodes/text", "nodes/type"], function (require, exports, css_select_1, he_3, back_1, matcher_1, void_tag_1, comment_1, node_2, text_1, type_3) {
|
|
342
373
|
"use strict";
|
|
343
374
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
344
375
|
exports.parse = exports.base_parse = void 0;
|
|
345
376
|
he_3 = __importDefault(he_3);
|
|
346
377
|
back_1 = __importDefault(back_1);
|
|
347
378
|
matcher_1 = __importDefault(matcher_1);
|
|
379
|
+
void_tag_1 = __importDefault(void_tag_1);
|
|
348
380
|
comment_1 = __importDefault(comment_1);
|
|
349
381
|
node_2 = __importDefault(node_2);
|
|
350
382
|
text_1 = __importDefault(text_1);
|
|
351
383
|
type_3 = __importDefault(type_3);
|
|
352
|
-
var voidTags = new Set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']);
|
|
353
384
|
function decode(val) {
|
|
354
385
|
// clone string
|
|
355
386
|
return JSON.parse(JSON.stringify(he_3.default.decode(val)));
|
|
@@ -456,10 +487,12 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
456
487
|
*
|
|
457
488
|
* @memberof HTMLElement
|
|
458
489
|
*/
|
|
459
|
-
function HTMLElement(tagName, keyAttrs, rawAttrs, parentNode, range) {
|
|
490
|
+
function HTMLElement(tagName, keyAttrs, rawAttrs, parentNode, range, voidTag) {
|
|
460
491
|
if (rawAttrs === void 0) { rawAttrs = ''; }
|
|
492
|
+
if (voidTag === void 0) { voidTag = new void_tag_1.default(); }
|
|
461
493
|
var _this = _super.call(this, parentNode, range) || this;
|
|
462
494
|
_this.rawAttrs = rawAttrs;
|
|
495
|
+
_this.voidTag = voidTag;
|
|
463
496
|
/**
|
|
464
497
|
* Node Type declaration.
|
|
465
498
|
*/
|
|
@@ -543,7 +576,7 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
543
576
|
});
|
|
544
577
|
Object.defineProperty(HTMLElement.prototype, "isVoidElement", {
|
|
545
578
|
get: function () {
|
|
546
|
-
return
|
|
579
|
+
return this.voidTag.isVoidElement(this.localName);
|
|
547
580
|
},
|
|
548
581
|
enumerable: false,
|
|
549
582
|
configurable: true
|
|
@@ -636,7 +669,7 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
636
669
|
var tag = this.rawTagName;
|
|
637
670
|
if (tag) {
|
|
638
671
|
var attrs = this.rawAttrs ? " ".concat(this.rawAttrs) : '';
|
|
639
|
-
return this.
|
|
672
|
+
return this.voidTag.formatNode(tag, attrs, this.innerHTML);
|
|
640
673
|
}
|
|
641
674
|
return this.innerHTML;
|
|
642
675
|
};
|
|
@@ -1333,7 +1366,9 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
1333
1366
|
* @return {HTMLElement} root element
|
|
1334
1367
|
*/
|
|
1335
1368
|
function base_parse(data, options) {
|
|
1369
|
+
var _a, _b;
|
|
1336
1370
|
if (options === void 0) { options = { lowerCaseTagName: false, comment: false }; }
|
|
1371
|
+
var voidTag = new void_tag_1.default((_a = options === null || options === void 0 ? void 0 : options.voidTag) === null || _a === void 0 ? void 0 : _a.closingSlash, (_b = options === null || options === void 0 ? void 0 : options.voidTag) === null || _b === void 0 ? void 0 : _b.tags);
|
|
1337
1372
|
var elements = options.blockTextElements || {
|
|
1338
1373
|
script: true,
|
|
1339
1374
|
noscript: true,
|
|
@@ -1350,7 +1385,7 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
1350
1385
|
return kBlockTextElements.some(function (it) { return it.test(tag); });
|
|
1351
1386
|
}
|
|
1352
1387
|
var createRange = function (startPos, endPos) { return [startPos - frameFlagOffset, endPos - frameFlagOffset]; };
|
|
1353
|
-
var root = new HTMLElement(null, {}, '', null, [0, data.length]);
|
|
1388
|
+
var root = new HTMLElement(null, {}, '', null, [0, data.length], voidTag);
|
|
1354
1389
|
var currentParent = root;
|
|
1355
1390
|
var stack = [root];
|
|
1356
1391
|
var lastTextPos = -1;
|
|
@@ -1421,7 +1456,7 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
1421
1456
|
var tagStartPos_1 = tagEndPos_1 - matchLength;
|
|
1422
1457
|
currentParent = currentParent.appendChild(
|
|
1423
1458
|
// Initialize range (end position updated later for closed tags)
|
|
1424
|
-
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos_1, tagEndPos_1)));
|
|
1459
|
+
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos_1, tagEndPos_1), voidTag));
|
|
1425
1460
|
stack.push(currentParent);
|
|
1426
1461
|
if (is_block_text_element(tagName)) {
|
|
1427
1462
|
// Find closing tag
|
package/dist/nodes/html.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import VoidTag from '../void-tag';
|
|
1
2
|
import Node from './node';
|
|
2
3
|
import NodeType from './type';
|
|
3
4
|
export interface KeyAttributes {
|
|
@@ -37,6 +38,7 @@ declare class DOMTokenList {
|
|
|
37
38
|
*/
|
|
38
39
|
export default class HTMLElement extends Node {
|
|
39
40
|
rawAttrs: string;
|
|
41
|
+
private voidTag;
|
|
40
42
|
private _attrs;
|
|
41
43
|
private _rawAttrs;
|
|
42
44
|
rawTagName: string;
|
|
@@ -59,7 +61,7 @@ export default class HTMLElement extends Node {
|
|
|
59
61
|
*
|
|
60
62
|
* @memberof HTMLElement
|
|
61
63
|
*/
|
|
62
|
-
constructor(tagName: string, keyAttrs: KeyAttributes, rawAttrs: string, parentNode: HTMLElement | null, range
|
|
64
|
+
constructor(tagName: string, keyAttrs: KeyAttributes, rawAttrs: string, parentNode: HTMLElement | null, range: [number, number], voidTag?: VoidTag);
|
|
63
65
|
/**
|
|
64
66
|
* Remove Child element from childNodes array
|
|
65
67
|
* @param {HTMLElement} node node to remove
|
|
@@ -205,6 +207,16 @@ export interface Options {
|
|
|
205
207
|
blockTextElements: {
|
|
206
208
|
[tag: string]: boolean;
|
|
207
209
|
};
|
|
210
|
+
voidTag?: {
|
|
211
|
+
/**
|
|
212
|
+
* options, default value is ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']
|
|
213
|
+
*/
|
|
214
|
+
tags?: string[];
|
|
215
|
+
/**
|
|
216
|
+
* void tag serialisation, add a final slash <br/>
|
|
217
|
+
*/
|
|
218
|
+
closingSlash?: boolean;
|
|
219
|
+
};
|
|
208
220
|
}
|
|
209
221
|
/**
|
|
210
222
|
* Parses HTML and returns a root element
|
package/dist/nodes/html.js
CHANGED
|
@@ -43,11 +43,11 @@ var css_select_1 = require("css-select");
|
|
|
43
43
|
var he_1 = __importDefault(require("he"));
|
|
44
44
|
var back_1 = __importDefault(require("../back"));
|
|
45
45
|
var matcher_1 = __importDefault(require("../matcher"));
|
|
46
|
+
var void_tag_1 = __importDefault(require("../void-tag"));
|
|
46
47
|
var comment_1 = __importDefault(require("./comment"));
|
|
47
48
|
var node_1 = __importDefault(require("./node"));
|
|
48
49
|
var text_1 = __importDefault(require("./text"));
|
|
49
50
|
var type_1 = __importDefault(require("./type"));
|
|
50
|
-
var voidTags = new Set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']);
|
|
51
51
|
function decode(val) {
|
|
52
52
|
// clone string
|
|
53
53
|
return JSON.parse(JSON.stringify(he_1.default.decode(val)));
|
|
@@ -154,10 +154,12 @@ var HTMLElement = /** @class */ (function (_super) {
|
|
|
154
154
|
*
|
|
155
155
|
* @memberof HTMLElement
|
|
156
156
|
*/
|
|
157
|
-
function HTMLElement(tagName, keyAttrs, rawAttrs, parentNode, range) {
|
|
157
|
+
function HTMLElement(tagName, keyAttrs, rawAttrs, parentNode, range, voidTag) {
|
|
158
158
|
if (rawAttrs === void 0) { rawAttrs = ''; }
|
|
159
|
+
if (voidTag === void 0) { voidTag = new void_tag_1.default(); }
|
|
159
160
|
var _this = _super.call(this, parentNode, range) || this;
|
|
160
161
|
_this.rawAttrs = rawAttrs;
|
|
162
|
+
_this.voidTag = voidTag;
|
|
161
163
|
/**
|
|
162
164
|
* Node Type declaration.
|
|
163
165
|
*/
|
|
@@ -241,7 +243,7 @@ var HTMLElement = /** @class */ (function (_super) {
|
|
|
241
243
|
});
|
|
242
244
|
Object.defineProperty(HTMLElement.prototype, "isVoidElement", {
|
|
243
245
|
get: function () {
|
|
244
|
-
return
|
|
246
|
+
return this.voidTag.isVoidElement(this.localName);
|
|
245
247
|
},
|
|
246
248
|
enumerable: false,
|
|
247
249
|
configurable: true
|
|
@@ -334,7 +336,7 @@ var HTMLElement = /** @class */ (function (_super) {
|
|
|
334
336
|
var tag = this.rawTagName;
|
|
335
337
|
if (tag) {
|
|
336
338
|
var attrs = this.rawAttrs ? " ".concat(this.rawAttrs) : '';
|
|
337
|
-
return this.
|
|
339
|
+
return this.voidTag.formatNode(tag, attrs, this.innerHTML);
|
|
338
340
|
}
|
|
339
341
|
return this.innerHTML;
|
|
340
342
|
};
|
|
@@ -1031,7 +1033,9 @@ var frameflag = 'documentfragmentcontainer';
|
|
|
1031
1033
|
* @return {HTMLElement} root element
|
|
1032
1034
|
*/
|
|
1033
1035
|
function base_parse(data, options) {
|
|
1036
|
+
var _a, _b;
|
|
1034
1037
|
if (options === void 0) { options = { lowerCaseTagName: false, comment: false }; }
|
|
1038
|
+
var voidTag = new void_tag_1.default((_a = options === null || options === void 0 ? void 0 : options.voidTag) === null || _a === void 0 ? void 0 : _a.closingSlash, (_b = options === null || options === void 0 ? void 0 : options.voidTag) === null || _b === void 0 ? void 0 : _b.tags);
|
|
1035
1039
|
var elements = options.blockTextElements || {
|
|
1036
1040
|
script: true,
|
|
1037
1041
|
noscript: true,
|
|
@@ -1048,7 +1052,7 @@ function base_parse(data, options) {
|
|
|
1048
1052
|
return kBlockTextElements.some(function (it) { return it.test(tag); });
|
|
1049
1053
|
}
|
|
1050
1054
|
var createRange = function (startPos, endPos) { return [startPos - frameFlagOffset, endPos - frameFlagOffset]; };
|
|
1051
|
-
var root = new HTMLElement(null, {}, '', null, [0, data.length]);
|
|
1055
|
+
var root = new HTMLElement(null, {}, '', null, [0, data.length], voidTag);
|
|
1052
1056
|
var currentParent = root;
|
|
1053
1057
|
var stack = [root];
|
|
1054
1058
|
var lastTextPos = -1;
|
|
@@ -1119,7 +1123,7 @@ function base_parse(data, options) {
|
|
|
1119
1123
|
var tagStartPos_1 = tagEndPos_1 - matchLength;
|
|
1120
1124
|
currentParent = currentParent.appendChild(
|
|
1121
1125
|
// Initialize range (end position updated later for closed tags)
|
|
1122
|
-
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos_1, tagEndPos_1)));
|
|
1126
|
+
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos_1, tagEndPos_1), voidTag));
|
|
1123
1127
|
stack.push(currentParent);
|
|
1124
1128
|
if (is_block_text_element(tagName)) {
|
|
1125
1129
|
// Find closing tag
|
package/dist/void-tag.js
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
var VoidTag = /** @class */ (function () {
|
|
4
|
+
function VoidTag(addClosingSlash, tags) {
|
|
5
|
+
if (addClosingSlash === void 0) { addClosingSlash = false; }
|
|
6
|
+
this.addClosingSlash = addClosingSlash;
|
|
7
|
+
if (Array.isArray(tags)) {
|
|
8
|
+
this.voidTags = tags.reduce(function (set, tag) {
|
|
9
|
+
return set.add(tag.toLowerCase());
|
|
10
|
+
}, new Set());
|
|
11
|
+
}
|
|
12
|
+
else {
|
|
13
|
+
this.voidTags = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'].reduce(function (set, tag) {
|
|
14
|
+
return set.add(tag);
|
|
15
|
+
}, new Set());
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
VoidTag.prototype.formatNode = function (tag, attrs, innerHTML) {
|
|
19
|
+
var addClosingSlash = this.addClosingSlash;
|
|
20
|
+
var closingSpace = (addClosingSlash && attrs && !attrs.endsWith(' ')) ? ' ' : '';
|
|
21
|
+
var closingSlash = addClosingSlash ? "".concat(closingSpace, "/") : '';
|
|
22
|
+
return this.isVoidElement(tag.toLowerCase()) ? "<".concat(tag).concat(attrs).concat(closingSlash, ">") : "<".concat(tag).concat(attrs, ">").concat(innerHTML, "</").concat(tag, ">");
|
|
23
|
+
};
|
|
24
|
+
VoidTag.prototype.isVoidElement = function (tag) {
|
|
25
|
+
return this.voidTags.has(tag);
|
|
26
|
+
};
|
|
27
|
+
return VoidTag;
|
|
28
|
+
}());
|
|
29
|
+
exports.default = VoidTag;
|
package/package.json
CHANGED