node-html-parser 5.3.1 → 5.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -19,18 +19,20 @@ npm install --save node-html-parser
19
19
 
20
20
  ## Performance
21
21
 
22
+ -- 2022-08-10
23
+
22
24
  ```shell
23
- cheerio :12.0726 ms/file ± 7.31605
24
- parse5 :8.18615 ms/file ± 6.15337
25
- node-html-parser (last release):2.16533 ms/file ± 1.56924
26
- htmlparser :17.0658 ms/file ± 120.901
27
- htmlparser2 :2.62695 ms/file ± 4.17579
28
- node-html-parser:2.14907 ms/file ± 1.66632
29
- html-parser :24.6505 ms/file ± 18.9996
30
- htmljs-parser :5.81797 ms/file ± 6.55537
31
- html-dom-parser :2.52265 ms/file ± 3.54858
32
- html5parser :2.01144 ms/file ± 2.53570
33
- high5 :3.91342 ms/file ± 2.65563
25
+ html-parser :24.1595 ms/file ± 18.7667
26
+ htmljs-parser :4.72064 ms/file ± 5.67689
27
+ html-dom-parser :2.18055 ms/file ± 2.96136
28
+ html5parser :1.69639 ms/file ± 2.17111
29
+ cheerio :12.2122 ms/file ± 8.10916
30
+ parse5 :6.50626 ms/file ± 4.02352
31
+ htmlparser2 :2.38179 ms/file ± 3.42389
32
+ htmlparser :17.4820 ms/file ± 128.041
33
+ high5 :3.95188 ms/file ± 2.52313
34
+ node-html-parser:2.04288 ms/file ± 1.25203
35
+ node-html-parser (last release):2.00527 ms/file ± 1.21317
34
36
  ```
35
37
 
36
38
  Tested with [htmlparser-benchmark](https://github.com/AndreasMadsen/htmlparser-benchmark).
@@ -82,6 +84,10 @@ Parse the data provided, and return the root of the generated DOM.
82
84
  {
83
85
  lowerCaseTagName: false, // convert tag name to lower case (hurts performance heavily)
84
86
  comment: false, // retrieve comments (hurts performance slightly)
87
+ voidTag:{
88
+ tags: ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'], // optional and case insensitive, default value is ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']
89
+ addClosingSlash: true // optional, default false. void tag serialisation, add a final slash <br/>
90
+ },
85
91
  blockTextElements: {
86
92
  script: true, // keep text content when parsing
87
93
  noscript: true, // keep text content when parsing
package/dist/main.js CHANGED
@@ -73,6 +73,20 @@ define("nodes/node", ["require", "exports", "he"], function (require, exports, h
73
73
  value: range !== null && range !== void 0 ? range : [-1, -1]
74
74
  });
75
75
  }
76
+ /**
77
+ * Remove current node
78
+ */
79
+ Node.prototype.remove = function () {
80
+ var _this = this;
81
+ if (this.parentNode) {
82
+ var children = this.parentNode.childNodes;
83
+ this.parentNode.childNodes = children.filter(function (child) {
84
+ return _this !== child;
85
+ });
86
+ this.parentNode = null;
87
+ }
88
+ return this;
89
+ };
76
90
  Object.defineProperty(Node.prototype, "innerText", {
77
91
  get: function () {
78
92
  return this.rawText;
@@ -199,6 +213,37 @@ define("matcher", ["require", "exports", "nodes/type"], function (require, expor
199
213
  findAll: findAll
200
214
  };
201
215
  });
216
+ define("void-tag", ["require", "exports"], function (require, exports) {
217
+ "use strict";
218
+ Object.defineProperty(exports, "__esModule", { value: true });
219
+ var VoidTag = /** @class */ (function () {
220
+ function VoidTag(addClosingSlash, tags) {
221
+ if (addClosingSlash === void 0) { addClosingSlash = false; }
222
+ this.addClosingSlash = addClosingSlash;
223
+ if (Array.isArray(tags)) {
224
+ this.voidTags = tags.reduce(function (set, tag) {
225
+ return set.add(tag.toLowerCase());
226
+ }, new Set());
227
+ }
228
+ else {
229
+ this.voidTags = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'].reduce(function (set, tag) {
230
+ return set.add(tag);
231
+ }, new Set());
232
+ }
233
+ }
234
+ VoidTag.prototype.formatNode = function (tag, attrs, innerHTML) {
235
+ var addClosingSlash = this.addClosingSlash;
236
+ var closingSpace = (addClosingSlash && attrs && !attrs.endsWith(' ')) ? ' ' : '';
237
+ var closingSlash = addClosingSlash ? "".concat(closingSpace, "/") : '';
238
+ return this.isVoidElement(tag.toLowerCase()) ? "<".concat(tag).concat(attrs).concat(closingSlash, ">") : "<".concat(tag).concat(attrs, ">").concat(innerHTML, "</").concat(tag, ">");
239
+ };
240
+ VoidTag.prototype.isVoidElement = function (tag) {
241
+ return this.voidTags.has(tag);
242
+ };
243
+ return VoidTag;
244
+ }());
245
+ exports.default = VoidTag;
246
+ });
202
247
  define("nodes/text", ["require", "exports", "he", "nodes/node", "nodes/type"], function (require, exports, he_2, node_1, type_2) {
203
248
  "use strict";
204
249
  Object.defineProperty(exports, "__esModule", { value: true });
@@ -324,18 +369,18 @@ define("nodes/text", ["require", "exports", "he", "nodes/node", "nodes/type"], f
324
369
  return (hasLeadingSpace ? ' ' : '') + text.slice(startPos, endPos + 1) + (hasTrailingSpace ? ' ' : '');
325
370
  }
326
371
  });
327
- define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher", "nodes/comment", "nodes/node", "nodes/text", "nodes/type"], function (require, exports, css_select_1, he_3, back_1, matcher_1, comment_1, node_2, text_1, type_3) {
372
+ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher", "void-tag", "nodes/comment", "nodes/node", "nodes/text", "nodes/type"], function (require, exports, css_select_1, he_3, back_1, matcher_1, void_tag_1, comment_1, node_2, text_1, type_3) {
328
373
  "use strict";
329
374
  Object.defineProperty(exports, "__esModule", { value: true });
330
375
  exports.parse = exports.base_parse = void 0;
331
376
  he_3 = __importDefault(he_3);
332
377
  back_1 = __importDefault(back_1);
333
378
  matcher_1 = __importDefault(matcher_1);
379
+ void_tag_1 = __importDefault(void_tag_1);
334
380
  comment_1 = __importDefault(comment_1);
335
381
  node_2 = __importDefault(node_2);
336
382
  text_1 = __importDefault(text_1);
337
383
  type_3 = __importDefault(type_3);
338
- var voidTags = new Set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']);
339
384
  function decode(val) {
340
385
  // clone string
341
386
  return JSON.parse(JSON.stringify(he_3.default.decode(val)));
@@ -442,10 +487,12 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
442
487
  *
443
488
  * @memberof HTMLElement
444
489
  */
445
- function HTMLElement(tagName, keyAttrs, rawAttrs, parentNode, range) {
490
+ function HTMLElement(tagName, keyAttrs, rawAttrs, parentNode, range, voidTag) {
446
491
  if (rawAttrs === void 0) { rawAttrs = ''; }
492
+ if (voidTag === void 0) { voidTag = new void_tag_1.default(); }
447
493
  var _this = _super.call(this, parentNode, range) || this;
448
494
  _this.rawAttrs = rawAttrs;
495
+ _this.voidTag = voidTag;
449
496
  /**
450
497
  * Node Type declaration.
451
498
  */
@@ -485,20 +532,6 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
485
532
  }
486
533
  return JSON.stringify(attr.replace(/"/g, '&quot;'));
487
534
  };
488
- /**
489
- * Remove current element
490
- */
491
- HTMLElement.prototype.remove = function () {
492
- var _this = this;
493
- if (this.parentNode) {
494
- var children = this.parentNode.childNodes;
495
- this.parentNode.childNodes = children.filter(function (child) {
496
- return _this !== child;
497
- });
498
- this.parentNode = null;
499
- }
500
- return this;
501
- };
502
535
  /**
503
536
  * Remove Child element from childNodes array
504
537
  * @param {HTMLElement} node node to remove
@@ -543,7 +576,7 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
543
576
  });
544
577
  Object.defineProperty(HTMLElement.prototype, "isVoidElement", {
545
578
  get: function () {
546
- return voidTags.has(this.localName);
579
+ return this.voidTag.isVoidElement(this.localName);
547
580
  },
548
581
  enumerable: false,
549
582
  configurable: true
@@ -636,7 +669,7 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
636
669
  var tag = this.rawTagName;
637
670
  if (tag) {
638
671
  var attrs = this.rawAttrs ? " ".concat(this.rawAttrs) : '';
639
- return this.isVoidElement ? "<".concat(tag).concat(attrs, ">") : "<".concat(tag).concat(attrs, ">").concat(this.innerHTML, "</").concat(tag, ">");
672
+ return this.voidTag.formatNode(tag, attrs, this.innerHTML);
640
673
  }
641
674
  return this.innerHTML;
642
675
  };
@@ -936,7 +969,8 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
936
969
  * @return {Node} node appended
937
970
  */
938
971
  HTMLElement.prototype.appendChild = function (node) {
939
- // node.parentNode = this;
972
+ // remove the node from it's parent
973
+ node.remove();
940
974
  this.childNodes.push(node);
941
975
  node.parentNode = this;
942
976
  return node;
@@ -1332,7 +1366,9 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
1332
1366
  * @return {HTMLElement} root element
1333
1367
  */
1334
1368
  function base_parse(data, options) {
1369
+ var _a, _b;
1335
1370
  if (options === void 0) { options = { lowerCaseTagName: false, comment: false }; }
1371
+ var voidTag = new void_tag_1.default((_a = options === null || options === void 0 ? void 0 : options.voidTag) === null || _a === void 0 ? void 0 : _a.closingSlash, (_b = options === null || options === void 0 ? void 0 : options.voidTag) === null || _b === void 0 ? void 0 : _b.tags);
1336
1372
  var elements = options.blockTextElements || {
1337
1373
  script: true,
1338
1374
  noscript: true,
@@ -1349,7 +1385,7 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
1349
1385
  return kBlockTextElements.some(function (it) { return it.test(tag); });
1350
1386
  }
1351
1387
  var createRange = function (startPos, endPos) { return [startPos - frameFlagOffset, endPos - frameFlagOffset]; };
1352
- var root = new HTMLElement(null, {}, '', null, [0, data.length]);
1388
+ var root = new HTMLElement(null, {}, '', null, [0, data.length], voidTag);
1353
1389
  var currentParent = root;
1354
1390
  var stack = [root];
1355
1391
  var lastTextPos = -1;
@@ -1420,7 +1456,7 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
1420
1456
  var tagStartPos_1 = tagEndPos_1 - matchLength;
1421
1457
  currentParent = currentParent.appendChild(
1422
1458
  // Initialize range (end position updated later for closed tags)
1423
- new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos_1, tagEndPos_1)));
1459
+ new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos_1, tagEndPos_1), voidTag));
1424
1460
  stack.push(currentParent);
1425
1461
  if (is_block_text_element(tagName)) {
1426
1462
  // Find closing tag
@@ -1,3 +1,4 @@
1
+ import VoidTag from '../void-tag';
1
2
  import Node from './node';
2
3
  import NodeType from './type';
3
4
  export interface KeyAttributes {
@@ -37,6 +38,7 @@ declare class DOMTokenList {
37
38
  */
38
39
  export default class HTMLElement extends Node {
39
40
  rawAttrs: string;
41
+ private voidTag;
40
42
  private _attrs;
41
43
  private _rawAttrs;
42
44
  rawTagName: string;
@@ -59,11 +61,7 @@ export default class HTMLElement extends Node {
59
61
  *
60
62
  * @memberof HTMLElement
61
63
  */
62
- constructor(tagName: string, keyAttrs: KeyAttributes, rawAttrs: string, parentNode: HTMLElement | null, range?: [number, number]);
63
- /**
64
- * Remove current element
65
- */
66
- remove(): this;
64
+ constructor(tagName: string, keyAttrs: KeyAttributes, rawAttrs: string, parentNode: HTMLElement | null, range: [number, number], voidTag?: VoidTag);
67
65
  /**
68
66
  * Remove Child element from childNodes array
69
67
  * @param {HTMLElement} node node to remove
@@ -99,7 +97,7 @@ export default class HTMLElement extends Node {
99
97
  toString(): string;
100
98
  get innerHTML(): string;
101
99
  set innerHTML(content: string);
102
- set_content(content: string | Node | Node[], options?: Options): this;
100
+ set_content(content: string | Node | Node[], options?: Partial<Options>): this;
103
101
  replaceWith(...nodes: (string | Node)[]): void;
104
102
  get outerHTML(): string;
105
103
  /**
@@ -209,6 +207,16 @@ export interface Options {
209
207
  blockTextElements: {
210
208
  [tag: string]: boolean;
211
209
  };
210
+ voidTag?: {
211
+ /**
212
+ * options, default value is ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']
213
+ */
214
+ tags?: string[];
215
+ /**
216
+ * void tag serialisation, add a final slash <br/>
217
+ */
218
+ closingSlash?: boolean;
219
+ };
212
220
  }
213
221
  /**
214
222
  * Parses HTML and returns a root element
@@ -43,11 +43,11 @@ var css_select_1 = require("css-select");
43
43
  var he_1 = __importDefault(require("he"));
44
44
  var back_1 = __importDefault(require("../back"));
45
45
  var matcher_1 = __importDefault(require("../matcher"));
46
+ var void_tag_1 = __importDefault(require("../void-tag"));
46
47
  var comment_1 = __importDefault(require("./comment"));
47
48
  var node_1 = __importDefault(require("./node"));
48
49
  var text_1 = __importDefault(require("./text"));
49
50
  var type_1 = __importDefault(require("./type"));
50
- var voidTags = new Set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']);
51
51
  function decode(val) {
52
52
  // clone string
53
53
  return JSON.parse(JSON.stringify(he_1.default.decode(val)));
@@ -154,10 +154,12 @@ var HTMLElement = /** @class */ (function (_super) {
154
154
  *
155
155
  * @memberof HTMLElement
156
156
  */
157
- function HTMLElement(tagName, keyAttrs, rawAttrs, parentNode, range) {
157
+ function HTMLElement(tagName, keyAttrs, rawAttrs, parentNode, range, voidTag) {
158
158
  if (rawAttrs === void 0) { rawAttrs = ''; }
159
+ if (voidTag === void 0) { voidTag = new void_tag_1.default(); }
159
160
  var _this = _super.call(this, parentNode, range) || this;
160
161
  _this.rawAttrs = rawAttrs;
162
+ _this.voidTag = voidTag;
161
163
  /**
162
164
  * Node Type declaration.
163
165
  */
@@ -197,20 +199,6 @@ var HTMLElement = /** @class */ (function (_super) {
197
199
  }
198
200
  return JSON.stringify(attr.replace(/"/g, '&quot;'));
199
201
  };
200
- /**
201
- * Remove current element
202
- */
203
- HTMLElement.prototype.remove = function () {
204
- var _this = this;
205
- if (this.parentNode) {
206
- var children = this.parentNode.childNodes;
207
- this.parentNode.childNodes = children.filter(function (child) {
208
- return _this !== child;
209
- });
210
- this.parentNode = null;
211
- }
212
- return this;
213
- };
214
202
  /**
215
203
  * Remove Child element from childNodes array
216
204
  * @param {HTMLElement} node node to remove
@@ -255,7 +243,7 @@ var HTMLElement = /** @class */ (function (_super) {
255
243
  });
256
244
  Object.defineProperty(HTMLElement.prototype, "isVoidElement", {
257
245
  get: function () {
258
- return voidTags.has(this.localName);
246
+ return this.voidTag.isVoidElement(this.localName);
259
247
  },
260
248
  enumerable: false,
261
249
  configurable: true
@@ -348,7 +336,7 @@ var HTMLElement = /** @class */ (function (_super) {
348
336
  var tag = this.rawTagName;
349
337
  if (tag) {
350
338
  var attrs = this.rawAttrs ? " ".concat(this.rawAttrs) : '';
351
- return this.isVoidElement ? "<".concat(tag).concat(attrs, ">") : "<".concat(tag).concat(attrs, ">").concat(this.innerHTML, "</").concat(tag, ">");
339
+ return this.voidTag.formatNode(tag, attrs, this.innerHTML);
352
340
  }
353
341
  return this.innerHTML;
354
342
  };
@@ -648,7 +636,8 @@ var HTMLElement = /** @class */ (function (_super) {
648
636
  * @return {Node} node appended
649
637
  */
650
638
  HTMLElement.prototype.appendChild = function (node) {
651
- // node.parentNode = this;
639
+ // remove the node from it's parent
640
+ node.remove();
652
641
  this.childNodes.push(node);
653
642
  node.parentNode = this;
654
643
  return node;
@@ -1044,7 +1033,9 @@ var frameflag = 'documentfragmentcontainer';
1044
1033
  * @return {HTMLElement} root element
1045
1034
  */
1046
1035
  function base_parse(data, options) {
1036
+ var _a, _b;
1047
1037
  if (options === void 0) { options = { lowerCaseTagName: false, comment: false }; }
1038
+ var voidTag = new void_tag_1.default((_a = options === null || options === void 0 ? void 0 : options.voidTag) === null || _a === void 0 ? void 0 : _a.closingSlash, (_b = options === null || options === void 0 ? void 0 : options.voidTag) === null || _b === void 0 ? void 0 : _b.tags);
1048
1039
  var elements = options.blockTextElements || {
1049
1040
  script: true,
1050
1041
  noscript: true,
@@ -1061,7 +1052,7 @@ function base_parse(data, options) {
1061
1052
  return kBlockTextElements.some(function (it) { return it.test(tag); });
1062
1053
  }
1063
1054
  var createRange = function (startPos, endPos) { return [startPos - frameFlagOffset, endPos - frameFlagOffset]; };
1064
- var root = new HTMLElement(null, {}, '', null, [0, data.length]);
1055
+ var root = new HTMLElement(null, {}, '', null, [0, data.length], voidTag);
1065
1056
  var currentParent = root;
1066
1057
  var stack = [root];
1067
1058
  var lastTextPos = -1;
@@ -1132,7 +1123,7 @@ function base_parse(data, options) {
1132
1123
  var tagStartPos_1 = tagEndPos_1 - matchLength;
1133
1124
  currentParent = currentParent.appendChild(
1134
1125
  // Initialize range (end position updated later for closed tags)
1135
- new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos_1, tagEndPos_1)));
1126
+ new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos_1, tagEndPos_1), voidTag));
1136
1127
  stack.push(currentParent);
1137
1128
  if (is_block_text_element(tagName)) {
1138
1129
  // Find closing tag
@@ -13,6 +13,10 @@ export default abstract class Node {
13
13
  abstract toString(): string;
14
14
  abstract clone(): Node;
15
15
  constructor(parentNode?: HTMLElement, range?: [number, number]);
16
+ /**
17
+ * Remove current node
18
+ */
19
+ remove(): this;
16
20
  get innerText(): string;
17
21
  get textContent(): string;
18
22
  set textContent(val: string);
@@ -16,6 +16,20 @@ var Node = /** @class */ (function () {
16
16
  value: range !== null && range !== void 0 ? range : [-1, -1]
17
17
  });
18
18
  }
19
+ /**
20
+ * Remove current node
21
+ */
22
+ Node.prototype.remove = function () {
23
+ var _this = this;
24
+ if (this.parentNode) {
25
+ var children = this.parentNode.childNodes;
26
+ this.parentNode.childNodes = children.filter(function (child) {
27
+ return _this !== child;
28
+ });
29
+ this.parentNode = null;
30
+ }
31
+ return this;
32
+ };
19
33
  Object.defineProperty(Node.prototype, "innerText", {
20
34
  get: function () {
21
35
  return this.rawText;
@@ -0,0 +1,7 @@
1
+ export default class VoidTag {
2
+ addClosingSlash: boolean;
3
+ private voidTags;
4
+ constructor(addClosingSlash?: boolean, tags?: string[]);
5
+ formatNode(tag: string, attrs: string, innerHTML: string): string;
6
+ isVoidElement(tag: string): boolean;
7
+ }
@@ -0,0 +1,29 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ var VoidTag = /** @class */ (function () {
4
+ function VoidTag(addClosingSlash, tags) {
5
+ if (addClosingSlash === void 0) { addClosingSlash = false; }
6
+ this.addClosingSlash = addClosingSlash;
7
+ if (Array.isArray(tags)) {
8
+ this.voidTags = tags.reduce(function (set, tag) {
9
+ return set.add(tag.toLowerCase());
10
+ }, new Set());
11
+ }
12
+ else {
13
+ this.voidTags = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'].reduce(function (set, tag) {
14
+ return set.add(tag);
15
+ }, new Set());
16
+ }
17
+ }
18
+ VoidTag.prototype.formatNode = function (tag, attrs, innerHTML) {
19
+ var addClosingSlash = this.addClosingSlash;
20
+ var closingSpace = (addClosingSlash && attrs && !attrs.endsWith(' ')) ? ' ' : '';
21
+ var closingSlash = addClosingSlash ? "".concat(closingSpace, "/") : '';
22
+ return this.isVoidElement(tag.toLowerCase()) ? "<".concat(tag).concat(attrs).concat(closingSlash, ">") : "<".concat(tag).concat(attrs, ">").concat(innerHTML, "</").concat(tag, ">");
23
+ };
24
+ VoidTag.prototype.isVoidElement = function (tag) {
25
+ return this.voidTags.has(tag);
26
+ };
27
+ return VoidTag;
28
+ }());
29
+ exports.default = VoidTag;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "node-html-parser",
3
- "version": "5.3.1",
3
+ "version": "5.4.0",
4
4
  "description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -31,8 +31,8 @@
31
31
  "prepare": "cd test && yarn install"
32
32
  },
33
33
  "keywords": [
34
- "parser",
35
34
  "html",
35
+ "parser",
36
36
  "nodejs",
37
37
  "typescript"
38
38
  ],