node-html-parser 7.0.1 → 7.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/README.md +4 -3
- package/dist/main.js +43 -7
- package/dist/nodes/html.d.ts +3 -1
- package/dist/nodes/html.js +39 -7
- package/package.json +3 -2
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
|
|
4
4
|
|
|
5
|
+
### [7.0.2](https://github.com/taoqf/node-fast-html-parser/compare/v7.0.1...v7.0.2) (2026-01-07)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
### Bug Fixes
|
|
9
|
+
|
|
10
|
+
* [#227](https://github.com/taoqf/node-fast-html-parser/issues/227) ([51528c4](https://github.com/taoqf/node-fast-html-parser/commit/51528c41ef2648d6c4dc1aecd14ee9d2b0083c4f))
|
|
11
|
+
* [#294](https://github.com/taoqf/node-fast-html-parser/issues/294) Closing tag is missing but valid HTML is still not parseable ([950865f](https://github.com/taoqf/node-fast-html-parser/commit/950865fab5f4df7853b36712869b71c90f4d3a1b))
|
|
12
|
+
* add missing dev dependency: yarn ([6d73ea3](https://github.com/taoqf/node-fast-html-parser/commit/6d73ea37c48f4170c35907869ba410c5122a9a1f))
|
|
13
|
+
* test valid.js ([a81fc46](https://github.com/taoqf/node-fast-html-parser/commit/a81fc46fab2507615b0362150d62568a6f52ee4e))
|
|
14
|
+
|
|
5
15
|
### [7.0.1](https://github.com/taoqf/node-fast-html-parser/compare/v7.0.0...v7.0.1) (2024-12-26)
|
|
6
16
|
|
|
7
17
|
|
package/README.md
CHANGED
|
@@ -43,6 +43,7 @@ import { parse } from 'node-html-parser';
|
|
|
43
43
|
|
|
44
44
|
const root = parse('<ul id="list"><li>Hello World</li></ul>');
|
|
45
45
|
|
|
46
|
+
// parse() adds a wrapper node, so the input data's first node is the root's first child node
|
|
46
47
|
console.log(root.firstChild.structure);
|
|
47
48
|
// ul#list
|
|
48
49
|
// li
|
|
@@ -74,7 +75,7 @@ var root = HTMLParser.parse('<ul id="list"><li>Hello World</li></ul>');
|
|
|
74
75
|
|
|
75
76
|
### parse(data[, options])
|
|
76
77
|
|
|
77
|
-
Parse the data provided, and return the root of the generated DOM.
|
|
78
|
+
Parse the data provided, wrap the result in a new node, and return the root of the generated DOM.
|
|
78
79
|
|
|
79
80
|
- **data**, data to parse
|
|
80
81
|
- **options**, parse options
|
|
@@ -349,11 +350,11 @@ Get all child elements, so all child nodes of type HTMLELement.
|
|
|
349
350
|
|
|
350
351
|
### firstChild
|
|
351
352
|
|
|
352
|
-
Get first child node
|
|
353
|
+
Get first child node of the wrapper node added by `parse()`. `undefined` if the node has no children.
|
|
353
354
|
|
|
354
355
|
### lastChild
|
|
355
356
|
|
|
356
|
-
Get last child node
|
|
357
|
+
Get last child node of the wrapper node added by `parse()`. `undefined` if the node has no children.
|
|
357
358
|
|
|
358
359
|
### firstElementChild
|
|
359
360
|
|
package/dist/main.js
CHANGED
|
@@ -503,7 +503,7 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
503
503
|
_this.nodeType = type_3.default.ELEMENT_NODE;
|
|
504
504
|
_this.rawTagName = tagName;
|
|
505
505
|
_this.rawAttrs = rawAttrs || '';
|
|
506
|
-
_this.
|
|
506
|
+
_this._id = keyAttrs.id || '';
|
|
507
507
|
_this.childNodes = [];
|
|
508
508
|
_this._parseOptions = _parseOptions;
|
|
509
509
|
_this.classList = new DOMTokenList(keyAttrs.class ? keyAttrs.class.split(/\s+/) : [], function (classList) { return _this.setAttribute('class', classList.toString()); } // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call
|
|
@@ -590,6 +590,16 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
590
590
|
enumerable: false,
|
|
591
591
|
configurable: true
|
|
592
592
|
});
|
|
593
|
+
Object.defineProperty(HTMLElement.prototype, "id", {
|
|
594
|
+
get: function () {
|
|
595
|
+
return this._id;
|
|
596
|
+
},
|
|
597
|
+
set: function (newid) {
|
|
598
|
+
this.setAttribute('id', newid);
|
|
599
|
+
},
|
|
600
|
+
enumerable: false,
|
|
601
|
+
configurable: true
|
|
602
|
+
});
|
|
593
603
|
Object.defineProperty(HTMLElement.prototype, "rawText", {
|
|
594
604
|
/**
|
|
595
605
|
* Get escpaed (as-it) text value of current node and its children.
|
|
@@ -786,7 +796,7 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
786
796
|
res.push(' '.repeat(indention) + str);
|
|
787
797
|
}
|
|
788
798
|
function dfs(node) {
|
|
789
|
-
var idStr = node.
|
|
799
|
+
var idStr = node._id ? "#".concat(node._id) : '';
|
|
790
800
|
var classStr = node.classList.length ? ".".concat(node.classList.value.join('.')) : ''; // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/restrict-template-expressions, @typescript-eslint/no-unsafe-call
|
|
791
801
|
write("".concat(node.rawTagName).concat(idStr).concat(classStr));
|
|
792
802
|
indention++;
|
|
@@ -923,7 +933,7 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
923
933
|
continue;
|
|
924
934
|
}
|
|
925
935
|
if (child.nodeType === type_3.default.ELEMENT_NODE) {
|
|
926
|
-
if (child.
|
|
936
|
+
if (child._id === id) {
|
|
927
937
|
return child;
|
|
928
938
|
}
|
|
929
939
|
// if children are existing push the current status to the stack and keep searching for elements in the level below
|
|
@@ -1073,9 +1083,9 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
1073
1083
|
return "".concat(name, "=").concat(val);
|
|
1074
1084
|
})
|
|
1075
1085
|
.join(' ');
|
|
1076
|
-
// Update this.
|
|
1086
|
+
// Update this._id
|
|
1077
1087
|
if (key === 'id') {
|
|
1078
|
-
this.
|
|
1088
|
+
this._id = '';
|
|
1079
1089
|
}
|
|
1080
1090
|
return this;
|
|
1081
1091
|
};
|
|
@@ -1121,9 +1131,9 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
1121
1131
|
return "".concat(name, "=").concat(val);
|
|
1122
1132
|
})
|
|
1123
1133
|
.join(' ');
|
|
1124
|
-
// Update this.
|
|
1134
|
+
// Update this._id
|
|
1125
1135
|
if (key === 'id') {
|
|
1126
|
-
this.
|
|
1136
|
+
this._id = value;
|
|
1127
1137
|
}
|
|
1128
1138
|
return this;
|
|
1129
1139
|
};
|
|
@@ -1150,6 +1160,10 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
1150
1160
|
return "".concat(name, "=").concat(_this.quoteAttribute(String(val)));
|
|
1151
1161
|
})
|
|
1152
1162
|
.join(' ');
|
|
1163
|
+
// Update this._id
|
|
1164
|
+
if ('id' in attributes) {
|
|
1165
|
+
this._id = attributes['id'];
|
|
1166
|
+
}
|
|
1153
1167
|
return this;
|
|
1154
1168
|
};
|
|
1155
1169
|
HTMLElement.prototype.insertAdjacentHTML = function (where, html) {
|
|
@@ -1421,6 +1435,9 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
1421
1435
|
th: { tr: true, table: true, TR: true, TABLE: true },
|
|
1422
1436
|
TH: { tr: true, table: true, TR: true, TABLE: true },
|
|
1423
1437
|
};
|
|
1438
|
+
var kElementsClosedByClosingExcept = {
|
|
1439
|
+
p: { a: true, audio: true, del: true, ins: true, map: true, noscript: true, video: true },
|
|
1440
|
+
};
|
|
1424
1441
|
var frameflag = 'documentfragmentcontainer';
|
|
1425
1442
|
/**
|
|
1426
1443
|
* Parses HTML and returns a root element
|
|
@@ -1566,6 +1583,25 @@ define("nodes/html", ["require", "exports", "css-select", "he", "back", "matcher
|
|
|
1566
1583
|
continue;
|
|
1567
1584
|
}
|
|
1568
1585
|
}
|
|
1586
|
+
var openTag = currentParent.rawTagName ?
|
|
1587
|
+
currentParent.rawTagName.toLowerCase() :
|
|
1588
|
+
'';
|
|
1589
|
+
if (kElementsClosedByClosingExcept[openTag]) {
|
|
1590
|
+
var closingTag = tagName.toLowerCase();
|
|
1591
|
+
if (stack.length > 1) {
|
|
1592
|
+
var possibleContainer = stack[stack.length - 2];
|
|
1593
|
+
if (possibleContainer &&
|
|
1594
|
+
possibleContainer.rawTagName &&
|
|
1595
|
+
possibleContainer.rawTagName.toLowerCase() === closingTag &&
|
|
1596
|
+
!kElementsClosedByClosingExcept[openTag][closingTag]) {
|
|
1597
|
+
// Update range end for closed tag
|
|
1598
|
+
currentParent.range[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];
|
|
1599
|
+
stack.pop();
|
|
1600
|
+
currentParent = (0, back_1.default)(stack);
|
|
1601
|
+
continue;
|
|
1602
|
+
}
|
|
1603
|
+
}
|
|
1604
|
+
}
|
|
1569
1605
|
// Use aggressive strategy to handle unmatching markups.
|
|
1570
1606
|
break;
|
|
1571
1607
|
}
|
package/dist/nodes/html.d.ts
CHANGED
|
@@ -43,8 +43,8 @@ export default class HTMLElement extends Node {
|
|
|
43
43
|
private _attrs;
|
|
44
44
|
private _rawAttrs;
|
|
45
45
|
private _parseOptions;
|
|
46
|
+
private _id;
|
|
46
47
|
rawTagName: string;
|
|
47
|
-
id: string;
|
|
48
48
|
classList: DOMTokenList;
|
|
49
49
|
/**
|
|
50
50
|
* Node Type declaration.
|
|
@@ -79,6 +79,8 @@ export default class HTMLElement extends Node {
|
|
|
79
79
|
set tagName(newname: string);
|
|
80
80
|
get localName(): string;
|
|
81
81
|
get isVoidElement(): boolean;
|
|
82
|
+
get id(): string;
|
|
83
|
+
set id(newid: string);
|
|
82
84
|
/**
|
|
83
85
|
* Get escpaed (as-it) text value of current node and its children.
|
|
84
86
|
* @return {string} text content
|
package/dist/nodes/html.js
CHANGED
|
@@ -126,7 +126,7 @@ class HTMLElement extends node_1.default {
|
|
|
126
126
|
this.nodeType = type_1.default.ELEMENT_NODE;
|
|
127
127
|
this.rawTagName = tagName;
|
|
128
128
|
this.rawAttrs = rawAttrs || '';
|
|
129
|
-
this.
|
|
129
|
+
this._id = keyAttrs.id || '';
|
|
130
130
|
this.childNodes = [];
|
|
131
131
|
this._parseOptions = _parseOptions;
|
|
132
132
|
this.classList = new DOMTokenList(keyAttrs.class ? keyAttrs.class.split(/\s+/) : [], (classList) => this.setAttribute('class', classList.toString()) // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call
|
|
@@ -185,6 +185,12 @@ class HTMLElement extends node_1.default {
|
|
|
185
185
|
get isVoidElement() {
|
|
186
186
|
return this.voidTag.isVoidElement(this.localName);
|
|
187
187
|
}
|
|
188
|
+
get id() {
|
|
189
|
+
return this._id;
|
|
190
|
+
}
|
|
191
|
+
set id(newid) {
|
|
192
|
+
this.setAttribute('id', newid);
|
|
193
|
+
}
|
|
188
194
|
/**
|
|
189
195
|
* Get escpaed (as-it) text value of current node and its children.
|
|
190
196
|
* @return {string} text content
|
|
@@ -350,7 +356,7 @@ class HTMLElement extends node_1.default {
|
|
|
350
356
|
res.push(' '.repeat(indention) + str);
|
|
351
357
|
}
|
|
352
358
|
function dfs(node) {
|
|
353
|
-
const idStr = node.
|
|
359
|
+
const idStr = node._id ? `#${node._id}` : '';
|
|
354
360
|
const classStr = node.classList.length ? `.${node.classList.value.join('.')}` : ''; // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/restrict-template-expressions, @typescript-eslint/no-unsafe-call
|
|
355
361
|
write(`${node.rawTagName}${idStr}${classStr}`);
|
|
356
362
|
indention++;
|
|
@@ -483,7 +489,7 @@ class HTMLElement extends node_1.default {
|
|
|
483
489
|
continue;
|
|
484
490
|
}
|
|
485
491
|
if (child.nodeType === type_1.default.ELEMENT_NODE) {
|
|
486
|
-
if (child.
|
|
492
|
+
if (child._id === id) {
|
|
487
493
|
return child;
|
|
488
494
|
}
|
|
489
495
|
// if children are existing push the current status to the stack and keep searching for elements in the level below
|
|
@@ -623,9 +629,9 @@ class HTMLElement extends node_1.default {
|
|
|
623
629
|
return `${name}=${val}`;
|
|
624
630
|
})
|
|
625
631
|
.join(' ');
|
|
626
|
-
// Update this.
|
|
632
|
+
// Update this._id
|
|
627
633
|
if (key === 'id') {
|
|
628
|
-
this.
|
|
634
|
+
this._id = '';
|
|
629
635
|
}
|
|
630
636
|
return this;
|
|
631
637
|
}
|
|
@@ -670,9 +676,9 @@ class HTMLElement extends node_1.default {
|
|
|
670
676
|
return `${name}=${val}`;
|
|
671
677
|
})
|
|
672
678
|
.join(' ');
|
|
673
|
-
// Update this.
|
|
679
|
+
// Update this._id
|
|
674
680
|
if (key === 'id') {
|
|
675
|
-
this.
|
|
681
|
+
this._id = value;
|
|
676
682
|
}
|
|
677
683
|
return this;
|
|
678
684
|
}
|
|
@@ -698,6 +704,10 @@ class HTMLElement extends node_1.default {
|
|
|
698
704
|
return `${name}=${this.quoteAttribute(String(val))}`;
|
|
699
705
|
})
|
|
700
706
|
.join(' ');
|
|
707
|
+
// Update this._id
|
|
708
|
+
if ('id' in attributes) {
|
|
709
|
+
this._id = attributes['id'];
|
|
710
|
+
}
|
|
701
711
|
return this;
|
|
702
712
|
}
|
|
703
713
|
insertAdjacentHTML(where, html) {
|
|
@@ -905,6 +915,9 @@ const kElementsClosedByClosing = {
|
|
|
905
915
|
th: { tr: true, table: true, TR: true, TABLE: true },
|
|
906
916
|
TH: { tr: true, table: true, TR: true, TABLE: true },
|
|
907
917
|
};
|
|
918
|
+
const kElementsClosedByClosingExcept = {
|
|
919
|
+
p: { a: true, audio: true, del: true, ins: true, map: true, noscript: true, video: true },
|
|
920
|
+
};
|
|
908
921
|
const frameflag = 'documentfragmentcontainer';
|
|
909
922
|
/**
|
|
910
923
|
* Parses HTML and returns a root element
|
|
@@ -1049,6 +1062,25 @@ function base_parse(data, options = {}) {
|
|
|
1049
1062
|
continue;
|
|
1050
1063
|
}
|
|
1051
1064
|
}
|
|
1065
|
+
const openTag = currentParent.rawTagName ?
|
|
1066
|
+
currentParent.rawTagName.toLowerCase() :
|
|
1067
|
+
'';
|
|
1068
|
+
if (kElementsClosedByClosingExcept[openTag]) {
|
|
1069
|
+
const closingTag = tagName.toLowerCase();
|
|
1070
|
+
if (stack.length > 1) {
|
|
1071
|
+
const possibleContainer = stack[stack.length - 2];
|
|
1072
|
+
if (possibleContainer &&
|
|
1073
|
+
possibleContainer.rawTagName &&
|
|
1074
|
+
possibleContainer.rawTagName.toLowerCase() === closingTag &&
|
|
1075
|
+
!kElementsClosedByClosingExcept[openTag][closingTag]) {
|
|
1076
|
+
// Update range end for closed tag
|
|
1077
|
+
currentParent.range[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];
|
|
1078
|
+
stack.pop();
|
|
1079
|
+
currentParent = (0, back_1.default)(stack);
|
|
1080
|
+
continue;
|
|
1081
|
+
}
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1052
1084
|
// Use aggressive strategy to handle unmatching markups.
|
|
1053
1085
|
break;
|
|
1054
1086
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "node-html-parser",
|
|
3
|
-
"version": "7.0.
|
|
3
|
+
"version": "7.0.2",
|
|
4
4
|
"description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -87,7 +87,8 @@
|
|
|
87
87
|
"standard-version": "^9.5.0",
|
|
88
88
|
"travis-cov": "latest",
|
|
89
89
|
"ts-node": "^10.9.1",
|
|
90
|
-
"typescript": "latest"
|
|
90
|
+
"typescript": "latest",
|
|
91
|
+
"yarn": "^1.22.22"
|
|
91
92
|
},
|
|
92
93
|
"config": {
|
|
93
94
|
"blanket": {
|